Esempio n. 1
0
 def insert_article(self, art):
     exists = self.session.query(Article).filter_by(url=art['url']).first()
     article = Article(url=art['url'], title=art['title'], date=art['date'])
     if not exists:
         self.session.add(article)
         logger.write_log("added: " + art['title'] + " to Database")
     self.session.commit()
Esempio n. 2
0
 def insert_location(self, location):
     exists = self.session.query(Location).filter_by(
         name=location['name']).first()
     if not exists:
         loc = Location(name=location['name'])
         self.session.add(loc)
         logger.write_log("added: " + location['name'] + " to database")
     self.session.commit()
Esempio n. 3
0
 def insert_organisation(self, organisation):
     exists = self.session.query(Organisation).filter_by(
         name=organisation['name']).first()
     if not exists:
         org = Organisation(name=organisation['name'])
         self.session.add(org)
         logger.write_log("added: " + organisation['name'] + " to database")
     self.session.commit()
Esempio n. 4
0
 def insert_person(self, person):
     exists = self.session.query(Person).filter_by(
         name=person['name']).first()
     if not exists:
         pers = Person(name=person['name'])
         self.session.add(pers)
         logger.write_log("added: " + person['name'] + " to database")
     self.session.commit()
Esempio n. 5
0
 def get_internal_links(self):
     logger.write_log("visiting: " + self.url)
     links = self.get_links()
     internal_links = []
     for link in links:
         if "brainpickings.org/20" in link and link not in internal_links:
             internal_links.append(link)
     return internal_links
Esempio n. 6
0
 def insert_reference(self, ref):
     # Generate unique id for db
     sha_id = hashlib.sha1(bytes(ref['url'] + ref['ref'], 'utf-8'))
     ref['id'] = sha_id.hexdigest()
     exists = self.session.query(Reference).filter_by(id=ref['id']).first()
     if not exists:
         reference = Reference(id=ref['id'], url=ref['url'], ref=ref['ref'])
         self.session.add(reference)
         logger.write_log("added reference from: " + ref['url'] + " to: " +
                          ref['ref'])
     self.session.commit()
Esempio n. 7
0
 def get_people(self):
     logger.write_log("getting people from: " + self.url)
     persons = self.ner.get_persons()
     main = 0
     if persons:
         for pers in persons:
             pers['am_i_main'] = False
             if pers['count'] > persons[main]['count']:
                 main = persons.index(pers)
         persons[main]['am_i_main'] = True
     return persons
Esempio n. 8
0
 def insert_location_rel(self, rel):
     sha_id = hashlib.sha1(bytes(rel['article'] + rel['location'], 'utf-8'))
     rel['id'] = sha_id.hexdigest()
     exists = self.session.query(LocationRel).filter_by(
         id=rel['id']).first()
     if not exists:
         relation = LocationRel(id=rel['id'],
                                article=rel['article'],
                                location=rel['location'],
                                count=rel['count'])
         self.session.add(relation)
         logger.write_log("added relation from: " + rel['article'] +
                          " to: " + rel['location'])
     self.session.commit()
def optimize_my_database():
    logger.write_log("Optimize one-offs")
    delete_one_offs()

    logger.write_log("Optimize too-longs")
    delete_five_or_more()

    logger.write_log("Optimize duplication")
    fix_name_duplication()
    fix_case_duplication()

    logger.write_log("Optimize not-letters")
    fix_symbols()

    # logger.write_log("Fix link-errors")
    # link_errors()

    logger.write_log("Verify")
    verify_all_with_dbpedia()
Esempio n. 10
0
    def bp_index(self):
        """
        Main index method, iterates all dates within a range
        :return: a list of dicts { 'url': url, 'title': title, 'date': date object }
        """
        logger.write_log("Indexing from: " + str(self.start_date) + " to: " +
                         str(self.end_date))

        db = DBSession()
        articlelist = []
        delta = dt.timedelta(days=1)

        while self.start_date <= self.end_date:
            page = self.fetch_page(self.start_date.year, self.start_date.month,
                                   self.start_date.day)
            if (page != "empty"):
                for article in page:
                    articlelist.append(article)
                    db.insert_article(article)
            self.start_date += delta
Esempio n. 11
0
    def fetch_page(self, y, m, d):
        """
        Constructs an url and checks if it contains a page.
        :param y: year
        :param m: month
        :param d: day
        :return: a list of dicts { 'url': url, 'title': title, 'date': date object }
        """

        url = self.baseurl + "/" + dts(y) + "/" + dts(m) + "/" + dts(d) + "/"
        response = read(url, self.local)
        if (response == "empty"):
            return response
        else:
            logger.write_log("visiting: " + url)
            title = url.replace("https://", "")
            title = title.replace("/", ":")
            # Save the article locally
            if self.save and not self.local:
                save_html(response, "html_collection_pages/" + title)
            articles = self.fetch_articles(response)
            for article in articles:
                article['date'] = dt.date(y, m, d)
            return articles
Esempio n. 12
0
def read_url(url):
    try:
        response = urlopen(url)
    except urllib.error.HTTPError as e:
        logger.write_log(url + " : " + e.__str__())
        return "empty"
    except urllib.error.URLError as e:
        logger.write_log(url + " : " + e.__str__())
        return "empty"
    if response.getheader(
            'Content-Type'
    ) == "text/html; charset=UTF-8":  # Make sure that page is HTML
        html_bytes = response.read()
        html_string = html_bytes.decode("utf-8")
        logger.write_log(url + " : " + "Added to index")
    else:
        print(url + "not crawlable")
        logger.write_log(url + " : " + "Page not html/utf-8")
        return "empty"
    return html_string
Esempio n. 13
0
def save_html(html_string, location):
    location += ".html"
    file = open(location, "w")
    file.write(html_string)
    file.close()
    logger.write_log(location + " written to file")
Esempio n. 14
0
 def get_organisations(self):
     logger.write_log("getting orginisations from: " + self.url)
     return self.ner.get_organisations()
Esempio n. 15
0
 def get_locations(self):
     logger.write_log("getting locations from: " + self.url)
     return self.ner.get_locations()