Ejemplo n.º 1
0
 def web_download_articles(self):
     '''
     Downloads the latest articles from the RSS feed
     for this source, only taking the ones which have not already been
     taken
     '''
     # set up the database cursor
     cursor = self.db.cursor()
     rss_reader = pyrss.RSSReader(self.rss_url)
     articles_to_add = dict()
     num_candidate_articles = 0
     for rss_item in rss_reader.GetItems():
         if(rss_item):
             # now we run a quick query to see which articles
             # already exist in the database...
             query_for_duplicates = "SELECT guid FROM articles WHERE guid = '" + str(rss_item.guid) + "'"
             cursor.execute(query_for_duplicates)
             existing_articles = database.get_first_column(cursor.fetchall())
             if len(existing_articles) == 0:
                 item_article = article.Article(tag_to_find = self.article_tags,article_rss_item = rss_item,source = self)
                 article_query_values = item_article.get_article_values()
                 if article_query_values:
                     values = "(" + article_query_values + ")"
                     articles_to_add[item_article.guid] = values
                     yield "Adding candidate article " + item_article.title
                     num_candidate_articles += 1
                 else:
                     yield "Ignoring article " + item_article.title
     query_for_duplicates = "SELECT guid FROM articles WHERE guid = '" + "' OR guid = '".join(articles_to_add.keys()) + "'"
     cursor.execute(query_for_duplicates)
     existing_articles = database.get_first_column(cursor.fetchall())
     articles_to_add = [qu[1] for qu in articles_to_add.items() if qu[0] not in existing_articles]
     if articles_to_add:
         query = "INSERT INTO `articles`"\
             " (`guid`, `url`, `source`, `title`, `author`, `originaltext`, `datepublished`, `dateretrieved`) "\
             "VALUES " + ",".join(articles_to_add)
         cursor.execute(query)
         self.rows_affected = cursor.rowcount
         cursor.close()
         self.db.commit()
     else:
         self.rows_affected = 0
     self.duplicates_found = num_candidate_articles - len(articles_to_add)
Ejemplo n.º 2
0
 def db_load_info(self):
     try:
         # create a database cursor (emulated by MySQLdb)
         cursor = self.db.cursor()
         # get the source with the ID the same as this one
         cursor.execute("SELECT * FROM sources WHERE id = '" + self.id + "'")
         source_info = cursor.fetchone()
         if cursor.rowcount != 1:
             raise RuntimeError("Could not find the source '" + str(self.id) + "' in the database")
         self.newspaper_name = source_info[1]
         self.website_name = source_info[2]
         self.website_url = source_info[3]
         self.rss_url = source_info[4]
         # used to be only one tag per source
         # self.article_tags = source_info[5]
         self.description = source_info[6]
         cursor.execute("SELECT tag_attribute_value FROM bad_tag_attributes WHERE source_id = '" + self.id + "'")
         self.bad_attribute_values = database.get_first_column(cursor.fetchall())
         cursor.execute("SELECT article_tag FROM target_tags WHERE source_id = '" + self.id + "' ORDER BY `order` ASC")
         self.article_tags = self.get_attrs_and_tags(database.get_first_column(cursor.fetchall()))
         cursor.close()
         self.db.commit()
     except Exception, err:
         print(err)