def web_download_articles(self): ''' Downloads the latest articles from the RSS feed for this source, only taking the ones which have not already been taken ''' # set up the database cursor cursor = self.db.cursor() rss_reader = pyrss.RSSReader(self.rss_url) articles_to_add = dict() num_candidate_articles = 0 for rss_item in rss_reader.GetItems(): if(rss_item): # now we run a quick query to see which articles # already exist in the database... query_for_duplicates = "SELECT guid FROM articles WHERE guid = '" + str(rss_item.guid) + "'" cursor.execute(query_for_duplicates) existing_articles = database.get_first_column(cursor.fetchall()) if len(existing_articles) == 0: item_article = article.Article(tag_to_find = self.article_tags,article_rss_item = rss_item,source = self) article_query_values = item_article.get_article_values() if article_query_values: values = "(" + article_query_values + ")" articles_to_add[item_article.guid] = values yield "Adding candidate article " + item_article.title num_candidate_articles += 1 else: yield "Ignoring article " + item_article.title query_for_duplicates = "SELECT guid FROM articles WHERE guid = '" + "' OR guid = '".join(articles_to_add.keys()) + "'" cursor.execute(query_for_duplicates) existing_articles = database.get_first_column(cursor.fetchall()) articles_to_add = [qu[1] for qu in articles_to_add.items() if qu[0] not in existing_articles] if articles_to_add: query = "INSERT INTO `articles`"\ " (`guid`, `url`, `source`, `title`, `author`, `originaltext`, `datepublished`, `dateretrieved`) "\ "VALUES " + ",".join(articles_to_add) cursor.execute(query) self.rows_affected = cursor.rowcount cursor.close() self.db.commit() else: self.rows_affected = 0 self.duplicates_found = num_candidate_articles - len(articles_to_add)
def db_load_info(self): try: # create a database cursor (emulated by MySQLdb) cursor = self.db.cursor() # get the source with the ID the same as this one cursor.execute("SELECT * FROM sources WHERE id = '" + self.id + "'") source_info = cursor.fetchone() if cursor.rowcount != 1: raise RuntimeError("Could not find the source '" + str(self.id) + "' in the database") self.newspaper_name = source_info[1] self.website_name = source_info[2] self.website_url = source_info[3] self.rss_url = source_info[4] # used to be only one tag per source # self.article_tags = source_info[5] self.description = source_info[6] cursor.execute("SELECT tag_attribute_value FROM bad_tag_attributes WHERE source_id = '" + self.id + "'") self.bad_attribute_values = database.get_first_column(cursor.fetchall()) cursor.execute("SELECT article_tag FROM target_tags WHERE source_id = '" + self.id + "' ORDER BY `order` ASC") self.article_tags = self.get_attrs_and_tags(database.get_first_column(cursor.fetchall())) cursor.close() self.db.commit() except Exception, err: print(err)