def run(self): """ Loops through the keywords, and uploads an article for each. """ keywords = KeywordExtractor.extract(config.KEYWORDS_FILE_PATH) for keyword in keywords: self._upload_article(keyword)
def main(): include_regex = re.compile(r'\bnfl\b', re.IGNORECASE) exclude_regex = re.compile(r'\bnba\b|\bnhl\b', re.IGNORECASE) extractor = KeywordExtractor() idx_db = sqlite3.connect('../idx.db') idx_db_cur = idx_db.cursor() nfl_db = sqlite3.connect('../nfl_tweets.db') nfl_db_cur = nfl_db.cursor() print "Getting already added ids" already_added_ids = set() nfl_db_cur.execute('SELECT idx_id FROM nfl_tweets') for row in nfl_db_cur: already_added_ids.add(row[0]) print "Found {0} ids".format(len(already_added_ids)) count = 0 idx_db_cur.execute("SELECT rowid, utc_timestamp, content FROM tweets") for row in idx_db_cur: if row[0] not in already_added_ids: if include_regex.search(row[2]) is not None and exclude_regex.search(row[2]) is None: keywords = extractor.extract(row[2]) if 'nfl' in keywords: count += 1 nfl_db_cur.execute( 'INSERT INTO nfl_tweets(idx_id,created,keywords) VALUES(?,?,?)', (row[0],row[1],keywords) ) if count > 0 and count % 10000 == 0: sys.stdout.write('.') sys.stdout.flush() if count > 10000: print print 'added', count, 'records' print 'committing' else: print 'nothing new found' nfl_db.commit() print 'closing' nfl_db_cur.close() nfl_db.close() idx_db_cur.close() idx_db.close()
def main(): extractor = KeywordExtractor() idx_db = sqlite3.connect('../idx.db') idx_db_cur = idx_db.cursor() kwd_db = sqlite3.connect('../kwd.db') kwd_db_cur = kwd_db.cursor() print "Getting already added ids" already_added_ids = set() kwd_db_cur.execute('SELECT idx_id FROM keywords') for row in kwd_db_cur: already_added_ids.add(row[0]) print "Getting found {0} ids".format(len(already_added_ids)) count = 0 idx_db_cur.execute("SELECT rowid, utc_timestamp, content FROM tweets") for row in idx_db_cur: if row[0] not in already_added_ids: count += 1 keywords = extractor.extract(row[2]) kwd_db_cur.execute( 'INSERT INTO keywords(idx_id,created,content) VALUES(?,?,?)', (row[0],row[1],keywords) ) if count > 0 and count % 10000 == 0: sys.stdout.write('.') sys.stdout.flush() if count > 10000: print print 'committing' else: print 'nothing new found' kwd_db.commit() print 'closing' kwd_db_cur.close() kwd_db.close() idx_db_cur.close() idx_db.close()
class KeywordExtractorTests(unittest.TestCase): def setUp(self): self.extractor = KeywordExtractor() def tearDown(self): pass def test_extract_sanity(self): keyword_list = self.extractor.extract(CSV_PATH) self.assertEqual(type(keyword_list), types.ListType) our_list = [] with open(CSV_PATH) as f: reader = csv.reader(f) reader.next() for row in reader: our_list.append(row[0]) self.assertItemsEqual(keyword_list, our_list)
from keyword_extractor import KeywordExtractor import argparse ap = argparse.ArgumentParser() ap.add_argument("--word2vec", default=None, help="path to word2vec pre-trained embeddings") ap.add_argument("--data", required=True, help="path to file from which keywords are to be extracted") args = ap.parse_args() with open(args.data, 'r') as data_file: lines = data_file.readlines() extractor = KeywordExtractor(word2vec=args.word2vec) for text in lines: keywords = extractor.extract(text, ratio=0.2, split=True, scores=True) for keyword in keywords: print(keyword)
class IgdbFetcher: GET_GAME_INFO_TEMPLATE = \ "https://igdbcom-internet-game-database-v1.p.mashape.com/games/" \ "?fields=name,summary,storyline,publishers,themes,keywords," \ "game_modes,genres,first_release_date,release_dates" \ "&limit=20&offset=0&search=%s" ENDPOINT_API_TEMPLATE = \ "https://igdbcom-internet-game-database-v1.p.mashape.com/" \ "%s/%s?fields=name" STOP_WORDS = {'game', 'player', 'gameplay'} def __init__(self): self.keyword_extractor = KeywordExtractor() self.publisher_id_to_name = {} self.platform_id_to_name = {} self.theme_id_to_name = {} self.genre_id_to_name = {} self.game_mode_id_to_name = {} self.game_keyword_id_to_name = {} self.fetch_publishers = self.__add_attr_to_game_data( 'publishers', 'companies', self.publisher_id_to_name) self.fetch_platforms = self.__add_attr_to_game_data( 'platform', 'platforms', self.platform_id_to_name) self.fetch_themes = self.__add_attr_to_game_data( 'themes', 'themes', self.theme_id_to_name) self.fetch_genres = self.__add_attr_to_game_data( 'genres', 'genres', self.genre_id_to_name) self.fetch_game_modes = self.__add_attr_to_game_data( 'game_modes', 'game_modes', self.game_mode_id_to_name) def get_game_info(self, game_data): response = unirest.get(self.GET_GAME_INFO_TEMPLATE % game_data.name, headers={ "X-Mashape-Key": os.environ['IGDB_KEY'], "Accept": "application/json" } ) game_info = None game_name_lower = game_data.name.lower().strip() for response_game in response.body: if 'name' not in response_game: continue if game_name_lower == response_game['name'].lower().strip(): game_info = response_game break if not game_info: return False if not self.__validate_field(game_info, 'release_dates'): return False if not self.__validate_field(game_info, 'publishers'): return False if not self.__validate_field(game_info, 'themes') and \ not self.__validate_field(game_info, 'genres'): return False if not self.__validate_field(game_info, 'game_modes'): return False if 'first_release_date' not in game_info: return False if 'summary' not in game_info and 'storyline' not in game_info: return False for release_date in game_info['release_dates']: self.fetch_platforms(release_date, game_data.add_platform) if 'themes' in game_info: self.fetch_themes(game_info, game_data.add_genre) if 'genres' in game_info: self.fetch_genres(game_info, game_data.add_genre) self.fetch_publishers(game_info, game_data.add_publisher) self.fetch_game_modes(game_info, game_data.add_game_mode) release_date_timestamp = game_info['first_release_date'] release_date = datetime.datetime.fromtimestamp( release_date_timestamp / 1000) game_data.release_date = release_date release_day_of_year = release_date.timetuple().tm_yday quarter = int(release_day_of_year / (367 / 4.0)) game_data.release_quarter = quarter if 'summary' in game_info: summary = game_info['summary'] summary_keywords = self.__extract_keywords(summary) game_data.add_keywords(summary_keywords) if 'storyline' in game_info: storyline = game_info['storyline'] storyline_keywords = self.__extract_keywords(storyline) game_data.add_keywords(storyline_keywords) print "response body = " + str(response.body) return True def __validate_field(self, game_info, field_name): return field_name in game_info and len(game_info[field_name]) > 0 def __is_valid_keyword(self, keyword): return keyword not in self.STOP_WORDS and \ re.match("^[A-Za-z]+$", keyword) def __extract_keywords(self, text): keyword_tuples = self.keyword_extractor.extract(text) keywords = [] for keyword, _, _ in keyword_tuples: if self.__is_valid_keyword(keyword): keywords.append(keyword) return keywords def __add_attr_to_game_data(self, attr_name, endpoint_name, attr_map): def f(game_info, add_func): if attr_name not in game_info: print "Attribute %s is empty, skipping." % attr_name return if type(game_info[attr_name]) == list: for attr_id in game_info[attr_name]: if attr_id not in attr_map: fetched_name = self.__fetch_endpoint( endpoint_name, attr_id) if not fetched_name: continue attr_map[attr_id] = fetched_name add_func(attr_map[attr_id]) else: attr_id = game_info[attr_name] if attr_id not in attr_map: fetched_name = self.__fetch_endpoint( endpoint_name, attr_id) attr_map[attr_id] = fetched_name add_func(attr_map[attr_id]) return f def __fetch_endpoint(self, endpoint_name, id): url = self.ENDPOINT_API_TEMPLATE % (endpoint_name, id) response = unirest.get(url, headers={ "X-Mashape-Key": os.environ['IGDB_KEY'] }) if not type(response.body) == list or len(response.body) == 0 or \ 'name' not in response.body[0]: return None return response.body[0]['name']