def run(self):
     """
     Loops through the keywords, and uploads an article for each.
     """
     keywords = KeywordExtractor.extract(config.KEYWORDS_FILE_PATH)
     for keyword in keywords:
         self._upload_article(keyword)
Exemple #2
0
def main():

    include_regex = re.compile(r'\bnfl\b', re.IGNORECASE)
    exclude_regex = re.compile(r'\bnba\b|\bnhl\b', re.IGNORECASE)

    extractor = KeywordExtractor()

    idx_db = sqlite3.connect('../idx.db')
    idx_db_cur = idx_db.cursor()

    nfl_db = sqlite3.connect('../nfl_tweets.db')
    nfl_db_cur = nfl_db.cursor()

    print "Getting already added ids"
    already_added_ids = set()
    nfl_db_cur.execute('SELECT idx_id FROM nfl_tweets')
    for row in nfl_db_cur:
        already_added_ids.add(row[0])

    print "Found {0} ids".format(len(already_added_ids))

    count = 0

    idx_db_cur.execute("SELECT rowid, utc_timestamp, content FROM tweets")
    for row in idx_db_cur:
        if row[0] not in already_added_ids:
            if include_regex.search(row[2]) is not None and exclude_regex.search(row[2]) is None:

                keywords = extractor.extract(row[2])

                if 'nfl' in keywords:
                    count += 1
                    nfl_db_cur.execute(
                        'INSERT INTO nfl_tweets(idx_id,created,keywords) VALUES(?,?,?)',
                        (row[0],row[1],keywords)
                    )

                    if count > 0 and count % 10000 == 0:
                        sys.stdout.write('.')
                        sys.stdout.flush()

    if count > 10000:
        print
        print 'added', count, 'records'
        print 'committing'
    else:
        print 'nothing new found'

    nfl_db.commit()
    print 'closing'

    nfl_db_cur.close()
    nfl_db.close()    

    idx_db_cur.close()
    idx_db.close()
Exemple #3
0
def main():
    extractor = KeywordExtractor()

    idx_db = sqlite3.connect('../idx.db')
    idx_db_cur = idx_db.cursor()

    kwd_db = sqlite3.connect('../kwd.db')
    kwd_db_cur = kwd_db.cursor()

    print "Getting already added ids"
    already_added_ids = set()
    kwd_db_cur.execute('SELECT idx_id FROM keywords')
    for row in kwd_db_cur:
        already_added_ids.add(row[0])

    print "Getting found {0} ids".format(len(already_added_ids))

    count = 0

    idx_db_cur.execute("SELECT rowid, utc_timestamp, content FROM tweets")
    for row in idx_db_cur:
        if row[0] not in already_added_ids:
            count += 1
            keywords = extractor.extract(row[2])

            kwd_db_cur.execute(
                'INSERT INTO keywords(idx_id,created,content) VALUES(?,?,?)',
                (row[0],row[1],keywords)
            )

            if count > 0 and count % 10000 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()

    if count > 10000:
        print
        print 'committing'
    else:
        print 'nothing new found'

    kwd_db.commit()
    print 'closing'

    kwd_db_cur.close()
    kwd_db.close()    

    idx_db_cur.close()
    idx_db.close()
class KeywordExtractorTests(unittest.TestCase):

    def setUp(self):
        self.extractor = KeywordExtractor()

    def tearDown(self):
        pass

    def test_extract_sanity(self):
        keyword_list = self.extractor.extract(CSV_PATH)
        self.assertEqual(type(keyword_list), types.ListType)
        our_list = []
        with open(CSV_PATH) as f:
            reader = csv.reader(f)
            reader.next()
            for row in reader:
                our_list.append(row[0])

        self.assertItemsEqual(keyword_list, our_list)
Exemple #5
0
from keyword_extractor import KeywordExtractor
import argparse

ap = argparse.ArgumentParser()
ap.add_argument("--word2vec",
                default=None,
                help="path to word2vec pre-trained embeddings")
ap.add_argument("--data",
                required=True,
                help="path to file from which keywords are to be extracted")

args = ap.parse_args()

with open(args.data, 'r') as data_file:
    lines = data_file.readlines()

extractor = KeywordExtractor(word2vec=args.word2vec)

for text in lines:
    keywords = extractor.extract(text, ratio=0.2, split=True, scores=True)
    for keyword in keywords:
        print(keyword)
Exemple #6
0
class IgdbFetcher:

    GET_GAME_INFO_TEMPLATE = \
        "https://igdbcom-internet-game-database-v1.p.mashape.com/games/" \
        "?fields=name,summary,storyline,publishers,themes,keywords," \
        "game_modes,genres,first_release_date,release_dates" \
        "&limit=20&offset=0&search=%s"
    ENDPOINT_API_TEMPLATE = \
        "https://igdbcom-internet-game-database-v1.p.mashape.com/" \
        "%s/%s?fields=name"
    STOP_WORDS = {'game', 'player', 'gameplay'}

    def __init__(self):
        self.keyword_extractor = KeywordExtractor()

        self.publisher_id_to_name = {}
        self.platform_id_to_name = {}
        self.theme_id_to_name = {}
        self.genre_id_to_name = {}
        self.game_mode_id_to_name = {}
        self.game_keyword_id_to_name = {}

        self.fetch_publishers = self.__add_attr_to_game_data(
            'publishers', 'companies', self.publisher_id_to_name)
        self.fetch_platforms = self.__add_attr_to_game_data(
            'platform', 'platforms', self.platform_id_to_name)
        self.fetch_themes = self.__add_attr_to_game_data(
            'themes', 'themes', self.theme_id_to_name)
        self.fetch_genres = self.__add_attr_to_game_data(
            'genres', 'genres', self.genre_id_to_name)
        self.fetch_game_modes = self.__add_attr_to_game_data(
            'game_modes', 'game_modes', self.game_mode_id_to_name)

    def get_game_info(self, game_data):

        response = unirest.get(self.GET_GAME_INFO_TEMPLATE % game_data.name,
                               headers={
                                   "X-Mashape-Key": os.environ['IGDB_KEY'],
                                   "Accept": "application/json"
                               }
                               )
        game_info = None
        game_name_lower = game_data.name.lower().strip()
        for response_game in response.body:
            if 'name' not in response_game:
                continue
            if game_name_lower == response_game['name'].lower().strip():
                game_info = response_game
                break

        if not game_info:
            return False
        if not self.__validate_field(game_info, 'release_dates'):
            return False
        if not self.__validate_field(game_info, 'publishers'):
            return False
        if not self.__validate_field(game_info, 'themes') and \
                not self.__validate_field(game_info, 'genres'):
            return False
        if not self.__validate_field(game_info, 'game_modes'):
            return False
        if 'first_release_date' not in game_info:
            return False
        if 'summary' not in game_info and 'storyline' not in game_info:
            return False

        for release_date in game_info['release_dates']:
            self.fetch_platforms(release_date, game_data.add_platform)

        if 'themes' in game_info:
            self.fetch_themes(game_info, game_data.add_genre)
        if 'genres' in game_info:
            self.fetch_genres(game_info, game_data.add_genre)

        self.fetch_publishers(game_info, game_data.add_publisher)
        self.fetch_game_modes(game_info, game_data.add_game_mode)

        release_date_timestamp = game_info['first_release_date']
        release_date = datetime.datetime.fromtimestamp(
            release_date_timestamp / 1000)
        game_data.release_date = release_date
        release_day_of_year = release_date.timetuple().tm_yday
        quarter = int(release_day_of_year / (367 / 4.0))
        game_data.release_quarter = quarter

        if 'summary' in game_info:
            summary = game_info['summary']
            summary_keywords = self.__extract_keywords(summary)
            game_data.add_keywords(summary_keywords)

        if 'storyline' in game_info:
            storyline = game_info['storyline']
            storyline_keywords = self.__extract_keywords(storyline)
            game_data.add_keywords(storyline_keywords)

        print "response body = " + str(response.body)
        return True

    def __validate_field(self, game_info, field_name):
        return field_name in game_info and len(game_info[field_name]) > 0

    def __is_valid_keyword(self, keyword):
        return keyword not in self.STOP_WORDS and \
            re.match("^[A-Za-z]+$", keyword)

    def __extract_keywords(self, text):
        keyword_tuples = self.keyword_extractor.extract(text)
        keywords = []
        for keyword, _, _ in keyword_tuples:
            if self.__is_valid_keyword(keyword):
                keywords.append(keyword)
        return keywords

    def __add_attr_to_game_data(self, attr_name, endpoint_name, attr_map):
        def f(game_info, add_func):
            if attr_name not in game_info:
                print "Attribute %s is empty, skipping." % attr_name
                return
            if type(game_info[attr_name]) == list:
                for attr_id in game_info[attr_name]:
                    if attr_id not in attr_map:
                        fetched_name = self.__fetch_endpoint(
                            endpoint_name, attr_id)
                        if not fetched_name:
                            continue
                        attr_map[attr_id] = fetched_name
                    add_func(attr_map[attr_id])
            else:
                attr_id = game_info[attr_name]
                if attr_id not in attr_map:
                    fetched_name = self.__fetch_endpoint(
                        endpoint_name, attr_id)
                    attr_map[attr_id] = fetched_name
                add_func(attr_map[attr_id])
        return f

    def __fetch_endpoint(self, endpoint_name, id):
        url = self.ENDPOINT_API_TEMPLATE % (endpoint_name, id)
        response = unirest.get(url, headers={
            "X-Mashape-Key": os.environ['IGDB_KEY']
        })
        if not type(response.body) == list or len(response.body) == 0 or \
                'name' not in response.body[0]:
            return None
        return response.body[0]['name']