Exemple #1
0
def _obtain_candidate_keywords(list_all_dialogs, candi_kw_path, min_kw_freq=1, load_file_if_exists=True):
    r"""Obtain and save the candidate keywords used for extracting keywords.

    Inputs: list_all_dialogs, candi_kw_path, load_file_if_exists
        # TODO
        - **list_all_dialogs**:
        - **candi_kw_path**:
        - **load_file_if_exists**:

    Outputs: candi_keywords
        - **candi_keywords**:  a 'list' containing all the candidate keywords
    """
    if load_file_if_exists:
        if os.path.isfile(candi_kw_path):
            with open(candi_kw_path,'r') as f:
                candi_keywords = [kw.strip() for kw in f.readlines()]
            print('Loading candidate keywords from {}'.format(candi_kw_path))
            print('Total candidate keywords count: ', len(candi_keywords))
            return candi_keywords

    if not list_all_dialogs:
        raise Exception('no dialogs provided for obtaining candidate keywords')

    candi_kw_dir = os.path.dirname(candi_kw_path)
    if not os.path.exists(candi_kw_dir):
        os.makedirs(candi_kw_dir)

    print('Obtaining candidate keywords...')

    # initialization
    candi_keywords = []
    kw_counter = collections.Counter()
    kw_extractor = KeywordExtractor()

    # extract possible keywords
    for dialog in tqdm(list_all_dialogs):
        for utterance in dialog:
            cur_keywords = kw_extractor.candi_extract(utterance)
            kw_counter.update(cur_keywords)
            candi_keywords.extend(cur_keywords)

    # delete the keywords occurring less than specified times (indicated by 'min_kw_freq').
    rare_keywords = [kw for kw, freq in kw_counter.most_common() if freq < min_kw_freq]
    candi_keywords = [kw for kw, freq in kw_counter.most_common() if freq >= min_kw_freq]
    # delete keywords containing only one single letter
    single_letter_keywords = [kw for kw in candi_keywords if len(kw) < 2]
    candi_keywords = [kw for kw in candi_keywords if len(kw) >= 2]

    # print the information of candidate keywords
    print('rare keywords count: ', len(rare_keywords))
    print('single letter keywords count: ', len(single_letter_keywords))
    print('total candidate keywords count(before cleaning): ', len(kw_counter.items()))
    print('total candidate keywords count(after cleaning):  ', len(candi_keywords))

    print('Saving candi_keywords into {}...'.format(candi_kw_path))
    with open(candi_kw_path,'w') as f:
        for keyword in candi_keywords:
            f.write(keyword + '\n')

    return candi_keywords
def create_tf_idf(file_path):
    reader = TrainingTextReader(file_path)
    keywords = KeywordExtractor(reader.articles[10], 'useless.txt')
    vector_index = Vectorizer(keywords.article_sents_tokened)
    freq_mat = vector_index.frequencyMatrix
    normalized_vector = VectorNormalizer(freq_mat)
    norm_mat = normalized_vector.l2_norm_matrice
    tf_idf = InverseDocumentFrequency(norm_mat)
    return tf_idf.tf_idf_matrice
    def _obtain_candidate_keywords(self, load_file_if_exists=True):
        r"""Obtains and saves the candidate keywords used for extracting keywords.

        Args:
            load_file_if_exists: A 'bool' indicating whether load candi_keywords file if it exists.

        Returns:
            candi_keywords: A 'list' containing all the candidate keywords.
        """
        if load_file_if_exists:
            candi_keywords_name = '../data/{}/candi_keywords.txt'.format(self.output_data_dir)
            if os.path.isfile(candi_keywords_name):
                with open(candi_keywords_name,'r') as f:
                    candi_keywords = [kw.strip() for kw in f.readlines()]
                print('Loading candidate keywords from {}'.format(candi_keywords_name))
                print('Total candidate keywords count: ', len(candi_keywords))
                return candi_keywords

        print('Obtaining candidate keywords...')

        # Initialization
        candi_keywords = []
        kw_counter = collections.Counter()
        kw_extractor = KeywordExtractor()

        # Extracts possible keywords.
        for dialog in tqdm(self.list_all_dialogs):
            for utterance in dialog:
                cur_keywords = kw_extractor.candi_extract(utterance)
                kw_counter.update(cur_keywords)
                candi_keywords.extend(cur_keywords)

        # Deletes the keywords occurring less than specified times
        rare_keywords = [kw for kw, freq in kw_counter.most_common()
            if freq < self.min_kw_freq]
        candi_keywords = [kw for kw, freq in kw_counter.most_common()
            if freq >= self.min_kw_freq]
        # Deletes keywords containing only one single letter
        single_letter_keywords = [kw for kw in candi_keywords if len(kw) < 2]
        candi_keywords = [kw for kw in candi_keywords if len(kw) >= 2]

        # Writes candidate keywords into file
        candidate_keywords_output_path = '../data/{}/candi_keywords.txt'.format(
            self.output_data_dir)
        with open(candidate_keywords_output_path,'w') as f:
            for keyword in candi_keywords:
                f.write(keyword + '\n')

        return candi_keywords
Exemple #4
0
 def handle(self, *args, **options):
     questions = Question.objects.all()
     ke = KeywordExtractor()
     for question in questions:
         if question.date.year != 2016 or question.date.month != 7:
             continue
         question.keywords = []
         keywords = ke.get_keywords(question.question)
         print ",".join(keywords)
         for keyword in keywords:
             m, created = Keyword.objects.get_or_create(keyword=keyword)
             m.keyword = keyword
             question.keywords.add(m)
             m.save()
         question.save()
Exemple #5
0
    def __init__(self):
        self.keyword_extractor = KeywordExtractor()

        self.publisher_id_to_name = {}
        self.platform_id_to_name = {}
        self.theme_id_to_name = {}
        self.genre_id_to_name = {}
        self.game_mode_id_to_name = {}
        self.game_keyword_id_to_name = {}

        self.fetch_publishers = self.__add_attr_to_game_data(
            'publishers', 'companies', self.publisher_id_to_name)
        self.fetch_platforms = self.__add_attr_to_game_data(
            'platform', 'platforms', self.platform_id_to_name)
        self.fetch_themes = self.__add_attr_to_game_data(
            'themes', 'themes', self.theme_id_to_name)
        self.fetch_genres = self.__add_attr_to_game_data(
            'genres', 'genres', self.genre_id_to_name)
        self.fetch_game_modes = self.__add_attr_to_game_data(
            'game_modes', 'game_modes', self.game_mode_id_to_name)
    def __init__(self, dataset_name, output_data_dir,
                 separator, min_kw_freq,
                 context_turns, set_names):
        self.dataset_name = dataset_name
        self.output_data_dir = output_data_dir
        self.separator = separator
        self.min_kw_freq = min_kw_freq
        self.context_turns = context_turns
        self.set_names = set_names

        self._make_data_dir_if_not_exists()
        self._load_raw_dialog_data()

        # Initializes keyword extractor
        candi_keywords = self._obtain_candidate_keywords()
        idf_dict = self._calculate_idf()
        self.kw_extractor = KeywordExtractor(candi_keywords, idf_dict)

        self._obtain_and_save_uttr_kw_mapping()  # uttr_kw_mapping: (utterances -> keywords) mapping
        self._obtain_and_save_vocab()
Exemple #7
0
    parser.add_argument('--candi_kw_path', type=str, help='path of candidate keywords file')
    parser.add_argument('--input_text_path', type=str, help='path of dialog text that need extracting keywords')
    parser.add_argument('--kw_output_path', type=str, help='path of dialog text that need extracting keywords')
    args = parser.parse_args()

    output_info = 'Start keyword extraction [dataset: {}, file: {}]'.format(
        args.dataset_name, args.input_text_path)
    print('-' * len(output_info))
    print(output_info)
    print('-' * len(output_info))

    # initialize keyword extractor
    try:
        candi_keywords = _obtain_candidate_keywords(None, args.candi_kw_path)
        idf_dict = _calculate_idf(None, args.idf_path)
        kw_extractor = KeywordExtractor(candi_keywords, idf_dict)
    except Exception as err:
        print('Exception: ', err)
        # load all dialogs of the specific dataset
        dataset = load_dataset(args.dataset_name, args.dataset_dir)
        candi_keywords = _obtain_candidate_keywords(dataset, args.candi_kw_path)
        idf_dict = _calculate_idf(dataset, args.idf_path)
        kw_extractor = KeywordExtractor(candi_keywords, idf_dict)


    # load texts that need extracting keywords
    texts = load_texts(args.input_text_path)
    # extract keywords
    extract_keywords(texts, kw_extractor, args.kw_output_path)
    print('Done.')
Exemple #8
0
from keyword_extractor import KeywordExtractor
import argparse

ap = argparse.ArgumentParser()
ap.add_argument("--word2vec",
                default=None,
                help="path to word2vec pre-trained embeddings")
ap.add_argument("--data",
                required=True,
                help="path to file from which keywords are to be extracted")

args = ap.parse_args()

with open(args.data, 'r') as data_file:
    lines = data_file.readlines()

extractor = KeywordExtractor(word2vec=args.word2vec)

for text in lines:
    keywords = extractor.extract(text, ratio=0.2, split=True, scores=True)
    for keyword in keywords:
        print(keyword)
from bs4 import BeautifulSoup
import urllib3
import random

# Custom Libs
from article_lister import ArticleLister
from keyword_extractor import KeywordExtractor
from news_db_storer import NewsDBStorer

cache_dir = "./pkl_cache/"

dbstore = NewsDBStorer(db_name="newsarticlesdb",
                       table_name="politician_based_newsarticlestable")
dbstore.set_up_connection()

keyword_xtractor = KeywordExtractor()


class GenericNewsScraper:
    def __init__(self, paper_name="cnn", base_url="https://www.cnn.com/"):
        self.articles = []
        self.base_url = base_url
        self.paper_name = paper_name
        self.art_obj = set()

    # Loads article cache
    def load_articles(self):
        f = open(cache_dir + self.paper_name + ".pkl", 'rb')
        cache_obj = pickle.load(f)
        f.close()
        return cache_obj