Exemple #1
0
 def pre_keywords(self, _):
     """
     Setter of pre_keywords
     :param docs: list of strings with the documents to analyze
     """
     cleaned = [
         list(textcleaner.tokenize_by_word(x))
         for x in self.clean_it(return_it=True)
     ]
     min_count = max(1, int(len(cleaned) * self.min_df))
     ngrams = self.make_ngrams(cleaned, min_count, self.nlp)
     kw = list(
         set([(x[0].strip().replace('_', ' '), x[1]) for y in ngrams
              for x in keywords(' '.join(y), **self.opt)]))
     self.__pre_keywords = [x for x in kw if len(x[0]) > 2]
def highlight_keywords(sentence: str, keywords_list: List[str]) -> str:
    """
    Generates a sentence block with keywords highlighted

    Parameters
    ----------
    sentence: str
        Input string to highlight
    keyword_list: List[str]
        List contraining keywords to highlight

    Returns
    -------
    result: str
        Output html string
    """

    words = sentence.split(" ")
    for i in range(len(words)):
        if any(item in keywords_list for item in tokenize_by_word(words[i])):
            words[i] = f'<span class="highlight">{words[i]}</span>'

    output_str: str = '<p class="sentence">' + " ".join(words) + "</p>"
    return output_str
Exemple #3
0
                for index in range(0, len(sentences), sentences_per_chunk)
            ]
        else:
            raise ValueError(
                'pieces strategy can only be one of {} but is [{}]'.format(
                    pieces_strategies, pieces_strategy))

        lower_pieces = [piece.lower() for piece in pieces]
        logger.info('context size: {} pieces: {}'.format(
            context_limit_, len(pieces)))
        if mode == modes[0]:
            vectorizer = TfidfVectorizer().fit(lower_pieces)
            pieces_ = vectorizer.transform(lower_pieces)
        elif mode == modes[1]:
            texts = [[
                word for word in tokenize_by_word(document.lower())
                if word not in ENGLISH_STOP_WORDS
            ] for document in pieces]
            # remove words that appear only once
            frequency = Counter([token for text in texts for token in text])
            texts = [[token for token in text if frequency[token] > 1]
                     for text in texts]
            dictionary = corpora.Dictionary(texts)
            logger.info('dictionary size: {}'.format(len(dictionary)))
            corpus_ = [dictionary.doc2bow(text) for text in texts]
            lsi = models.LsiModel(corpus_,
                                  id2word=dictionary,
                                  num_topics=lsi_topic_count)
            lsi.show_topics(num_topics=lsi_topic_count,
                            num_words=100,
                            log=True)
test_acc = False

# currently available: nyt, washpo
source_name = 'nyt'
source_path = 'source_embeddings/' + source_name

data_source = 'nexis'

if not os.path.isfile(source_path) or force_retrain:
    with open('../data/%s.csv' % source_name) as f:
        reader = csv.reader(f)
        articles = [r[1] for r in reader]
    sentences = []
    for article in articles:
        art = split_sentences(article)
        sentences += [list(tokenize_by_word(sen)) for sen in art]
    bigram_transformer = Phrases(sentences)
    sentences = bigram_transformer[sentences]
    model = gensim.models.Word2Vec(sentences,
                                   size=100,
                                   window=10,
                                   min_count=2,
                                   workers=10)
    model.train(sentences, total_examples=len(sentences), epochs=50)
    model.save(source_path)
else:
    model = gensim.models.Word2Vec.load(source_path)

if test_acc:
    model.accuracy('questions-words.txt')
Exemple #5
0
def cut_word(texts):
    output = []
    for text in texts:
        output.append(list(textcleaner.tokenize_by_word(text)))
    return output