Beispiel #1
0
def extract_keywords(text, lemmatizer):

    """
    extract keywords using textRank algorithm, lemmatize them and return the top n percent keywords
    :param text: string
    :param lemmatizer: nltk WordNetLemmatizer
    :return: dictionary {keyword:score, ...}
    """

    lemmatized_text = " ".join(lemmatize_all_pos_text(text, lemmatizer))

    # use gensim.summarization.keywords to extract keywords
    keywords_scores_pair = keywords(lemmatized_text, lemmatize=True, scores=True)

    # lemmatized the keywords
    lemmatized_keywords_mapping = lemmatize_all_pos_pair(keywords_scores_pair, lemmatizer)
    lemmatized_keywords = lemmatized_keywords_mapping.keys()

    # lemmatize stopwords
    stop_words = set(lemmatize_all_pos_text(stopwords.words('english'), lemmatizer, tokenize=False))

    # remove stopwords from keywords
    keywords_without_stopwords = [keyword for keyword in lemmatized_keywords if keyword not in stop_words]

    keywords_mapping = dict()
    for keyword in keywords_without_stopwords:
        keywords_mapping[keyword] = lemmatized_keywords_mapping[keyword]

    return keywords_mapping
Beispiel #2
0
def extract_top_n_percent_keywords(lemmatized_text, lemmatizer, top_n_percent):

    """
    extract keywords using textRank algorithm, lemmatize them and return the top n percent keywords

    :param lemmatized_text: list of strings
    :param lemmatizer: nltk WordNetLemmatizer
    :param top_n_percent: int (0 <= top_n_percent <= 100)
    :return: list of strings (top n percent keywords)
    """

    # use gensim.summarization.keywords to extract keywords
    # this function returns [keyword, scores]
    keyword_score_pair = keywords(lemmatized_text, lemmatize=True, scores=True)
    keywords_only = [x[0] for x in keyword_score_pair]

    # lemmatized the keywords
    lemmatized_keywords = lemmatize_all_pos(keywords_only, lemmatizer, tokenize=False)

    # pair the lemmatized keywords with their scores
    scores = [x[1] for x in keyword_score_pair]
    lemmatized_pairs = list(zip(lemmatized_keywords, scores))

    # select top keywords
    # 100 - int(top_n_percent) percentile is the same as top_n_percent
    # e.g. 90 percentile returns the top 10%
    top_percentile = np.percentile(scores, 100 - int(top_n_percent))
    # pair[0] is the keyword and pair[1] is the corresponding score
    top_keywords = [pair[0] for pair in lemmatized_pairs if pair[1] >= top_percentile]

    return top_keywords
def extract_keywords(text):
    '''
  Returns the keywords from a given body of text. Requires more than one sentence as input.

  Parameters
  text: A body of text with multiple sentences
  '''

    return keywords(text, words=5)
Beispiel #4
0
 def set_textrank_keywords(self, n=5, ratio=1.0):
     try:
         kws = keywords(self.spaced, split=True, ratio=ratio, deacc=False)
         kws = [kw.replace(" ", "") for kw in kws]
         n = min(len(kws), n)
         self.textrank_keywords = kws[:n]
     except (IndexError, ZeroDivisionError) as e:
         print(e)
         print("setting textrank_keywords to empty")
         self.textrank_keywords = []
Beispiel #5
0
def get_keywords(text):
    words = 3
    raw_keywords = []
    while True:
        try:
            if words <= 0:
                return []
            raw_keywords = keywords(text, scores=True, words=words)
        except:
            words -= 1
            continue
        return raw_keywords
Beispiel #6
0
 def predict(self,
             X: str,
             y=None) -> Union[List[Tuple[str, float]], List[str], str]:
     return keywords(
         X,
         ratio=self.ratio,
         words=self.n_keywords,
         split=self.split,
         scores=self.scores,
         pos_filter=self.pos_filter,
         lemmatize=False,
         deacc=self.deacc,
     )
Beispiel #7
0
def get_title(text):
    sentences = _clean_text_by_sentences(text)

    if len(sentences) == 0:
        return ""

    if len(sentences) == 1:
        return _format_results(sentences, False)

    sentence_tokenized = _tokenize_sentence(_format_results(sentences, True))
    indices_delete = []

    for i in range(len(sentence_tokenized)):
        #print(sen_word_count[i][1])
        if sentence_tokenized[i][1] > 10:
            indices_delete.append(i)

    sentence_tokenized = [
        i for j, i in enumerate(sentence_tokenized) if j not in indices_delete
    ]

    #print(len(sen_word_count))

    if len(sentence_tokenized) == 0:
        return ""

    if len(sentence_tokenized) == 1:
        return sentence_tokenized[0][0]

    keyword_list = keywords(_format_results(sentences, False),
                            words=4,
                            split=True)
    #print(keyword_list)
    title_score = [0, 0]

    for sen_tuple in sentence_tokenized:
        #print(sen_tuple[0])
        temp_score = 0
        index = sentence_tokenized.index(sen_tuple)
        count = 0
        for word_pos in sen_tuple[2]:
            if word_pos[0] in keyword_list:
                count += 1
        temp_score = count / sen_tuple[1]
        #print(temp_score)
        #print(sen_tuple[1])
        if temp_score > title_score[1]:
            title_score = [index, temp_score]

    return sentence_tokenized[title_score[0]][0]