Ejemplos de Rake.extract_keywords_from_sentences en Python, ejemplos de rake_nltk.Rake.extract_keywords_from_sentences en Python

Ejemplo n.º 1

0

Mostrar archivo

    def process_text(self):
        # Remove new lines and turn to lower case
        text = re.sub('\n', ' ', self.text).lower()

        # Extract keyphrases using Rake
        # TODO also possible to extract keywords from sentence
        rake = Rake()
        if self.text_type == 'article':
            rake.extract_keywords_from_text(text)
        elif self.text_type == 'social':
            rake.extract_keywords_from_sentences(text)
        self.all_phrases = rake.get_ranked_phrases_with_scores()
        # word_freq_dist = rake.get_word_frequency_distribution()

        # Tokenize text
        self.article_text_tokenized = word_tokenize(text)

        # Tokenize phrases
        self.all_phrases_tokenized = self.tokenize_phrases()

        # Tag all phrases and remove all but noun words
        self.all_phrases_tagged = pos_tag_phrase_pairs(self.all_phrases)
        self.all_phrases_tagged_nouns = filter_pos(self.all_phrases_tagged,
                                                   "nouns")

        # Convert list of tagged nouns back to a string phrase
        self.string_phrases_nouns = self.tuple_list_to_string_list()

Ejemplo n.º 2

0

Mostrar archivo

def keywords_by_rake(
        texts_by_chapters: Sequence[Tuple[str, str]],
        word_count: int,
        filter_words: Optional[Sequence[str]] = None) -> Sequence[str]:
    """Extract keywords from the raw complete text (by appending the chapter-divided
    text blocks into a complete text block) using RAKE. The RAKE-ranked keywords shall
    be preprocessed and duplicates shall be removed. If filter_words is provided,
    those words will be filtered out from the list of keywords. Only the word_count
    most highly ranked keywords shall be returned."""
    complete_text_by_chapters = []
    for header, text_block in texts_by_chapters:
        complete_text_by_chapters.append(text_block)

    r = Rake(stopwords=mod_config.STOP_WORDS,
             punctuations=mod_config.PUNCTUATION,
             max_length=1,
             min_length=1)
    # Extract keywords from the text_block
    r.extract_keywords_from_sentences(complete_text_by_chapters)

    # Get list of ranked keywords (highest-lowest)
    keywords = r.ranked_phrases
    _logger.debug("Raw RAKE keywords: {}...".format(keywords[:20]))

    # Preprocess the keywords
    keywords = _preprocess_words(keywords)
    keywords = _remove_duplicates(keywords)

    if filter_words is not None:
        # Filter is specified. Filter out the specified words from the keywords.
        keywords = list(filter(lambda x: x not in filter_words, keywords))
    # Return (possibly filtered) list of preprocessed keywords in ranked order
    return keywords[:word_count]

Ejemplo n.º 3

0

Mostrar archivo

Archivo: keywords.py Proyecto: ben-kolber/first_eye_witness_tweet_detection

    def rake(self):

        r_1 = Rake(ranking_metric=Metric.WORD_DEGREE)
        r_2 = Rake(ranking_metric=Metric.WORD_FREQUENCY)

        # Extraction given the text.
        r_1.extract_keywords_from_text(self.article.text)
        r_2.extract_keywords_from_text(self.article.text)

        # To get keyword phrases ranked highest to lowest.
        r_1.get_ranked_phrases()
        r_2.get_ranked_phrases()

        # To get keyword phrases ranked highest to lowest with scores.
        list_1 = r_1.get_ranked_phrases()[:10]
        list_2 = r_2.get_ranked_phrases()[:10]

        # make a list of duplicates
        dups = set(list_1) & set(list_2)
        r_3 = Rake(ranking_metric=Metric.WORD_DEGREE)
        r_4 = Rake(ranking_metric=Metric.WORD_FREQUENCY)
        r_3.extract_keywords_from_sentences(dups)
        r_4.extract_keywords_from_sentences(dups)
        list_3 = r_3.get_ranked_phrases()[:10]
        list_4 = r_4.get_ranked_phrases()[:10]

        rake_keywords = []
        tmp_keywords = list(set(list_3) & set(list_4))
        for i in range(len(tmp_keywords)):
            tmp = tmp_keywords[i].split()
            for word in tmp:
                if "-" not in word:
                    rake_keywords.append(word.lower())

        return rake_keywords

Ejemplo n.º 4

0

Mostrar archivo

def get_keywords(tweets):
    rake = Rake()
    rake.extract_keywords_from_sentences(tweets)
    rake_return = []
    for phrase in rake.get_ranked_phrases():
        if (len(phrase.split()) < 4 and len(phrase.split()) > 1) and (phrase[:2].lower() != 'rt' and 'http' not in phrase and phrase.replace(" ", "").isalpha()):
            rake_return.append(phrase.strip(string.punctuation))
    return rake_return

Ejemplo n.º 5

0

Mostrar archivo

Archivo: Recommender.py Proyecto: BitPhinix/Barbra

    def run(self, text, val):
        """
        TODO Improvements:
        1. casual_tokenize can't handle 'words-with-hyphens-like-this' & reduces coverage
        """

        # Remove new lines and turn to lower case
        # TODO what if only wanting to read first x lines, but that should only be for purposes of ML
        self.val = val

        text = re.sub('\n', ' ', text).lower()

        # Extract keyphrases using Rake
        # TODO also possible to extract keywords from sentence
        rake = Rake()
        if val == 'article':
            rake.extract_keywords_from_text(text)
        elif val == 'social':
            rake.extract_keywords_from_sentences(text)
        all_phrases = rake.get_ranked_phrases_with_scores()
        word_freq_dist = rake.get_word_frequency_distribution()

        # Tokenize text
        article_text_tokenized = casual_tokenize(text)

        # Tokenize phrases
        all_phrases_tokenized = self.tokenize_phrases(all_phrases)

        # Tag all phrases and remove all but noun words
        all_phrases_tagged = self.pos_tag_phrase_pairs(all_phrases)
        all_phrases_tagged_nouns = self.filter_nouns(all_phrases_tagged)

        # Convert list of tagged nouns back to a string phrase
        string_phrases_nouns = self.tuple_list_to_string_list(
            all_phrases_tagged_nouns)

        # Get the indexes from the non-filtered suggested phrases in the original text
        all_surrounding_tokens, all_context_tokens = self.get_all_surrounding_tokens(
            all_phrases_tokenized, article_text_tokenized)

        # Get wikipedia urls for top 5 phrases
        mapping_list = self.get_wiki_urls_top_n_phrases(
            string_phrases_nouns, all_surrounding_tokens, 10)

        # Return mapping to console
        wiki_mapping = self.write_suggestions_to_json(mapping_list)
        # print(json.dumps(wiki_mapping))

        # Get page links on medium by phrase
        medium_mapping = self.get_n_listed_medium_posts(
            string_phrases_nouns, 2)
        # print(json.dumps(medium_mapping))

        # Combine jsons
        mapping = self.combine_mappings(wiki_mapping, medium_mapping)
        print(json.dumps(mapping))

Ejemplo n.º 6

0

Mostrar archivo

def rake_keyword_extractor_raw_text(filename):
    rake = Rake()
    word_array = list()
    with open(filename, "r", encoding="utf-8") as file:
        try:
            extractor_data = file.readlines()
            rake.extract_keywords_from_sentences(extractor_data)
            word_array = filtering_condition_for_words(
                rake.get_ranked_phrases())
        except UnicodeDecodeError:
            print("Cant extract data from file: " + filename)
    return word_array

Ejemplo n.º 7

0

Mostrar archivo

Archivo: ananse.py Proyecto: baasare/ananse

    def extract_terms(self, DataFrame, min_len=2, max_len=4):
        """
        This method uses the RAKE Algorithm to extract keywords from the text column of the DataFrame of naive search
        results.


        :param DataFrame:
        :param min_len: minimum keyword length
        :param max_len: maximum keyword length
        :return: a list consisting of  a combination of extracted keywords and author keyword
        """
        r = Rake(language='english',
                 punctuations='!"#$%&\'()*+,-),./“:;≥≤<=|‘>©?@[\\]^_`{|}~',
                 ranking_metric=Metric.WORD_DEGREE)

        # Extraction using the text column the text.
        texts = list(DataFrame['text'])

        r.extract_keywords_from_sentences(texts)
        raked_keywords = r.get_ranked_phrases()  # raked keywords

        # Extract author keywords from naive search results and remove blank values
        author_keywords = list(DataFrame['keywords'])
        real_keywords = [
            x.lower() for x in author_keywords if str(x) != 'nan'
        ]  # removing nan values from list of author keywords

        # merge all keywords and split into list
        real_keywords = "".join(real_keywords)
        real_keywords = real_keywords.split(";")

        # merge raked keywords with author keywords
        keywords = raked_keywords + real_keywords

        # loop through all keywords, remove every keyword with a digit in it and create new cleaned list
        digits_cleaned_all_keywords = [
            x for x in keywords if (any(char.isdigit() for char in x) == False)
        ]

        regex = re.compile('[@_!#$%^&""*..,≈·ακ⩽(∼苔草沼泽的no排放量天)<>?•η°/|}{~:]')

        # loop through all keywords, remove every keyword with a symbol in it using regex and create new cleaned list
        all_keywords = [
            x.strip() for x in digits_cleaned_all_keywords
            if (regex.search(x) is None)
        ]

        # Convert keyword list to set and then back to list to deduplicate keyword list
        all_keywords = list(set(all_keywords))
        all_keywords.sort(reverse=False)

        return all_keywords

Ejemplo n.º 8

0

Mostrar archivo

Archivo: py_readpaper.py Proyecto: sungcheolkim78/py_readpaper

    def keywords_rake_nltk(self, texts=None, words=10, **kwargs):
        """ extract keywords using rake_nltk """

        r = Rake()
        if texts is None:
            texts = self.contents(**kwargs)

        if isinstance(texts, list):
            r.extract_keywords_from_sentences(texts)
        else:
            r.extract_keywords_from_text(texts)

        res = r.get_ranked_phrases()
        return res[:words]

Ejemplo n.º 9

0

Mostrar archivo

Archivo: ldt.py Proyecto: TeamWork24by07/Language_Detect

def detect_and_translate(text):
    translator = google_translator()
    r = Rake()
    original_lang = translator.detect(text)
    print("\nSource Language was : ", original_lang[1])
    if (original_lang[0] != 'en'):
        print(text)
    translate_text = translator.translate(text, lang_tgt='en')
    print("\nEnglish Translation \n")
    r.extract_keywords_from_sentences(translate_text.split('\n'))
    print(translate_text)
    ranked = r.get_ranked_phrases_with_scores()
    print("\nPhrases with Scores")
    print(ranked)

Ejemplo n.º 10

0

Mostrar archivo

def rake_keyword_extractor(filename):
    rake = Rake()
    extractor_data = load_as_json(filename)
    categories = dict()
    result_dict = dict()
    for content in extractor_data:
        if content['category'] not in categories:
            categories[content['category']] = []
            result_dict[content['category']] = []
        categories[content['category']].append(content['text'])
    for category, categoryArray in categories.items():
        print(category)
        rake.extract_keywords_from_sentences(categoryArray)
        word_array = filtering_condition_for_words(rake.get_ranked_phrases())
        print(len(word_array))
        result_dict[category] = word_array
    return result_dict

Ejemplo n.º 11

0

Mostrar archivo

Archivo: nlp.py Proyecto: aakankshaduggal/tableQA

def extract_keywords_from_doc(doc, phrases=True, return_scores=False):
    if phrases:
        r = Rake()
        if isinstance(doc, (list, tuple)):
            r.extract_keywords_from_sentences(doc)
        else:
            r.extract_keywords_from_text(doc)
        if return_scores:
            return [(b, a) for a, b in r.get_ranked_phrases_with_scores()]
        else:
            return r.get_ranked_phrases()
    else:
        if not isinstance(doc, (list, tuple)):
            doc = [doc]
        ret = []
        for x in doc:
            for t in nltk.word_tokenize(x):
                if t.lower() not in stop_words:
                    ret.append(t)
        return ret

Ejemplo n.º 12

0

Mostrar archivo

Archivo: word_parser.py Proyecto: DongOnee/lova-backend

    def parse_keywords(self):

        r = Rake()

        if self.keyword_limit == 0:
            sentence = self.sentence
            r.extract_keywords_from_text(sentence)
            score_words = r.get_ranked_phrases_with_scores()

            for keyword in score_words:
                if keyword[0] > 1:
                    self.keywords.append(keyword[1])

            return self.keywords

        else:
            sentences = [self.sentence]
            r.extract_keywords_from_sentences(sentences)
            keywords = r.ranked_phrases
            return keywords[0:self.keyword_limit]

Ejemplo n.º 13

0

Mostrar archivo

Archivo: AnalyseDataset.py Proyecto: vishalkesti382/lies-have-short-legs

def do_keyword_extraction(words):
    if debug: print("---\n", words)
        
    rake_all = Rake()
    rake_all.extract_keywords_from_sentences(_t["context"].value_counts().index.values)

    word_degrees = dict(rake_all.get_word_degrees())
    
    r = Rake()
    r.extract_keywords_from_text(words)

    keywords = dict(r.get_word_degrees())
    
    if debug: print(keywords)
        
    for k, v in keywords.items():
        keywords[k] = word_degrees[k]
    
    if debug: print(keywords)

    return Counter(keywords).most_common(1)[0]

Ejemplo n.º 14

0

Mostrar archivo

Archivo: data.py Proyecto: ldzhangyx/music-nlp-chatbot

def lyrics_preprocessing(folder_path, tags_csv_path, output_csv_path):
    collection = list()

    # genre
    tags_table = pd.read_csv(tags_csv_path, sep='\t', index_col='id')

    # lyrics
    for i, file_name in enumerate(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, file_name)
        file_id = file_name.split('.')[0]

        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            lines = [i.strip() for i in lines]
            lyrics = ';'.join(lines)

        # keywords
        r = Rake()
        r.extract_keywords_from_sentences(lines)
        keywords = r.get_ranked_phrases()
        keywords_str = ','.join(keywords[:3])
        tags = tags_table.loc[file_id, 'tags']

        # clean data
        if len(lyrics) < 50:
            continue
        if detect(lyrics[:100]) != 'en':
            continue

        # add line
        collection.append([tags, keywords_str, lyrics])

        if i % 100 == 0:
            print(i)

    with open(output_csv_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, delimiter='\t')
        # writer.writerow(['tags','keywords','lyrics'])
        writer.writerows(collection)

Ejemplo n.º 15

0

Mostrar archivo

    def get_keywords(messages, topics):
        rake = Rake()
        rake.extract_keywords_from_sentences(
            [message['text'] for message in messages])

        # Only bi-grams
        filtered = [
            item for item in rake.get_ranked_phrases_with_scores()
            if len(item[1].split()) == 2
        ]

        # Filter only nouns bi-grams
        keywords_with_score = []
        for item in filtered:
            score = item[0]
            keyword = item[1]
            words = keyword.split()
            should_include = True
            tags = pos_tag(words)
            should_include = 'NN' in tags[0][1] and 'NN' == tags[1][1]
            for word in words:
                synset = wn.synsets(word)
                if not synset:
                    should_include = False
                    break
                if synset[0].pos() != 'n':
                    should_include = False

            if should_include:
                keywords_with_score.append(item)

        extracted_keywords = [
            item[1] for item in keywords_with_score[:EXTRACT_KEYWORDS_COUNT]
        ]
        extracted_keywords.extend([topic['text'].lower() for topic in topics])
        return list(set(extracted_keywords))

Ejemplo n.º 16

0

Mostrar archivo

Archivo: NLTK-Rake test.py Proyecto: pabdelmalik/pythonplay

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""

import gensim

import pdfminer3
txt = "C:/RandD/Ex1/test-EVD-SEARO.txt"

openfile = open(txt, "r")

lines = openfile.readlines()

import rake_nltk
from rake_nltk import Rake

r = Rake(
)  # Uses stopwords for english from NLTK, and all puntuation characters.

r.extract_keywords_from_sentences(lines)

phraselist = r.get_ranked_phrases_with_scores(
)  # To get keyword phrases ranked highest to lowest.

for i in phraselist[:5]:
    print("Line: ", i[1], " score: ", i[0])

Ejemplo n.º 17

0

Mostrar archivo

def getSentenceFeature(tokens, wordVectors, sentence, keyword = 'off', postag = 'off'):
    """ Obtain the sentence feature for sentiment analysis by averaging its word vectors """
    # Implement computation for the sentence features given a sentence.                                                       
    
    # Inputs:                                                         
    # - tokens: a dictionary that maps words to their indices in    
    #          the word vector list                                
    # - wordVectors: word vectors (each row) for all tokens                
    # - sentence: a list of words in the sentence of interest 

    # Output:                                                         
    # - sentVector: feature vector for the sentence
    if keyword == 'on':
        r = Rake()
        r.extract_keywords_from_sentences(sentence)
        n = len(sentence)
        m = int(n/2)
        sentence_new = r.get_ranked_phrases()
        sentVector = np.zeros((wordVectors.shape[1],))
        n = len(sentence_new)
        ### YOUR CODE HERE
        if n == 0:
            for word in sentence:
                token = tokens.get(word, 19536)
                wordVector = wordVectors[token]
                sentVector += wordVector
            n = len(sentence)
        else:
            for word in sentence_new:
                token = tokens.get(word, 19536)
                wordVector = wordVectors[token]
                sentVector += wordVector
            n = len(sentence_new)
        sentVector /= n
    elif postag == 'on':
        sentVector = np.zeros((wordVectors.shape[1],))
        tags = nltk.pos_tag(sentence)
        sentence_new = []
        tag_list = ['RB','RBR','RBS','UH','VB','VBD','VBG','VBN','VBP','VBZ','WRB','JJ','JJR','JJS','NN']
        for tag in tags:
            if tag[1] in tag_list:
                sentence_new.append(tag[0])
        n = len(sentence_new)
        if n == 0:
            for word in sentence:
                token = tokens.get(word, 19536)
                wordVector = wordVectors[token]
                sentVector += wordVector
            n = len(sentence)
        else:
            for word in sentence_new:
                token = tokens.get(word, 19536)
                wordVector = wordVectors[token]
                sentVector += wordVector
            n = len(sentence_new)
        sentVector /= n

    else:
        sentVector = np.zeros((wordVectors.shape[1],))

        ### YOUR CODE HERE
        for word in sentence:
            token = tokens.get(word, 19536)
            wordVector = wordVectors[token]
            sentVector += wordVector
        n = len(sentence)
        sentVector /= n




    #raise NotImplementedError
    ### END YOUR CODE
    
    return sentVector

Ejemplo n.º 18

0

Mostrar archivo

Archivo: l2_rake.py Proyecto: coder352/shellscript

#!/usr/bin/python3
# coding: utf-8
# pip install rake-nltk
from rake_nltk import Rake
from nltk import tokenize
r = Rake()  # Uses stopwords for english from NLTK, and all puntuation characters by default
##################################################################
## Extraction given the text.
mytext = '''Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered.
            Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given.
            These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.'''
r.extract_keywords_from_text(mytext)
print(r.get_ranked_phrases())  # To get keyword phrases ranked highest to lowest.
# ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility']
print(r.get_ranked_phrases_with_scores())  # To get keyword phrases ranked highest to lowest with scores.
# [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')]
##################################################################
## Extraction given the list of strings where each string is a sentence.
r.extract_keywords_from_sentences(tokenize.sent_tokenize(mytext))
print(r.get_ranked_phrases())
# ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility']
print(r.get_ranked_phrases_with_scores())
# [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')]

Ejemplo n.º 19

0

Mostrar archivo

Archivo: rake-nltk.py Proyecto: hangle2/tiki-reviews-discover

from data_helper import get_reviews_from_database
from pre_processing import pre_process, get_stop_words

sentences = get_reviews_from_database(4536405)
formated_sentences = []

for sentence in sentences:
    formated_sentences.append(pre_process(sentence['content']))

from rake_nltk import Rake

# Uses stopwords for english from NLTK, and all puntuation characters by
# default

stop_words = get_stop_words("data/stopwords.txt")
r = Rake(stopwords=stop_words, language="vietnamese")

# Extraction given the text.
r.extract_keywords_from_sentences(formated_sentences)

phrases = r.get_ranked_phrases()
for phrase in phrases:
    print(phrase)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: Extraction.py Proyecto: lrxzhy/CyberEventExtraction

def extractKeywords(document):
    r = Rake()
    keys = r.extract_keywords_from_sentences(document)
    return keys

Ejemplo n.º 21

0

Mostrar archivo

)

title_id = 0
for l in range(len(lines)):
    if lines[l].strip().startswith("<EOS>"):
        continue
    title = lines_title[title_id].strip()
    title_id += 1
    document = lines[l].replace('t outline . <s>', '').replace(
        ' <p> ', ' ').replace('  ', ' ').strip().replace(' <s> ',
                                                         '\n').split('\n')
    body = lines[l].replace('t outline . <s>', '').strip()

    try:
        r = Rake()
        r.extract_keywords_from_sentences(document)
        top_features = r.get_ranked_phrases()
        top_features = clean_top_features(top_features, topK)
    except Exception:
        print(document)
        continue

    keywordsSTR = convert_keys_to_str(top_features)

    if len(title) > 2:
        title = title.lower().replace("paid notice :",
                                      "").replace("paid notice:",
                                                  "").replace("journal;",
                                                              "").strip()
        keywordsSTR = title + '[SEP]' + keywordsSTR
        if len(keywordsSTR.split(' ')) > 100:

Ejemplo n.º 22

0

Mostrar archivo

    def generate_questions(self, data):
        text = data['text']
        title = self.nlp(data['title'])
        subject = self.nlp(data['subject'])
        doc = self.nlp(text)
        generated = []
        ranking_metrics = [Metric.WORD_DEGREE]
        sentences = [s.text for s in doc.sents]
        phrases_swisscom = self.generate_keywords(text)
        phrases = []
        # EG: [['egyptian president gamal abdel nasser', 'suez canal', 'israeli war', 'arab world', 'egypt', 'suez crisis', 'soviet union', 'nasser', 'tripartite aggression', 'israel'], [1.0, 0.8614248633384705, 0.8030354976654053, 0.7896698713302612, 0.811191737651825, 0.8514521718025208, 0.6438262462615967, 0.8813737034797668, 0.5584405660629272, 0.7795075178146362], [['nasser'], ['canal'], [], [], [], [], [], ['egyptian president gamal abdel nasser'], [], []]]

        if phrases_swisscom == None:
            # fallback option just in case swisscom isn't working
            for metric in ranking_metrics:
                r = Rake(ranking_metric=metric, min_length=1, max_length=5)

                # Extraction given the sentences as a list of strings.
                r.extract_keywords_from_sentences(sentences)

                # To get keyword phrases ranked highest to lowest and strip out the last half.
                keywords = r.get_ranked_phrases()
                keywords = keywords[0:round(len(keywords) * 0.5)]
                phrases.extend(keywords)
        else:
            print(phrases_swisscom)
            phrases = sorted(
                [(p, phrases_swisscom[1][i])
                 for i, p in enumerate(phrases_swisscom[0])
                 if title.similarity(self.nlp(p)) < self.title_similarity
                 and subject.similarity(self.nlp(p)) < self.title_similarity],
                key=lambda x: x[1],
                reverse=True)
            phrases = [p for p, s in phrases if s > 0.5]

        generated = []
        sentences_used = {s: 0 for s in doc.sents}
        phrases_used = []

        for tok in doc:
            for phrase in phrases:
                tok_sent_i = tok.i - tok.sent.start
                tok_sent_end = tok_sent_i + len(phrase.split())
                same_len = tok.sent[tok_sent_i:tok_sent_end]
                if [t.lower_ for t in same_len] == phrase.lower().split():
                    similarity = [
                        self.nlp(phrase).similarity(p) for p in phrases_used
                    ]  # figure out empty vector
                    if sentences_used[tok.sent] < 3 and max(
                            similarity, default=0.0) < self.title_similarity:
                        toks_with_ws = [
                            token.text_with_ws for token in tok.sent
                        ]
                        long_gap_toks = deepcopy(toks_with_ws)
                        for i in range(tok_sent_i, tok_sent_end):
                            long_gap_toks[i] = '_' * len(tok.sent[i])
                            if i == tok_sent_i:
                                toks_with_ws[i] = '_____'
                            else:
                                toks_with_ws[i] = ''
                        long_gap_toks[tok_sent_end - 1] = long_gap_toks[
                            tok_sent_end - 1] + tok.sent[tok_sent_end -
                                                         1].whitespace_
                        toks_with_ws[tok_sent_end - 1] = toks_with_ws[
                            tok_sent_end - 1] + tok.sent[tok_sent_end -
                                                         1].whitespace_
                        pair = {
                            "question": "".join(long_gap_toks),
                            "answer":
                            "".join([t.text_with_ws for t in same_len]),
                            "sentence": tok.sent.text,
                            "short_gap": "".join(toks_with_ws)
                        }
                        sentences_used[tok.sent] += 1
                        phrases_used.append(self.nlp(phrase))
                        generated.append(pair)

        return generated

Ejemplo n.º 23

0

Mostrar archivo

from rake_nltk import Rake

# Uses stopwords for english from NLTK, and all puntuation characters by
# default
r = Rake()

# Extraction given the text.
#r.extract_keywords_from_text("keyword_tests.txt")

x = []
with open("question_keyword_tests.txt") as f:
    for line in f:
        x.append (line)
        r.extract_keywords_from_text(line)
        #print(r.get_ranked_phrases_with_scores())
        print(r.get_ranked_phrases())


# Extraction given the list of strings where each string is a sentence.
r.extract_keywords_from_sentences(x)

# To get keyword phrases ranked highest to lowest.
phrases = r.get_ranked_phrases()

# To get keyword phrases ranked highest to lowest with scores.
phrases_with_text = r.get_ranked_phrases_with_scores()

# print(phrases)
# print(phrases_with_text)

Ejemplo n.º 24

0

Mostrar archivo

Archivo: main.py Proyecto: Finspire13/ML-Homework

for entry in patterns:
    print('{}: {}'.format(entry[0], entry[1]))

####################### Question 2.1 ##############################

# Get keywords using RAKE
titles = get_values(data, 'title')
titles = sorted(titles)

stopwords = nltk.corpus.stopwords.words('english')
extra_stopwords = ['using', 'via', 'without',
                   'towards', 'toward', 'based']
stopwords += extra_stopwords

extractor = Rake(stopwords=stopwords)
extractor.extract_keywords_from_sentences(titles)

keywords = extractor.get_ranked_phrases()
# Only using keywords with less than 5 words
keywords = [k for k in keywords if len(k.split()) <= 5]
# Get 20000 keywords
keywords = keywords[:20000]

# Get keyword of each team's research
print_divider('=', 'Question 2.1: Team Insterests')
for team in teams:
    team_data = get_team_data(data, team)
    team_titles = get_values(team_data, 'title')
    team_titles = sorted(team_titles)
    team_freq = get_ranked_keyword_frequency(team_titles, keywords)

Ejemplo n.º 25

0

Mostrar archivo

Archivo: extract_Keywords_Article.py Proyecto: JoaoSantos123/PythonProjects

#Project 3.1 - Extract Keywords Article

#rake-nltk short for Rapid Automatic Keyword Extraction algorithm
from rake_nltk import Rake

r = Rake()

text = " The Platform of the Future? \
\
The survival of any organization depends on its ability to outperform competitors and marketplaces in attracting and rewarding talent, ideas and capital. As communication and transaction costs have drastically declined because of the internet, new platforms have emerged, delivering goods and services at a speed and efficiency previously unimaginable. These new digital players took advantage of the changes in the underlying technology to challenge established business models and rethink pre-existing value chains. The ones that succeeded did so because they achieved a level of efficiency that their brick and mortar counterparts had trouble replicating. Through online reputation and feedback systems, digital players were able to create global marketplaces where individuals, products and services could be matched more effectively than ever before. By providing curation and ensuring the safety of transactions, these new types of intermediaries were able to reap the returns of this first wave of digitization. \
\
A similar transformation is about to happen as blockchain technology and cryptocurrencies mature and mainstream applications emerge. Under this new wave of technological change, intermediaries will still be able to add value to transactions, but the nature of intermediation will fundamentally change. Whereas some established players will be able to use this opportunity to further scale their operations, others will be challenged by new entrants proposing entirely new approaches to value creation and value capture.\
\
Complementing Artificial Intelligence with Human Intelligence "

#Extraction given a text
r.extract_keywords_from_text(text)

#Extraction given the list of strings where is a sentence
myList = ["ability to outperform", "blockchain", "suvival organization", "cryptocurrencies emerge", "artifical intelligence blockchain"]
r.extract_keywords_from_sentences(myList)

#To get Keyword phrases ranked highest to lowest
r.get_ranked_phrases()

#To get Keyword phrases ranked highest to lowest with scores
print(r.get_ranked_phrases_with_scores())

Ejemplo n.º 26

0

Mostrar archivo

Archivo: ml.py Proyecto: jackvandrunen/hackuci18

def menu_response(reviews):
    if not reviews:
        return [], 400

    reviews = filter(len, [
        filter(len, map(utils.simple_preprocess, i.split('.')))
        for i in reviews
    ])
    r = Rake()
    key_phrases = []
    for review in reviews:
        r.extract_keywords_from_sentences(sentences(review))
        for i in sorted(r.get_ranked_phrases(),
                        key=lambda s: similarity_to_food(ml.sentence_model, s),
                        reverse=True)[:5]:
            key_phrases.append(i)

    similar_phrases = []
    for p1, p2 in map(lambda t: (t[0].split(), t[1].split()),
                      itertools.combinations(key_phrases, 2)):
        try:
            similar_phrases.append(
                (p1, p2, sentence_model.wv.n_similarity(p1, p2)))
        except KeyError:
            pass
    similar_phrases = sorted(similar_phrases, key=lambda t: t[2], reverse=True)
    '''food_phrases = set()
    while similar_phrases and similar_phrases[0][2] > 0.9:
        shrt, lng = sorted([' '.join(similar_phrases[0][0]), ' '.join(similar_phrases[0][1])], key=len)
        if shrt not in food_phrases and lng not in food_phrases:
            food_phrases.add(shrt)
        try:
            key_phrases.remove(shrt)
        except KeyError, e:
            try:
                key_phrases.remove(lng)
            except KeyError:
                pass
        similar_phrases.pop(0)'''

    food_phrases, rejected, queue = set(), set(), set()
    for p1, p2, similarity in similar_phrases:
        if similarity < 0.75:
            break
        p1 = ' '.join(p1)
        p2 = ' '.join(p2)
        if p1 in food_phrases | rejected or p2 in food_phrases | rejected:
            continue
        if p1 in queue or p2 in queue:
            if len(p1) < len(p2):
                food_phrases.add(p1)
                rejected.add(p2)
            else:
                food_phrases.add(p2)
                rejected.add(p1)
        else:
            queue.add(p1)
            queue.add(p2)

    return sorted(score_phrases(food_phrases, reviews),
                  key=lambda t: t[1],
                  reverse=True), 200

Ejemplo n.º 27

0

Mostrar archivo

def get_key_words(comments):
    r = Rake()
    r.extract_keywords_from_sentences(comments)
    return r.get_ranked_phrases_with_scores()