Beispiel #1
0
    def process_text(self):
        # Remove new lines and turn to lower case
        text = re.sub('\n', ' ', self.text).lower()

        # Extract keyphrases using Rake
        # TODO also possible to extract keywords from sentence
        rake = Rake()
        if self.text_type == 'article':
            rake.extract_keywords_from_text(text)
        elif self.text_type == 'social':
            rake.extract_keywords_from_sentences(text)
        self.all_phrases = rake.get_ranked_phrases_with_scores()
        # word_freq_dist = rake.get_word_frequency_distribution()

        # Tokenize text
        self.article_text_tokenized = word_tokenize(text)

        # Tokenize phrases
        self.all_phrases_tokenized = self.tokenize_phrases()

        # Tag all phrases and remove all but noun words
        self.all_phrases_tagged = pos_tag_phrase_pairs(self.all_phrases)
        self.all_phrases_tagged_nouns = filter_pos(self.all_phrases_tagged,
                                                   "nouns")

        # Convert list of tagged nouns back to a string phrase
        self.string_phrases_nouns = self.tuple_list_to_string_list()
Beispiel #2
0
def keywords_by_rake(
        texts_by_chapters: Sequence[Tuple[str, str]],
        word_count: int,
        filter_words: Optional[Sequence[str]] = None) -> Sequence[str]:
    """Extract keywords from the raw complete text (by appending the chapter-divided
    text blocks into a complete text block) using RAKE. The RAKE-ranked keywords shall
    be preprocessed and duplicates shall be removed. If filter_words is provided,
    those words will be filtered out from the list of keywords. Only the word_count
    most highly ranked keywords shall be returned."""
    complete_text_by_chapters = []
    for header, text_block in texts_by_chapters:
        complete_text_by_chapters.append(text_block)

    r = Rake(stopwords=mod_config.STOP_WORDS,
             punctuations=mod_config.PUNCTUATION,
             max_length=1,
             min_length=1)
    # Extract keywords from the text_block
    r.extract_keywords_from_sentences(complete_text_by_chapters)

    # Get list of ranked keywords (highest-lowest)
    keywords = r.ranked_phrases
    _logger.debug("Raw RAKE keywords: {}...".format(keywords[:20]))

    # Preprocess the keywords
    keywords = _preprocess_words(keywords)
    keywords = _remove_duplicates(keywords)

    if filter_words is not None:
        # Filter is specified. Filter out the specified words from the keywords.
        keywords = list(filter(lambda x: x not in filter_words, keywords))
    # Return (possibly filtered) list of preprocessed keywords in ranked order
    return keywords[:word_count]
    def rake(self):

        r_1 = Rake(ranking_metric=Metric.WORD_DEGREE)
        r_2 = Rake(ranking_metric=Metric.WORD_FREQUENCY)

        # Extraction given the text.
        r_1.extract_keywords_from_text(self.article.text)
        r_2.extract_keywords_from_text(self.article.text)

        # To get keyword phrases ranked highest to lowest.
        r_1.get_ranked_phrases()
        r_2.get_ranked_phrases()

        # To get keyword phrases ranked highest to lowest with scores.
        list_1 = r_1.get_ranked_phrases()[:10]
        list_2 = r_2.get_ranked_phrases()[:10]

        # make a list of duplicates
        dups = set(list_1) & set(list_2)
        r_3 = Rake(ranking_metric=Metric.WORD_DEGREE)
        r_4 = Rake(ranking_metric=Metric.WORD_FREQUENCY)
        r_3.extract_keywords_from_sentences(dups)
        r_4.extract_keywords_from_sentences(dups)
        list_3 = r_3.get_ranked_phrases()[:10]
        list_4 = r_4.get_ranked_phrases()[:10]

        rake_keywords = []
        tmp_keywords = list(set(list_3) & set(list_4))
        for i in range(len(tmp_keywords)):
            tmp = tmp_keywords[i].split()
            for word in tmp:
                if "-" not in word:
                    rake_keywords.append(word.lower())

        return rake_keywords
Beispiel #4
0
def get_keywords(tweets):
    rake = Rake()
    rake.extract_keywords_from_sentences(tweets)
    rake_return = []
    for phrase in rake.get_ranked_phrases():
        if (len(phrase.split()) < 4 and len(phrase.split()) > 1) and (phrase[:2].lower() != 'rt' and 'http' not in phrase and phrase.replace(" ", "").isalpha()):
            rake_return.append(phrase.strip(string.punctuation))
    return rake_return
Beispiel #5
0
    def run(self, text, val):
        """
        TODO Improvements:
        1. casual_tokenize can't handle 'words-with-hyphens-like-this' & reduces coverage
        """

        # Remove new lines and turn to lower case
        # TODO what if only wanting to read first x lines, but that should only be for purposes of ML
        self.val = val

        text = re.sub('\n', ' ', text).lower()

        # Extract keyphrases using Rake
        # TODO also possible to extract keywords from sentence
        rake = Rake()
        if val == 'article':
            rake.extract_keywords_from_text(text)
        elif val == 'social':
            rake.extract_keywords_from_sentences(text)
        all_phrases = rake.get_ranked_phrases_with_scores()
        word_freq_dist = rake.get_word_frequency_distribution()

        # Tokenize text
        article_text_tokenized = casual_tokenize(text)

        # Tokenize phrases
        all_phrases_tokenized = self.tokenize_phrases(all_phrases)

        # Tag all phrases and remove all but noun words
        all_phrases_tagged = self.pos_tag_phrase_pairs(all_phrases)
        all_phrases_tagged_nouns = self.filter_nouns(all_phrases_tagged)

        # Convert list of tagged nouns back to a string phrase
        string_phrases_nouns = self.tuple_list_to_string_list(
            all_phrases_tagged_nouns)

        # Get the indexes from the non-filtered suggested phrases in the original text
        all_surrounding_tokens, all_context_tokens = self.get_all_surrounding_tokens(
            all_phrases_tokenized, article_text_tokenized)

        # Get wikipedia urls for top 5 phrases
        mapping_list = self.get_wiki_urls_top_n_phrases(
            string_phrases_nouns, all_surrounding_tokens, 10)

        # Return mapping to console
        wiki_mapping = self.write_suggestions_to_json(mapping_list)
        # print(json.dumps(wiki_mapping))

        # Get page links on medium by phrase
        medium_mapping = self.get_n_listed_medium_posts(
            string_phrases_nouns, 2)
        # print(json.dumps(medium_mapping))

        # Combine jsons
        mapping = self.combine_mappings(wiki_mapping, medium_mapping)
        print(json.dumps(mapping))
Beispiel #6
0
def rake_keyword_extractor_raw_text(filename):
    rake = Rake()
    word_array = list()
    with open(filename, "r", encoding="utf-8") as file:
        try:
            extractor_data = file.readlines()
            rake.extract_keywords_from_sentences(extractor_data)
            word_array = filtering_condition_for_words(
                rake.get_ranked_phrases())
        except UnicodeDecodeError:
            print("Cant extract data from file: " + filename)
    return word_array
Beispiel #7
0
    def extract_terms(self, DataFrame, min_len=2, max_len=4):
        """
        This method uses the RAKE Algorithm to extract keywords from the text column of the DataFrame of naive search
        results.


        :param DataFrame:
        :param min_len: minimum keyword length
        :param max_len: maximum keyword length
        :return: a list consisting of  a combination of extracted keywords and author keyword
        """
        r = Rake(language='english',
                 punctuations='!"#$%&\'()*+,-),./“:;≥≤<=|‘>©?@[\\]^_`{|}~',
                 ranking_metric=Metric.WORD_DEGREE)

        # Extraction using the text column the text.
        texts = list(DataFrame['text'])

        r.extract_keywords_from_sentences(texts)
        raked_keywords = r.get_ranked_phrases()  # raked keywords

        # Extract author keywords from naive search results and remove blank values
        author_keywords = list(DataFrame['keywords'])
        real_keywords = [
            x.lower() for x in author_keywords if str(x) != 'nan'
        ]  # removing nan values from list of author keywords

        # merge all keywords and split into list
        real_keywords = "".join(real_keywords)
        real_keywords = real_keywords.split(";")

        # merge raked keywords with author keywords
        keywords = raked_keywords + real_keywords

        # loop through all keywords, remove every keyword with a digit in it and create new cleaned list
        digits_cleaned_all_keywords = [
            x for x in keywords if (any(char.isdigit() for char in x) == False)
        ]

        regex = re.compile('[@_!#$%^&""*..,≈·ακ⩽(∼苔草沼泽的no排放量天)<>?•η°/|}{~:]')

        # loop through all keywords, remove every keyword with a symbol in it using regex and create new cleaned list
        all_keywords = [
            x.strip() for x in digits_cleaned_all_keywords
            if (regex.search(x) is None)
        ]

        # Convert keyword list to set and then back to list to deduplicate keyword list
        all_keywords = list(set(all_keywords))
        all_keywords.sort(reverse=False)

        return all_keywords
    def keywords_rake_nltk(self, texts=None, words=10, **kwargs):
        """ extract keywords using rake_nltk """

        r = Rake()
        if texts is None:
            texts = self.contents(**kwargs)

        if isinstance(texts, list):
            r.extract_keywords_from_sentences(texts)
        else:
            r.extract_keywords_from_text(texts)

        res = r.get_ranked_phrases()
        return res[:words]
Beispiel #9
0
def detect_and_translate(text):
    translator = google_translator()
    r = Rake()
    original_lang = translator.detect(text)
    print("\nSource Language was : ", original_lang[1])
    if (original_lang[0] != 'en'):
        print(text)
    translate_text = translator.translate(text, lang_tgt='en')
    print("\nEnglish Translation \n")
    r.extract_keywords_from_sentences(translate_text.split('\n'))
    print(translate_text)
    ranked = r.get_ranked_phrases_with_scores()
    print("\nPhrases with Scores")
    print(ranked)
Beispiel #10
0
def rake_keyword_extractor(filename):
    rake = Rake()
    extractor_data = load_as_json(filename)
    categories = dict()
    result_dict = dict()
    for content in extractor_data:
        if content['category'] not in categories:
            categories[content['category']] = []
            result_dict[content['category']] = []
        categories[content['category']].append(content['text'])
    for category, categoryArray in categories.items():
        print(category)
        rake.extract_keywords_from_sentences(categoryArray)
        word_array = filtering_condition_for_words(rake.get_ranked_phrases())
        print(len(word_array))
        result_dict[category] = word_array
    return result_dict
Beispiel #11
0
def extract_keywords_from_doc(doc, phrases=True, return_scores=False):
    if phrases:
        r = Rake()
        if isinstance(doc, (list, tuple)):
            r.extract_keywords_from_sentences(doc)
        else:
            r.extract_keywords_from_text(doc)
        if return_scores:
            return [(b, a) for a, b in r.get_ranked_phrases_with_scores()]
        else:
            return r.get_ranked_phrases()
    else:
        if not isinstance(doc, (list, tuple)):
            doc = [doc]
        ret = []
        for x in doc:
            for t in nltk.word_tokenize(x):
                if t.lower() not in stop_words:
                    ret.append(t)
        return ret
Beispiel #12
0
    def parse_keywords(self):

        r = Rake()

        if self.keyword_limit == 0:
            sentence = self.sentence
            r.extract_keywords_from_text(sentence)
            score_words = r.get_ranked_phrases_with_scores()

            for keyword in score_words:
                if keyword[0] > 1:
                    self.keywords.append(keyword[1])

            return self.keywords

        else:
            sentences = [self.sentence]
            r.extract_keywords_from_sentences(sentences)
            keywords = r.ranked_phrases
            return keywords[0:self.keyword_limit]
def do_keyword_extraction(words):
    if debug: print("---\n", words)
        
    rake_all = Rake()
    rake_all.extract_keywords_from_sentences(_t["context"].value_counts().index.values)

    word_degrees = dict(rake_all.get_word_degrees())
    
    r = Rake()
    r.extract_keywords_from_text(words)

    keywords = dict(r.get_word_degrees())
    
    if debug: print(keywords)
        
    for k, v in keywords.items():
        keywords[k] = word_degrees[k]
    
    if debug: print(keywords)

    return Counter(keywords).most_common(1)[0]
Beispiel #14
0
def lyrics_preprocessing(folder_path, tags_csv_path, output_csv_path):
    collection = list()

    # genre
    tags_table = pd.read_csv(tags_csv_path, sep='\t', index_col='id')

    # lyrics
    for i, file_name in enumerate(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, file_name)
        file_id = file_name.split('.')[0]

        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            lines = [i.strip() for i in lines]
            lyrics = ';'.join(lines)

        # keywords
        r = Rake()
        r.extract_keywords_from_sentences(lines)
        keywords = r.get_ranked_phrases()
        keywords_str = ','.join(keywords[:3])
        tags = tags_table.loc[file_id, 'tags']

        # clean data
        if len(lyrics) < 50:
            continue
        if detect(lyrics[:100]) != 'en':
            continue

        # add line
        collection.append([tags, keywords_str, lyrics])

        if i % 100 == 0:
            print(i)

    with open(output_csv_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, delimiter='\t')
        # writer.writerow(['tags','keywords','lyrics'])
        writer.writerows(collection)
Beispiel #15
0
    def get_keywords(messages, topics):
        rake = Rake()
        rake.extract_keywords_from_sentences(
            [message['text'] for message in messages])

        # Only bi-grams
        filtered = [
            item for item in rake.get_ranked_phrases_with_scores()
            if len(item[1].split()) == 2
        ]

        # Filter only nouns bi-grams
        keywords_with_score = []
        for item in filtered:
            score = item[0]
            keyword = item[1]
            words = keyword.split()
            should_include = True
            tags = pos_tag(words)
            should_include = 'NN' in tags[0][1] and 'NN' == tags[1][1]
            for word in words:
                synset = wn.synsets(word)
                if not synset:
                    should_include = False
                    break
                if synset[0].pos() != 'n':
                    should_include = False

            if should_include:
                keywords_with_score.append(item)

        extracted_keywords = [
            item[1] for item in keywords_with_score[:EXTRACT_KEYWORDS_COUNT]
        ]
        extracted_keywords.extend([topic['text'].lower() for topic in topics])
        return list(set(extracted_keywords))
# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""

import gensim

import pdfminer3
txt = "C:/RandD/Ex1/test-EVD-SEARO.txt"

openfile = open(txt, "r")

lines = openfile.readlines()

import rake_nltk
from rake_nltk import Rake

r = Rake(
)  # Uses stopwords for english from NLTK, and all puntuation characters.

r.extract_keywords_from_sentences(lines)

phraselist = r.get_ranked_phrases_with_scores(
)  # To get keyword phrases ranked highest to lowest.

for i in phraselist[:5]:
    print("Line: ", i[1], " score: ", i[0])
Beispiel #17
0
def getSentenceFeature(tokens, wordVectors, sentence, keyword = 'off', postag = 'off'):
    """ Obtain the sentence feature for sentiment analysis by averaging its word vectors """
    # Implement computation for the sentence features given a sentence.                                                       
    
    # Inputs:                                                         
    # - tokens: a dictionary that maps words to their indices in    
    #          the word vector list                                
    # - wordVectors: word vectors (each row) for all tokens                
    # - sentence: a list of words in the sentence of interest 

    # Output:                                                         
    # - sentVector: feature vector for the sentence
    if keyword == 'on':
        r = Rake()
        r.extract_keywords_from_sentences(sentence)
        n = len(sentence)
        m = int(n/2)
        sentence_new = r.get_ranked_phrases()
        sentVector = np.zeros((wordVectors.shape[1],))
        n = len(sentence_new)
        ### YOUR CODE HERE
        if n == 0:
            for word in sentence:
                token = tokens.get(word, 19536)
                wordVector = wordVectors[token]
                sentVector += wordVector
            n = len(sentence)
        else:
            for word in sentence_new:
                token = tokens.get(word, 19536)
                wordVector = wordVectors[token]
                sentVector += wordVector
            n = len(sentence_new)
        sentVector /= n
    elif postag == 'on':
        sentVector = np.zeros((wordVectors.shape[1],))
        tags = nltk.pos_tag(sentence)
        sentence_new = []
        tag_list = ['RB','RBR','RBS','UH','VB','VBD','VBG','VBN','VBP','VBZ','WRB','JJ','JJR','JJS','NN']
        for tag in tags:
            if tag[1] in tag_list:
                sentence_new.append(tag[0])
        n = len(sentence_new)
        if n == 0:
            for word in sentence:
                token = tokens.get(word, 19536)
                wordVector = wordVectors[token]
                sentVector += wordVector
            n = len(sentence)
        else:
            for word in sentence_new:
                token = tokens.get(word, 19536)
                wordVector = wordVectors[token]
                sentVector += wordVector
            n = len(sentence_new)
        sentVector /= n

    else:
        sentVector = np.zeros((wordVectors.shape[1],))

        ### YOUR CODE HERE
        for word in sentence:
            token = tokens.get(word, 19536)
            wordVector = wordVectors[token]
            sentVector += wordVector
        n = len(sentence)
        sentVector /= n




    #raise NotImplementedError
    ### END YOUR CODE
    
    return sentVector
Beispiel #18
0
#!/usr/bin/python3
# coding: utf-8
# pip install rake-nltk
from rake_nltk import Rake
from nltk import tokenize
r = Rake()  # Uses stopwords for english from NLTK, and all puntuation characters by default
##################################################################
## Extraction given the text.
mytext = '''Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered.
            Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given.
            These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.'''
r.extract_keywords_from_text(mytext)
print(r.get_ranked_phrases())  # To get keyword phrases ranked highest to lowest.
# ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility']
print(r.get_ranked_phrases_with_scores())  # To get keyword phrases ranked highest to lowest with scores.
# [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')]
##################################################################
## Extraction given the list of strings where each string is a sentence.
r.extract_keywords_from_sentences(tokenize.sent_tokenize(mytext))
print(r.get_ranked_phrases())
# ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility']
print(r.get_ranked_phrases_with_scores())
# [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')]
from data_helper import get_reviews_from_database
from pre_processing import pre_process, get_stop_words

sentences = get_reviews_from_database(4536405)
formated_sentences = []

for sentence in sentences:
    formated_sentences.append(pre_process(sentence['content']))

from rake_nltk import Rake

# Uses stopwords for english from NLTK, and all puntuation characters by
# default

stop_words = get_stop_words("data/stopwords.txt")
r = Rake(stopwords=stop_words, language="vietnamese")

# Extraction given the text.
r.extract_keywords_from_sentences(formated_sentences)

phrases = r.get_ranked_phrases()
for phrase in phrases:
    print(phrase)
def extractKeywords(document):
    r = Rake()
    keys = r.extract_keywords_from_sentences(document)
    return keys
Beispiel #21
0
)

title_id = 0
for l in range(len(lines)):
    if lines[l].strip().startswith("<EOS>"):
        continue
    title = lines_title[title_id].strip()
    title_id += 1
    document = lines[l].replace('t outline . <s>', '').replace(
        ' <p> ', ' ').replace('  ', ' ').strip().replace(' <s> ',
                                                         '\n').split('\n')
    body = lines[l].replace('t outline . <s>', '').strip()

    try:
        r = Rake()
        r.extract_keywords_from_sentences(document)
        top_features = r.get_ranked_phrases()
        top_features = clean_top_features(top_features, topK)
    except Exception:
        print(document)
        continue

    keywordsSTR = convert_keys_to_str(top_features)

    if len(title) > 2:
        title = title.lower().replace("paid notice :",
                                      "").replace("paid notice:",
                                                  "").replace("journal;",
                                                              "").strip()
        keywordsSTR = title + '[SEP]' + keywordsSTR
        if len(keywordsSTR.split(' ')) > 100:
Beispiel #22
0
    def generate_questions(self, data):
        text = data['text']
        title = self.nlp(data['title'])
        subject = self.nlp(data['subject'])
        doc = self.nlp(text)
        generated = []
        ranking_metrics = [Metric.WORD_DEGREE]
        sentences = [s.text for s in doc.sents]
        phrases_swisscom = self.generate_keywords(text)
        phrases = []
        # EG: [['egyptian president gamal abdel nasser', 'suez canal', 'israeli war', 'arab world', 'egypt', 'suez crisis', 'soviet union', 'nasser', 'tripartite aggression', 'israel'], [1.0, 0.8614248633384705, 0.8030354976654053, 0.7896698713302612, 0.811191737651825, 0.8514521718025208, 0.6438262462615967, 0.8813737034797668, 0.5584405660629272, 0.7795075178146362], [['nasser'], ['canal'], [], [], [], [], [], ['egyptian president gamal abdel nasser'], [], []]]

        if phrases_swisscom == None:
            # fallback option just in case swisscom isn't working
            for metric in ranking_metrics:
                r = Rake(ranking_metric=metric, min_length=1, max_length=5)

                # Extraction given the sentences as a list of strings.
                r.extract_keywords_from_sentences(sentences)

                # To get keyword phrases ranked highest to lowest and strip out the last half.
                keywords = r.get_ranked_phrases()
                keywords = keywords[0:round(len(keywords) * 0.5)]
                phrases.extend(keywords)
        else:
            print(phrases_swisscom)
            phrases = sorted(
                [(p, phrases_swisscom[1][i])
                 for i, p in enumerate(phrases_swisscom[0])
                 if title.similarity(self.nlp(p)) < self.title_similarity
                 and subject.similarity(self.nlp(p)) < self.title_similarity],
                key=lambda x: x[1],
                reverse=True)
            phrases = [p for p, s in phrases if s > 0.5]

        generated = []
        sentences_used = {s: 0 for s in doc.sents}
        phrases_used = []

        for tok in doc:
            for phrase in phrases:
                tok_sent_i = tok.i - tok.sent.start
                tok_sent_end = tok_sent_i + len(phrase.split())
                same_len = tok.sent[tok_sent_i:tok_sent_end]
                if [t.lower_ for t in same_len] == phrase.lower().split():
                    similarity = [
                        self.nlp(phrase).similarity(p) for p in phrases_used
                    ]  # figure out empty vector
                    if sentences_used[tok.sent] < 3 and max(
                            similarity, default=0.0) < self.title_similarity:
                        toks_with_ws = [
                            token.text_with_ws for token in tok.sent
                        ]
                        long_gap_toks = deepcopy(toks_with_ws)
                        for i in range(tok_sent_i, tok_sent_end):
                            long_gap_toks[i] = '_' * len(tok.sent[i])
                            if i == tok_sent_i:
                                toks_with_ws[i] = '_____'
                            else:
                                toks_with_ws[i] = ''
                        long_gap_toks[tok_sent_end - 1] = long_gap_toks[
                            tok_sent_end - 1] + tok.sent[tok_sent_end -
                                                         1].whitespace_
                        toks_with_ws[tok_sent_end - 1] = toks_with_ws[
                            tok_sent_end - 1] + tok.sent[tok_sent_end -
                                                         1].whitespace_
                        pair = {
                            "question": "".join(long_gap_toks),
                            "answer":
                            "".join([t.text_with_ws for t in same_len]),
                            "sentence": tok.sent.text,
                            "short_gap": "".join(toks_with_ws)
                        }
                        sentences_used[tok.sent] += 1
                        phrases_used.append(self.nlp(phrase))
                        generated.append(pair)

        return generated
Beispiel #23
0
from rake_nltk import Rake

# Uses stopwords for english from NLTK, and all puntuation characters by
# default
r = Rake()

# Extraction given the text.
#r.extract_keywords_from_text("keyword_tests.txt")

x = []
with open("question_keyword_tests.txt") as f:
    for line in f:
        x.append (line)
        r.extract_keywords_from_text(line)
        #print(r.get_ranked_phrases_with_scores())
        print(r.get_ranked_phrases())


# Extraction given the list of strings where each string is a sentence.
r.extract_keywords_from_sentences(x)

# To get keyword phrases ranked highest to lowest.
phrases = r.get_ranked_phrases()

# To get keyword phrases ranked highest to lowest with scores.
phrases_with_text = r.get_ranked_phrases_with_scores()

# print(phrases)
# print(phrases_with_text)
Beispiel #24
0
for entry in patterns:
    print('{}: {}'.format(entry[0], entry[1]))

####################### Question 2.1 ##############################

# Get keywords using RAKE
titles = get_values(data, 'title')
titles = sorted(titles)

stopwords = nltk.corpus.stopwords.words('english')
extra_stopwords = ['using', 'via', 'without',
                   'towards', 'toward', 'based']
stopwords += extra_stopwords

extractor = Rake(stopwords=stopwords)
extractor.extract_keywords_from_sentences(titles)

keywords = extractor.get_ranked_phrases()
# Only using keywords with less than 5 words
keywords = [k for k in keywords if len(k.split()) <= 5]
# Get 20000 keywords
keywords = keywords[:20000]

# Get keyword of each team's research
print_divider('=', 'Question 2.1: Team Insterests')
for team in teams:
    team_data = get_team_data(data, team)
    team_titles = get_values(team_data, 'title')
    team_titles = sorted(team_titles)
    team_freq = get_ranked_keyword_frequency(team_titles, keywords)
#Project 3.1 - Extract Keywords Article

#rake-nltk short for Rapid Automatic Keyword Extraction algorithm
from rake_nltk import Rake

r = Rake()

text = " The Platform of the Future? \
\
The survival of any organization depends on its ability to outperform competitors and marketplaces in attracting and rewarding talent, ideas and capital. As communication and transaction costs have drastically declined because of the internet, new platforms have emerged, delivering goods and services at a speed and efficiency previously unimaginable. These new digital players took advantage of the changes in the underlying technology to challenge established business models and rethink pre-existing value chains. The ones that succeeded did so because they achieved a level of efficiency that their brick and mortar counterparts had trouble replicating. Through online reputation and feedback systems, digital players were able to create global marketplaces where individuals, products and services could be matched more effectively than ever before. By providing curation and ensuring the safety of transactions, these new types of intermediaries were able to reap the returns of this first wave of digitization. \
\
A similar transformation is about to happen as blockchain technology and cryptocurrencies mature and mainstream applications emerge. Under this new wave of technological change, intermediaries will still be able to add value to transactions, but the nature of intermediation will fundamentally change. Whereas some established players will be able to use this opportunity to further scale their operations, others will be challenged by new entrants proposing entirely new approaches to value creation and value capture.\
\
Complementing Artificial Intelligence with Human Intelligence "

#Extraction given a text
r.extract_keywords_from_text(text)

#Extraction given the list of strings where is a sentence
myList = ["ability to outperform", "blockchain", "suvival organization", "cryptocurrencies emerge", "artifical intelligence blockchain"]
r.extract_keywords_from_sentences(myList)

#To get Keyword phrases ranked highest to lowest
r.get_ranked_phrases()

#To get Keyword phrases ranked highest to lowest with scores
print(r.get_ranked_phrases_with_scores())


Beispiel #26
0
def menu_response(reviews):
    if not reviews:
        return [], 400

    reviews = filter(len, [
        filter(len, map(utils.simple_preprocess, i.split('.')))
        for i in reviews
    ])
    r = Rake()
    key_phrases = []
    for review in reviews:
        r.extract_keywords_from_sentences(sentences(review))
        for i in sorted(r.get_ranked_phrases(),
                        key=lambda s: similarity_to_food(ml.sentence_model, s),
                        reverse=True)[:5]:
            key_phrases.append(i)

    similar_phrases = []
    for p1, p2 in map(lambda t: (t[0].split(), t[1].split()),
                      itertools.combinations(key_phrases, 2)):
        try:
            similar_phrases.append(
                (p1, p2, sentence_model.wv.n_similarity(p1, p2)))
        except KeyError:
            pass
    similar_phrases = sorted(similar_phrases, key=lambda t: t[2], reverse=True)
    '''food_phrases = set()
    while similar_phrases and similar_phrases[0][2] > 0.9:
        shrt, lng = sorted([' '.join(similar_phrases[0][0]), ' '.join(similar_phrases[0][1])], key=len)
        if shrt not in food_phrases and lng not in food_phrases:
            food_phrases.add(shrt)
        try:
            key_phrases.remove(shrt)
        except KeyError, e:
            try:
                key_phrases.remove(lng)
            except KeyError:
                pass
        similar_phrases.pop(0)'''

    food_phrases, rejected, queue = set(), set(), set()
    for p1, p2, similarity in similar_phrases:
        if similarity < 0.75:
            break
        p1 = ' '.join(p1)
        p2 = ' '.join(p2)
        if p1 in food_phrases | rejected or p2 in food_phrases | rejected:
            continue
        if p1 in queue or p2 in queue:
            if len(p1) < len(p2):
                food_phrases.add(p1)
                rejected.add(p2)
            else:
                food_phrases.add(p2)
                rejected.add(p1)
        else:
            queue.add(p1)
            queue.add(p2)

    return sorted(score_phrases(food_phrases, reviews),
                  key=lambda t: t[1],
                  reverse=True), 200
Beispiel #27
0
def get_key_words(comments):
    r = Rake()
    r.extract_keywords_from_sentences(comments)
    return r.get_ranked_phrases_with_scores()