Beispiel #1
0
def GetRelevanteKeyWords(text, quantity, language):

    r = Rake(language=language)

    r.extract_keywords_from_text(text)

    r.get_ranked_phrases()  # To get keyword phrases ranked highest to lowest.

    r.get_word_frequency_distribution()

    teste = r.get_word_degrees()

    keyWordsSorted = sorted(teste.items(), key=lambda x: x[1], reverse=True)

    keyWordsRelevante = [x[0] for x in keyWordsSorted[0:quantity]]

    return {'keywords': keyWordsRelevante, 'result': keyWordsSorted}
Beispiel #2
0
def extract_and_update_video_keywords(video_id, video_content):
    n = 5
    rake = Rake(stopwords=stop_words)
    rake.extract_keywords_from_text(video_content)
    top_n_keywords = rake.get_word_frequency_distribution().most_common(
        n)  # list of tuples (word, count) ordered by 'count' desc
    top_n_keywords_str = ", ".join(
        [kw_tuple[0] for kw_tuple in top_n_keywords])
    update_videos_meta_data(video_id, "Keywords", top_n_keywords_str)
Beispiel #3
0
    def run(self, text, val):
        """
        TODO Improvements:
        1. casual_tokenize can't handle 'words-with-hyphens-like-this' & reduces coverage
        """

        # Remove new lines and turn to lower case
        # TODO what if only wanting to read first x lines, but that should only be for purposes of ML
        self.val = val

        text = re.sub('\n', ' ', text).lower()

        # Extract keyphrases using Rake
        # TODO also possible to extract keywords from sentence
        rake = Rake()
        if val == 'article':
            rake.extract_keywords_from_text(text)
        elif val == 'social':
            rake.extract_keywords_from_sentences(text)
        all_phrases = rake.get_ranked_phrases_with_scores()
        word_freq_dist = rake.get_word_frequency_distribution()

        # Tokenize text
        article_text_tokenized = casual_tokenize(text)

        # Tokenize phrases
        all_phrases_tokenized = self.tokenize_phrases(all_phrases)

        # Tag all phrases and remove all but noun words
        all_phrases_tagged = self.pos_tag_phrase_pairs(all_phrases)
        all_phrases_tagged_nouns = self.filter_nouns(all_phrases_tagged)

        # Convert list of tagged nouns back to a string phrase
        string_phrases_nouns = self.tuple_list_to_string_list(
            all_phrases_tagged_nouns)

        # Get the indexes from the non-filtered suggested phrases in the original text
        all_surrounding_tokens, all_context_tokens = self.get_all_surrounding_tokens(
            all_phrases_tokenized, article_text_tokenized)

        # Get wikipedia urls for top 5 phrases
        mapping_list = self.get_wiki_urls_top_n_phrases(
            string_phrases_nouns, all_surrounding_tokens, 10)

        # Return mapping to console
        wiki_mapping = self.write_suggestions_to_json(mapping_list)
        # print(json.dumps(wiki_mapping))

        # Get page links on medium by phrase
        medium_mapping = self.get_n_listed_medium_posts(
            string_phrases_nouns, 2)
        # print(json.dumps(medium_mapping))

        # Combine jsons
        mapping = self.combine_mappings(wiki_mapping, medium_mapping)
        print(json.dumps(mapping))
Beispiel #4
0
    def rakeResult(self):
        text_doc = self.getDoc()
        r = Rake()
        r.extract_keywords_from_text(text_doc)
        ranked_w_score = r.get_ranked_phrases_with_scores()
        ranked = r.get_ranked_phrases()
        word_degree = r.get_word_degrees()
        word_freq_dist = r.get_word_frequency_distribution()

        return ranked_w_score, ranked, word_degree, word_freq_dist
def calculate_rake_ranking(just_words):

    # Initializes the Rake object
    r = Rake()

    # Meant to contain each word in a string
    words_string = ''

    # Extracts only the word itself as a string
    for word_array in just_words:
        words_string += word_array[0] + " "

    # The Rake object ranks all the words in the string
    r.extract_keywords_from_text(words_string)

    # The return type of both functions called below is Dictionary (key -> value)
    frequency_distribution = r.get_word_frequency_distribution(
    )  # word -> frequency (number of times it occurs)
    word_degrees = r.get_word_degrees(
    )  # word -> degree (linguistic co-occurrence)

    # Meant to contain RAKE ranking which aren't scaled yet
    rake_not_scaled = []

    # Appends the ranking to each word's array
    for word_array in just_words:

        word_frequency = 1
        word_degree = 1

        # Linear search to match a word to its frequency
        for word, value in frequency_distribution.items():

            if word_array[0] == word:
                word_frequency = value

        # Linear search to match a word to its degree
        for word, value in word_degrees.items():

            if word_array[0] == word:
                word_degree = value

        # Formula in accordance with the chosen metric
        ranking = word_degree / word_frequency

        rake_not_scaled.append(ranking)

    # Scales the values of the RAKE rankings to [0, 2]
    scaler = MinMaxScaler(feature_range=(0, 2))
    rake_scaled = scaler.fit_transform(
        np.asarray(rake_not_scaled).reshape(-1, 1))
    rake_scaled = [float(ranking) for ranking in rake_scaled]

    return rake_scaled
Beispiel #6
0
    def test_build_frequency_dist(self):
        r = Rake()

        phrase_list = [['red', 'apples'], ['good'], ['red'], ['flavour']]
        freq = defaultdict(lambda: 0)
        freq['apples'] = 1
        freq['good'] = 1
        freq['flavour'] = 1
        freq['red'] = 2
        r._build_frequency_dist(phrase_list)
        self.assertEqual(r.get_word_frequency_distribution(), freq)
Beispiel #7
0
    def test_build_frequency_dist(self):
        r = Rake()

        phrase_list = [["red", "apples"], ["good"], ["red"], ["flavour"]]
        freq = defaultdict(lambda: 0)
        freq["apples"] = 1
        freq["good"] = 1
        freq["flavour"] = 1
        freq["red"] = 2
        r._build_frequency_dist(phrase_list)
        self.assertEqual(r.get_word_frequency_distribution(), freq)
def extract_lemmatize_dictionary(file_list):
    with open(stopwords_file_path + 'stopwords.json') as fp:
        stopwords = json.load(fp)

    r = Rake(stopwords=stopwords)

    file_to_words = {}

    lemmatizer = WordNetLemmatizer()

    file_list = sorted(file_list)

    i = 0

    for f in file_list:

        counter = collections.Counter()

        i = i + 1
        print "Processed ", (i * 100.0) / len(file_list), "%"

        lines = [line.rstrip('\n') for line in open(file_path + f)]
        for line in lines[6:-2]:

            r.extract_keywords_from_text(line)
            d = dict(r.get_word_frequency_distribution())
            lemmatized = {}

            for k, v in d.items():
                if not is_alpha(k) or len(k) < 3:
                    d.pop(k)

            for k, v in d.items():
                w = []
                w.append(k)
                tag = get_wordnet_pos(nltk.pos_tag(w)[0][1])
                if tag != '':
                    lem = lemmatizer.lemmatize(k, tag)
                    if len(lem) < 3:
                        continue
                    if lem in lemmatized:
                        lemmatized[lem] = lemmatized[lem] + v
                    else:
                        lemmatized[lem] = v

            counter.update(lemmatized)

        file_to_words[f] = dict(counter)

    with open('file_to_words.json', 'w') as fp:
        json.dump(file_to_words, fp, sort_keys=True, indent=3)
Beispiel #9
0
def tagging(filename):

    df = pd.read_excel(filename)
    cols = df.columns.values
    r = Rake()
    df2 = pd.DataFrame()
    for i in cols:
        # print(i)
        sent = [str(j) for j in df[i].values if j != 0]
        # print(sent)
        r.extract_keywords_from_text(" ".join(sent))
        # print(r.get_word_frequency_distribution())
        # print(r.get_word_degrees())
        fdis = r.get_word_frequency_distribution()
        wdig = r.get_word_degrees()
        fdis_ls = []
        wdig_ls = []
        wdig = {
            a: b
            for a, b in sorted(
                wdig.items(), key=lambda item: item[1], reverse=True)
        }
        # print(wdig)
        for j in fdis.most_common():
            # print(j[0])
            if len(j[0]) > 3:
                fdis_ls.append(j[0])
        for j in wdig.keys():
            if len(j) > 3:
                wdig_ls.append(j)
        print(fdis_ls[:5])
        print(wdig_ls[:5])
        res = [fdis_ls[:5], wdig_ls[:5]]
        df2[i] = res

        # break
    df2.to_excel("datafile/tagged.xlsx")
Beispiel #10
0
    #formatted_string = re.sub(r' [0-9]{1,3} ', ' ', formatted_string)
    formatted_string = re.sub(r' [a-z]{1,2} ', ' ', formatted_string)

    print("FINAL", formatted_string)
    return formatted_string


def get_email(html_string):
    email_words = whitespace_wt.tokenize(html_string.lower())
    for element in email_words:
        #element = "*****@*****.**"
        if re.match(r".*@.*\.(de|com|net)", element) is not None:
            print("Email found:" + element)


browser.get(
    "https://www.schwaebische.de/landkreis/bodenseekreis/tettnang_artikel,-junge-union-will-partty-bus-verwirklichen-_arid,10701303.html"
)
html_of_search = browser.page_source
html = BeautifulSoup(html_of_search, "html.parser")
html_string = str(html.body.text)
html_string = formate_input_text(html_string)
#html_string = test(html_string)

r.extract_keywords_from_text(html_string)

print(r.get_ranked_phrases())
print(r.get_ranked_phrases_with_scores())
print(r.get_word_degrees())
print(r.get_word_frequency_distribution())
import json

cons = []
pros = []
conString  = ''
proString  = ''

# with open('../data/cons.csv') as csvFile:
#     pros = list(filter(lambda row: len(row) > 0, csv.reader(csvFile, delimiter='\n')))
#     pros = list(map(lambda row: row[0], pros))
#     proString = ''.join(pros)

# r = Rake(min_length=3, max_length=4)
# r.extract_keywords_from_text(proString)
# conFrequency = r.get_word_frequency_distribution()

# with open('../data/confrequency.json', 'w') as jsonfile:
#     json.dump(conFrequency, jsonfile)


with open('../data/pros.csv') as csvFile:
    pros = list(filter(lambda row: len(row) > 0, csv.reader(csvFile, delimiter='\n')))
    pros = list(map(lambda row: row[0], pros))
    proString = ''.join(pros)

r = Rake(min_length=3, max_length=4)
r.extract_keywords_from_text(proString)
proFrequency = r.get_word_frequency_distribution()

with open('../data/profrequency.json', 'w') as jsonfile:
    json.dump(proFrequency, jsonfile)