Example #1
0
def get_spam_keywords(spam_features, ham_features):
    #POPULATE THE SPAM AND HAM TEXT BLOBS
    text_spam = ''
    text_ham = ''

    for pr in spam_features:
        text_spam += get_keywords(pr)

    for pr in ham_features:
        text_ham += get_keywords(pr)

    text_spam = re.sub('[^a-zA-Z0-9 \n\.]', ' ', text_spam)
    text_ham = re.sub('[^a-zA-Z0-9 \n\.]', ' ', text_ham)
    # print(text_spam,"\n------------------------------------------\n",text_ham)
    #INITIALISE RAKE FOR POPULAR WORDS
    rake = Rake(max_words=2, min_freq=5)

    #EXTRACT POPULAR KEYWORDS FOR SPAM AND HAM
    keywords_spam = rake.apply(text_spam.lower())
    keywords_ham = rake.apply(text_ham.lower())

    # print(keywords_ham)
    # print(keywords_spam)

    spam = [spam_keyword[0] for spam_keyword in keywords_spam[:50]]
    ham = [ham_keyword[0] for ham_keyword in keywords_ham[:50]]

    # GENERATE KEYWORDS PRESENT IN SPAM WHICH ARE NOT PRESENT IN HAM
    spam_final = []
    for word in spam:
        if word not in ham:
            spam_final.append(word)

    # print(spam_final)
    return spam_final
    def get_keyword_polarity(sentence):

        #adds the polarity value of a keyword to the keywords_polarity array,  so we can get the mean polarity for the keyword later
        def add_pol_to_keyword_pol(keyword, pol):
            for e in keyword_polarity:
                if e["word"] == keyword:
                    e["pol"].append(pol)

        sentences = tokenizer.tokenize(sentence)

        r = Rake()

        for s in sentences:
            #extract phrases -> no fill words
            r.extract_keywords_from_text(s)  #rake-nltk
            phrases = r.get_ranked_phrases()  #rake-nltk
            #phrases = r.apply(s) #multi-rake
            for phrase in phrases:
                for k in keywords:
                    #check if the keywords appear in the phrase
                    if k in phrase:
                        print(k)
                        polarity_phrase = get_review_sentiment(phrase)
                        if polarity_phrase != 0.0:  #if the polarity != 0, the phrase is something like "good beers" -> save the polarity value in keyword_poopularity
                            add_pol_to_keyword_pol(k, polarity_phrase)
                            #print((phrase, polarity_phrase))
                            #print("\n")
                        else:  #if the polarity == 0, the phrase is the keyword itself, like 'beer'. so we take the polarity of the whole sentence -> save it in
                            polarity_sentence = get_review_sentiment(s)
                            add_pol_to_keyword_pol(k, polarity_sentence)
Example #3
0
    def get(self, language_code=None):
        rake = Rake(language_code=language_code)

        text = request.form.get('text')
        if text:
            return rake.apply(text)

        return 'No text given', 400
Example #4
0
    def post():
        posted_data = request.get_json()
        text = posted_data['text']
        rake = Rake()
        keywords = rake.apply(text)
        text = [i[0] for i in keywords]

        return jsonify({'Keywords': text})
Example #5
0
def get_RAKE(article):
    rake = Rake()
    keywords = rake.apply(article)
    topKeywords = []

    for i in range(10):
        topKeywords.append(keywords[i][0])
    #print(topKeywords)
    return topKeywords
Example #6
0
def getKeyWords(text):
    rake = Rake()
    keywords = rake.apply(text)
    sortedKw = []
    for keyword in keywords:
        if keyword[1] > RANK:
            sortedKw.append(keyword[0])
        else:
            pass
    return sortedKw
Example #7
0
    def get_keywords(self, article):
        """
        Find the keywords in article and return them in a convenient way.
        :params:
            article, list of sentences, sentences are lists of strings 
        :returns:
            keywords, list of strings -- extracted keywords
        """
        #here we save the labels that will NOT be changed
        labeltype = set()
        if self.label is not None:
            if self.label == 'neutral':
                labeltype.add('B-SPAN')
                labeltype.add('I-SPAN')
            elif self.label == 'propaganda':
                labeltype.add('0')

        #additional variable that holds parts of speech that will NOT be changed
        pos_shortcuts = {'NN': 'n', 'JJ': 'adj', 'RB': 'adv', 'VB': 'v'}

        wordtypes = set()
        if self.postype is not None:
            wordtypes = {'n', 'adj', 'adv', 'v'}
            for fig in self.postype:
                wordtypes.discard(fig)

        text = self.get_text(article)
        text = ''.join(c for c in text if c not in '\'\"')
        rake = Rake()
        try:
            fig = rake.apply(text)
        except:
            print('Couldn\'t find keywords, falling back to all words.')
            fig = self.get_words(article)
            return fig

        raw_keywords = []
        for string, _ in fig:
            raw_keywords += string.split()

        raw_keywords = set(raw_keywords)

        keywords = {}

        for i in range(len(article)):
            sentence = article[i]
            for comb in sentence:
                word, label = comb.split()
                word = word.lower()
                pos = nltk.pos_tag([word])[0][1]
                if pos in pos_shortcuts:
                    pos = pos_shortcuts[pos]
                if word in raw_keywords and label not in labeltype and pos not in wordtypes:
                    keywords[word] = i
        return keywords
Example #8
0
 def _fetch_all_sentences_keywords(self):
     """
     For each sentence object in self.video.sentences, the method gets the keywords and saves them in sentence.keywords
     :return: None
     """
     rake = Rake()
     for sentence in self.video.sentences:
         keywords_result = rake.apply(sentence.text,
                                      text_for_stopwords=None)
         keywords = [keyword[0] for keyword in keywords_result
                     ]  # Getting just keywords, without accuracy
         sentence.keywords = keywords
Example #9
0
def getKeywords(text):
    tokens = text.split()
    processedTokens = []
    if len(tokens) < 3:
        processedTokens = text.split()
    else:
        rake = Rake()
        keywords = rake.apply(text)
        for i in keywords:
            tempHold = i[0].split()
            for j in tempHold:
                processedTokens.append(j)
    return processedTokens
Example #10
0
def extract_keywords(text):
    nlkt_text = word_tokenize(text)
    val = nltk.pos_tag(nlkt_text)
    rake = Rake()
    keywords = rake.apply(text)
    new_val = []
    keyword = 'nothing'
    for i in range(len(val)):
        if ((val[i][1] == 'NN')):
            new_val.append(val[i][0])
    for i in range(len(keywords)):
        for j in range(len(new_val)):
            if (keywords[i][0] == new_val[j]):
                keyword = new_val[j]
    return (keyword)
Example #11
0
    def extract_keywords(self, max_words=1, min_freq=5, num_top_words=10):

        stop_words = get_stop_words('fr')

        rake = Rake(max_words=max_words,
                    min_freq=min_freq,
                    language_code="fr",
                    stopwords=stop_words)

        for i, label in enumerate(np.unique(self.labels)):
            corpus_fr = ' '.join(self.data[self.labels == label])
            keywords = rake.apply(corpus_fr)
            top_words = np.array(keywords[:num_top_words])[:, 0]
            self.keywords["Cluster {0}".format(label)] = top_words

        return self.keywords
Example #12
0
class TextPreprocessor(object):
    def __init__(self, lang=None):
        self.lang = lang
        self.rake = Rake(language_code=self.lang, max_words=5)

    def key_words(self, text):
        return self.rake.apply(text)
def test_extraction_with_RAKE():
    rake = Rake(language_code="el")
    while True:
        input_doc = input()
        if input_doc == "end":
            break
        output = extract_keywords_RAKE(rake, input_doc)
        print(output)
Example #14
0
def process():
    keywords = []
    text = request.form['text_to_process']
    max_kw_length = int(request.form['max_kw_length'])

    if not text:
        abort(404)

    if request.method == 'POST':
        f = open("data/stopwords.txt", "r")
        sw = f.read()
        rake = Rake(language_code='id',
                    max_words=max_kw_length,
                    stopwords=set(sw.split("\n")))
        keywords = rake.apply(text)

    return render_template('process.html',
                           keywords=keywords,
                           text=text,
                           max_kw_length=max_kw_length)
 def _RAKE_search(self,text, score_threshold=):
     rake=Rake(min_chars=, #Número mínimo de caracteres que debe tener un término para poder considerarse kw.
           max_words=, #Número máximo de términos que puede contener una potencial kw.
           min_freq=, #Frecuencia mínima de aparición.
           language_code='')
     keywords=rake.apply(text)
     
     result=list()
     for (x,y) in keywords:
         if y > score_threshold: #Puntuación mínima.
             result.append(x)
         
     return result
Example #16
0
def get_keywords(text):
        text = (
            text
        )
        rake = Rake(
            min_chars=3,
            max_words=1,
            min_freq=1,
            language_code='es',
            stopwords=None,
            lang_detect_threshold=100,
            max_words_unknown_lang=10,
            generated_stopwords_percentile=80,
            generated_stopwords_max_len=10,
            generated_stopwords_min_freq=2,
            )

        keywords = rake.apply(
            text,
            text_for_stopwords=None,
        )
        return keywords
Example #17
0
from multi_rake import Rake
import csv

#text = input()
text = "please tell me the good"
rake = Rake()

keywords = rake.apply(text)

#good = 1, bad = 0
words = dict(good=1, bad=0)
totalcount = 0
goodcount = 0
for word in keywords:
    if word[0] in words:
        totalcount += 1
        if word[1]:
            goodcount += 1
print(goodcount / totalcount)
Example #18
0
import pandas as pd
from multi_rake import Rake
import ipfshttpclient
from config import *
import arxiv

client = ipfshttpclient.connect('/ip4/127.0.0.1/tcp/5001/http')
rake = Rake()

def create_clinks_set(item, dirpath):
    paper = {'pdf_url': item['pdf_url'], "title": item['title']}
    summary = item['summary']
    title = item['title']
    summary = title + '. ' + summary

    keywords = rake.apply(summary)
    path = arxiv.download(paper, dirpath=dirpath)

    cid_to = client.add(path)['Hash']
    cid_from_list = []

    for keyword in keywords:
        if keyword[1] > RANK:
            temp = client.add_str(keyword[0])
            cid_from_list.append(temp)
        else:
            pass

    cid_from_list.append(client.add_str(title))

    data = {'title': title, 'cid_from': cid_from_list, 'cid_to': cid_to}
    def __init__(self,
                 classifier_dims,
                 num_classes,
                 embedding_dims,
                 gaussian_noise,
                 dropout,
                 internal_dims,
                 n_layers,
                 featurizer,
                 final_layer_builder,
                 n_tokens_in=64,
                 n_tokens_out=16,
                 capabilities2dims=dict(),
                 use_as_super=False,
                 **kwargs):
        super(LangFeaturesModel, self).__init__(classifier_dims,
                                                num_classes,
                                                embedding_dims,
                                                gaussian_noise,
                                                dropout,
                                                internal_dims,
                                                n_layers,
                                                featurizer,
                                                final_layer_builder,
                                                n_tokens_in,
                                                n_tokens_out,
                                                use_as_super=True,
                                                **kwargs)
        assert "capabilities" in kwargs
        capabilities = kwargs["capabilities"]
        kwargs[
            "rake_dims"] = kwargs["rake_dims"] if "rake_dims" in kwargs else 32
        kwargs[
            "yake_dims"] = kwargs["yake_dims"] if "yake_dims" in kwargs else 32
        assert "key_phrases" not in capabilities or (
            "key_phrases" in capabilities and "spacy" in capabilities)
        use_layer_norm = kwargs[
            "use_layer_norm"] if "use_layer_norm" in kwargs else False
        self.capabilities = capabilities
        embedding_dim = 8
        cap_to_dim_map = {
            "spacy": 128,
            "snlp": 32,
            "key_phrases": 64,
            "nltk": 192,
            "full_view": 64,
            "tmoji": 32,
            "ibm_max": 16,
            "gensim": 256,
            "fasttext_crawl": 256
        }
        cap_to_dim_map.update(capabilities2dims)
        all_dims = sum([cap_to_dim_map[c] for c in capabilities])
        self.cap_to_dim_map = cap_to_dim_map
        self.all_dims = all_dims

        if "spacy" in capabilities:
            tr = pytextrank.TextRank(token_lookback=7)
            self.nlp = spacy.load("en_core_web_lg", disable=[])
            self.nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
            spacy_in_dims = (96 * 2) + (11 * embedding_dim) + 2
            self.spacy_nn = ExpandContract(spacy_in_dims,
                                           cap_to_dim_map["spacy"],
                                           dropout,
                                           use_layer_norm=use_layer_norm,
                                           groups=(2, 4))

        if "fasttext_crawl" in capabilities:
            self.bpe = BPEmb(dim=200)
            self.cngram = CharNGram()
            fasttext_crawl_file = kwargs[
                "fasttext_crawl_file"] if "fasttext_crawl_file" in kwargs else "crawl-300d-2M-subword.bin"
            self.crawl = fasttext.load_model(fasttext_crawl_file)
            self.crawl_nn = ExpandContract(200 + 300 + 100,
                                           cap_to_dim_map["fasttext_crawl"],
                                           dropout,
                                           use_layer_norm=use_layer_norm,
                                           groups=(4, 4))

        if "gensim" in capabilities:
            gensim = [
                api.load("glove-twitter-50"),
                api.load("glove-wiki-gigaword-50"),
                api.load("word2vec-google-news-300"),
                api.load("conceptnet-numberbatch-17-06-300")
            ]
            self.gensim = gensim
            self.gensim_nn = ExpandContract(400,
                                            cap_to_dim_map["gensim"],
                                            dropout,
                                            use_layer_norm=use_layer_norm,
                                            groups=(4, 4))

        if "full_view" in capabilities:
            full_sent_in_dims = 300
            self.full_sent_nn = ExpandContract(full_sent_in_dims,
                                               cap_to_dim_map["full_view"],
                                               dropout,
                                               use_layer_norm=use_layer_norm,
                                               groups=(4, 4))

        if "snlp" in capabilities:
            import stanza
            self.snlp = stanza.Pipeline(
                'en',
                processors='tokenize,pos,lemma,depparse,ner',
                use_gpu=False,
                pos_batch_size=2048)
            self.snlp_nn = ExpandContract(embedding_dim * 5,
                                          cap_to_dim_map["snlp"],
                                          dropout,
                                          use_layer_norm=use_layer_norm)
        if "key_phrases" in capabilities:
            import yake
            self.kw_extractor = yake.KeywordExtractor(lan="en",
                                                      n=3,
                                                      dedupLim=0.9,
                                                      dedupFunc='seqm',
                                                      windowsSize=3,
                                                      top=10,
                                                      features=None)

            self.key_occ_cnt_pytextrank = nn.Embedding(8, embedding_dim)
            nn.init.normal_(self.key_occ_cnt_pytextrank.weight,
                            std=1 / embedding_dim)
            self.key_wc_pytextrank = nn.Embedding(4, embedding_dim)
            nn.init.normal_(self.key_wc_pytextrank.weight,
                            std=1 / embedding_dim)

            yake_dims = kwargs["yake_dims"] if "yake_dims" in kwargs else 32
            self.yake_dims = yake_dims
            self.yake_nn = ExpandContract(300,
                                          yake_dims,
                                          dropout,
                                          use_layer_norm=use_layer_norm,
                                          groups=(2, 2))

            try:
                from multi_rake import Rake
                rake_dims = kwargs["rake_dims"] if "rake_dims" in kwargs else 32
                self.rake_dims = rake_dims
                self.rake_nn = ExpandContract(300,
                                              rake_dims,
                                              dropout,
                                              use_layer_norm=use_layer_norm,
                                              groups=(2, 2))
                self.rake = Rake(language_code="en")
                keyphrases_dim = 2 * embedding_dim + rake_dims + yake_dims
            except:
                self.rake = None
                keyphrases_dim = 2 * embedding_dim + yake_dims
            self.keyphrase_nn = ExpandContract(keyphrases_dim,
                                               cap_to_dim_map["key_phrases"],
                                               dropout,
                                               use_layer_norm=use_layer_norm,
                                               groups=(4, 4))

        fasttext_file = kwargs[
            "fasttext_file"] if "fasttext_file" in kwargs else "wiki-news-300d-1M-subword.bin"
        if not set(capabilities).isdisjoint(
            {"key_phrases", "full_view", "nltk"}):
            self.text_model = fasttext.load_model(fasttext_file)

        self.pdict = get_all_tags()
        self.tag_em = nn.Embedding(len(self.pdict) + 1, embedding_dim)
        nn.init.normal_(self.tag_em.weight, std=1 / embedding_dim)

        self.sw_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.sw_em.weight, std=1 / embedding_dim)

        self.sent_start_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.sent_start_em.weight, std=1 / embedding_dim)

        self.is_oov_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.is_oov_em.weight, std=1 / embedding_dim)

        self.has_digit_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.has_digit_em.weight, std=1 / embedding_dim)

        self.is_mask_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.is_mask_em.weight, std=1 / embedding_dim)

        self.w_len = nn.Embedding(16, embedding_dim)
        nn.init.normal_(self.w_len.weight, std=1 / embedding_dim)

        self.wc_emb = nn.Embedding(16, embedding_dim)
        nn.init.normal_(self.wc_emb.weight, std=1 / embedding_dim)

        if "nltk" in capabilities:
            import rake_nltk
            from textblob import TextBlob
            from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VaderSentimentIntensityAnalyzer
            self.stop_words = set(stopwords.words('english'))
            self.rake_nltk = rake_nltk.Rake()
            self.key_wc_rake_nltk = nn.Embedding(4, embedding_dim)
            nn.init.normal_(self.key_wc_rake_nltk.weight,
                            std=1 / embedding_dim)
            self.nltk_sid = SentimentIntensityAnalyzer()
            self.vader_sid = VaderSentimentIntensityAnalyzer()
            in_dims = 310 + 5 * embedding_dim
            self.nltk_nn = ExpandContract(in_dims,
                                          cap_to_dim_map["nltk"],
                                          dropout,
                                          use_layer_norm=use_layer_norm,
                                          groups=(2, 4))

        if "ibm_max" in capabilities:
            from ..external import ModelWrapper
            self.ibm_max = ModelWrapper()
            for p in self.ibm_max.model.parameters():
                p.requires_grad = False
            self.ibm_nn = ExpandContract(6,
                                         cap_to_dim_map["ibm_max"],
                                         dropout,
                                         use_layer_norm=use_layer_norm,
                                         groups=(1, 1))

        if "tmoji" in capabilities:
            from torchmoji.sentence_tokenizer import SentenceTokenizer
            from torchmoji.model_def import torchmoji_emojis
            from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
            with open(VOCAB_PATH, 'r') as f:
                maxlen = self.n_tokens_in
                self.vocabulary = json.load(f)
                self.st = SentenceTokenizer(self.vocabulary, maxlen)
                self.tmoji = torchmoji_emojis(PRETRAINED_PATH)
                for p in self.tmoji.parameters():
                    p.requires_grad = False
            self.tm_nn = ExpandContract(64,
                                        cap_to_dim_map["tmoji"],
                                        dropout,
                                        use_layer_norm=use_layer_norm,
                                        groups=(1, 1))

        self.contract_nn = ExpandContract(self.all_dims,
                                          embedding_dims,
                                          dropout,
                                          use_layer_norm=True,
                                          unit_norm=False,
                                          groups=(4, 4))
        if not use_as_super:
            if featurizer == "cnn":
                self.featurizer = CNN1DFeaturizer(n_tokens_in, embedding_dims,
                                                  n_tokens_out,
                                                  classifier_dims,
                                                  internal_dims, n_layers,
                                                  gaussian_noise, dropout)
            elif featurizer == "gru":
                self.featurizer = GRUFeaturizer(n_tokens_in, embedding_dims,
                                                n_tokens_out, classifier_dims,
                                                internal_dims, n_layers,
                                                gaussian_noise, dropout)
            elif featurizer == "basic":
                self.featurizer = BasicFeaturizer(n_tokens_in, embedding_dims,
                                                  n_tokens_out,
                                                  classifier_dims,
                                                  internal_dims, n_layers,
                                                  gaussian_noise, dropout)

            elif featurizer == "transformer":
                self.attention_drop_proba = kwargs[
                    "attention_drop_proba"] if "attention_drop_proba" in kwargs else 0.0
                n_encoders = kwargs.pop("n_encoders", n_layers)
                n_decoders = kwargs.pop("n_decoders", n_layers)
                self.featurizer = TransformerFeaturizer(
                    n_tokens_in, embedding_dims, n_tokens_out, classifier_dims,
                    internal_dims, n_encoders, n_decoders, gaussian_noise,
                    dropout, self.attention_drop_proba)
            else:
                raise NotImplementedError()

            self.final_layer = final_layer_builder(classifier_dims,
                                                   n_tokens_out, num_classes,
                                                   dropout, **kwargs)
        if "stored_model" in kwargs:
            load_stored_params(self, kwargs["stored_model"])
        self.reg_layers = get_regularization_layers(self)
import numpy
import tflearn
import tensorflow
import random
import pickle
from multi_rake import Rake
import re
import os

size = 10000

sentences = []
questions, ans = [], []
files = ["movie_subtitles_en.txt"]
rake = Rake()

for f in files:
    # get all sentences
    with open(f, 'r', errors='ignore') as file:
        for line in file.readlines():
            if len(sentences) == 0:
                sentences.append(line.lower())
            else:
                if line.lower() != sentences[-1]:
                    sentences.append(line.lower())
    
    # separate sentences to questions and responses
    for idx, s in enumerate(sentences[:-1]):
        questions.append(sentences[idx])
        ans.append(sentences[idx + 1])
Example #21
0
def test_rake():
    rake = Rake(
        min_chars=3,
        max_words=3,
        min_freq=1,
        lang_detect_threshold=50,
        max_words_unknown_lang=2,
        generated_stopwords_percentile=80,
        generated_stopwords_max_len=3,
        generated_stopwords_min_freq=2,
    )
    text_en = (
        'Compatibility of systems of linear constraints over the set of '
        'natural numbers. Criteria of compatibility of a system of linear '
        'Diophantine equations, strict inequations, and nonstrict inequations '
        'are considered. Upper bounds for components of a minimal set of '
        'solutions and algorithms of construction of minimal generating sets '
        'of solutions for all types of systems are given. These criteria and '
        'the corresponding algorithms for constructing a minimal supporting '
        'set of solutions can be used in solving all the considered types of '
        'systems and systems of mixed types.')
    result = rake.apply(text_en)
    result = _postprocess_result(result)
    expected = [
        ('minimal generating sets', 8.666666666666666),
        ('linear diophantine equations', 8.5),
        ('minimal supporting set', 7.666666666666666),
        ('minimal set', 4.666666666666666),
        ('linear constraints', 4.5),
        ('natural numbers', 4.0),
        ('strict inequations', 4.0),
        ('nonstrict inequations', 4.0),
        ('upper bounds', 4.0),
        ('mixed types', 3.666666666666667),
        ('considered types', 3.166666666666667),
        ('set', 2.0),
        ('types', 1.6666666666666667),
        ('considered', 1.5),
        ('compatibility', 1.0),
        ('systems', 1.0),
        ('criteria', 1.0),
        ('system', 1.0),
        ('components', 1.0),
        ('solutions', 1.0),
        ('algorithms', 1.0),
        ('construction', 1.0),
        ('constructing', 1.0),
        ('solving', 1.0),
    ]
    expected = _postprocess_result(expected)
    assert result == expected

    rake_en = Rake(
        min_chars=3,
        max_words=3,
        min_freq=1,
        language_code='en',
    )
    result = rake_en.apply(text_en)
    result = _postprocess_result(result)
    assert result == expected

    rake_with_stopwords = Rake(
        min_chars=3,
        max_words=3,
        min_freq=1,
        stopwords={'of', 'the', 'a', 'and'},
        lang_detect_threshold=50,
        max_words_unknown_lang=2,
        generated_stopwords_percentile=80,
        generated_stopwords_max_len=3,
        generated_stopwords_min_freq=2,
    )
    result = rake_with_stopwords.apply(text_en)
    result = _postprocess_result(result)
    expected = [
        ('linear constraints over', 9.0),
        ('linear diophantine equations', 9.0),
        ('minimal generating sets', 8.666666666666666),
        ('minimal supporting set', 7.666666666666666),
        ('systems are given', 7.5),
        ('minimal set', 4.666666666666666),
        ('natural numbers', 4.0),
        ('strict inequations', 4.0),
        ('considered types', 4.0),
        ('mixed types', 4.0),
        ('these criteria', 3.5),
        ('set', 2.0),
        ('systems', 1.5),
        ('criteria', 1.5),
        ('compatibility', 1.0),
        ('system', 1.0),
        ('solutions', 1.0),
        ('algorithms', 1.0),
        ('construction', 1.0),
    ]
    expected = _postprocess_result(expected)
    assert result == expected

    text_esperanto = (
        'Liberalismo estas politika filozofio aŭ mondrigardo konstruita en '
        'ideoj de libereco kaj egaleco. Liberaluloj apogas larĝan aron de '
        'vidpunktoj depende de sia kompreno de tiuj principoj, sed ĝenerale '
        'ili apogas ideojn kiel ekzemple liberaj kaj justaj elektoj, '
        'civitanrajtoj, gazetara libereco, religia libereco, libera komerco, '
        'kaj privata posedrajto. Liberalismo unue iĝis klara politika movado '
        'dum la Klerismo, kiam ĝi iĝis populara inter filozofoj kaj '
        'ekonomikistoj en la okcidenta mondo. Liberalismo malaprobis heredajn '
        'privilegiojn, ŝtatan religion, absolutan monarkion kaj la Didevena '
        'Rajto de Reĝoj. La filozofo John Locke de la 17-a jarcento ofte '
        'estas meritigita pro fondado de liberalismo kiel klara filozofia '
        'tradicio. Locke argumentis ke ĉiu h**o havas naturon rekte al vivo, '
        'libereco kaj posedrajto kaj laŭ la socia '
        'kontrakto, registaroj ne rajtas malobservi tiujn rajtojn. '
        'Liberaluloj kontraŭbatalis tradician konservativismon kaj serĉis '
        'anstataŭigi absolutismon en registaroj per reprezenta demokratio kaj '
        'la jura hegemonio.')
    result = rake.apply(text_esperanto)
    result = _postprocess_result(result)
    expected = [
        ('vidpunktoj depende', 4.0),
        ('sia kompreno', 4.0),
        ('tiuj principoj', 4.0),
        ('justaj elektoj', 4.0),
        ('libera komerco', 4.0),
        ('okcidenta mondo', 4.0),
        ('ŝtatan religion', 4.0),
        ('absolutan monarkion', 4.0),
        ('didevena rajto', 4.0),
        ('socia kontrakto', 4.0),
        ('jura hegemonio', 4.0),
        ('gazetara libereco', 3.5),
        ('religia libereco', 3.5),
        ('privata posedrajto', 3.5),
        ('libereco', 1.5),
        ('posedrajto', 1.5),
        ('ideoj', 1.0),
        ('egaleco', 1.0),
        ('civitanrajtoj', 1.0),
        ('klerismo', 1.0),
        ('ekonomikistoj', 1.0),
        ('reĝoj', 1.0),
        ('laŭ', 1.0),
    ]
    expected = _postprocess_result(expected)
    assert result == expected

    rake_max_words_unknown_lang_none = Rake(
        min_chars=3,
        max_words=3,
        min_freq=1,
        lang_detect_threshold=50,
        max_words_unknown_lang=None,
        generated_stopwords_percentile=80,
        generated_stopwords_max_len=3,
        generated_stopwords_min_freq=2,
    )
    result = rake_max_words_unknown_lang_none.apply(text_esperanto)
    result = _postprocess_result(result)
    expected = [
        ('filozofo john locke', 9.0),
        ('serĉis anstataŭigi absolutismon', 9.0),
        ('vidpunktoj depende', 4.0),
        ('sia kompreno', 4.0),
        ('tiuj principoj', 4.0),
        ('justaj elektoj', 4.0),
        ('libera komerco', 4.0),
        ('okcidenta mondo', 4.0),
        ('ŝtatan religion', 4.0),
        ('absolutan monarkion', 4.0),
        ('didevena rajto', 4.0),
        ('socia kontrakto', 4.0),
        ('jura hegemonio', 4.0),
        ('gazetara libereco', 3.5),
        ('religia libereco', 3.5),
        ('privata posedrajto', 3.5),
        ('libereco', 1.5),
        ('posedrajto', 1.5),
        ('ideoj', 1.0),
        ('egaleco', 1.0),
        ('civitanrajtoj', 1.0),
        ('klerismo', 1.0),
        ('ekonomikistoj', 1.0),
        ('reĝoj', 1.0),
        ('laŭ', 1.0),
    ]
    expected = _postprocess_result(expected)
    assert result == expected

    text_for_stopwords = 'de en la kaj al' * 20
    result = rake.apply(text_esperanto, text_for_stopwords)
    result = _postprocess_result(result)
    expected = [
        ('vidpunktoj depende', 4.0),
        ('sia kompreno', 4.0),
        ('tiuj principoj', 4.0),
        ('justaj elektoj', 4.0),
        ('libera komerco', 4.0),
        ('okcidenta mondo', 4.0),
        ('ŝtatan religion', 4.0),
        ('absolutan monarkion', 4.0),
        ('didevena rajto', 4.0),
        ('socia kontrakto', 4.0),
        ('jura hegemonio', 4.0),
        ('gazetara libereco', 3.5),
        ('religia libereco', 3.5),
        ('privata posedrajto', 3.5),
        ('libereco', 1.5),
        ('posedrajto', 1.5),
        ('ideoj', 1.0),
        ('egaleco', 1.0),
        ('civitanrajtoj', 1.0),
        ('klerismo', 1.0),
        ('ekonomikistoj', 1.0),
        ('reĝoj', 1.0),
        ('vivo', 1.0),
        ('laŭ', 1.0),
    ]
    expected = _postprocess_result(expected)
    assert result == expected

    text_numbers = '123, 123, 123, 123'
    result = rake.apply(text_numbers)
    assert result == [('123', 0)]

    rake_min_freq2 = Rake(
        min_chars=3,
        max_words=3,
        min_freq=2,
        lang_detect_threshold=50,
        max_words_unknown_lang=2,
        generated_stopwords_percentile=80,
        generated_stopwords_max_len=3,
        generated_stopwords_min_freq=2,
    )
    text_starts_with_stopword = ('and keywords... keywords are the best words')
    result = rake_min_freq2.apply(text_starts_with_stopword)
    assert result == [('keywords', 1.0)]

    with pytest.raises(NotImplementedError):
        Rake(language_code='xxx')

    rake_uk = Rake(
        min_chars=3,
        max_words=4,
        min_freq=1,
        language_code='uk',
    )
    text_en_uk = (
        'Compatibility of systems of linear constraints над the set of '
        'natural numbers. Criteria of compatibility of a system of linear '
        'Diophantine equations, strict inequations, та nonstrict inequations '
        'are considered. Upper bounds для components of a minimal set of '
        'solutions та algorithms of construction of minimal generating sets '
        'of solutions для всіх types of systems are given. Ці criteria та '
        'the corresponding algorithms для constructing a minimal supporting '
        'set of solutions може бути used в solving всіх the considered types '
        'of systems та systems of mixed types.')
    result = rake_uk.apply(text_en_uk)
    result = _postprocess_result(result)
    expected = [
        ('minimal set of solutions', 15.6),
        ('systems of mixed types', 15.6),
        ('nonstrict inequations are considered', 15.0),
        ('criteria of compatibility of', 13.7),
        ('the corresponding algorithms', 9.0),
        ('components of', 5.6),
        ('strict inequations', 5.0),
        ('upper bounds', 4.0),
        ('criteria', 2.5),
        ('constructing', 1.0),
        ('used', 1.0),
        ('solving', 1.0),
    ]
    expected = _postprocess_result(expected)
    assert result == expected
Example #22
0
    if stopwords:
        words = remove_stopwords(words, stopwords)
    return words


mystem = Mystem()

custom_stopwords = get_text('stopwords.txt').split("\n")
# text = get_text('skills.txt')
ru_stopwords = stopwords.words("russian")
en_stopwords = stopwords.words("english")
stopwords = ru_stopwords + en_stopwords + custom_stopwords

rake = Rake(min_chars=2,
            max_words=3,
            min_freq=1,
            language_code="ru",
            stopwords=stopwords)

# clear_text = preprocess_text(text)

# keywords = rake.apply(clear_text)
# print(keywords)

num_lines = open('skills.txt').read().count('\n')
bar = IncrementalBar('Обработка', max=num_lines)

keywords_hash = {}
line_number = 0
with open('skills.txt') as input:
    for line in input:
Example #23
0
#for path in raw_files:
#    with io.open(path, encoding='utf-8') as file:
#        for line in file:
#            text_for_stopwords = text_for_stopwords + line-

total_tp = 0
total_all = 0
total_sz = 0
for path in raw_files:
    text_ru = ''
    with io.open(path, encoding='utf-8') as file:
        for line in file:
            text_ru = text_ru + line
    rake = Rake(min_chars=3,
                max_words=5,
                min_freq=1,
                language_code='ru',
                stopwords=stopwords_list)

    # apply Mystem to current text to delete form it all verbs form
    #text_ru = remove_verbs(text_ru)
    keywords = rake.apply(text_ru)
    #keywords = rake.apply(text_ru, text_for_stopwords=text_for_stopwords) # doesnt work better

    # TODO how to choose the last good candidate
    last_good = 0
    for el in keywords:
        if el[1] >= 2:
            last_good += 1

    with io.open('results/' + path.split('\\')[1], mode='w',
Example #24
0
def reviewscore(request):
    rake = Rake()
    positive = request.POST['positive']
    negative = request.POST['negative']

    if len(positive) == 0:
        positive = "No positive"
    if len(negative) == 0:
        #message = 'You searched for: %r' % request.GET['negative']
        negative = "No negative"

    positiveResult = rake.apply(positive)
    negativeResult = rake.apply(negative)

    positiveScore = 0
    negativeScore = 0

    if len(positiveResult) > 0:
        for i in range(0, len(positiveResult)):
            positiveScore = positiveScore + positiveResult[i][1]

    if len(negativeResult) > 0:
        for i in range(0, len(negativeResult)):
            negativeScore = negativeScore + negativeResult[i][1]

    totalScore = positiveScore - negativeScore

    expectedReviewScore = 0.18 * totalScore + 8.31

    # limit expected score range from 0 to 10
    if expectedReviewScore > 10.0:
        expectedReviewScore = 10.0
    elif expectedReviewScore < 0.0:
        expectedReviewScore = 0.0
    else:
        expectedReviewScore = round(expectedReviewScore, 2)

    # import actual data for actual user score
    reviewsRawData = pd.read_csv(
        "../data/Hotel_Reviews.csv",
        usecols=['Positive_Review', 'Negative_Review', 'Reviewer_Score'])
    resultTuple = reviewsRawData[
        reviewsRawData["Positive_Review"].str.contains(positive)]

    resultVal = ''
    actual = ' | This review is not from database'
    analysis = ''

    # handle if the review doesn't exist on database
    if len(resultTuple["Reviewer_Score"].values) > 0:
        resultVal = resultTuple["Reviewer_Score"].values
        if resultVal[0] > 0.0:
            tempVal = copy.deepcopy(resultVal[0])
            actual = ' | Actual '
            actual = actual + copy.deepcopy(str(resultVal[0]))
            actual = actual + ' | Accuracy: '
            if expectedReviewScore > resultVal[0]:
                analysis = (expectedReviewScore - resultVal[0])
                analysis = str(
                    round(100 - (analysis / expectedReviewScore * 100), 2))
            else:
                analysis = (resultVal[0] - expectedReviewScore)
                analysis = str(round(100 - (analysis / tempVal * 100), 2))
            analysis = analysis + '%'

    result = "User Rating: Predicted " + str(
        expectedReviewScore) + actual + analysis
    return HttpResponse(result)
Example #25
0
from multi_rake import Rake

rake = Rake(
    min_chars=4,
    max_words=1,
    language_code='de',  # 'en'
    generated_stopwords_percentile=50,
)


def get_keywords(text: str):
    keywords = rake.apply(text)
    return [item[0] for item in keywords]
Example #26
0
 def __init__(self, lang=None):
     self.lang = lang
     self.rake = Rake(language_code=self.lang, max_words=5)
Example #27
0
import glob
import json
import logging
from pathlib import Path
from pprint import pprint
import dariah
from multi_rake import Rake

rake = Rake(
    language_code="en",
    min_chars=10,
    max_words=2,
    min_freq=5,
)


class Topicist:
    def __init__(self, directory):
        self.directory = directory
        self.update()

    def update(self):
        self.docs_paths = list(glob.glob(self.directory + "/*.*"))
        self.state_file = self.directory.replace("\\", "") + ".topicstate"
        try:
            with open(f'{self.state_file}', "r") as f:
                self.state = json.loads(f.read())
            if self.state['state'] == str(self.docs_paths):
                self.headword2doc = self.state['result']
                return
            else:
Example #28
0
    count = 0
    pdftxt = ""
    #The while loop will read each page.
    while count < num_pages:
        pageObj = pdfReader.getPage(count)
        count +=1
        pdftxt += pageObj.extractText()
    txt = pdftxt
    st.write("File Upload Successful")
  
    lang = detect(txt)
    str1 = "Detected Origin of language : " + ilc.language_name(lang)
    st.write(str1)
    
    #----- RAKE 
    rake = Rake(language_code='es', max_words=1)
    rakekeywords = rake.apply(txt)

    if len(rakekeywords) > 25 :
        rakekeywords = rakekeywords[:25]
        
    #----- YAKE 
    max_ngram_size = 3
    deduplication_thresold = 0.9
    deduplication_algo = 'seqm'
    windowSize = 1
    numOfKeywords = 25
    
    custom_kw_extractor = yake.KeywordExtractor(lan=lang, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
    yakekeywords = custom_kw_extractor.extract_keywords(txt)
    st.write("Extracting keywords now ...\n")
Example #29
0
En_dict = enchant.Dict("en_US")

#API Object to interact with Twitter.
api = tweepy.API(auth, wait_on_rate_limit=True)
#api.update_status('The Bot is LIVE my dudes!') #(this just tweets the msg.)

#Object from tmdb...probably going to use it to movie with keywords from user tweet, genre, similar.
Movie_Search = tmdb.Search()
Genre = tmdb.Genres()
Discover = tmdb.Discover()

#filename to store last seen mention id, so that we don't keep replying to the same request again.
LAST_SEEN_FILE = 'last_mention_id.txt'

#Object use to extract keywords from user tweets.
keywords_Extract = Rake(min_chars=3, max_words=1)


def get_last_mention_id(file_name):
    #Create to get the last seen mention for getmentions() to avoid grabbing the same mention again.
    last_read = open(file_name, 'r')
    id = int(last_read.read().strip())
    last_read.close()
    return id


def set_last_mention_id(last_id, file_name):
    #Create to write the last seen mention into a file.
    last_write = open(file_name, 'w')
    last_write.write(str(last_id))
    last_write.close()
Example #30
0
def home():
    score = 0
    results_score = ''
    results_feedback = ''
    reaction = ''
    temp = ''
    temp2 = ''
    terms_form = ''
    res_good = []
    res_bad = []
    bad_words = {
        "proprietary notice language": 0,
        "reasonable attorneys’ fees": 0,
        "assume total responsibility": 0,
        "communication line failure": 0,
        "attorneys’ fees": 0,
        "similar fees": 0,
        "applicable prices": 0,
        "publicly displayed": 0,
        "manipulate identifiers": 0,
        "losses incurred": 0,
        "injuries caused": 0,
        "irreparable harm": 0,
        "computer virus": 0,
        "apple’s failure": 0,
        "apple’s control": 0,
        "governmental request": 0,
        "out-of-pocket expenses": 0,
        "oral agreements": 0,
        "destructive features": 0,
        "punitive damages": 0,
        "monetary damages": 0,
        "third-party applications connected": 0,
        "re-export control laws": 0,
        "modified additional terms": 0,
        "stop providing services": 0,
        "expressly override": 0,
        "constantly changing": 0,
        "non-exclusive license": 0,
        "remove functionalities": 0,
        "apply retroactively": 0,
        "alleged infringing material": 0,
        "affiliated companies": 0,
        "manual process": 0,
        "mail lists": 0,
        "reverse engineer": 0,
        "trade secret": 0,
        "accounting fees": 0,
        "lost data": 0,
        "external websites": 0,
        "fully responsible": 0,
        "password information": 0,
        "post advertisements": 0,
        "conditions waive": 0,
        "remove communications": 0
    }

    good_words = {
        "intellectual property rights": 0,
        "account information secure": 0,
        "completely private": 0,
        "good faith": 0,
        "accessible worldwide": 0,
        "equitable relief": 0,
        "relief granted": 0,
        "competent jurisdiction": 0,
        "reasonable time": 0,
        "copyrights rights": 0,
        "information secure": 0,
        "apple’s liability": 0,
        "reasonable advance notice": 0,
        "party beneficiary rights": 0,
        "open source license": 0,
        "open source software": 0,
        "legal notices displayed": 0,
        "safety laws": 0,
        "password confidential": 0,
        "malware detection": 0,
        "privacy": 0,
        "worldwide license": 0,
        "submit feedback": 0,
        "reasonable requests assisting": 0,
        "good faith belief": 0,
        "limitation security-related features": 0,
        "legally binding agreement": 0,
        "license rights granted": 0,
        "copyright owner's behalf": 0,
        "license includes access": 0,
        "royalty-free license": 0,
        "confidential information": 0
    }

    rake = Rake()

    if request.method == "POST":
        terms_form = request.form.get("input")
        kw = rake.apply(terms_form)
        for word in kw:
            if word[0] in good_words:
                good_words[word[0]] += 1
                res_good.append(word[0])
            if word[0] in bad_words:
                bad_words[word[0]] += 1
                res_bad.append(word[0])

        score = round(len(res_good) / (len(res_good) + len(res_bad)) * 175, 2)
        results_score = "The score is {}%".format(score)

        if 0 <= score <= 50:
            results_feedback = "Poor"
        elif 50 < score <= 65:
            results_feedback = "Average"
        elif 65 < score <= 80:
            results_feedback = "Good"
        elif 80 < score <= 100:
            results_feedback = "Excellent"

        temp = ", ".join(res_good)
        temp2 = ", ".join(res_bad)

    return render_template('index.html',
                           results_score=results_score,
                           results_feedback=results_feedback,
                           reaction=reaction,
                           output=temp,
                           output2=temp2)