コード例 #1
0
def get_spam_keywords(spam_features, ham_features):
    #POPULATE THE SPAM AND HAM TEXT BLOBS
    text_spam = ''
    text_ham = ''

    for pr in spam_features:
        text_spam += get_keywords(pr)

    for pr in ham_features:
        text_ham += get_keywords(pr)

    text_spam = re.sub('[^a-zA-Z0-9 \n\.]', ' ', text_spam)
    text_ham = re.sub('[^a-zA-Z0-9 \n\.]', ' ', text_ham)
    # print(text_spam,"\n------------------------------------------\n",text_ham)
    #INITIALISE RAKE FOR POPULAR WORDS
    rake = Rake(max_words=2, min_freq=5)

    #EXTRACT POPULAR KEYWORDS FOR SPAM AND HAM
    keywords_spam = rake.apply(text_spam.lower())
    keywords_ham = rake.apply(text_ham.lower())

    # print(keywords_ham)
    # print(keywords_spam)

    spam = [spam_keyword[0] for spam_keyword in keywords_spam[:50]]
    ham = [ham_keyword[0] for ham_keyword in keywords_ham[:50]]

    # GENERATE KEYWORDS PRESENT IN SPAM WHICH ARE NOT PRESENT IN HAM
    spam_final = []
    for word in spam:
        if word not in ham:
            spam_final.append(word)

    # print(spam_final)
    return spam_final
コード例 #2
0
ファイル: addtext.py プロジェクト: tezer/OppiWordsBot
class TextPreprocessor(object):
    def __init__(self, lang=None):
        self.lang = lang
        self.rake = Rake(language_code=self.lang, max_words=5)

    def key_words(self, text):
        return self.rake.apply(text)
コード例 #3
0
    def get(self, language_code=None):
        rake = Rake(language_code=language_code)

        text = request.form.get('text')
        if text:
            return rake.apply(text)

        return 'No text given', 400
コード例 #4
0
ファイル: rake.py プロジェクト: yildirimyusuf/FlaskAPI
    def post():
        posted_data = request.get_json()
        text = posted_data['text']
        rake = Rake()
        keywords = rake.apply(text)
        text = [i[0] for i in keywords]

        return jsonify({'Keywords': text})
コード例 #5
0
def get_RAKE(article):
    rake = Rake()
    keywords = rake.apply(article)
    topKeywords = []

    for i in range(10):
        topKeywords.append(keywords[i][0])
    #print(topKeywords)
    return topKeywords
コード例 #6
0
ファイル: helper.py プロジェクト: mumeblossom/arxiv_crawler
def getKeyWords(text):
    rake = Rake()
    keywords = rake.apply(text)
    sortedKw = []
    for keyword in keywords:
        if keyword[1] > RANK:
            sortedKw.append(keyword[0])
        else:
            pass
    return sortedKw
コード例 #7
0
    def get_keywords(self, article):
        """
        Find the keywords in article and return them in a convenient way.
        :params:
            article, list of sentences, sentences are lists of strings 
        :returns:
            keywords, list of strings -- extracted keywords
        """
        #here we save the labels that will NOT be changed
        labeltype = set()
        if self.label is not None:
            if self.label == 'neutral':
                labeltype.add('B-SPAN')
                labeltype.add('I-SPAN')
            elif self.label == 'propaganda':
                labeltype.add('0')

        #additional variable that holds parts of speech that will NOT be changed
        pos_shortcuts = {'NN': 'n', 'JJ': 'adj', 'RB': 'adv', 'VB': 'v'}

        wordtypes = set()
        if self.postype is not None:
            wordtypes = {'n', 'adj', 'adv', 'v'}
            for fig in self.postype:
                wordtypes.discard(fig)

        text = self.get_text(article)
        text = ''.join(c for c in text if c not in '\'\"')
        rake = Rake()
        try:
            fig = rake.apply(text)
        except:
            print('Couldn\'t find keywords, falling back to all words.')
            fig = self.get_words(article)
            return fig

        raw_keywords = []
        for string, _ in fig:
            raw_keywords += string.split()

        raw_keywords = set(raw_keywords)

        keywords = {}

        for i in range(len(article)):
            sentence = article[i]
            for comb in sentence:
                word, label = comb.split()
                word = word.lower()
                pos = nltk.pos_tag([word])[0][1]
                if pos in pos_shortcuts:
                    pos = pos_shortcuts[pos]
                if word in raw_keywords and label not in labeltype and pos not in wordtypes:
                    keywords[word] = i
        return keywords
コード例 #8
0
 def _fetch_all_sentences_keywords(self):
     """
     For each sentence object in self.video.sentences, the method gets the keywords and saves them in sentence.keywords
     :return: None
     """
     rake = Rake()
     for sentence in self.video.sentences:
         keywords_result = rake.apply(sentence.text,
                                      text_for_stopwords=None)
         keywords = [keyword[0] for keyword in keywords_result
                     ]  # Getting just keywords, without accuracy
         sentence.keywords = keywords
コード例 #9
0
ファイル: searchImpl.py プロジェクト: cyrijohnson/API
def getKeywords(text):
    tokens = text.split()
    processedTokens = []
    if len(tokens) < 3:
        processedTokens = text.split()
    else:
        rake = Rake()
        keywords = rake.apply(text)
        for i in keywords:
            tempHold = i[0].split()
            for j in tempHold:
                processedTokens.append(j)
    return processedTokens
コード例 #10
0
ファイル: commands.py プロジェクト: adibees/mirrormirror
def extract_keywords(text):
    nlkt_text = word_tokenize(text)
    val = nltk.pos_tag(nlkt_text)
    rake = Rake()
    keywords = rake.apply(text)
    new_val = []
    keyword = 'nothing'
    for i in range(len(val)):
        if ((val[i][1] == 'NN')):
            new_val.append(val[i][0])
    for i in range(len(keywords)):
        for j in range(len(new_val)):
            if (keywords[i][0] == new_val[j]):
                keyword = new_val[j]
    return (keyword)
コード例 #11
0
ファイル: SCBert.py プロジェクト: ms2020bgd/SCBert
    def extract_keywords(self, max_words=1, min_freq=5, num_top_words=10):

        stop_words = get_stop_words('fr')

        rake = Rake(max_words=max_words,
                    min_freq=min_freq,
                    language_code="fr",
                    stopwords=stop_words)

        for i, label in enumerate(np.unique(self.labels)):
            corpus_fr = ' '.join(self.data[self.labels == label])
            keywords = rake.apply(corpus_fr)
            top_words = np.array(keywords[:num_top_words])[:, 0]
            self.keywords["Cluster {0}".format(label)] = top_words

        return self.keywords
コード例 #12
0
ファイル: main.py プロジェクト: w33ladalah/id-keywords-finder
def process():
    keywords = []
    text = request.form['text_to_process']
    max_kw_length = int(request.form['max_kw_length'])

    if not text:
        abort(404)

    if request.method == 'POST':
        f = open("data/stopwords.txt", "r")
        sw = f.read()
        rake = Rake(language_code='id',
                    max_words=max_kw_length,
                    stopwords=set(sw.split("\n")))
        keywords = rake.apply(text)

    return render_template('process.html',
                           keywords=keywords,
                           text=text,
                           max_kw_length=max_kw_length)
コード例 #13
0
def get_keywords(text):
        text = (
            text
        )
        rake = Rake(
            min_chars=3,
            max_words=1,
            min_freq=1,
            language_code='es',
            stopwords=None,
            lang_detect_threshold=100,
            max_words_unknown_lang=10,
            generated_stopwords_percentile=80,
            generated_stopwords_max_len=10,
            generated_stopwords_min_freq=2,
            )

        keywords = rake.apply(
            text,
            text_for_stopwords=None,
        )
        return keywords
コード例 #14
0
from multi_rake import Rake
import csv

#text = input()
text = "please tell me the good"
rake = Rake()

keywords = rake.apply(text)

#good = 1, bad = 0
words = dict(good=1, bad=0)
totalcount = 0
goodcount = 0
for word in keywords:
    if word[0] in words:
        totalcount += 1
        if word[1]:
            goodcount += 1
print(goodcount / totalcount)
コード例 #15
0
ファイル: app.py プロジェクト: shriya2909/TL-DR
    pdftxt = ""
    #The while loop will read each page.
    while count < num_pages:
        pageObj = pdfReader.getPage(count)
        count +=1
        pdftxt += pageObj.extractText()
    txt = pdftxt
    st.write("File Upload Successful")
  
    lang = detect(txt)
    str1 = "Detected Origin of language : " + ilc.language_name(lang)
    st.write(str1)
    
    #----- RAKE 
    rake = Rake(language_code='es', max_words=1)
    rakekeywords = rake.apply(txt)

    if len(rakekeywords) > 25 :
        rakekeywords = rakekeywords[:25]
        
    #----- YAKE 
    max_ngram_size = 3
    deduplication_thresold = 0.9
    deduplication_algo = 'seqm'
    windowSize = 1
    numOfKeywords = 25
    
    custom_kw_extractor = yake.KeywordExtractor(lan=lang, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
    yakekeywords = custom_kw_extractor.extract_keywords(txt)
    st.write("Extracting keywords now ...\n")
    
コード例 #16
0
def getKey(text):
    rake = Rake()
    keywords = rake.apply(text)
    return keywords[:7]
コード例 #17
0
ファイル: analyze.py プロジェクト: isengupt/LyricsScraper
    print(len(wordsData))
    new_dict[dt]['words'] = wordsData
    important_words = {}
    if (len(wordsData) < 5000000):

        keyw = rake_object.run(wordsData)
        important_words['rake1'] = keyw[:10]

        print(dt)
        print("Normal Rake: ", keyw[:10])

        gen1 = keywords(wordsData, words=10, scores=True, lemmatize=True)
        important_words['gen1'] = gen1
        print("Gen: ", gen1)

        rake_keys = rake.apply(wordsData)
        important_words['rake2'] = rake_keys[:10]

        print("Second Rake:", rake_keys[:10])

    new_dict[dt]['key_words'] = important_words

with open('raked_dict_upssdated.json', 'w') as f:
    json.dump(new_dict, f)
    # print(dt)
    # print(wordsData)
#        try
#        cv=CountVectorizer(max_df=0.85,stop_words=stopwords,max_features=10000)
#       dt_mat = cv.fit_transform(wordsData)
#print(list(cv.vocabulary_.keys())[:10])
コード例 #18
0
import logging
from gensim.summarization import keywords
from multi_rake import Rake

from helper import Recipes

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s: %(message)s',
                    datefmt='%d-%b-%y %H:%M:%S')

# load recipes
logging.info("Loading recipes from mongodb")
recipes = Recipes(limit=10000)
recipes = recipes.load()
recipes_text = [recipe["text"] for recipe in recipes]

# get keywords
logging.info("Extracting keywords.")
#keywords = keywords(text=" ".join(recipes_text), words=10, scores=True, pos_filter=("NN", "NNS"))
rake = Rake(max_words=1, language_code="de", min_freq=500)
keywords = rake.apply(" ".join(recipes_text))[:20]
print(keywords)
コード例 #19
0
ファイル: test_rake.py プロジェクト: vgrabovets/multi_rake
def test_rake():
    rake = Rake(
        min_chars=3,
        max_words=3,
        min_freq=1,
        lang_detect_threshold=50,
        max_words_unknown_lang=2,
        generated_stopwords_percentile=80,
        generated_stopwords_max_len=3,
        generated_stopwords_min_freq=2,
    )
    text_en = (
        'Compatibility of systems of linear constraints over the set of '
        'natural numbers. Criteria of compatibility of a system of linear '
        'Diophantine equations, strict inequations, and nonstrict inequations '
        'are considered. Upper bounds for components of a minimal set of '
        'solutions and algorithms of construction of minimal generating sets '
        'of solutions for all types of systems are given. These criteria and '
        'the corresponding algorithms for constructing a minimal supporting '
        'set of solutions can be used in solving all the considered types of '
        'systems and systems of mixed types.')
    result = rake.apply(text_en)
    result = _postprocess_result(result)
    expected = [
        ('minimal generating sets', 8.666666666666666),
        ('linear diophantine equations', 8.5),
        ('minimal supporting set', 7.666666666666666),
        ('minimal set', 4.666666666666666),
        ('linear constraints', 4.5),
        ('natural numbers', 4.0),
        ('strict inequations', 4.0),
        ('nonstrict inequations', 4.0),
        ('upper bounds', 4.0),
        ('mixed types', 3.666666666666667),
        ('considered types', 3.166666666666667),
        ('set', 2.0),
        ('types', 1.6666666666666667),
        ('considered', 1.5),
        ('compatibility', 1.0),
        ('systems', 1.0),
        ('criteria', 1.0),
        ('system', 1.0),
        ('components', 1.0),
        ('solutions', 1.0),
        ('algorithms', 1.0),
        ('construction', 1.0),
        ('constructing', 1.0),
        ('solving', 1.0),
    ]
    expected = _postprocess_result(expected)
    assert result == expected

    rake_en = Rake(
        min_chars=3,
        max_words=3,
        min_freq=1,
        language_code='en',
    )
    result = rake_en.apply(text_en)
    result = _postprocess_result(result)
    assert result == expected

    rake_with_stopwords = Rake(
        min_chars=3,
        max_words=3,
        min_freq=1,
        stopwords={'of', 'the', 'a', 'and'},
        lang_detect_threshold=50,
        max_words_unknown_lang=2,
        generated_stopwords_percentile=80,
        generated_stopwords_max_len=3,
        generated_stopwords_min_freq=2,
    )
    result = rake_with_stopwords.apply(text_en)
    result = _postprocess_result(result)
    expected = [
        ('linear constraints over', 9.0),
        ('linear diophantine equations', 9.0),
        ('minimal generating sets', 8.666666666666666),
        ('minimal supporting set', 7.666666666666666),
        ('systems are given', 7.5),
        ('minimal set', 4.666666666666666),
        ('natural numbers', 4.0),
        ('strict inequations', 4.0),
        ('considered types', 4.0),
        ('mixed types', 4.0),
        ('these criteria', 3.5),
        ('set', 2.0),
        ('systems', 1.5),
        ('criteria', 1.5),
        ('compatibility', 1.0),
        ('system', 1.0),
        ('solutions', 1.0),
        ('algorithms', 1.0),
        ('construction', 1.0),
    ]
    expected = _postprocess_result(expected)
    assert result == expected

    text_esperanto = (
        'Liberalismo estas politika filozofio aŭ mondrigardo konstruita en '
        'ideoj de libereco kaj egaleco. Liberaluloj apogas larĝan aron de '
        'vidpunktoj depende de sia kompreno de tiuj principoj, sed ĝenerale '
        'ili apogas ideojn kiel ekzemple liberaj kaj justaj elektoj, '
        'civitanrajtoj, gazetara libereco, religia libereco, libera komerco, '
        'kaj privata posedrajto. Liberalismo unue iĝis klara politika movado '
        'dum la Klerismo, kiam ĝi iĝis populara inter filozofoj kaj '
        'ekonomikistoj en la okcidenta mondo. Liberalismo malaprobis heredajn '
        'privilegiojn, ŝtatan religion, absolutan monarkion kaj la Didevena '
        'Rajto de Reĝoj. La filozofo John Locke de la 17-a jarcento ofte '
        'estas meritigita pro fondado de liberalismo kiel klara filozofia '
        'tradicio. Locke argumentis ke ĉiu h**o havas naturon rekte al vivo, '
        'libereco kaj posedrajto kaj laŭ la socia '
        'kontrakto, registaroj ne rajtas malobservi tiujn rajtojn. '
        'Liberaluloj kontraŭbatalis tradician konservativismon kaj serĉis '
        'anstataŭigi absolutismon en registaroj per reprezenta demokratio kaj '
        'la jura hegemonio.')
    result = rake.apply(text_esperanto)
    result = _postprocess_result(result)
    expected = [
        ('vidpunktoj depende', 4.0),
        ('sia kompreno', 4.0),
        ('tiuj principoj', 4.0),
        ('justaj elektoj', 4.0),
        ('libera komerco', 4.0),
        ('okcidenta mondo', 4.0),
        ('ŝtatan religion', 4.0),
        ('absolutan monarkion', 4.0),
        ('didevena rajto', 4.0),
        ('socia kontrakto', 4.0),
        ('jura hegemonio', 4.0),
        ('gazetara libereco', 3.5),
        ('religia libereco', 3.5),
        ('privata posedrajto', 3.5),
        ('libereco', 1.5),
        ('posedrajto', 1.5),
        ('ideoj', 1.0),
        ('egaleco', 1.0),
        ('civitanrajtoj', 1.0),
        ('klerismo', 1.0),
        ('ekonomikistoj', 1.0),
        ('reĝoj', 1.0),
        ('laŭ', 1.0),
    ]
    expected = _postprocess_result(expected)
    assert result == expected

    rake_max_words_unknown_lang_none = Rake(
        min_chars=3,
        max_words=3,
        min_freq=1,
        lang_detect_threshold=50,
        max_words_unknown_lang=None,
        generated_stopwords_percentile=80,
        generated_stopwords_max_len=3,
        generated_stopwords_min_freq=2,
    )
    result = rake_max_words_unknown_lang_none.apply(text_esperanto)
    result = _postprocess_result(result)
    expected = [
        ('filozofo john locke', 9.0),
        ('serĉis anstataŭigi absolutismon', 9.0),
        ('vidpunktoj depende', 4.0),
        ('sia kompreno', 4.0),
        ('tiuj principoj', 4.0),
        ('justaj elektoj', 4.0),
        ('libera komerco', 4.0),
        ('okcidenta mondo', 4.0),
        ('ŝtatan religion', 4.0),
        ('absolutan monarkion', 4.0),
        ('didevena rajto', 4.0),
        ('socia kontrakto', 4.0),
        ('jura hegemonio', 4.0),
        ('gazetara libereco', 3.5),
        ('religia libereco', 3.5),
        ('privata posedrajto', 3.5),
        ('libereco', 1.5),
        ('posedrajto', 1.5),
        ('ideoj', 1.0),
        ('egaleco', 1.0),
        ('civitanrajtoj', 1.0),
        ('klerismo', 1.0),
        ('ekonomikistoj', 1.0),
        ('reĝoj', 1.0),
        ('laŭ', 1.0),
    ]
    expected = _postprocess_result(expected)
    assert result == expected

    text_for_stopwords = 'de en la kaj al' * 20
    result = rake.apply(text_esperanto, text_for_stopwords)
    result = _postprocess_result(result)
    expected = [
        ('vidpunktoj depende', 4.0),
        ('sia kompreno', 4.0),
        ('tiuj principoj', 4.0),
        ('justaj elektoj', 4.0),
        ('libera komerco', 4.0),
        ('okcidenta mondo', 4.0),
        ('ŝtatan religion', 4.0),
        ('absolutan monarkion', 4.0),
        ('didevena rajto', 4.0),
        ('socia kontrakto', 4.0),
        ('jura hegemonio', 4.0),
        ('gazetara libereco', 3.5),
        ('religia libereco', 3.5),
        ('privata posedrajto', 3.5),
        ('libereco', 1.5),
        ('posedrajto', 1.5),
        ('ideoj', 1.0),
        ('egaleco', 1.0),
        ('civitanrajtoj', 1.0),
        ('klerismo', 1.0),
        ('ekonomikistoj', 1.0),
        ('reĝoj', 1.0),
        ('vivo', 1.0),
        ('laŭ', 1.0),
    ]
    expected = _postprocess_result(expected)
    assert result == expected

    text_numbers = '123, 123, 123, 123'
    result = rake.apply(text_numbers)
    assert result == [('123', 0)]

    rake_min_freq2 = Rake(
        min_chars=3,
        max_words=3,
        min_freq=2,
        lang_detect_threshold=50,
        max_words_unknown_lang=2,
        generated_stopwords_percentile=80,
        generated_stopwords_max_len=3,
        generated_stopwords_min_freq=2,
    )
    text_starts_with_stopword = ('and keywords... keywords are the best words')
    result = rake_min_freq2.apply(text_starts_with_stopword)
    assert result == [('keywords', 1.0)]

    with pytest.raises(NotImplementedError):
        Rake(language_code='xxx')

    rake_uk = Rake(
        min_chars=3,
        max_words=4,
        min_freq=1,
        language_code='uk',
    )
    text_en_uk = (
        'Compatibility of systems of linear constraints над the set of '
        'natural numbers. Criteria of compatibility of a system of linear '
        'Diophantine equations, strict inequations, та nonstrict inequations '
        'are considered. Upper bounds для components of a minimal set of '
        'solutions та algorithms of construction of minimal generating sets '
        'of solutions для всіх types of systems are given. Ці criteria та '
        'the corresponding algorithms для constructing a minimal supporting '
        'set of solutions може бути used в solving всіх the considered types '
        'of systems та systems of mixed types.')
    result = rake_uk.apply(text_en_uk)
    result = _postprocess_result(result)
    expected = [
        ('minimal set of solutions', 15.6),
        ('systems of mixed types', 15.6),
        ('nonstrict inequations are considered', 15.0),
        ('criteria of compatibility of', 13.7),
        ('the corresponding algorithms', 9.0),
        ('components of', 5.6),
        ('strict inequations', 5.0),
        ('upper bounds', 4.0),
        ('criteria', 2.5),
        ('constructing', 1.0),
        ('used', 1.0),
        ('solving', 1.0),
    ]
    expected = _postprocess_result(expected)
    assert result == expected
コード例 #20
0
def keyword_extraction(transcript):
    rake = Rake(max_words=2, min_freq=2)
    keywords = rake.apply(transcript)
    return [item[0] for item in keywords[:5]]
コード例 #21
0
def reviewscore(request):
    rake = Rake()
    positive = request.POST['positive']
    negative = request.POST['negative']

    if len(positive) == 0:
        positive = "No positive"
    if len(negative) == 0:
        #message = 'You searched for: %r' % request.GET['negative']
        negative = "No negative"

    positiveResult = rake.apply(positive)
    negativeResult = rake.apply(negative)

    positiveScore = 0
    negativeScore = 0

    if len(positiveResult) > 0:
        for i in range(0, len(positiveResult)):
            positiveScore = positiveScore + positiveResult[i][1]

    if len(negativeResult) > 0:
        for i in range(0, len(negativeResult)):
            negativeScore = negativeScore + negativeResult[i][1]

    totalScore = positiveScore - negativeScore

    expectedReviewScore = 0.18 * totalScore + 8.31

    # limit expected score range from 0 to 10
    if expectedReviewScore > 10.0:
        expectedReviewScore = 10.0
    elif expectedReviewScore < 0.0:
        expectedReviewScore = 0.0
    else:
        expectedReviewScore = round(expectedReviewScore, 2)

    # import actual data for actual user score
    reviewsRawData = pd.read_csv(
        "../data/Hotel_Reviews.csv",
        usecols=['Positive_Review', 'Negative_Review', 'Reviewer_Score'])
    resultTuple = reviewsRawData[
        reviewsRawData["Positive_Review"].str.contains(positive)]

    resultVal = ''
    actual = ' | This review is not from database'
    analysis = ''

    # handle if the review doesn't exist on database
    if len(resultTuple["Reviewer_Score"].values) > 0:
        resultVal = resultTuple["Reviewer_Score"].values
        if resultVal[0] > 0.0:
            tempVal = copy.deepcopy(resultVal[0])
            actual = ' | Actual '
            actual = actual + copy.deepcopy(str(resultVal[0]))
            actual = actual + ' | Accuracy: '
            if expectedReviewScore > resultVal[0]:
                analysis = (expectedReviewScore - resultVal[0])
                analysis = str(
                    round(100 - (analysis / expectedReviewScore * 100), 2))
            else:
                analysis = (resultVal[0] - expectedReviewScore)
                analysis = str(round(100 - (analysis / tempVal * 100), 2))
            analysis = analysis + '%'

    result = "User Rating: Predicted " + str(
        expectedReviewScore) + actual + analysis
    return HttpResponse(result)
コード例 #22
0
ファイル: nlpFilterSkills.py プロジェクト: lup-/tg_digger
# keywords = rake.apply(clear_text)
# print(keywords)

num_lines = open('skills.txt').read().count('\n')
bar = IncrementalBar('Обработка', max=num_lines)

keywords_hash = {}
line_number = 0
with open('skills.txt') as input:
    for line in input:
        bar.next()
        keywords = []
        for enum in line.split(','):
            clear_line = preprocess_text(line)
            enum_keywords = rake.apply(clear_line)
            keywords += enum_keywords
        for keywordData in keywords:
            keyword, score = keywordData
            occurence_data = {'score': score, 'text': line.strip()}
            if keyword in keywords_hash:
                keywords_hash[keyword].append(occurence_data)
            else:
                keywords_hash[keyword] = [occurence_data]
    input.close()

bar.finish()

keywords = [{
    "normal": keyword,
    "variants": variants
コード例 #23
0
def keywords():
    # entry = {
    #    "text": "La inteligencia artifical es muy interesante. Los elefantes son animales grandes. La ballena es el "
    #            "mamífero más grande del mundo. la jirafa tiene cuello largo. Plutón no es un planeta, "
    #            "Saturno tampoco. Dios es real. Aguante Perón carajo. "

    stopwords = {
        'a', 'á', 'acerca', 'además', 'adonde', 'al', 'algo', 'algún',
        'alguna', 'algunas', 'alguno', 'algunos', 'allende', 'ambos', 'amén',
        'ampleamos', 'ante', 'antes', 'aquel', 'aquella', 'aquellas',
        'aquellos', 'aqui', 'arriba', 'atras', 'aun', 'bajo', 'bastante',
        'bien', 'cabe', 'cabo', 'cada', 'cierta', 'ciertas', 'cierto',
        'ciertos', 'circa', 'como', 'con', 'conmigo', 'connosco',
        'conseguimos', 'conseguir', 'consigo', 'consigue', 'consiguen',
        'consigues', 'contigo', 'contra', 'convosco', 'convusco', 'cual',
        'cuando', 'de', 'dejante', 'del', 'delas', 'denominada', 'denominadas',
        'denominado', 'denominados', 'dentro', 'desde', 'después', 'donde',
        'dos', 'durante', 'e', 'el', 'él', 'ella', 'ellas', 'ellos',
        'empleais', 'emplean', 'emplear', 'empleas', 'empleo', 'en', 'encima',
        'entonces', 'entre', 'era', 'erais', 'eramos', 'éramos', 'eran',
        'erar', 'eras', 'eres', 'es', 'esa', 'esas', 'ese', 'eso', 'esos',
        'esta', 'está', 'estaba', 'estabais', 'estábamos', 'estaban',
        'estabas', 'estad', 'estada', 'estadas', 'estado', 'estados', 'estais',
        'estáis', 'estamos', 'estan', 'están', 'estando', 'estar', 'estará',
        'estarán', 'estarás', 'estaré', 'estaréis', 'estaremos', 'estaría',
        'estaríais', 'estaríamos', 'estarían', 'estarías', 'estas', 'estás',
        'este', 'esté', 'estéis', 'estemos', 'estén', 'estés', 'esto', 'estos',
        'estoy', 'estuve', 'estuviera', 'estuvierais', 'estuviéramos',
        'estuvieran', 'estuvieras', 'estuvieron', 'estuviese', 'estuvieseis',
        'estuviésemos', 'estuviesen', 'estuvieses', 'estuvimos', 'estuviste',
        'estuvisteis', 'estuvo', 'excepto', 'existente', 'existentes', 'fin',
        'fue', 'fuera', 'fuerais', 'fuéramos', 'fueran', 'fueras', 'fueron',
        'fuerza', 'fuese', 'fueseis', 'fuésemos', 'fuesen', 'fueses', 'fui',
        'fuimos', 'fuiste', 'fuisteis', 'gueno', 'ha', 'habéis', 'haber',
        'había', 'habíais', 'habíamos', 'habían', 'habías', 'habida',
        'habidas', 'habido', 'habidos', 'habiendo', 'habrá', 'habrán',
        'habrás', 'habré', 'habréis', 'habremos', 'habría', 'habríais',
        'habríamos', 'habrían', 'habrías', 'hace', 'haceis', 'hacemos',
        'hacen', 'hacer', 'haces', 'hacia', 'hago', 'han', 'has', 'hasta',
        'hay', 'haya', 'hayáis', 'hayamos', 'hayan', 'hayas', 'haz', 'he',
        'hemo', 'hemos', 'hube', 'hubiera', 'hubierais', 'hubiéramos',
        'hubieran', 'hubieras', 'hubieron', 'hubiese', 'hubieseis',
        'hubiésemos', 'hubiesen', 'hubieses', 'hubimos', 'hubiste',
        'hubisteis', 'hubo', 'incluso', 'intenta', 'intentais', 'intentamos',
        'intentan', 'intentar', 'intentas', 'intento', 'ir', 'la', 'largo',
        'las', 'le', 'les', 'lo', 'los', 'más', 'me', 'mediante', 'menos',
        'mi', 'mí', 'mía', 'miar', 'mías', 'mientras', 'mio', 'mío', 'míos',
        'mis', 'modode', 'mucho', 'muchos', 'muy', 'na', 'nada', 'ni', 'no',
        'nos', 'nosotras', 'nosotros', 'nuestra', 'nuestras', 'nuestro',
        'nuestros', 'nunca', 'o', 'os', 'otra', 'otras', 'otro', 'otros', 'pa',
        'pa\'', 'par', 'para', 'pero', 'poco', 'podeis', 'podemos', 'poder',
        'podria', 'podriais', 'podriamos', 'podrian', 'podrias', 'por',
        'porque', 'primero', 'pro', 'puede', 'pueden', 'puedo', 'pues', 'que',
        'qué', 'quien', 'quienes', 'sabe', 'sabeis', 'sabemos', 'saben',
        'saber', 'sabes', 'salvo', 'se', 'sea', 'seáis', 'seamos', 'sean',
        'seas', 'según', 'sentid', 'sentida', 'sentidas', 'sentido',
        'sentidos', 'sentir', 'ser', 'será', 'serán', 'serás', 'seré',
        'seréis', 'seremos', 'sería', 'seríais', 'seríamos', 'serían',
        'serías', 'si', 'sí', 'sido', 'siendo', 'siente', 'sin', 'sintiendo',
        'so', 'sobre', 'sois', 'solamente', 'solo', 'somos', 'son', 'soy',
        'su', 'sus', 'suya', 'suyas', 'suyo', 'suyos', 'también', 'tanto',
        'te', 'tendrá', 'tendrán', 'tendrás', 'tendré', 'tendréis',
        'tendremos', 'tendría', 'tendríais', 'tendríamos', 'tendrían',
        'tendrías', 'tened', 'teneis', 'tenéis', 'tenemos', 'tener', 'tenga',
        'tengáis', 'tengamos', 'tengan', 'tengas', 'tengo', 'tenía', 'teníais',
        'teníamos', 'tenían', 'tenías', 'tenida', 'tenidas', 'tenido',
        'tenidos', 'teniendo', 'ti', 'tiempo', 'tiene', 'tienen', 'tienes',
        'todo', 'todos', 'trabaja', 'trabajais', 'trabajamos', 'trabajan',
        'trabajar', 'trabajas', 'trabajo', 'tras', 'tu', 'tú', 'tus', 'tuve',
        'tuviera', 'tuvierais', 'tuviéramos', 'tuvieran', 'tuvieras',
        'tuvieron', 'tuviese', 'tuvieseis', 'tuviésemos', 'tuviesen',
        'tuvieses', 'tuvimos', 'tuviste', 'tuvisteis', 'tuvo', 'tuya', 'tuyas',
        'tuyo', 'tuyos', 'ultimar', 'ultimo', 'un', 'un', 'una', 'unas', 'uno',
        'unos', 'usa', 'usais', 'usamos', 'usan', 'usar', 'usas', 'utilizando',
        'uso', 'va', 'vais', 'valor', 'vamos', 'van', 'vaya', 'verdad',
        'verdadera', 'verdadero', 'versus', 'vía', 'vosostras', 'vosostros',
        'vosotras', 'vosotros', 'voy', 'vuestra', 'vuestras', 'vuestro',
        'vuestros', 'vusco', 'y', 'ya', 'yo', 'optimizando', 'actualmente',
        'llevar', 'manera', 'podrán', 'reduciendo', 'brindar'
    }

    text_es = request.json['text']
    stopwords_list = request.json['stopwords']
    stopwords_list_clean = re.sub('[\[\]]', '', stopwords_list).split(", ")
    stopwords.update(stopwords_list_clean)

    print(stopwords)

    html = markdown.markdown(text_es)
    plain_text = html2text.html2text(html)
    plain_text = re.sub('[!@·_•*\[\]/#$]', '', plain_text)
    plain_text = plain_text.replace('\\n', '. ')
    plain_text = re.sub(r'http\S+', '', plain_text, flags=re.MULTILINE)
    plain_text = re.sub(r'www\S+', '', plain_text, flags=re.MULTILINE)

    result = []

    # de 2 palabras
    rake = Rake(language_code='es',
                max_words=2,
                generated_stopwords_max_len=20,
                stopwords=stopwords)
    key_words_2 = rake.apply(plain_text)
    keys2 = key_words_2[:20]

    for key in keys2:
        item = {"name": key[0], "score": key[1]}
        result.append(item)

    # de 1 palabra
    rake = Rake(language_code='es',
                max_words=1,
                generated_stopwords_max_len=20,
                stopwords=stopwords)
    key_words_1 = rake.apply(plain_text)
    keys1 = key_words_1[:50]

    for key in keys1:
        if not resultContains(result, key[0]):
            item = {"name": key[0], "score": key[1]}
            if len(result) < 40:
                result.append(item)

    response = {"keywords": result}

    print(response)

    return response
コード例 #24
0
def home():
    score = 0
    results_score = ''
    results_feedback = ''
    reaction = ''
    temp = ''
    temp2 = ''
    terms_form = ''
    res_good = []
    res_bad = []
    bad_words = {
        "proprietary notice language": 0,
        "reasonable attorneys’ fees": 0,
        "assume total responsibility": 0,
        "communication line failure": 0,
        "attorneys’ fees": 0,
        "similar fees": 0,
        "applicable prices": 0,
        "publicly displayed": 0,
        "manipulate identifiers": 0,
        "losses incurred": 0,
        "injuries caused": 0,
        "irreparable harm": 0,
        "computer virus": 0,
        "apple’s failure": 0,
        "apple’s control": 0,
        "governmental request": 0,
        "out-of-pocket expenses": 0,
        "oral agreements": 0,
        "destructive features": 0,
        "punitive damages": 0,
        "monetary damages": 0,
        "third-party applications connected": 0,
        "re-export control laws": 0,
        "modified additional terms": 0,
        "stop providing services": 0,
        "expressly override": 0,
        "constantly changing": 0,
        "non-exclusive license": 0,
        "remove functionalities": 0,
        "apply retroactively": 0,
        "alleged infringing material": 0,
        "affiliated companies": 0,
        "manual process": 0,
        "mail lists": 0,
        "reverse engineer": 0,
        "trade secret": 0,
        "accounting fees": 0,
        "lost data": 0,
        "external websites": 0,
        "fully responsible": 0,
        "password information": 0,
        "post advertisements": 0,
        "conditions waive": 0,
        "remove communications": 0
    }

    good_words = {
        "intellectual property rights": 0,
        "account information secure": 0,
        "completely private": 0,
        "good faith": 0,
        "accessible worldwide": 0,
        "equitable relief": 0,
        "relief granted": 0,
        "competent jurisdiction": 0,
        "reasonable time": 0,
        "copyrights rights": 0,
        "information secure": 0,
        "apple’s liability": 0,
        "reasonable advance notice": 0,
        "party beneficiary rights": 0,
        "open source license": 0,
        "open source software": 0,
        "legal notices displayed": 0,
        "safety laws": 0,
        "password confidential": 0,
        "malware detection": 0,
        "privacy": 0,
        "worldwide license": 0,
        "submit feedback": 0,
        "reasonable requests assisting": 0,
        "good faith belief": 0,
        "limitation security-related features": 0,
        "legally binding agreement": 0,
        "license rights granted": 0,
        "copyright owner's behalf": 0,
        "license includes access": 0,
        "royalty-free license": 0,
        "confidential information": 0
    }

    rake = Rake()

    if request.method == "POST":
        terms_form = request.form.get("input")
        kw = rake.apply(terms_form)
        for word in kw:
            if word[0] in good_words:
                good_words[word[0]] += 1
                res_good.append(word[0])
            if word[0] in bad_words:
                bad_words[word[0]] += 1
                res_bad.append(word[0])

        score = round(len(res_good) / (len(res_good) + len(res_bad)) * 175, 2)
        results_score = "The score is {}%".format(score)

        if 0 <= score <= 50:
            results_feedback = "Poor"
        elif 50 < score <= 65:
            results_feedback = "Average"
        elif 65 < score <= 80:
            results_feedback = "Good"
        elif 80 < score <= 100:
            results_feedback = "Excellent"

        temp = ", ".join(res_good)
        temp2 = ", ".join(res_bad)

    return render_template('index.html',
                           results_score=results_score,
                           results_feedback=results_feedback,
                           reaction=reaction,
                           output=temp,
                           output2=temp2)
コード例 #25
0
    generated_stopwords_percentile=80,
    generated_stopwords_max_len=3,
    generated_stopwords_min_freq=2,
)

docs = [] #to store opened trainFiles

for i in range(0,len(trainFiles)):
    raw = open(trainFiles[i], encoding = "latin-1")
    txt = raw.read()
    docs.append(txt) #opening each file in trainFiles and storing it in docs
    
wordsByRake =[] #to store key words from each docs after applying Rake

for i in range(0,len(trainFiles)):
    wordsByRake.append(rake.apply(docs[i]))

def listOfLists(lst):
    temp =[]
    for i in range(0,len(lst)):
        if(lst[i][1] > 4.0): #this can be changed according to the required number of tags
            temp.append(lst[i][0])
    return [elem for elem in temp]

for i in range(0,len(wordsByRake)):
    trainDocs.insert(i, listOfLists(wordsByRake[i]))

wordsByRake = [] #to avoid memory usage
docs = [] #to avoid memory usage

nlp = en_core_web_sm.load() #loading the model
コード例 #26
0
        ans.append(sentences[idx + 1])
    
    sentences = []

# further refine question and answer sentences
del questions[1::2]
del ans[1::2]

# trim data size to defined size
questions =  questions[:size]
ans = ans[:size]

# create keywords for each sentence using Rake
key = []
for idx, q in enumerate(questions):
    keyword = rake.apply(" ".join(re.findall(r"\w+", q.lower())))
    if len(keyword) != 0:
        key.append((keyword[0])[0])
    else:
        key.append((line.split())[0])
            


try:
    with open("data.pickle", "rb") as f:
        words, labels, training, output = pickle.load(f)
except:
    words = []
    labels = []
    docs_x = []
    docs_y = []