Beispiel #1
0
def get_phrases(text=''):
    rake = Rake()
    rake.extract_keywords_from_text(''.join(text))
    phrases = rake.get_ranked_phrases()
    if len(phrases) >= 5:
        return phrases[:5]
    else:
        return phrases
def phrase(ques):
    phrase=[]
    new_list=[]
    r=Rake()
    question=ques
    r.extract_keywords_from_text(question)
    phrase=r.get_ranked_phrases()
    for items in phrase:
        new_list.extend(items.lower().split())
    return new_list    
Beispiel #3
0
def extract_keywords_from_doc(doc, phrases=True, return_scores=False):
    if phrases:
        r = Rake()
        if isinstance(doc, (list, tuple)):
            r.extract_keywords_from_sentences(doc)
        else:
            r.extract_keywords_from_text(doc)
        if return_scores:
            return [(b, a) for a, b in r.get_ranked_phrases_with_scores()]
        else:
            return r.get_ranked_phrases()
    else:
        if not isinstance(doc, (list, tuple)):
            doc = [doc]
        ret = []
        for x in doc:
            for t in nltk.word_tokenize(x):
                if t.lower() not in stop_words:
                    ret.append(t)
        return ret
Beispiel #4
0
def predict_rake(tasks, order, phrases):
    predictions = []
    expected = []
    durations = []
    r = Rake()
    for task, actual in zip(tasks, order):
        scores = dict()
        cover_scores = dict()
        expected.append(actual)
        words = []
        cover = {}
        
        for _, row in phrases.iterrows():
            search_terms = word_tokenize(row["phrase"])
            search_terms = [x for x in search_terms if not x in stop_words]
            search_terms = [x for x in search_terms if len(x) > 2]
            search_terms = [x for x in search_terms if x in model]

            occurs = 0
            coverage = 0
            covered = []
            
            for word in search_terms:
                if word in task:
                    occurs += task[word]
                    coverage += 1
                    covered.append(word)
                coverage = coverage/len(search_terms)
            
            scores[row["expected"]] = occurs
            cover_scores[row["expected"]] = coverage

            cover[row["expected"]] = [(x, task[x]) for x in covered]

        scores = normalize_score(scores)
        cover_scores = normalize_score(scores)

        for key in scores.keys():
            scores[key] = scores[key] * 1 + cover_scores[key] * 0

        predictions.append(get_prediction(scores))

    return predictions, expected
Beispiel #5
0
def get_key_phrases_from_text(text, max_length=None):
    """
    Find key phrases within an html page.

    :param text: the text from an html page.
    :type text: str
    :param max_length: the max length of each key phrase.
    :type max_length: int or None
    :return: a list of all key phrases within the text.
    :rtype: list of str
    """

    if max_length is not None:
        r = Rake(max_length=max_length)
    else:
        r = Rake()
    r.extract_keywords_from_text(text)
    return filter_key_phrases(r.get_ranked_phrases())
    def _get_keyphrases(self):
        # Extract keywords and phrases from the current document so we know
        # what to search for in ES.
        r = Rake()
        r.extract_keywords_from_text(self.instance.content)
        key_phrases = [
            keyphrase[1] for keyphrase in r.get_ranked_phrases_with_scores()
            if keyphrase[0] >= self.min_rank_score
        ]
        if not key_phrases:
            key_phrases = [
                keyphrase[1] for keyphrase in
                r.get_ranked_phrases_with_scores()[:self.default_list_length]
            ]

        return key_phrases
Beispiel #7
0
class KeywordFinder():
    rake: Rake

    def __init__(self):
        self.rake = Rake(min_length=1, max_length=5)
        pass

    def find_keyword(self, text):
        self.rake.extract_keywords_from_text(text)
        return self.rake.get_ranked_phrases()[0]

    def find_keywords(self, text):
        self.rake.extract_keywords_from_text(text)
        return self.rake.get_ranked_phrases()

    pass
def analyse(tab):
    keys = [
        'Наименование продукта/технологии', 'Уникальные характеристики',
        'Задачи, которые решает продукт', 'Технические характеристики',
        'Ожидаемые эффекты'
    ]
    res = []
    r = Rake()
    r.language = "russian"
    # Extraction given the text.
    for i in tab:
        text = "\n".join(list(map(lambda x: i[x], keys)))
        r.extract_keywords_from_text(text)
        ranked = r.get_ranked_phrases_with_scores()
        res.append(ranked)
    return res
Beispiel #9
0
def search_keyword(request):
    if request.method == "POST":
        search_title = request.POST["search_title"]
        if search_title is None:
            return HttpResponse("found")
        user = User.objects.get(pk=UserDataManagement.MainData.EnteredUser.UserEmail)
        history = History()
        history.Date_Time = datetime.datetime.now()
        history.SearchTitle = search_title
        history.UserEmailFK = user
        history.save()
        topics = Topic.objects.all()
        for topic in topics:
            if (str(topic.TopicName).lower()) == (str(search_title).lower()):
                return render(request, "Search.html", {"articles": topic.article_set.all(),
                                                       "UserName": UserDataManagement.MainData.EnteredUser.UserName,
                                                       "check": "1"})

        Newslist = []
        RakeAlgoritm = Rake()
        RakeAlgoritm.extract_keywords_from_text(search_title)
        KeyWordsList1 = RakeAlgoritm.get_ranked_phrases()
        for topic in topics:
            for article in topic.article_set.all():
                RakeAlgoritm.extract_keywords_from_text(article.ArticleDescription)
                KeyWordsList = RakeAlgoritm.get_ranked_phrases()
                intersection = set(KeyWordsList) & set(KeyWordsList1)
                if intersection == set():
                    continue
                else:
                    Newslist.append(article.ArticleDescription)
        if len(Newslist) != 0:
            return render(request, "Search.html", {"articles": Newslist,
                                                   "UserName": UserDataManagement.MainData.EnteredUser.UserName,
                                                   "check": "2"})

    return HttpResponse("Notfound")
Beispiel #10
0
    def parse_keywords(self):

        r = Rake()

        if self.keyword_limit == 0:
            sentence = self.sentence
            r.extract_keywords_from_text(sentence)
            score_words = r.get_ranked_phrases_with_scores()

            for keyword in score_words:
                if keyword[0] > 1:
                    self.keywords.append(keyword[1])

            return self.keywords

        else:
            sentences = [self.sentence]
            r.extract_keywords_from_sentences(sentences)
            keywords = r.ranked_phrases
            return keywords[0:self.keyword_limit]
Beispiel #11
0
class KeywordExtraction:
    THRESHOLD = 3.5

    def __init__(self, text):
        self.r = Rake()
        self.text = text

    def return_keywords(self) -> list:
        self.r.extract_keywords_from_text(self.text)
        return self.r.get_ranked_phrases()

    def return_keywords_with_score(self) -> tuple:
        self.r.extract_keywords_from_text(self.text)
        return self.r.get_ranked_phrases_with_scores()

    def return_keywords_with_score_more_than_threshold(self) -> list:
        return [tup[1] for tup in self.return_keywords_with_score() if tup[0] > self.THRESHOLD]
def filter_bert(res, query, w, num, num_bert):

    r = Rake()
    text_corpus = []
    score_arr = []
    query = str(query)
    query_corpus = []
    query_corpus.append(query)
    res_new = []

    for i in range(len(res)):

        text = str(res[i]['_source']['title_body'])
        # remove stop words
        #text_tokens = word_tokenize(text)
        #tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
        #text = (" ").join(tokens_without_sw)
        key_text = extract_key(text, w, r)
        text_corpus.append(key_text)

    text_emb = np.array(model.encode(text_corpus))
    query_emb = np.array(model.encode(query_corpus))
    query_emb = query_emb[0]

    #print("text emb size: ", text_emb.shape)
    #print("query emb size: ", query_emb.shape)

    #for t, emb in zip(text_corpus, text_emb):
    for emb in text_emb:
        score = scoring_bert(query_emb, emb)
        score_arr.append(score)

    score_arr = np.array(score_arr)
    max_ind = score_arr.argsort()[-num_bert:][::-1]
    for i in max_ind:
        res_new.append(res[i])

    return res_new
Beispiel #13
0
    def __init__(self, name):
        super(App, self).__init__(name)

      
        print("[INFO] Loading models")

        cur_dir =  os.path.dirname(os.path.realpath(__file__))

        
        gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        # add the EOS token as PAD token to avoid warnings
        gpt2_model = GPT2LMHeadModel.from_pretrained(
            "gpt2", pad_token_id=gpt2_tokenizer.eos_token_id
        )

        
        t5_model = T5ForConditionalGeneration.from_pretrained("t5-base")
        t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
        t5_state_path = os.path.join(cur_dir, "./models/final.pt")
        t5_model.load_state_dict(torch.load(t5_state_path))

        
        print("[INFO] Initializing classes")

        self.spacy_model = SpacyModel(size="md")
        self.gpt2 = GPT2(gpt2_tokenizer, gpt2_model)
        self.t5 = T5(t5_tokenizer, t5_model, 100)
        self.r = Rake()

        token = os.getenv("ACCESS_TOKEN")

        
        dataset = os.path.join(cur_dir, './data/artists.txt')
        self.artists = open(dataset).readlines()

        

        self.client = GeniusClient(token)
Beispiel #14
0
def shorten_title(title, max_title_len, alpha_only=True):
    """
    Shortens a title using important phrases and keywords in the title.

    Args:
        title (str): Title to shorten.
        max_title_len (int): Maximum length of the final title.
        alpha_only (bool): Whether to only use alphabetic characters.

    Returns:
        str: Shortened, all lower-case title with a length less than `max_title_len`.
    """
    title = title.lower()
    if len(title) <= max_title_len:
        # Title is already short enough.
        return title

    if alpha_only:
        filter = re.compile("[^a-z ]")
        title = filter.sub("", title)

    # Try using the highest ranked phrase from the title.
    r = Rake()
    r.extract_keywords_from_text(title)
    new_title = r.get_ranked_phrases()[0]
    if len(new_title) <= max_title_len:
        return new_title

    # Title is still too long. Use as many of the important words as will fit within the max
    # title length.
    words = sorted(r.get_word_degrees())
    new_title = words[0]
    if len(new_title) > max_title_len:
        # Cut the single-word title short.
        return new_title[:max_title_len]

    for w in words[1:]:
        append_title = "{} {}".format(new_title, w)
        if len(append_title) > max_title_len:
            break
        new_title = append_title

    return new_title
Beispiel #15
0
    def test_extract_keywords_from_text(self):
        r = Rake()

        r.extract_keywords_from_text(self.test_text)

        ranked_phrases = [
            'minimal generating sets', 'linear diophantine equations',
            'minimal supporting set', 'minimal set', 'linear constraints',
            'upper bounds', 'strict inequations', 'nonstrict inequations',
            'natural numbers', 'mixed types', 'corresponding algorithms',
            'considered types', 'set', 'types', 'considered', 'algorithms',
            'used', 'systems', 'system', 'solving', 'solutions', 'given',
            'criteria', 'construction', 'constructing', 'components',
            'compatibility'
        ]
        self.assertEqual(r.get_ranked_phrases(), ranked_phrases)
        self.assertEqual(
            [phrase for _, phrase in r.get_ranked_phrases_with_scores()],
            ranked_phrases)
Beispiel #16
0
def test_extract_keywords_from_text_word_frequency_metric():
    r = Rake(ranking_metric=Metric.WORD_FREQUENCY)
    r.extract_keywords_from_text(text)
    ranked_phrases = [
        'minimal supporting set',
        'minimal set',
        'minimal generating sets',
        'considered types',
        'systems',
        'systems',
        'systems',
        'systems',
        'mixed types',
        'linear diophantine equations',
        'types',
        'strict inequations',
        'solutions',
        'solutions',
        'solutions',
        'set',
        'nonstrict inequations',
        'linear constraints',
        'corresponding algorithms',
        'upper bounds',
        'natural numbers',
        'criteria',
        'criteria',
        'considered',
        'compatibility',
        'compatibility',
        'algorithms',
        'used',
        'system',
        'solving',
        'given',
        'construction',
        'constructing',
        'components',
    ]

    assert r.get_ranked_phrases() == ranked_phrases
    assert [phrase for _, phrase in r.get_ranked_phrases_with_scores()
            ] == ranked_phrases
Beispiel #17
0
def Extract(train_data, test_data, max_score, j, Enter_rank=True):
    train, test = Rake(), Rake()
    train.extract_keywords_from_text(train_data)
    test.extract_keywords_from_text(test_data)
    train_keywords = lematize(break_phrases(train.get_ranked_phrases()))
    test_keywords = lematize(break_phrases(test.get_ranked_phrases()))
    for x in test_keywords:
        print(x)
        testlist.append(x)
    result = 0
    dict = key[j]
    trainlist = []
    print(dict)
    for x in dict.keys():
        trainlist.append(x)
    print(trainlist)
    i = 0
    for x in testlist:

        if x in dict.keys():
            print(x)
            result = result + (dict[x] * max_score) / 100
            print(result, dict[x])
            i = i + 1
        else:
            syn = PyDictionary.PyDictionary().synonym(trainlist[i])
            if syn == None:
                continue
            print(syn)
            for j in syn:
                if j in testlist:
                    print(trainlist[i], j)
                    print(dict)
                    print(x)
                    dict[j] = (dict[x] * max_score) / 100
                    result = result + dict[j] * max_score
                    matched.append(i)
            i = i + 1
    app.startSubWindow("one", modal=True)
    app.addLabel("l1", result)
    app.stopSubWindow()

    app.addButton("get score", score)
Beispiel #18
0
def keywords_extraction(article, method, k=20, with_weight=False):
    doc = ""
    if method == 0:
        model = lda.build_lda_model(article, 1)
        return lda.get_topic(model,
                             num_topics=1,
                             num_words=k,
                             with_weight=with_weight)[0]
    if method == 1:
        if isinstance(article, str):
            article = [article]
        text_list = text_process.general_processing_file(article)
        for arti in text_list:
            doc += arti
        return jieba.analyse.extract_tags(doc,
                                          topK=k,
                                          withWeight=with_weight,
                                          allowPOS=())
    elif method == 2:
        if isinstance(article, str):
            article = [article]
        article = text_process.general_processing_file(article)
        for arti in article:
            doc += arti
        return textrank.extract_key_phrases(doc)
    elif method == 3:
        if isinstance(article, str):
            article = [article]
        article = text_process.text_processing_rake(article)
        for arti in article:
            doc += arti
        r = Rake()
        r.extract_keywords_from_text(doc)
        rank = r.get_ranked_phrases()
        if with_weight == False:
            return rank[0:len(rank) / 2 + 1]
        score = r.get_ranked_phrases_with_scores()
        return score[0:len(rank) / 2]
    #docs_phase
    else:
        raise ValueError('wrong method code')
Beispiel #19
0
def extract_keywords():
    #read data
    user_data = dataset.iloc[:, 0]
    review_data = dataset.iloc[:, 1]
    #new list to remove stopwords
    review = []
    for data in review_data:
        review.append(data)

    #extracting keywords
    keywords = []
    #creating object for the class
    rake = Rake()

    for data in review:
        if "not" not in data:
            extracted_keywords = rake.extract_keywords_from_text(data)
            ranked_phrase_keywords = rake.get_ranked_phrases()
            keywords.append(ranked_phrase_keywords)
        else:
            extracted_keywords = rake.extract_keywords_from_text(data)
            ranked_phrase_keywords = rake.get_ranked_phrases()
            keywords.append(ranked_phrase_keywords)

    #print(keywords)
    sentiment_result = []
    st = SentimentIntensityAnalyzer()
    #joininig the keywords separated by commas
    for stmt in keywords:
        words = " ".join(str(e) for e in stmt)
        sentiment_result.append(words)

    #print(sentiment_result)
    result = []
    for statement in sentiment_result:
        ss = st.polarity_scores(statement)
        for k in ss:
            result.append([k, ss[k]])

    return sentiment_result
Beispiel #20
0
def run_rake_model(posts, rake_limit):
    # from nltk.corpus import stopwords
    # stop_words = stopwords.words('english')
    # stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
    # data_words = list(sent_to_words(posts))
    # data_words_nostops = remove_stopwords(data_words)
    # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ'])
    # print(data_lemmatized)
    # all_tokens = [j for i in data_lemmatized for j in i]
    # combined_text = " ".join(all_tokens)

    combined_text = " ".join(posts)

    # text = ["RAKE short for Rapid Automatic Keyword Extraction algorithm, " \
    #        "is a domain independent keyword extraction algorithm which tries " \
    #        "to determine key phrases in a body of text by analyzing the frequency " \
    #        "of word appearance and its co-occurance with other words in the text."]

    r = Rake(max_length=3,
             min_length=1,
             ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO)
    # print('lemmatized',data_lemmatized)
    # total_data = []
    # for each in data_lemmatized:
    #     total_data+=each
    # print(total_data)
    # cleaned_text = " ".join(total_data)
    # print('cleaned',cleaned_text)
    # print('combined',text)
    r.extract_keywords_from_text(combined_text)
    # To get keyword phrases ranked highest to lowest.
    res = r.get_ranked_phrases_with_scores()
    res_words = r.get_ranked_phrases()
    # print(res)
    # print(res_words)
    return res_words[:100]


# run_rake_model("F://Armitage_project/crawl_n_depth/extracted_json_files/www.axcelerate.com.au_0_data.json",50)
Beispiel #21
0
def tagging(filename):

    df = pd.read_excel(filename)
    cols = df.columns.values
    r = Rake()
    df2 = pd.DataFrame()
    for i in cols:
        # print(i)
        sent = [str(j) for j in df[i].values if j != 0]
        # print(sent)
        r.extract_keywords_from_text(" ".join(sent))
        # print(r.get_word_frequency_distribution())
        # print(r.get_word_degrees())
        fdis = r.get_word_frequency_distribution()
        wdig = r.get_word_degrees()
        fdis_ls = []
        wdig_ls = []
        wdig = {
            a: b
            for a, b in sorted(
                wdig.items(), key=lambda item: item[1], reverse=True)
        }
        # print(wdig)
        for j in fdis.most_common():
            # print(j[0])
            if len(j[0]) > 3:
                fdis_ls.append(j[0])
        for j in wdig.keys():
            if len(j) > 3:
                wdig_ls.append(j)
        print(fdis_ls[:5])
        print(wdig_ls[:5])
        res = [fdis_ls[:5], wdig_ls[:5]]
        df2[i] = res

        # break
    df2.to_excel("datafile/tagged.xlsx")
Beispiel #22
0
 def test_extract_keywords_from_text_word_frequency_metric(self):
     r = Rake(ranking_metric=Metric.WORD_FREQUENCY)
     r.extract_keywords_from_text(self.text)
     ranked_phrases = [
         "minimal supporting set",
         "minimal set",
         "minimal generating sets",
         "considered types",
         "mixed types",
         "linear diophantine equations",
         "types",
         "strict inequations",
         "set",
         "nonstrict inequations",
         "linear constraints",
         "corresponding algorithms",
         "upper bounds",
         "natural numbers",
         "considered",
         "algorithms",
         "used",
         "systems",
         "system",
         "solving",
         "solutions",
         "given",
         "criteria",
         "construction",
         "constructing",
         "components",
         "compatibility",
     ]
     self.assertEqual(r.get_ranked_phrases(), ranked_phrases)
     self.assertEqual(
         [phrase for _, phrase in r.get_ranked_phrases_with_scores()],
         ranked_phrases,
     )
Beispiel #23
0
def index(request):
    textform = TextForm()
    if request.method == 'POST':
        text = request.POST.get("text")
        text_object = Text.objects.create(text=text)
        r = Rake()
        r.extract_keywords_from_text(text)
        list_phrases = r.get_ranked_phrases()
        for phrase in list_phrases:
            params = {
                'action': 'wbsearchentities',
                'format': 'json',
                'language': 'en',
                'search': phrase
            }
            r = requests.get(API_ENDPOINT, params=params)
            result = r.json()['search']
            if result == []:
                exist = False
                disambiguation = False
            else:
                exist = True
                if len(result) > 1:
                    disambiguation = True
                else:
                    disambiguation = False
            text_object.keyphrase_set.create(phrase=phrase,
                                             exist=exist,
                                             disambiguation=disambiguation)

        return render(request, "index.html", {"form": textform})

    return render(request, "index.html", {
        "form": textform,
        "top_keyphrase": df['phrase'].to_list()[::-1][:10]
    })
Beispiel #24
0
        .map(lambda  row : extract_with_row_id(row["id"], row["summary"]))\
        .flatMap(lambda xs: [(x) for x in xs])

    all_keywords_list = [
        keywords_from_content, keywords_from_title, keywords_from_keywords_col,
        keywords_from_meta_keywords, keywords_from_meta_description,
        keywords_from_tags, keywords_from_summary
    ]

    all_keywords_rdd = sc.union(all_keywords_list)
    all_keywords_rdd = all_keywords_rdd\
        .filter(lambda row: len(row[0]) > 2)\
        .reduceByKey(concat)

    all_keywords_df = all_keywords_rdd.toDF(["Keyword", "RowId & Score"])

    all_keywords_df.write.csv(outputfolderpath,
                              header=True,
                              quote='"',
                              escape='"')

    sc.stop()


rake = Rake()
inputfolderpath = sys.argv[1]
outputfolderpath = sys.argv[2]
jobname = sys.argv[3]

main(inputfolderpath, outputfolderpath, jobname)
df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')

df = df[['Title', 'Genre', 'Director', 'Actors', 'Plot']]

#cleaning

# initializing the new column
df['Key_words'] = ""

for index, row in df.iterrows():
    plot = row['Plot']

    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntuation characters as well
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()

    # assigning the key words to the new column for the corresponding movie
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
df.drop(columns=['Plot'], inplace=True)

#processing all the rows to be a single unique word and in all lowercase to ommit duplications
#cleaning the title row not to be cleaned as it is the target variable for our system
def SA():
    r = Rake()
    # Opens file and reads in training data
    # NB classifier trains using the read in data
    with open("datasets/trainingData.csv", 'r') as trainingdata:
        classifier = NaiveBayesClassifier(trainingdata, format="csv")
        print("Training Data")
        classifier.show_informative_features(15)

    # Opens file and reads in testing data
    # Prints testing data accuracy
    # Not needed for final product

    with open("datasets/testingData.csv", 'r') as testingdata:
        print("Testing data accuracy", classifier.accuracy(testingdata))

    # Asks for user input
    userInput = input("Please provide a test input: ")

    # Removes all non letter characters
    regex = re.compile('[^a-zA-Z ]')
    punctuationRemoved = regex.sub('', userInput)
    print("Punctuation removed: ", punctuationRemoved)

    # Defines stopwords
    stop_words = set(stopwords.words('english'))

    # Takes user input, removes stopwords
    word_tokens = word_tokenize(punctuationRemoved)

    # Creates list size based on number of words left after stop words are removed
    filtered_sentence = [w for w in word_tokens if not w in stop_words]

    # Initialize empty list
    filtered_sentence = []

    # Appends each word to end of list
    # Runs for as many words are stored in word_tokens
    for w in word_tokens:
        # If word is not in stop_words, append to end of list
        if w not in stop_words:
            filtered_sentence.append(w)

    # Prints list to see new sentence with stopwords removed
    print("Stopwords removed: ", filtered_sentence)

    # Converts the filtered stop word sentence to string
    stringWithoutStopwords = ' '.join(
        [str(elem) for elem in filtered_sentence])

    # Extracts keywords from the filtered sentence
    r.extract_keywords_from_text(stringWithoutStopwords)

    # Ranks the keywords that have been extracted
    ranked_phrases = r.get_ranked_phrases()

    print("Keywords extracted: ", ranked_phrases)

    # Converts extracted keywords list to string
    listToStr = ' '.join([str(elem) for elem in ranked_phrases])

    # Runs string through trained NB classifier
    finalString = TextBlob(listToStr, classifier=classifier)

    # Print string followed by classification
    print("String followed by classification: ", finalString,
          finalString.classify())
    if finalString.classify() == ("pos"):
        binaryClassify = 1
    else:
        binaryClassify = 0

    print(binaryClassify)
def parser(text: str):
    r = Rake()
    r.extract_keywords_from_text(text)
    r.get_ranked_phrases_with_scores()
    return r.rank_list
Beispiel #28
0
ES = ES[['Titulo', 'autores', 'materia']]

# In[216]:

ES['materia'] = ES['materia'].astype(str)

# In[217]:

#inicializando la columna
ES['palabras_clave'] = ""

for index, row in ES.iterrows():
    materia = row['materia']
    # instanciando rake, que utiliza las stopwords en el idioma ingles y descartando
    # puntuaciones
    r = Rake(language="spanish")

    # extrayendo a las palabras y pasandolas al texto
    r.extract_keywords_from_text(materia)
    key_words_dict_scores = r.get_word_degrees()

    # asignando las palabras clave a la columna palabras_clave
    row['palabras_clave'] = list(key_words_dict_scores.keys())

# In[218]:

ES = ES.drop("materia", axis=1)

# In[219]:

ES.set_index('Titulo', inplace=True)
Beispiel #29
0
                      error_bad_lines=False)

if __name__ == '__main__':

    #read data
    user_data = dataset.iloc[:, 0]
    review_data = dataset.iloc[:, 1]
    #new list to remove stopwords
    review = []
    for data in review_data:
        review.append(data)

    #extracting keywords
    keywords = []
    #creating object for the class
    rake = Rake()

    for data in review:
        extracted_keywords = rake.extract_keywords_from_text(data)
        ranked_phrase_keywords = rake.get_ranked_phrases()
        keywords.append(ranked_phrase_keywords)

    #print(keywords)
    sentiment_result = []
    st = SentimentIntensityAnalyzer()
    #joininig the keywords separated by commas
    for stmt in keywords:
        words = " ".join(str(e) for e in stmt)
        sentiment_result.append(words)

    print(sentiment_result)
Beispiel #30
0
 def __init__(self, text):
     self.text = text
     self.rake = Rake()
def main():

    primary_ui = PrimaryUI()

    rake = Rake()

    while True:

        event, value = primary_ui.Read()

        if event is None: break

        else:

            if event == PrimaryUI.SUBMIT:

                extraction_type = value[TYPE_SELECTION]

                input_text = re.sub(r'[^A-Za-z0-9\.?!"\' ]', '',
                                    value[INPUT_TEXT].strip())

                if input_text:

                    if extraction_type == PrimaryUI.KEYWORD_COUNT:

                        keyword_count_dict = {}

                        for line in input_text.splitlines():

                            for keyword in line.split(' '):

                                if (keyword.upper() in words
                                        or not keyword.strip()):
                                    continue

                                keyword_count_dict[keyword] = (
                                    keyword_count_dict[keyword] + 1 if keyword
                                    in keyword_count_dict.keys() else 1)

                        output_text = []

                        keyword_count_dict = [
                            (k, keyword_count_dict[k])
                            for k in sorted(keyword_count_dict,
                                            key=keyword_count_dict.get,
                                            reverse=True)
                        ]

                        for keyword, count in keyword_count_dict:
                            output_text.append(f'{keyword} : {count}')

                        primary_ui.set_output_text('\n'.join(output_text))

                    elif extraction_type == PrimaryUI.RANKED_PHRASES:

                        rake.extract_keywords_from_text(input_text)

                        primary_ui.set_output_text('\n'.join(
                            rake.get_ranked_phrases()))

                else:
                    primary_ui.display_warning_dialog(
                        "No input text was provided. Please provide Input.")

            elif event == PrimaryUI.CLEAR:
                primary_ui.clear_input_text()
Beispiel #32
0
def postdata():
    data = request.get_json()
    print(data)

    import pandas as pd
    from rake_nltk import Rake
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity
    from sklearn.feature_extraction.text import CountVectorizer

    pd.set_option('display.max_columns', 100)
    df = pd.read_csv(
        'https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')
    df.head()

    df = df[['Title', 'Genre', 'Director', 'Actors', 'Plot']]
    df.head()

    #df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3])
    #df['Genre'] = df['Genre'].map(lambda x: x.lower().split(','))
    #df['Director'] = df['Director'].map(lambda x: x.split(' '))

    #for index, row in df.iterrows():
    #row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']]
    #row['Director'] = ''.join(row['Director']).lower()

    df['Key_words'] = ""
    for index, row in df.iterrows():
        plot = row['Plot']
        r = Rake()
        r.extract_keywords_from_text(plot)
        key_words_dict_scores = r.get_word_degrees()
        row['Key_words'] = list(key_words_dict_scores.keys())
    df.drop(columns=['Plot'], inplace=True)

    df.set_index('Title', inplace=True)

    df.head()

    df['bag_of_words'] = ''
    columns = df.columns
    for index, row in df.iterrows():
        words = ''
        for col in columns:
            if col != 'Director':
                words = words + ' '.join(row[col]) + ' '
            else:
                words = words + row[col] + ' '
        row['bag_of_words'] = words

    df.drop(columns=[col for col in df.columns if col != 'bag_of_words'],
            inplace=True)

    df.head()

    count = CountVectorizer()
    count_matrix = count.fit_transform(df['bag_of_words'])

    indices = pd.Series(df.index)
    indices[:5]

    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    cosine_sim
    recommended_movies = []

    def recommendations(title, cosine_sim=cosine_sim):

        print("You are in the recommendations section")
        idx = indices[indices == title].index[0]
        score_series = pd.Series(cosine_sim[idx]).sort_values(ascending=False)
        top_10_indexes = list(score_series.iloc[1:11].index)
        for i in top_10_indexes:
            recommended_movies.append(list(df.index)[i])
        return recommended_movies

    for key, value in data.items():

        recommendations(value)

    return json.dumps(recommended_movies)
Beispiel #33
0
#!/usr/bin/python3
# coding: utf-8
# pip install rake-nltk
from rake_nltk import Rake
from nltk import tokenize
r = Rake()  # Uses stopwords for english from NLTK, and all puntuation characters by default
##################################################################
## Extraction given the text.
mytext = '''Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered.
            Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given.
            These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.'''
r.extract_keywords_from_text(mytext)
print(r.get_ranked_phrases())  # To get keyword phrases ranked highest to lowest.
# ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility']
print(r.get_ranked_phrases_with_scores())  # To get keyword phrases ranked highest to lowest with scores.
# [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')]
##################################################################
## Extraction given the list of strings where each string is a sentence.
r.extract_keywords_from_sentences(tokenize.sent_tokenize(mytext))
print(r.get_ranked_phrases())
# ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility']
print(r.get_ranked_phrases_with_scores())
# [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')]