Beispiel #1
0
def get_phrases(text=''):
    rake = Rake()
    rake.extract_keywords_from_text(''.join(text))
    phrases = rake.get_ranked_phrases()
    if len(phrases) >= 5:
        return phrases[:5]
    else:
        return phrases
def phrase(ques):
    phrase=[]
    new_list=[]
    r=Rake()
    question=ques
    r.extract_keywords_from_text(question)
    phrase=r.get_ranked_phrases()
    for items in phrase:
        new_list.extend(items.lower().split())
    return new_list    
Beispiel #3
0
def keywords_of(lang, text, k=5):
    ''' Return a set of at least k keywords from text written in the language lang. '''
    r = Rake()
    dictionary = enchant.Dict(lang)
    r.extract_keywords_from_text(text)
    keywords = Counter()
    phrases = r.get_ranked_phrases()

    for phrase in itertools.chain(phrases):
        word_set = set(k for k in phrase.split(' ')
                       if dictionary.check(k) and len(k) >= 4)
        keywords.update({k: 1 for k in word_set})

    return [key for key, _ in keywords.most_common(k)]
Beispiel #4
0
def get_context_keywords(cumulative_text):

    r = Rake()

    context_keywords_list = []
    for text in cumulative_text:

        r.extract_keywords_from_text(text)

        keywords = r.get_ranked_phrases()

        context_keywords_list.append(keywords)

    return (context_keywords_list)
Beispiel #5
0
def tag_most_common(x, top_keywords):
    r = Rake()  # initialize rake
    r.extract_keywords_from_text(x)  # Extraction given the text.
    keywords = r.get_ranked_phrases()  # ranked keywords

    # special care for important words
    for index, word in enumerate(keywords):
        if "data scientist" in word:
            keywords[index] = "data scientist"
        if "machine learning" in word:
            keywords[index] = "machine learning"

    # return only keywords in top most common keywords
    return list(set(keywords) & set(top_keywords))
    def keywords_rake_nltk(self, texts=None, words=10, **kwargs):
        """ extract keywords using rake_nltk """

        r = Rake()
        if texts is None:
            texts = self.contents(**kwargs)

        if isinstance(texts, list):
            r.extract_keywords_from_sentences(texts)
        else:
            r.extract_keywords_from_text(texts)

        res = r.get_ranked_phrases()
        return res[:words]
Beispiel #7
0
def getkeywords(text):
    # if abstract is empty, should skip it
    keys = ""
    if text != "":
        separator = ","
        # keys = separator.join(generate_grant_keywords(text))
        r = Rake()
        r.extract_keywords_from_text(text)
        keys = separator.join(r.get_ranked_phrases())

    #    else:
    #        print("empty abstract")

    return keys
Beispiel #8
0
def keywordFinder(filename):
    os.chdir(folderPath)
    dirFile = open(filename, 'r', errors="replace")
    fileTexts[filename] = dirFile.read()
    dirFile.close()
    r = Rake()
    r.extract_keywords_from_text(fileTexts[filename])
    keywords = r.get_ranked_phrases()

    for word in keywords[0:5]:
        if (word in keywordDict):
            keywordDict[word] += 1
        else:
            keywordDict[word] = 1
Beispiel #9
0
 def note_image(self):
     r = Rake(min_length=1,
              max_length=3,
              ranking_metric=Metric.WORD_FREQUENCY)
     note_text = self.note_content
     note_text = str(note_text)
     note_keywords_string = r.extract_keywords_from_text(note_text)
     note_keywords_string = r.get_ranked_phrases()[0:5]
     query = note_keywords_string[0]
     url = "https://api.unsplash.com/search/photos/?client_id=8d1a8c2c11547e593d064b6b389f3d728d7556d09817209e61447ddfb246982c&query=" + query + "&page=1&per_page=&orientation=landscape"
     response = requests.get(url)
     data = response.json()
     image_url = data["results"][0]["urls"]["regular"]
     return image_url
Beispiel #10
0
 def post(self):
     args = parser.parse_args()
     print(args)
     if args['lang']:
         r = Rake(language=args['lang'])
     else:
         r = Rake()
     r.extract_keywords_from_text(args['query'])
     output = r.get_ranked_phrases()[:4]
     #output=jsonify(output)
     js = json.dumps(output)
     return Response(js,
                     headers={'Access-Control-Allow-Origin': '*'},
                     mimetype='application/json')
def get_suggestions(username, password, device_id):
    r = Rake()
    final_list = []
    subs = get_client_subs(username, password, device_id)
    for pod in subs:
        r.extract_keywords_from_text(pod.description)
        keywords = r.get_ranked_phrases()
        for k in keywords[0:5]:
            search_results = public_client.search_podcasts(
                urllib.quote(k.encode('utf8')))
            if (len(search_results) != 0 and search_results[0] not in subs):
                final_list.append(search_results[0])
    final_list = appropriate_sort(final_list, 1)
    return jsonify_podcast_list(final_list)
Beispiel #12
0
def extract_keywords(query, stem=True):
    r = Rake()
    r.extract_keywords_from_text(query)
    kw = r.get_ranked_phrases()
    word_ls = nltk.word_tokenize(query)

    total = set(kw + word_ls)

    if stem:
        ps = PorterStemmer()
        return list(map(ps.stem, total))

    else:
        return list(total)
def influencetopic(handle, influencer):
    """
    Determine topic similarity between users

    :param handle: Handle of user to get.
    :type handle: str
    :param influencer: Handle of user to examine influence from.
    :type influencer: str

    :rtype: None
    """
    s1 = [t.text for t in statuses(handle)]
    # Keyword extraction
    r1 = Rake()
    r1.extract_keywords_from_text(" ".join(s1))
    r1.get_ranked_phrases()
    s2 = [t.text for t in statuses(influencer)]
    # Keyword extraction
    r2 = Rake()
    r2.extract_keywords_from_text(" ".join(s2))
    r2.get_ranked_phrases()
    #TODO list comparison
    return '["use the individual one for now"]'
def usertopics(handle):
    """
    Gets topics a user uses, grouped using word2vec

    :param handle: Handle of user to get.
    :type handle: str

    :rtype: None
    """
    s = [t.text for t in statuses(handle)]
    # Keyword extraction
    r = Rake()
    r.extract_keywords_from_text(" ".join(s))
    return json.dumps(r.get_ranked_phrases())
Beispiel #15
0
def search_keyword(request):
    if request.method == "POST":
        search_title = request.POST["search_title"]
        if search_title is None:
            return HttpResponse("found")
        user = User.objects.get(pk=UserDataManagement.MainData.EnteredUser.UserEmail)
        history = History()
        history.Date_Time = datetime.datetime.now()
        history.SearchTitle = search_title
        history.UserEmailFK = user
        history.save()
        topics = Topic.objects.all()
        for topic in topics:
            if (str(topic.TopicName).lower()) == (str(search_title).lower()):
                return render(request, "Search.html", {"articles": topic.article_set.all(),
                                                       "UserName": UserDataManagement.MainData.EnteredUser.UserName,
                                                       "check": "1"})

        Newslist = []
        RakeAlgoritm = Rake()
        RakeAlgoritm.extract_keywords_from_text(search_title)
        KeyWordsList1 = RakeAlgoritm.get_ranked_phrases()
        for topic in topics:
            for article in topic.article_set.all():
                RakeAlgoritm.extract_keywords_from_text(article.ArticleDescription)
                KeyWordsList = RakeAlgoritm.get_ranked_phrases()
                intersection = set(KeyWordsList) & set(KeyWordsList1)
                if intersection == set():
                    continue
                else:
                    Newslist.append(article.ArticleDescription)
        if len(Newslist) != 0:
            return render(request, "Search.html", {"articles": Newslist,
                                                   "UserName": UserDataManagement.MainData.EnteredUser.UserName,
                                                   "check": "2"})

    return HttpResponse("Notfound")
def cleanDescriptionBOW(description: list, nouns_only: bool) -> dict:
    document_tokens, document_keywords = [], []
    for idx, text in enumerate(description):
        # Pre-processing
        text = text.lower()  # to lower case
        text = text.strip()  # strip white space
        text = re.sub(r'\d+', ' ', text)  # remove digits
        text = re.sub(r'[^\w\s]', " ", text)  # remove punctuation
        text = re.sub("[ |\t]{2,}", " ", text)  # remove tabs

        # Tokenize
        if nouns_only:
            tokens = [
             item[0] \
             for item in nltk.pos_tag(nltk.word_tokenize(text)) \
             if item[1] == "NN"
            ]
        else:
            tokens = nltk.word_tokenize(text)

        # Stem
        stemmer = nltk.stem.porter.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]

        # Remove stopwords
        stopwords = nltk.corpus.stopwords.words('english')
        tokens = [token for token in tokens if token not in stopwords]

        # Extract keywords
        r = Rake()
        r.extract_keywords_from_text(text)
        keywords = r.get_ranked_phrases()
        if nouns_only:
            keywords = [
                item[0] for sublist in [
                    nltk.pos_tag(nltk.word_tokenize(keyword))
                    for keyword in keywords
                ] for item in sublist if item[1] == "NN"
            ]

        document_tokens += tokens
        document_keywords += keywords

    token_count, keyword_count = dict(Counter(document_tokens)), dict(
        Counter(document_keywords))

    document_bow = {**token_count, **keyword_count}

    return (document_bow)
Beispiel #17
0
 def get_top_k_docs(self, query):
     query_words = p.preprocess_query(query)  # preprocessing query
     if len(query_words) > 10:  # long query search
         r = Rake(min_length=1, max_length=4)
         r.extract_keywords_from_text(query)
         phrases = list(set(' '.join(r.get_ranked_phrases()).split()))
         query_words = p.preprocess_query(' '.join(phrases))
     top_k_docs = self.model.get_top_n(query_words, self.corpus,
                                       100)  # get top 100 docs
     insensitive_comparers = {}
     for qw in query_words:
         insensitive_comparers[qw] = re.compile(re.escape(qw),
                                                re.IGNORECASE)
     results = {
         'titles': [],
         'abstracts': [],
         'ids': [],
         'authors': [],
         'links': [],
         'category': []
     }
     for i in top_k_docs:
         abstract = i['abstract'].replace('\n', '')
         title = i['title'].replace('\n', '')
         authors = i['authors'].replace('\n', '')
         id = i['id']
         category = self.cat_data.iloc[id]['pred_category']
         if abstract == '' or title == '' or authors == '':
             continue
         abstract = p.remove_punctuations(abstract)
         doc_text = title.lower() + ' ' + abstract.lower(
         ) + ' ' + authors.lower()
         query_words_found = False
         for qw in query_words:
             if qw in doc_text:
                 query_words_found = True
                 break
         if not query_words_found:
             continue
         for qw in query_words:
             abstract = insensitive_comparers[qw].sub(
                 '<b>' + qw + '</b>', abstract)
         results['titles'].append(title.title())
         results['authors'].append(authors)
         results['abstracts'].append(abstract)
         results['ids'].append(i['id'])
         results['links'].append(i['link'])
         results['category'].append(category)
     return (results)
def title(text):
    """
    Extracts keywords from the HTML text and constructs a title.

    Parameters:
        text <list> strings contained in tags
    Return:
        title <string>
    """
    r = Rake(min_length=4, max_length=14)  # Rake instance
    keywords = []
    for e in text:
        r.extract_keywords_from_text(e)
        result = r.get_ranked_phrases()
        if result:
            cand = nltk.pos_tag(
                nltk.word_tokenize(result[0])
            )  # filters out candidates for keywords and pairs them with their respective POS tags
            for c in cand:
                if ("NN" in c or "NNS" in c or "NNP" in c or "NNPS" in c) and (
                    len(c[0]) > 2
                ):
                    keywords.append(
                        c[0]
                    )  # extracts nouns, plural nouns, proper nouns, and proper plural nouns
        else:
            continue
    title = ""
    if len(keywords) <= 4:
        for w in keywords:
            if keywords[-1] == w:
                title = title + w + "."
                return title
            else:
                title = title + w + ", "
    else:
        i = random.sample(range(len(keywords)), 4)
        title = (
            ""
            + keywords[i[0]]
            + ", "
            + keywords[i[1]]
            + ", "
            + keywords[i[2]]
            + ", "
            + keywords[i[3]]
            + "."
        )  # constructs a string out of the most relevant keywords
        return title
Beispiel #19
0
def main2():
    # Uses stopwords for english from NLTK, and all puntuation characters by
    # default
    r = Rake()

    sample_file = open("caption.txt", 'r', encoding="iso-8859-1")
    text = sample_file.read()
    r.extract_keywords_from_text(text)
    keywords = r.get_ranked_phrases()

    j = open("hashesfromcaption.txt", "w+")
    for i in keywords:
        j.write("#" + i)

    print("Keywords:", keywords)
def extract_labels(text_list):
    """Takes a list of strings, extracts significant words and returns them
    as a list of labels for use in the model. We make use of the RAKE Algorithm.
    There may be value in running the operation across a whole dataset of reports
    and then ranking the phrases and using those rankings to select significant
    labels from the user supplied text. We also may need to reduce this dataset
    across all labels to remove the phrases that don't occur across multiple
    reports, for example placenames, etc, as this may create bias in our model.
    """
    label_list = []
    r = Rake()
    corpus = " ".join(text_list)
    r.extract_keywords_from_text(corpus)
    label_list = r.get_ranked_phrases()
    return label_list
Beispiel #21
0
def nearest_neighbor_labelling(sentence):
    r = Rake(min_length=2, max_length=4)
    r.extract_keywords_from_text(sentence)
    keywords = r.get_ranked_phrases()

    if keywords == []:
        return ('none', 'none', '9')
    else:
        first_keyword = keywords[0]
        last_word = word_tokenize(first_keyword)[-1]
        matched_entity = vectors.most_similar_to_given(last_word,
                                                       known_entities)

    return first_keyword, entities_to_str[matched_entity], lookup[
        entities_to_str[matched_entity]]
Beispiel #22
0
def extract_keywords():
    keyword_list = []
    for i, row in topic_list.iterrows():
        r = Rake()
        r.extract_keywords_from_text(row['narratives'])
        keywords = r.get_ranked_phrases()[0]
        #remove punktuation that is in there for some reason
        keywords = keywords.translate(str.maketrans('', '',
                                                    string.punctuation))
        # remove shortwords
        shortword = re.compile(r'\W*\b\w{1,3}\b')
        keywords = shortword.sub("", keywords)
        keyword_list.append(keywords.strip())
        se = pd.Series(keyword_list)
        topic_list['keywords'] = se.values
Beispiel #23
0
    def get_keywords(self):
        r = Rake(min_length=1, max_length=3)

        # get text from pdf
        s = pdf.extract_text(self.path)

        # cleaning extracted text
        s = "".join(filter(lambda x: x.isalpha() or x.isspace(), s))

        r.extract_keywords_from_text(s)
        keywords = r.get_ranked_phrases()

        keywords = list(filter(self.is_valid, keywords))

        return keywords
Beispiel #24
0
 def generate_keywords(self):
     """
     Generate a list of keywords for the sentence
     Arg: sentence(str) - the sentence to parse
     Return: a list of string
     """
     keywords = []
     r = Rake()
     r.extract_keywords_from_text(self.quesiton)
     keywords = r.get_ranked_phrases()
     if not keywords:
         print('No keywords generated. Please rephrase your question.')
     if len(keywords) == 1:
         splited = keywords[0].split()
     return keywords
def rake_extract(rake_out):
  for i in range(len(docid)):
    doc = index_utils.doc_contents(docid[i])
    print("=== %d ===" %i)
    #find keyphrase extraction
    #Uses stopwords for english from NLTK, and all puntuation characters.
    r = Rake(min_length=2, max_length=10) 
    r.extract_keywords_from_text(doc)
    str_rake = " "
    # convert to string
    str_rake = str_rake.join(r.get_ranked_phrases()[:60]) 
    print(str_rake)
    # save in list -> 60 docid
    rake_out.append(str_rake)
    return rake_out
Beispiel #26
0
def extract_intent_summary_sanitized(row):
    custText = row['model_gensim_summary_sanitized']
    #print(row['activity'])
    if (row['activity'] != '1420820' and row['activity'] != '1554108'
            and row['activity'] != '1662813' and row['activity'] != '80445'):
        # r = Rake(min_length=2, max_length=7)
        #r = Rake(min_length=2, max_length=7, ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO)
        r = Rake(min_length=2, max_length=7, ranking_metric=Metric.WORD_DEGREE)
        r.extract_keywords_from_text(custText)
        result = r.get_ranked_phrases()
        #print (r.get_ranked_phrases_with_scores())

        return result[:4]
    else:
        return
Beispiel #27
0
def get_keywords(tweets):
    pos = 0
    neg = 0
    for c in tweets:
        polarity = TextBlob(c).sentiment.polarity

        if polarity < 0:
            neg += 1

        else:
            pos += 1

    total_string = " ".join(tweets)
    r = Rake(ranking_metric=Metric.WORD_FREQUENCY,max_length=2)
    words = r.extract_keywords_from_text(total_string)
    words1 = r.get_ranked_phrases()
    words2 = words1[:5]
    words3 = []
    occ = 0
    keywords = []
    positive = []
    negative = []

    for t in words2:
        s = t.strip('\ ’')
        words3.append(s)

    for x in words3:
        pos1=0
        neg1=0

        for t in tweets:
            t = t.lower()

            if x in t:
                polarity1 = TextBlob(t).sentiment.polarity

                if polarity1 >= 0:
                    pos1 += 1

                elif polarity1 <= 0:
                    neg1 += 1

        keywords.append(x)
        positive.append(pos1)
        negative.append(neg1)

    return keywords, positive, negative, pos, neg
Beispiel #28
0
def tag_extraction(x):
    if pd.isnull(x):
        return []

    r = Rake()  # initialize rake
    r.extract_keywords_from_text(x)  # Extraction given the text.
    keywords = r.get_ranked_phrases()  # ranked keywords

    # special care for important words
    for index, word in enumerate(keywords):
        if "data scientist" in word:
            keywords[index] = "data scientist"
        if "machine learning" in word:
            keywords[index] = "machine learning"

    return keywords  # return ranked phrases
Beispiel #29
0
def rake_keyword(doc):
    """
    Extracts keywords from the given text using rake.

    Args:
        doc: Paragraph from keywords need to be extracted.

    Returns:
        Returns Keywords extracted from the text document passed.
    """

    r = Rake()
    r.extract_keywords_from_text(doc)
    keywords = r.get_ranked_phrases()

    return keywords
Beispiel #30
0
def keyword_extract(pdf_name, below=" ", above=" ", pageNo=0):
    page = convert_pdf_to_txt(path + pdf_name, pageNo)
    if below == " " and above == " ":
        relevant_text = page
    else:
        if below in page:
            relevant_text = page.split(below)[1]
        if above in page:
            relevant_text = page.split(above)[0]

    r = Rake(ranking_metric=Metric.WORD_FREQUENCY)
    keywords = r.extract_keywords_from_text(relevant_text)
    ranked_kywrds = r.get_ranked_phrases()
    scored_kywrds = r.get_ranked_phrases_with_scores()

    return scored_kywrds
Beispiel #31
0
def extract_kwords(tweet_object):
    """Uses RAKE algorithm to extract keywords from
    a tweet.

    Inputs
    ------
    Tweepy tweet object
    Returns
    -------
    List of keywords.
    """
    r = Rake()
    tweet_text = tweet_object.full_text
    r.extract_keywords_from_text(tweet_text)
    extracted = r.get_ranked_phrases()
    return extracted
Beispiel #32
0
    def get_category_from_categories(self, phrases_list):
        lst = []
        bad_words = ['articles', 'links', 'containing']
        for phrase in phrases_list:
            phrase_words = phrase.split()
            for word in phrase_words:
                if word not in bad_words and word not in stopwords.words(
                        "english"):
                    lst.append(word)
            lst.append('.')

        text = list_to_string(lst)
        rec = Rake()
        rec.extract_keywords_from_text(text)
        cat = rec.get_ranked_phrases()
        return cat
Beispiel #33
0
#!/usr/bin/python3
# coding: utf-8
# pip install rake-nltk
from rake_nltk import Rake
from nltk import tokenize
r = Rake()  # Uses stopwords for english from NLTK, and all puntuation characters by default
##################################################################
## Extraction given the text.
mytext = '''Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered.
            Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given.
            These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.'''
r.extract_keywords_from_text(mytext)
print(r.get_ranked_phrases())  # To get keyword phrases ranked highest to lowest.
# ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility']
print(r.get_ranked_phrases_with_scores())  # To get keyword phrases ranked highest to lowest with scores.
# [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')]
##################################################################
## Extraction given the list of strings where each string is a sentence.
r.extract_keywords_from_sentences(tokenize.sent_tokenize(mytext))
print(r.get_ranked_phrases())
# ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility']
print(r.get_ranked_phrases_with_scores())
# [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')]