Python extract_key_phrases Exemples, textrank.extract_key_phrases Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : generate_main_entities_result.py Projet : lujiaying/TextRank

def generate_by_textrank(input_dir, output_dir):
    article_id_start, article_id_end = 51158, 51218
    for cur_article_id in range(article_id_start, article_id_end + 1):
        input_file_path = input_dir + '/%s' % (cur_article_id)
        input_file_json_dict = json.load(open(input_file_path))
        text = input_file_json_dict['text'].encode('utf8')
        key_phrases_with_scores = textrank.extract_key_phrases(text)
        within_doc_entities = [[{
            'start': None,
            'end': None,
            'id': None,
            'article': cur_article_id,
            'text': _[0]
        }] for _ in key_phrases_with_scores]
        entity_scores = [_[1] for _ in key_phrases_with_scores]
        result = {
            'id': input_file_json_dict['id'],
            'date': input_file_json_dict['date'],
            'source': input_file_json_dict['source'],
            'text': input_file_json_dict['text'],
            'title': input_file_json_dict['title'],
            'within_doc_entities': within_doc_entities,
            'entity_scores': entity_scores
        }
        json.dump(result,
                  open(output_dir + '/%s.output' % (cur_article_id), 'w'))

Exemple #2

0

Afficher le fichier

Fichier : ocr.py Projet : csatterfield/TaskDetection-analysis

def extract_key_phrases(fileString, date, results_tr):

    textRankResult = textrank.extract_key_phrases(fileString)

    #print(ranked)
    #print(values)
    result = []

    for word in textRankResult:
        strippedWord = word.replace("-", " ").replace("_"," ").strip()
        strippedWord = re.sub(r'[^a-zA-Z\s]+', '', strippedWord).strip()


        swords = strippedWord.split(" ")
        if(len(swords) < 2):
            if(len(strippedWord) <= 1):
                continue
            result.append(strippedWord)
        else:
            string = ""
            for sword in swords:
                if(len(sword) <= 1):
                    continue
                string += " " + sword
            if string.strip() != "":
                result.append(string.strip())
    
    lemmatizer = WordNetLemmatizer()
    ranked = np.asarray([lemmatizer.lemmatize(x.lower()) for x in result])
    ranked = np.asarray([x.lower() for x in result])

    keywordString = " ".join(ranked)
    results_tr.append((date, keywordString))

Exemple #3

0

Afficher le fichier

def get_textrank_keywords(text):
    results = []
    keywords = textrank.extract_key_phrases(text)

    for word in keywords:
        results.append({ 'text': word.strip(), 'score': 0 })

    return(results)

Exemple #4

0

Afficher le fichier

Fichier : main.py Projet : sinhaaayush01/Textrank

def get_toi_data():
    url3 = input("Enter a URL:")
    request3 = urllib2.Request(url3)
    result3 = urllib2.urlopen(request3)
    soup3 = BeautifulSoup(result3.read(), 'html.parser')
    prop_add3 = soup3.find('div', {'class': 'Normal'})
    p3 = prop_add3.text
    phrases3 = textrank.extract_key_phrases(p3)
    summaryV3 = textrank.extract_sentences(p3)
    print("Summary:\n")
    print(summaryV3)
    print("phrases:\n")
    print(phrases3)

Exemple #5

0

Afficher le fichier

Fichier : main.py Projet : sinhaaayush01/Textrank

def get_huffington_data():
    url2 = input("Enter a URL:")
    request2 = urllib2.Request(url2)
    result2 = urllib2.urlopen(request2)
    soup2 = BeautifulSoup(result2.read(), 'html.parser')
    prop_add2 = soup2.find("div", {"class": "post-contents yr-entry-text"})
    p2 = prop_add2.text
    phrases2 = textrank.extract_key_phrases(p2)
    summaryV2 = textrank.extract_sentences(p2)
    print("Summary:\n")
    print(summaryV2)
    print("phrases:\n")
    print(phrases2)

Exemple #6

0

Afficher le fichier

Fichier : main.py Projet : sinhaaayush01/Textrank

def get_hindustan_data():
    url = input("Enter a URL:")
    request = urllib2.Request(url)
    result = urllib2.urlopen(request)
    soup = BeautifulSoup(result.read(), 'html.parser')
    prop_add = soup.find('div', {'class': 'story-details'})
    p = prop_add.text
    f = open('t1.txt', 'w+')
    f.write("{}".format(p), file=text_file)
    f.close()
    with open(t1.txt) as g:
        phrases = textrank.extract_key_phrases(p)
        summaryV = textrank.extract_sentences(p)
        print("Summary:\n")
        print(summaryV)
        print("phrases:\n")
        print(phrases)

Exemple #7

0

Afficher le fichier

def summary(df):

    lSum = []

    print('\nSummarising & Extracting: \n', end='')

    for i in range(0, len(df)):
        print('\r', end='')
        print("Completed: " + str(round((i + 1) / len(df) * 100, 1)) + "%",
              end="",
              flush=True)
        summary = textrank.extract_sentences(df.iloc[i, 0])
        kwords = textrank.extract_key_phrases(dfData.iloc[i, 0])
        lSum.append([summary, kwords])

    dfSummaries = pd.DataFrame(lSum)
    dfSummaries.columns = ["summaries", "keywords"]

    return dfSummaries

Exemple #8

0

Afficher le fichier

def keywords_extraction(article, method, k=20, with_weight=False):
    doc = ""
    if method == 0:
        model = lda.build_lda_model(article, 1)
        return lda.get_topic(model,
                             num_topics=1,
                             num_words=k,
                             with_weight=with_weight)[0]
    if method == 1:
        if isinstance(article, str):
            article = [article]
        text_list = text_process.general_processing_file(article)
        for arti in text_list:
            doc += arti
        return jieba.analyse.extract_tags(doc,
                                          topK=k,
                                          withWeight=with_weight,
                                          allowPOS=())
    elif method == 2:
        if isinstance(article, str):
            article = [article]
        article = text_process.general_processing_file(article)
        for arti in article:
            doc += arti
        return textrank.extract_key_phrases(doc)
    elif method == 3:
        if isinstance(article, str):
            article = [article]
        article = text_process.text_processing_rake(article)
        for arti in article:
            doc += arti
        r = Rake()
        r.extract_keywords_from_text(doc)
        rank = r.get_ranked_phrases()
        if with_weight == False:
            return rank[0:len(rank) / 2 + 1]
        score = r.get_ranked_phrases_with_scores()
        return score[0:len(rank) / 2]
    #docs_phase
    else:
        raise ValueError('wrong method code')

Exemple #9

0

Afficher le fichier

def get_email_keywords_by_sender(gmail_object,
                                 sender_email,
                                 count=KEYWORDS_COUNT):
    subjects = get_email_subjects_list_by_sender(gmail_object, sender_email,
                                                 SUBJECTS_COUNT)
    keywords = []
    for subject in subjects:
        key_phrases = textrank.extract_key_phrases(subject)
        key_phrases = strip_key_phrases(key_phrases)
        subject_keywords = ', '.join(key_phrases)
        if len(subject_keywords) > 0:
            keywords.append(subject_keywords)

            for chopped_phrase in key_phrases:
                keywords.append(chopped_phrase)

        unique_keywords = remove_duplicate_while_preserving_order(keywords)
        if len(unique_keywords) == count:
            return unique_keywords

    return remove_duplicate_while_preserving_order(keywords)

Exemple #10

0

Afficher le fichier

def tag_keywords(tree):
    print(dir(textrank))
    textrank.setup_environment()

    root = tree.getroot()
    for child in root:
        node = child.find("narrative")
        narr = ""
        narr_keywords = ""
        if node is not None:
            narr = node.text.encode('utf-8')
            narr = narr.lower()

            keywords = textrank.extract_key_phrases(narr)
            for kw in keywords:
                narr_keywords = narr_keywords + kw + " "

        node = etree.SubElement(child, "textrank_kws")
        node.text = narr_keywords.decode('utf-8').strip()

    return tree

Exemple #11

0

Afficher le fichier

Fichier : parser.py Projet : sawatzkylindsey/workbench

 def _automatic_term_extract(self, page_id, content):
     page_name = page_id.replace(" ", "_")
     terms_textrank = set(
         textrank.extract_key_phrases(content, self.top_percent))
     logging.debug("textranks: %s" % terms_textrank)
     #with open("textrank.%s.txt" % page_name, "w", encoding="utf-8") as fh:
     #    for term in terms_textrank:
     #        fh.write("%s\n" % term.replace(" ", "-"))
     rake = Rake()
     rake.extract_keywords_from_text(content)
     rake_ranked_phrases = rake.get_ranked_phrases()
     #with open("rake.%s.txt" % page_name, "w", encoding="utf-8") as fh:
     #    for term in rake_ranked_phrases:
     #        fh.write("%s\n" % nlp.Term(term.split(" ")).name())
     terms_rake = set(rake_ranked_phrases[:max(
         int(len(rake_ranked_phrases) * self.top_percent), 1)])
     logging.debug("rakes: %s" % terms_textrank)
     intersection_terms = terms_textrank.intersection(terms_rake)
     logging.debug("intersection: %s" % intersection_terms)
     return [
         nlp.Term(t.split(" "))
         for t in filter(lambda term: term, intersection_terms)
     ]

Exemple #12

0

Afficher le fichier

for package_name in npm_data_dict.keys():
    readme = npm_data_dict[package_name]['readme']
    readme_rendered = mistune.markdown(readme, escape=False)
    soup = BeautifulSoup(readme_rendered, "html.parser")
    # Replace anchors with content where relevant and extract otherwise
    for link in soup.findAll('a'):
        if link.text.startswith('http'):
            link.extract()
        else:
            link.replaceWithChildren()
    # Remove all the images
    for image in soup.findAll('img'):
        image.extract()
    # Remove all the code blocks
    for code_block in soup.findAll('code'):
        code_block.extract()
    npm_data_dict[package_name]['readme_cleaned'] = clean_stopwords(soup.text)
npm_data.close()

tags = {}
for package_name in npm_data_dict.keys():
    print("Processing: " + package_name)
    tags[package_name] = list(
        textrank.extract_key_phrases(
            npm_data_dict[package_name]["readme_cleaned"]))
    print("tagged: " + package_name)

with open('package_tag_map.json', 'w') as package_tag_map:
    package_tag_map.write(json.dumps(tags))

Exemple #13

0

Afficher le fichier

Fichier : app.py Projet : hridyansh68/articulate

def getPhrases(text):
    list = extract_key_phrases(text)
    return list

Exemple #14

0

Afficher le fichier

Fichier : main.py Projet : davidadamojr/TextRank

def extract_phrases(filename):
    """Print key-phrases to stdout."""
    with open(filename) as f:
        phrases = textrank.extract_key_phrases(f.read())
        print(phrases)

Exemple #15

0

Afficher le fichier

from gensim.models import KeyedVectors
from nltk.corpus import wordnet as wn
import textrank

# a = wn.synsets("good", pos="n")
# print (a)
#
# for b in a:
#     print (b.definition())
#
# synonyms = []
# antonyms = []
#
# for syn in a:
#     for l in syn.lemmas():
#         synonyms.append(l.name())
#         if l.antonyms():
#             antonyms.append(l.antonyms()[0].name())
#
# print("Synonyms: ", set(synonyms))
# print("Antonyms: ", set(antonyms))
# file = "/Users/nikhilmalhotra/Downloads/GoogleNews-vectors-negative300.bin"
# model = KeyedVectors.load_word2vec_format(file, binary=True)
#
# result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
# print (result)

print(
    textrank.extract_key_phrases(
        "There are numerous weaknesses with the bag of words model, especially when applied to natural language processing tasks, that graph ranking algorithms such as TextRank are able to address. "
    ))

Exemple #16

0

Afficher le fichier

@click.argument('filename')
def extract_phrases(filename):
"""Print key-phrases to stdout."""
with open(filename) as f:
phrases = textrank.extract_key_phrases(f.read())
print(phrases)

your_text = """
What Causes Diabetes?
Your pancreas makes a hormone called insulin. It's what lets your cells turn glucose from the food you eat into energy. People with type 2 diabetes make insulin, but their cells don't use it as well as they should. Doctors call this insulin resistance.

At first, the pancreas makes more insulin to try to get glucose into the cells. But eventually it can't keep up, and the sugar builds up in your blood instead.

Usually a combination of things cause type 2 diabetes, including:
Genes. Scientists have found different bits of DNA that affect how your body makes insulin.

Extra weight. Being overweight or obese can cause insulin resistance, especially if you carry your extra pounds around the middle. Now type 2 diabetes affects kids and teens as well as adults, mainly because of childhood obesity.
Metabolic syndrome. People with insulin resistance often have a group of conditions including high blood glucose, extra fat around the waist, high blood pressure, and high cholesterol and triglycerides.

Too much glucose from your liver. When your blood sugar is low, your liver makes and sends out glucose. After you eat, your blood sugar goes up, and usually the liver will slow down and store its glucose for later. But some people's livers don't. They keep cranking out sugar.

Bad communication between cells. Sometimes cells send the wrong signals or don't pick up messages correctly. When these problems affect how your cells make and use insulin or glucose, a chain reaction can lead to diabetes.

Broken beta cells. If the cells that make the insulin send out the wrong amount of insulin at the wrong time, your blood sugar gets thrown off. High blood glucose can damage these cells, too.
"""

keywords_dict = extract_key_phrases(your_text)
# print("keywords_dict",keywords_dict)
# {'Diabetes', 'communication', 'hormone', 'middle', 'glucose', 'Metabolic syndrome', 'combination', 'Broken', 'cholesterol', 'group', 'liver', 'extra', 'different', 'pressure', 'Extra weight', 'insulin resistance', 'insulin', 'sugar', 'pancreas', 'DNA', 'amount', 'energy', 'overweight', 'childhood obesity', 'reaction'}

Exemple #17

0

Afficher le fichier

def extract_phrases(filename):
    """Print key-phrases to stdout."""
    with open(filename) as f:
        phrases = textrank.extract_key_phrases(f.read())
        print(phrases)

Exemple #18

0

Afficher le fichier

def generate_titles(file_name,
                    random=False,
                    use_rake=False,
                    use_summa_text_rank=False,
                    use_text_rank=False):
    logger.info("Opening file")
    text_file = open(file_name)
    logger.info("Reading file")
    raw_text = text_file.read().lower()
    # Remove Unicode characters.
    raw_text = raw_text.decode('unicode_escape').encode('ascii', 'ignore')

    #Convert raw text to word tokens
    logger.info("Tokenizing")
    tokens = nltk.word_tokenize(raw_text.translate(None, string.punctuation))

    #Remove stopwords
    logger.info("Removing stopwords")
    stop_words = set(stopwords.words('english'))
    #NOTE: we need to include some more stopwords, as 'english' doesn't contain some stopwords
    #      related to journal articles (e.g., "et" and "al" in "et al.")
    stop_words.update(ADDITIONAL_STOPWORDS)
    filtered_text = [word for word in tokens if word not in stop_words]

    #Create Corpus object for input text
    logger.info("Creating corpus object")
    input_text = Corpus(raw_text, tokens, filtered_text)
    input_text.stop_words = stop_words

    logger.info("Filtered words to use")
    logger.info("\t %s" % input_text.filtered_tokens[:5])

    #NOTE: stopwords are removed before POS tags assigned, this could
    #      potentially degrade POS tagging performance - may want to
    #      switch this order
    #Demonstrate functions
    logger.info("Getting POS tags")
    input_text.pos_tags = pos_tagger(input_text)
    logger.info("\t %s" % input_text.pos_tags[:5])

    logger.info("Finding all used parts of speech.")
    input_text.used_pos = set([tag[1] for tag in input_text.pos_tags])
    logger.info(input_text.used_pos)

    logger.info("Getting stemmed words")
    input_text.stemmed_words = stem_tokens(input_text)
    logger.info("\t %s" % input_text.stemmed_words[:5])

    # split the stemmed words into ~equal-sized groups
    logger.info("Splitting the stemmed words into groups")
    #logger.info("There are %s words in this group" % len(input_text.stemmed_words))
    num_splits = 2
    input_text.splits = split_tokens(input_text, num_splits)
    #for s in input_text.splits:
    #    logger.info("%s %s\n\n" % (s,len(s)))

    logger.info("Getting word frequency and proximity")
    cutoff = 0.125
    if len(input_text.filtered_tokens) < 250:
        cutoff = 0.35  #33
    input_text.word_freq_proximity = stems_frequency_proximity(
        input_text, cutoff)
    #logger.info("\t %s" % (input_text.word_freq_proximity[u'becom'],))

    logger.info("Mapping filtered words and their stemmed forms")
    input_text.filtered_word_and_bases, input_text.filtered_bases_and_words = stems_and_bases(
        input_text)
    #logger.info("\t %s" % input_text.filtered_word_and_bases[u'becom'])

    logger.info("Mapping POS tags and words")
    input_text.pos_tag_and_words = pos_tags_and_words(input_text)
    #logger.info("\t %s" % input_text.pos_tag_and_words['NNS'][:5])

    logger.info("------ End Processing ------\n\n")

    ##########################

    if use_rake:
        logger.info("------ Begin Rake ------")
        """More information at: https://github.com/fabianvf/python-rake"""

        r = Rake(RAKE.SmartStopList())  #stop_words_list)
        sorted_keywords = r.run(input_text.raw_text)
        logger.info("Sorted keywords: %s" % sorted_keywords[:5])
        logger.info("------ End Rake ------\n\n")

    if use_summa_text_rank:
        logger.info("------ Begin SummaTextRank ------")
        """More information at https://github.com/summanlp/textrank"""
        logger.info("Sentence(s) summary: %s" % summarizer.summarize(raw_text))
        logger.info("Keywords: %s" % keywords.keywords(raw_text))

        logger.info("------ End SummaTextRank ------\n\n")

    if use_text_rank:
        logger.info("------ Begin TextRank ------")
        """More information at https://github.com/davidadamojr/TextRank"""

        logger.info("Sentence(s) summary: %s " %
                    textrank.extract_sentences(raw_text))
        logger.info("Keywords: %s" % textrank.extract_key_phrases(raw_text))

        logger.info("------ End TextRank ------\n\n")

    ##########################

    logger.info("------ Begin Weighting ------")

    logger.info("Calculating word weights")
    input_text.word_weights = get_word_weights(input_text, random)

    logger.info("Printing word weights")
    weight_thresh = -1
    print_words_with_weight_above(weight_thresh, input_text.word_weights,
                                  input_text)

    logger.info("------ End Weighting ------\n\n")

    ##########################

    logger.info("------ Begin Building ------")

    titles = build_titles(input_text)

    logger.info("------ End Building ------\n\n")

    ##########################

    logger.info("Closing file")
    text_file.close()

    ##########################

    logger.info("------ Begin Ranking ------")

    #NOTE: the scores denote the title rankings relative to one another
    #      1 denotes the title with the highest rank and 0 denotes the
    #      title with the lowest rank (determined by a combination of
    #      summed word weights and average word weight)
    titles_ranked = order_titles(titles, input_text)

    logger.info("------ End Ranking ------\n\n")

    ##########################

    return titles_ranked

Exemple #19

0

Afficher le fichier

Fichier : keywords_extractor.py Projet : nescirem/google_scholar_paper_finder

def __extract_keywords(text):
    keywords = textrank.extract_key_phrases(text)
    return keywords

Exemple #20

0

Afficher le fichier

Fichier : summarize.py Projet : csatterfield/TaskDetection-analysis

def extract(taskString):
    keyphrases = textrank.extract_key_phrases(taskString)
    return keyphrases