def generate_by_textrank(input_dir, output_dir):
    article_id_start, article_id_end = 51158, 51218
    for cur_article_id in range(article_id_start, article_id_end + 1):
        input_file_path = input_dir + '/%s' % (cur_article_id)
        input_file_json_dict = json.load(open(input_file_path))
        text = input_file_json_dict['text'].encode('utf8')
        key_phrases_with_scores = textrank.extract_key_phrases(text)
        within_doc_entities = [[{
            'start': None,
            'end': None,
            'id': None,
            'article': cur_article_id,
            'text': _[0]
        }] for _ in key_phrases_with_scores]
        entity_scores = [_[1] for _ in key_phrases_with_scores]
        result = {
            'id': input_file_json_dict['id'],
            'date': input_file_json_dict['date'],
            'source': input_file_json_dict['source'],
            'text': input_file_json_dict['text'],
            'title': input_file_json_dict['title'],
            'within_doc_entities': within_doc_entities,
            'entity_scores': entity_scores
        }
        json.dump(result,
                  open(output_dir + '/%s.output' % (cur_article_id), 'w'))
def extract_key_phrases(fileString, date, results_tr):

    textRankResult = textrank.extract_key_phrases(fileString)

    #print(ranked)
    #print(values)
    result = []

    for word in textRankResult:
        strippedWord = word.replace("-", " ").replace("_"," ").strip()
        strippedWord = re.sub(r'[^a-zA-Z\s]+', '', strippedWord).strip()


        swords = strippedWord.split(" ")
        if(len(swords) < 2):
            if(len(strippedWord) <= 1):
                continue
            result.append(strippedWord)
        else:
            string = ""
            for sword in swords:
                if(len(sword) <= 1):
                    continue
                string += " " + sword
            if string.strip() != "":
                result.append(string.strip())
    
    lemmatizer = WordNetLemmatizer()
    ranked = np.asarray([lemmatizer.lemmatize(x.lower()) for x in result])
    ranked = np.asarray([x.lower() for x in result])

    keywordString = " ".join(ranked)
    results_tr.append((date, keywordString))
Exemple #3
0
def get_textrank_keywords(text):
    results = []
    keywords = textrank.extract_key_phrases(text)

    for word in keywords:
        results.append({ 'text': word.strip(), 'score': 0 })

    return(results)
Exemple #4
0
def get_toi_data():
    url3 = input("Enter a URL:")
    request3 = urllib2.Request(url3)
    result3 = urllib2.urlopen(request3)
    soup3 = BeautifulSoup(result3.read(), 'html.parser')
    prop_add3 = soup3.find('div', {'class': 'Normal'})
    p3 = prop_add3.text
    phrases3 = textrank.extract_key_phrases(p3)
    summaryV3 = textrank.extract_sentences(p3)
    print("Summary:\n")
    print(summaryV3)
    print("phrases:\n")
    print(phrases3)
Exemple #5
0
def get_huffington_data():
    url2 = input("Enter a URL:")
    request2 = urllib2.Request(url2)
    result2 = urllib2.urlopen(request2)
    soup2 = BeautifulSoup(result2.read(), 'html.parser')
    prop_add2 = soup2.find("div", {"class": "post-contents yr-entry-text"})
    p2 = prop_add2.text
    phrases2 = textrank.extract_key_phrases(p2)
    summaryV2 = textrank.extract_sentences(p2)
    print("Summary:\n")
    print(summaryV2)
    print("phrases:\n")
    print(phrases2)
Exemple #6
0
def get_hindustan_data():
    url = input("Enter a URL:")
    request = urllib2.Request(url)
    result = urllib2.urlopen(request)
    soup = BeautifulSoup(result.read(), 'html.parser')
    prop_add = soup.find('div', {'class': 'story-details'})
    p = prop_add.text
    f = open('t1.txt', 'w+')
    f.write("{}".format(p), file=text_file)
    f.close()
    with open(t1.txt) as g:
        phrases = textrank.extract_key_phrases(p)
        summaryV = textrank.extract_sentences(p)
        print("Summary:\n")
        print(summaryV)
        print("phrases:\n")
        print(phrases)
Exemple #7
0
def summary(df):

    lSum = []

    print('\nSummarising & Extracting: \n', end='')

    for i in range(0, len(df)):
        print('\r', end='')
        print("Completed: " + str(round((i + 1) / len(df) * 100, 1)) + "%",
              end="",
              flush=True)
        summary = textrank.extract_sentences(df.iloc[i, 0])
        kwords = textrank.extract_key_phrases(dfData.iloc[i, 0])
        lSum.append([summary, kwords])

    dfSummaries = pd.DataFrame(lSum)
    dfSummaries.columns = ["summaries", "keywords"]

    return dfSummaries
Exemple #8
0
def keywords_extraction(article, method, k=20, with_weight=False):
    doc = ""
    if method == 0:
        model = lda.build_lda_model(article, 1)
        return lda.get_topic(model,
                             num_topics=1,
                             num_words=k,
                             with_weight=with_weight)[0]
    if method == 1:
        if isinstance(article, str):
            article = [article]
        text_list = text_process.general_processing_file(article)
        for arti in text_list:
            doc += arti
        return jieba.analyse.extract_tags(doc,
                                          topK=k,
                                          withWeight=with_weight,
                                          allowPOS=())
    elif method == 2:
        if isinstance(article, str):
            article = [article]
        article = text_process.general_processing_file(article)
        for arti in article:
            doc += arti
        return textrank.extract_key_phrases(doc)
    elif method == 3:
        if isinstance(article, str):
            article = [article]
        article = text_process.text_processing_rake(article)
        for arti in article:
            doc += arti
        r = Rake()
        r.extract_keywords_from_text(doc)
        rank = r.get_ranked_phrases()
        if with_weight == False:
            return rank[0:len(rank) / 2 + 1]
        score = r.get_ranked_phrases_with_scores()
        return score[0:len(rank) / 2]
    #docs_phase
    else:
        raise ValueError('wrong method code')
Exemple #9
0
def get_email_keywords_by_sender(gmail_object,
                                 sender_email,
                                 count=KEYWORDS_COUNT):
    subjects = get_email_subjects_list_by_sender(gmail_object, sender_email,
                                                 SUBJECTS_COUNT)
    keywords = []
    for subject in subjects:
        key_phrases = textrank.extract_key_phrases(subject)
        key_phrases = strip_key_phrases(key_phrases)
        subject_keywords = ', '.join(key_phrases)
        if len(subject_keywords) > 0:
            keywords.append(subject_keywords)

            for chopped_phrase in key_phrases:
                keywords.append(chopped_phrase)

        unique_keywords = remove_duplicate_while_preserving_order(keywords)
        if len(unique_keywords) == count:
            return unique_keywords

    return remove_duplicate_while_preserving_order(keywords)
Exemple #10
0
def tag_keywords(tree):
    print(dir(textrank))
    textrank.setup_environment()

    root = tree.getroot()
    for child in root:
        node = child.find("narrative")
        narr = ""
        narr_keywords = ""
        if node is not None:
            narr = node.text.encode('utf-8')
            narr = narr.lower()

            keywords = textrank.extract_key_phrases(narr)
            for kw in keywords:
                narr_keywords = narr_keywords + kw + " "

        node = etree.SubElement(child, "textrank_kws")
        node.text = narr_keywords.decode('utf-8').strip()

    return tree
Exemple #11
0
 def _automatic_term_extract(self, page_id, content):
     page_name = page_id.replace(" ", "_")
     terms_textrank = set(
         textrank.extract_key_phrases(content, self.top_percent))
     logging.debug("textranks: %s" % terms_textrank)
     #with open("textrank.%s.txt" % page_name, "w", encoding="utf-8") as fh:
     #    for term in terms_textrank:
     #        fh.write("%s\n" % term.replace(" ", "-"))
     rake = Rake()
     rake.extract_keywords_from_text(content)
     rake_ranked_phrases = rake.get_ranked_phrases()
     #with open("rake.%s.txt" % page_name, "w", encoding="utf-8") as fh:
     #    for term in rake_ranked_phrases:
     #        fh.write("%s\n" % nlp.Term(term.split(" ")).name())
     terms_rake = set(rake_ranked_phrases[:max(
         int(len(rake_ranked_phrases) * self.top_percent), 1)])
     logging.debug("rakes: %s" % terms_textrank)
     intersection_terms = terms_textrank.intersection(terms_rake)
     logging.debug("intersection: %s" % intersection_terms)
     return [
         nlp.Term(t.split(" "))
         for t in filter(lambda term: term, intersection_terms)
     ]
Exemple #12
0
for package_name in npm_data_dict.keys():
    readme = npm_data_dict[package_name]['readme']
    readme_rendered = mistune.markdown(readme, escape=False)
    soup = BeautifulSoup(readme_rendered, "html.parser")
    # Replace anchors with content where relevant and extract otherwise
    for link in soup.findAll('a'):
        if link.text.startswith('http'):
            link.extract()
        else:
            link.replaceWithChildren()
    # Remove all the images
    for image in soup.findAll('img'):
        image.extract()
    # Remove all the code blocks
    for code_block in soup.findAll('code'):
        code_block.extract()
    npm_data_dict[package_name]['readme_cleaned'] = clean_stopwords(soup.text)
npm_data.close()

tags = {}
for package_name in npm_data_dict.keys():
    print("Processing: " + package_name)
    tags[package_name] = list(
        textrank.extract_key_phrases(
            npm_data_dict[package_name]["readme_cleaned"]))
    print("tagged: " + package_name)

with open('package_tag_map.json', 'w') as package_tag_map:
    package_tag_map.write(json.dumps(tags))
Exemple #13
0
def getPhrases(text):
    list = extract_key_phrases(text)
    return list
Exemple #14
0
def extract_phrases(filename):
    """Print key-phrases to stdout."""
    with open(filename) as f:
        phrases = textrank.extract_key_phrases(f.read())
        print(phrases)
Exemple #15
0
from gensim.models import KeyedVectors
from nltk.corpus import wordnet as wn
import textrank

# a = wn.synsets("good", pos="n")
# print (a)
#
# for b in a:
#     print (b.definition())
#
# synonyms = []
# antonyms = []
#
# for syn in a:
#     for l in syn.lemmas():
#         synonyms.append(l.name())
#         if l.antonyms():
#             antonyms.append(l.antonyms()[0].name())
#
# print("Synonyms: ", set(synonyms))
# print("Antonyms: ", set(antonyms))
# file = "/Users/nikhilmalhotra/Downloads/GoogleNews-vectors-negative300.bin"
# model = KeyedVectors.load_word2vec_format(file, binary=True)
#
# result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
# print (result)

print(
    textrank.extract_key_phrases(
        "There are numerous weaknesses with the bag of words model, especially when applied to natural language processing tasks, that graph ranking algorithms such as TextRank are able to address. "
    ))
Exemple #16
0
@click.argument('filename')
def extract_phrases(filename):
    """Print key-phrases to stdout."""
    with open(filename) as f:
        phrases = textrank.extract_key_phrases(f.read())
        print(phrases)


your_text = """
What Causes Diabetes?
Your pancreas makes a hormone called insulin. It's what lets your cells turn glucose from the food you eat into energy. People with type 2 diabetes make insulin, but their cells don't use it as well as they should. Doctors call this insulin resistance.

At first, the pancreas makes more insulin to try to get glucose into the cells. But eventually it can't keep up, and the sugar builds up in your blood instead.

Usually a combination of things cause type 2 diabetes, including:
Genes. Scientists have found different bits of DNA that affect how your body makes insulin.

Extra weight. Being overweight or obese can cause insulin resistance, especially if you carry your extra pounds around the middle. Now type 2 diabetes affects kids and teens as well as adults, mainly because of childhood obesity.
Metabolic syndrome. People with insulin resistance often have a group of conditions including high blood glucose, extra fat around the waist, high blood pressure, and high cholesterol and triglycerides.

Too much glucose from your liver. When your blood sugar is low, your liver makes and sends out glucose. After you eat, your blood sugar goes up, and usually the liver will slow down and store its glucose for later. But some people's livers don't. They keep cranking out sugar.

Bad communication between cells. Sometimes cells send the wrong signals or don't pick up messages correctly. When these problems affect how your cells make and use insulin or glucose, a chain reaction can lead to diabetes.

Broken beta cells. If the cells that make the insulin send out the wrong amount of insulin at the wrong time, your blood sugar gets thrown off. High blood glucose can damage these cells, too.
"""

keywords_dict = extract_key_phrases(your_text)
# print("keywords_dict",keywords_dict)
# {'Diabetes', 'communication', 'hormone', 'middle', 'glucose', 'Metabolic syndrome', 'combination', 'Broken', 'cholesterol', 'group', 'liver', 'extra', 'different', 'pressure', 'Extra weight', 'insulin resistance', 'insulin', 'sugar', 'pancreas', 'DNA', 'amount', 'energy', 'overweight', 'childhood obesity', 'reaction'}
Exemple #17
0
def extract_phrases(filename):
    """Print key-phrases to stdout."""
    with open(filename) as f:
        phrases = textrank.extract_key_phrases(f.read())
        print(phrases)
Exemple #18
0
def generate_titles(file_name,
                    random=False,
                    use_rake=False,
                    use_summa_text_rank=False,
                    use_text_rank=False):
    logger.info("Opening file")
    text_file = open(file_name)
    logger.info("Reading file")
    raw_text = text_file.read().lower()
    # Remove Unicode characters.
    raw_text = raw_text.decode('unicode_escape').encode('ascii', 'ignore')

    #Convert raw text to word tokens
    logger.info("Tokenizing")
    tokens = nltk.word_tokenize(raw_text.translate(None, string.punctuation))

    #Remove stopwords
    logger.info("Removing stopwords")
    stop_words = set(stopwords.words('english'))
    #NOTE: we need to include some more stopwords, as 'english' doesn't contain some stopwords
    #      related to journal articles (e.g., "et" and "al" in "et al.")
    stop_words.update(ADDITIONAL_STOPWORDS)
    filtered_text = [word for word in tokens if word not in stop_words]

    #Create Corpus object for input text
    logger.info("Creating corpus object")
    input_text = Corpus(raw_text, tokens, filtered_text)
    input_text.stop_words = stop_words

    logger.info("Filtered words to use")
    logger.info("\t %s" % input_text.filtered_tokens[:5])

    #NOTE: stopwords are removed before POS tags assigned, this could
    #      potentially degrade POS tagging performance - may want to
    #      switch this order
    #Demonstrate functions
    logger.info("Getting POS tags")
    input_text.pos_tags = pos_tagger(input_text)
    logger.info("\t %s" % input_text.pos_tags[:5])

    logger.info("Finding all used parts of speech.")
    input_text.used_pos = set([tag[1] for tag in input_text.pos_tags])
    logger.info(input_text.used_pos)

    logger.info("Getting stemmed words")
    input_text.stemmed_words = stem_tokens(input_text)
    logger.info("\t %s" % input_text.stemmed_words[:5])

    # split the stemmed words into ~equal-sized groups
    logger.info("Splitting the stemmed words into groups")
    #logger.info("There are %s words in this group" % len(input_text.stemmed_words))
    num_splits = 2
    input_text.splits = split_tokens(input_text, num_splits)
    #for s in input_text.splits:
    #    logger.info("%s %s\n\n" % (s,len(s)))

    logger.info("Getting word frequency and proximity")
    cutoff = 0.125
    if len(input_text.filtered_tokens) < 250:
        cutoff = 0.35  #33
    input_text.word_freq_proximity = stems_frequency_proximity(
        input_text, cutoff)
    #logger.info("\t %s" % (input_text.word_freq_proximity[u'becom'],))

    logger.info("Mapping filtered words and their stemmed forms")
    input_text.filtered_word_and_bases, input_text.filtered_bases_and_words = stems_and_bases(
        input_text)
    #logger.info("\t %s" % input_text.filtered_word_and_bases[u'becom'])

    logger.info("Mapping POS tags and words")
    input_text.pos_tag_and_words = pos_tags_and_words(input_text)
    #logger.info("\t %s" % input_text.pos_tag_and_words['NNS'][:5])

    logger.info("------ End Processing ------\n\n")

    ##########################

    if use_rake:
        logger.info("------ Begin Rake ------")
        """More information at: https://github.com/fabianvf/python-rake"""

        r = Rake(RAKE.SmartStopList())  #stop_words_list)
        sorted_keywords = r.run(input_text.raw_text)
        logger.info("Sorted keywords: %s" % sorted_keywords[:5])
        logger.info("------ End Rake ------\n\n")

    if use_summa_text_rank:
        logger.info("------ Begin SummaTextRank ------")
        """More information at https://github.com/summanlp/textrank"""
        logger.info("Sentence(s) summary: %s" % summarizer.summarize(raw_text))
        logger.info("Keywords: %s" % keywords.keywords(raw_text))

        logger.info("------ End SummaTextRank ------\n\n")

    if use_text_rank:
        logger.info("------ Begin TextRank ------")
        """More information at https://github.com/davidadamojr/TextRank"""

        logger.info("Sentence(s) summary: %s " %
                    textrank.extract_sentences(raw_text))
        logger.info("Keywords: %s" % textrank.extract_key_phrases(raw_text))

        logger.info("------ End TextRank ------\n\n")

    ##########################

    logger.info("------ Begin Weighting ------")

    logger.info("Calculating word weights")
    input_text.word_weights = get_word_weights(input_text, random)

    logger.info("Printing word weights")
    weight_thresh = -1
    print_words_with_weight_above(weight_thresh, input_text.word_weights,
                                  input_text)

    logger.info("------ End Weighting ------\n\n")

    ##########################

    logger.info("------ Begin Building ------")

    titles = build_titles(input_text)

    logger.info("------ End Building ------\n\n")

    ##########################

    logger.info("Closing file")
    text_file.close()

    ##########################

    logger.info("------ Begin Ranking ------")

    #NOTE: the scores denote the title rankings relative to one another
    #      1 denotes the title with the highest rank and 0 denotes the
    #      title with the lowest rank (determined by a combination of
    #      summed word weights and average word weight)
    titles_ranked = order_titles(titles, input_text)

    logger.info("------ End Ranking ------\n\n")

    ##########################

    return titles_ranked
def __extract_keywords(text):
    keywords = textrank.extract_key_phrases(text)
    return keywords
def extract(taskString):
    keyphrases = textrank.extract_key_phrases(taskString)
    return keyphrases