def extract_summary(filename):
    """Print summary text to stdout."""

    with open(filename) as f:
        summary = textrank.extract_sentences(f.read())
        print(summary)
        print("U R HRRE MAIN")
Ejemplo n.º 2
0
def get_huffington_data():
    url2 = input("Enter a URL:")
    request2 = urllib2.Request(url2)
    result2 = urllib2.urlopen(request2)
    soup2 = BeautifulSoup(result2.read(), 'html.parser')
    prop_add2 = soup2.find("div", {"class": "post-contents yr-entry-text"})
    p2 = prop_add2.text
    phrases2 = textrank.extract_key_phrases(p2)
    summaryV2 = textrank.extract_sentences(p2)
    print("Summary:\n")
    print(summaryV2)
    print("phrases:\n")
    print(phrases2)
Ejemplo n.º 3
0
def get_toi_data():
    url3 = input("Enter a URL:")
    request3 = urllib2.Request(url3)
    result3 = urllib2.urlopen(request3)
    soup3 = BeautifulSoup(result3.read(), 'html.parser')
    prop_add3 = soup3.find('div', {'class': 'Normal'})
    p3 = prop_add3.text
    phrases3 = textrank.extract_key_phrases(p3)
    summaryV3 = textrank.extract_sentences(p3)
    print("Summary:\n")
    print(summaryV3)
    print("phrases:\n")
    print(phrases3)
Ejemplo n.º 4
0
def get_hindustan_data():
    url = input("Enter a URL:")
    request = urllib2.Request(url)
    result = urllib2.urlopen(request)
    soup = BeautifulSoup(result.read(), 'html.parser')
    prop_add = soup.find('div', {'class': 'story-details'})
    p = prop_add.text
    f = open('t1.txt', 'w+')
    f.write("{}".format(p), file=text_file)
    f.close()
    with open(t1.txt) as g:
        phrases = textrank.extract_key_phrases(p)
        summaryV = textrank.extract_sentences(p)
        print("Summary:\n")
        print(summaryV)
        print("phrases:\n")
        print(phrases)
Ejemplo n.º 5
0
def create_summary(page):
    resp = urllib.request.urlopen(page)
    soup = bs4.BeautifulSoup(resp, 'html.parser')

    title = (soup.find('h1')).text
    pgraph = soup.find_all('p')
    text = ""
    for x in range((len(pgraph))):
        if "Use of this site constitutes" not in pgraph[x].text:
            text = text + pgraph[x].text

    text = remove_refs(text)

    summary = textrank.extract_sentences(text)
    summary = remove_trail(summary, ".")
    summary = summary + "."
    print('\n', title, '\n')
    print(summary)
Ejemplo n.º 6
0
def get_summary(article):
    """Takes a cleaned article as input
    :returns: summry of article (str)
    """

    article = [buzzclean(line) for line in article]
    sentences = []
    sentences = [re.sub("\s+", " ", line) for line in article]
    sentences = [punkt.tokenize(sentence.lower()) for sentence in sentences]
    sentences = list(chain.from_iterable(sentences))
    article = ' '.join(sentences)
    summary = textrank.extract_sentences(article,
                                         summary_length=150,
                                         clean_sentences=True)
    sentsum = punkt.tokenize(summary)
    sentsum = [sent.capitalize() for sent in sentsum]
    summary = ' '.join(sentsum)
    return summary
Ejemplo n.º 7
0
def summary(df):

    lSum = []

    print('\nSummarising & Extracting: \n', end='')

    for i in range(0, len(df)):
        print('\r', end='')
        print("Completed: " + str(round((i + 1) / len(df) * 100, 1)) + "%",
              end="",
              flush=True)
        summary = textrank.extract_sentences(df.iloc[i, 0])
        kwords = textrank.extract_key_phrases(dfData.iloc[i, 0])
        lSum.append([summary, kwords])

    dfSummaries = pd.DataFrame(lSum)
    dfSummaries.columns = ["summaries", "keywords"]

    return dfSummaries
Ejemplo n.º 8
0
def textrank():
    body = request.get_json()
    return extract_sentences(body['data'], k=int(body['expectedLen']))
Ejemplo n.º 9
0
def generate_titles(file_name,
                    random=False,
                    use_rake=False,
                    use_summa_text_rank=False,
                    use_text_rank=False):
    logger.info("Opening file")
    text_file = open(file_name)
    logger.info("Reading file")
    raw_text = text_file.read().lower()
    # Remove Unicode characters.
    raw_text = raw_text.decode('unicode_escape').encode('ascii', 'ignore')

    #Convert raw text to word tokens
    logger.info("Tokenizing")
    tokens = nltk.word_tokenize(raw_text.translate(None, string.punctuation))

    #Remove stopwords
    logger.info("Removing stopwords")
    stop_words = set(stopwords.words('english'))
    #NOTE: we need to include some more stopwords, as 'english' doesn't contain some stopwords
    #      related to journal articles (e.g., "et" and "al" in "et al.")
    stop_words.update(ADDITIONAL_STOPWORDS)
    filtered_text = [word for word in tokens if word not in stop_words]

    #Create Corpus object for input text
    logger.info("Creating corpus object")
    input_text = Corpus(raw_text, tokens, filtered_text)
    input_text.stop_words = stop_words

    logger.info("Filtered words to use")
    logger.info("\t %s" % input_text.filtered_tokens[:5])

    #NOTE: stopwords are removed before POS tags assigned, this could
    #      potentially degrade POS tagging performance - may want to
    #      switch this order
    #Demonstrate functions
    logger.info("Getting POS tags")
    input_text.pos_tags = pos_tagger(input_text)
    logger.info("\t %s" % input_text.pos_tags[:5])

    logger.info("Finding all used parts of speech.")
    input_text.used_pos = set([tag[1] for tag in input_text.pos_tags])
    logger.info(input_text.used_pos)

    logger.info("Getting stemmed words")
    input_text.stemmed_words = stem_tokens(input_text)
    logger.info("\t %s" % input_text.stemmed_words[:5])

    # split the stemmed words into ~equal-sized groups
    logger.info("Splitting the stemmed words into groups")
    #logger.info("There are %s words in this group" % len(input_text.stemmed_words))
    num_splits = 2
    input_text.splits = split_tokens(input_text, num_splits)
    #for s in input_text.splits:
    #    logger.info("%s %s\n\n" % (s,len(s)))

    logger.info("Getting word frequency and proximity")
    cutoff = 0.125
    if len(input_text.filtered_tokens) < 250:
        cutoff = 0.35  #33
    input_text.word_freq_proximity = stems_frequency_proximity(
        input_text, cutoff)
    #logger.info("\t %s" % (input_text.word_freq_proximity[u'becom'],))

    logger.info("Mapping filtered words and their stemmed forms")
    input_text.filtered_word_and_bases, input_text.filtered_bases_and_words = stems_and_bases(
        input_text)
    #logger.info("\t %s" % input_text.filtered_word_and_bases[u'becom'])

    logger.info("Mapping POS tags and words")
    input_text.pos_tag_and_words = pos_tags_and_words(input_text)
    #logger.info("\t %s" % input_text.pos_tag_and_words['NNS'][:5])

    logger.info("------ End Processing ------\n\n")

    ##########################

    if use_rake:
        logger.info("------ Begin Rake ------")
        """More information at: https://github.com/fabianvf/python-rake"""

        r = Rake(RAKE.SmartStopList())  #stop_words_list)
        sorted_keywords = r.run(input_text.raw_text)
        logger.info("Sorted keywords: %s" % sorted_keywords[:5])
        logger.info("------ End Rake ------\n\n")

    if use_summa_text_rank:
        logger.info("------ Begin SummaTextRank ------")
        """More information at https://github.com/summanlp/textrank"""
        logger.info("Sentence(s) summary: %s" % summarizer.summarize(raw_text))
        logger.info("Keywords: %s" % keywords.keywords(raw_text))

        logger.info("------ End SummaTextRank ------\n\n")

    if use_text_rank:
        logger.info("------ Begin TextRank ------")
        """More information at https://github.com/davidadamojr/TextRank"""

        logger.info("Sentence(s) summary: %s " %
                    textrank.extract_sentences(raw_text))
        logger.info("Keywords: %s" % textrank.extract_key_phrases(raw_text))

        logger.info("------ End TextRank ------\n\n")

    ##########################

    logger.info("------ Begin Weighting ------")

    logger.info("Calculating word weights")
    input_text.word_weights = get_word_weights(input_text, random)

    logger.info("Printing word weights")
    weight_thresh = -1
    print_words_with_weight_above(weight_thresh, input_text.word_weights,
                                  input_text)

    logger.info("------ End Weighting ------\n\n")

    ##########################

    logger.info("------ Begin Building ------")

    titles = build_titles(input_text)

    logger.info("------ End Building ------\n\n")

    ##########################

    logger.info("Closing file")
    text_file.close()

    ##########################

    logger.info("------ Begin Ranking ------")

    #NOTE: the scores denote the title rankings relative to one another
    #      1 denotes the title with the highest rank and 0 denotes the
    #      title with the lowest rank (determined by a combination of
    #      summed word weights and average word weight)
    titles_ranked = order_titles(titles, input_text)

    logger.info("------ End Ranking ------\n\n")

    ##########################

    return titles_ranked
def __extract_sentences(text):
    sentences = textrank.extract_sentences(text)
    return sentences
Ejemplo n.º 11
0
def printSummaries(collection):
    for ind, text in enumerate(collection):
        print 'Sample {:d}'.format(ind)
        print textrank.extract_sentences(text, summary_length=200)
        #print textrank.extract_key_phrases(text)
        print ''
Ejemplo n.º 12
0
def getSummaries(collection):
    return [(row.id, textrank.extract_sentences(row.text, summary_length=200))
            for row in collection.itertuples()]
Ejemplo n.º 13
0
def extract_summary(filename):
    fr = open("summarise.txt", "w")
    with open(filename) as f:
        summary = textrank.extract_sentences(f.read())
        fr.write(summary)
Ejemplo n.º 14
0
def extract_summary(filename):
    """Print summary text to stdout."""
    with open(filename) as f:
        summary = textrank.extract_sentences(f.read())
        print(summary)
Ejemplo n.º 15
0
import textrank

textrank.setup_environment()
filename = 'tests/ari/editorial.txt'
with open(filename) as f:
    summary = textrank.extract_sentences(f.read(), 108)
    print(summary)
    text_file = open("tests/ari/Output.txt", "w")
    text_file.write(summary)
    text_file.close()