def summarizeAndBigram(filename, url):
    # Creating a file object and requesting the html from the link given
    headers = {'User-Agent': 'Mozilla/5.0'}
    html = requests.get(url, headers=headers)
    soup = BeautifulSoup(html.text, 'html5lib')

    all_paras = soup.find_all("div", {"class": "has-content-area"})

    data_2018 = ""
    for para in all_paras:
        data_2018 = data_2018 + para.text

    article_sum = ru.summarize(data_2018)

    # Print summary gathered above
    print "Summary of data mining article"
    print "Three sentence summary"
    for sent in article_sum['top_n_summary']:
        print removeUnicode(sent)

    # Take the data extracted from the site and
    # create the bigrams based on the datas.
    print "--------------------"
    print "Bigrams:"
    asc_2018 = removeUnicode(data_2018)
    bigWords = nltk.tokenize.word_tokenize(asc_2018)
    N = 25
    search = nltk.BigramCollocationFinder.from_words(bigWords)
    search.apply_freq_filter(2)
    search.apply_word_filter(
        lambda skips: skips in nltk.corpus.stopwords.words('english'))

    from nltk import BigramAssocMeasures
    idxJaccard = BigramAssocMeasures.jaccard
    bigrams = search.nbest(idxJaccard, N)

    # Print the bigrams after the filter have been applied
    for bigram in bigrams:
        print str(bigram[0]).encode('utf-8'), " ", str(
            bigram[1]).encode('utf-8')

    print
    print
Beispiel #2
0
print "Sentiment Summation: %f" % overall_sentiment

print "***********Summary*******************"

fileObj = codecs.open("DT_Platform.rtf", "w", "UTF")
html = requests.get("https://donaldjtrump.com/positions/tax-reform")
soup = BeautifulSoup(html.text, "html5lib")
all_paras = soup.find_all('p')

data_trump = ""

for para in all_paras:
    fileObj.write(para.text)
    data_trump = data_trump + para.text

trump_sum = ru.summarize(data_trump)

print "Summary of Trump tax reform: "
for sent in trump_sum['top_n_summary']:
    print removeUnicode(sent)

articleAscii = removeUnicode(data_trump)
words = []

#num_Co is the number of collocations to find
N = 25

#need list of Words by sentence
sentences = nltk.tokenize.word_tokenize(articleAscii)
for sentence in sentences:
    for word in nltk.tokenize.word_tokenize(sentence):
Beispiel #3
0
def scrapePage():
    page = "https://www.google.com/intl/en/policies/technologies/"
    print "------------------------"
    print "     Page: ", page
    print "------------------------"

    html = requests.get(page)
    soup = BeautifulSoup(html.text, 'html5lib')
    all_paras = soup.find_all('p')
    data_2017 = ""
    for para in all_paras:
        data_2017 = data_2017 + para.text
        article_sum = ru.summarize(data_2017)

    print "------------------------"
    print "  Three Sentence Summary"
    print "------------------------"
    for sent in article_sum['top_n_summary']:
        print removeUnicode(sent)

    asc_2017 = removeUnicode(data_2017)
    lstSent = nltk.tokenize.sent_tokenize(asc_2017)
    sentWords = [nltk.tokenize.word_tokenize(s) for s in lstSent]
    posWords = [nltk.pos_tag(w) for w in sentWords]
    posWords = [token for sent in posWords for token in sent]

    chunkCollector = []
    foundChunk = []
    lastPos = None
    for (token, pos) in posWords:
        if pos == lastPos and pos.startswith('NN'):
            foundChunk.append(token)
        elif pos.startswith('NN'):
            if foundChunk != []:
                #something in hopper so add to collection
                chunkCollector.append((''.join(foundChunk), pos))
            foundChunk = [token]
        lastPos = pos

    dChunk = {}
    for chunk in chunkCollector:
        dChunk[chunk] = dChunk.get(chunk, 0) + 1

    print "------------------------"
    print " Most Common Noun Usage"
    print "------------------------"
    for (entity, pos) in sorted(dChunk, key=dChunk.get, reverse=True)[:7]:
        print '\t%s (%s)' % (entity, dChunk[entity, pos])

    chunkCollector = []
    foundChunk = []
    lastPos = None
    for (token, pos) in posWords:
        if pos == lastPos and pos.startswith('V'):
            foundChunk.append(token)
        elif pos.startswith('V'):
            if foundChunk != []:
                #something in hopper so add to collection
                chunkCollector.append((''.join(foundChunk), pos))
            foundChunk = [token]
        lastPos = pos

    dChunk = {}
    for chunk in chunkCollector:
        dChunk[chunk] = dChunk.get(chunk, 0) + 1

    print "------------------------"
    print " Most Common Verb Usage"
    print "------------------------"
    for (entity, pos) in sorted(dChunk, key=dChunk.get, reverse=True)[:7]:
        print '\t%s (%s)' % (entity, dChunk[entity, pos])
Beispiel #4
0
fileObj = codecs.open("17_HO1.rtf", "w", "UTF")

html = requests.get("http://swe.umbc.edu/~rayg/econ_plan.html")

soup = BeautifulSoup(html.text, 'html5lib')

all_paras = soup.find_all('p')

# Write test to file and collate it into a str var
data_2017 = ""

for para in all_paras:
    fileObj.write(para.text)
    data_2017 = data_2017 + para.text
    
Iceberg_sum = ru.summarize(data_2017)

print "Summary of new iceberg"
print "Print Three Sentence Summary"

for sentence in Iceberg_sum['top_n_summary']:
    print removeUnicode(sentence)
    
asc_2017 = removeUnicode(data_2017)

bigWords = nltk.tokenize.word_tokenize(asc_2017)
N = 25
search = nltk.BigramCollocationFinder.from_words(bigWords)

search.apply_freq_filter(2)
search.apply_word_filter(lambda skips: skips in nltk.corpus.stopwords.words('English'))
Beispiel #5
0
# Create a file to output to and gather html link and beautiful soup object
fileObj = codecs.open("proj2.rtf", "w", "UTF")
html = requests.get("https://www.ecommercetimes.com/story/52616.html")
soup = BeautifulSoup(html.text, 'html5lib')

#part3
# Search through all the paragraph tags to ather data and
# use russell to summarize the incoming data
all_paras = soup.find_all('p')

data_2018 = ""
for para in all_paras:
    fileObj.write(para.text)
    data_2018 = data_2018 + para.text

article_sum = ru.summarize(data_2018)

# Print summary gathered above
print "Summary of data mining article"
print "Three sentence summary"
for sent in article_sum['top_n_summary']:
    print removeUnicode(sent)

#part4
# Take the data extracted from the site and
# create the bigrams based on the datas.
print "--------------------"
print "Bigrams:"
asc_2018 = removeUnicode(data_2018)
bigWords = nltk.tokenize.word_tokenize(asc_2018)
N = 25
print "Sentiment Summation: %f" % overall_sentiment

print "***********Summary*******************"

fileObj = codecs.open("DT_Platform.rtf", "w", "UTF")
html = requests.get("https://donaldjtrump.com/positions/tax-reform")
soup = BeautifulSoup(html.text, "html5lib")
all_paras = soup.find_all('p')

data_trump = ""

for para in all_paras:
    fileObj.write(para.text)
    data_trump = data_trump + para.text

trump_sum = ru.summarize(data_trump)

print "Summary of Trump tax reform: "
for sent in trump_sum['top_n_summary']:
    print removeUnicode(sent)

articleAscii = removeUnicode(data_trump)
words = []

#num_Co is the number of collocations to find
N=25

#need list of Words by sentence
sentences = nltk.tokenize.word_tokenize(articleAscii)
for sentence in sentences:
    for word in nltk.tokenize.word_tokenize(sentence):
Beispiel #7
0
#Convert to a dictionary and count for each chunk
dChunk = {}
for chunk in chunkCollector:
    dChunk[chunk] = dChunk.get(chunk, 0) + 1

print "\nChunking"
for (entity, pos) in dChunk:
    if entity.istitle():
        print '\t%s (%s)' % (entity, dChunk[entity, pos])


#Create 3 sentence summary from article text
#This portion did not work with the 'russel.pyc' component that was on blackboard so it is being commented out

import russell as ru
articleSum = ru.summarize(articleText)
print "Summary of Article"
print "Three Sentence Summary"
for each in articleSum['top_n_summary']:
    print removeUnicode(each)

search = nltk.BigramCollocationFinder.from_words(articleAscii)

# filter out collocations that do not occur at least 2 times
search.apply_freq_filter(2)

# Filter out collocations that have stopwords
search.apply_word_filter(lambda skip: skip in skips)

# We use the Jaccard Index to find our bigrams
# idxJaccard = nltk.metrics.BigramAssocMeasures.jaccard
Beispiel #8
0
def sentenceSummary(data):
    # Three sentence summary (#3 of grading rubric)
    summary = ru.summarize(data)
    print "----- Three sentence summary of the article -----"
    for sent in summary['top_n_summary']:
        print removeUnicode(sent)