Esempio n. 1
0
def gatherPostData(soup):
  # Find post title
  post_title = clean_html(str(soup.find('a', {'rel':'bookmark'})))
  
  # Find publish date
  date = clean_html(str(soup.find('div', {'class':'date'}).find('span')))
  
  # Find author
  author = clean_html(str(soup.find('a', {'rel':'author'})))
  
  # Find number of comments
  num_comments = int(clean_html(str(soup.find('div', {'class':'comm'}).find('a'))).split()[0].replace(',', ''))

  return [post_title, date, author, num_comments]
Esempio n. 2
0
def punctuation_marks(document):
    text = clean_html(document.content)
    return {
     "#dots": text.count("."),
     "#commas": text.count(","),
     "#questions": text.count("?"),
     "#exclamations": text.count("!")
    }
Esempio n. 3
0
def spelling_errors(document):
    d = enchant.Dict("en_US")
    num = 0
    tokens = nltk.word_tokenize(clean_html(document.content))
    
    for token in tokens:
        if len(token) >= 2 and not d.check(token):
            num += 1
    
    return {
     '#spelling_errors': num
    }
Esempio n. 4
0
def sentiment(document):
    classifier = nltk.data.load("classifiers/polarity _NaiveBayes.pickle")
    words = nltk.word_tokenize(clean_html(document.content))
    words_ngrams = reduce(operator.add, [words if n == 1 else ngrams(words, n) for n in [1,2]])
    
    features = dict([(words_ngram, True) for words_ngram in words_ngrams])    
    
    polarity = classifier.classify(features)
    
    return {
     "?polarity": (0 if polarity[0] == "neg" else 1) 
    }
Esempio n. 5
0
def text_complexity(document):
    word_freq = nltk.FreqDist(w.lower() for w in nltk.word_tokenize(clean_html(document.content)))
    n = len(word_freq.samples())
    c = 0.0
    log10_n = math.log10(n)
    
    for f in word_freq.items():
        c += f[1] * (log10_n - math.log10(f[1]))
    
    c = c * (1.0/len(word_freq.samples()))
    
    return {
     '@text_complexity': c
    }
Esempio n. 6
0
    def __call__(self, env, start_response):
        # While not nessary, webob makes this easy
        request = Request(env)
        # Call up the middleware pipeline
        response = request.get_response(self.app)

        # Is the body HTML? (This assumes, our wsgi app is doing sane things
        # and setting a DOCTYPE)
        if re.match('\s*?<!DOCTYPE\s*?html', response.body):
            # If the PATH ends in .tldr
            if re.search('\.tldr\s*?$', request.path):
                # Summarize the html
                response.body = self.summary_html(
                    summarize(self.number,
                              self.context,
                              clean_html(response.body)))

        return response(env, start_response)
Esempio n. 7
0
def main():
    if len(sys.argv) != 3:
        print "usage: ./html2text.py <urlfile> <outputdir>"
        sys.exit(1)

    with open(sys.argv[1]) as f:
        urls = [line.rstrip('\n') for line in f]


    for url in urls:
        filename = url.rsplit('/',1)[1] + ".txt"
        response = urllib2.urlopen(url)
        text = clean_html(response.read())
        cleaned_text = re.sub(remove_footnotes, '', text)
        # wow such race condition
        if not os.path.exists(sys.argv[2]):
            os.makedirs(sys.argv[2])
        with open(os.path.join(sys.argv[2], filename), 'w') as f:
            f.write(cleaned_text)
Esempio n. 8
0
def part_of_speach(document):
    text = clean_html(document.content)
    sentences = nltk.sent_tokenize(text)
    
    features = {}
    
    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence)
        tagged = nltk.pos_tag(tokens)
        
        for word, tag in tagged:
            if len(tag) < 2: continue
             
            fname = "#" + tag
            if fname in features:
                features[fname] += 1
            else:
                features[fname] = 1
                
    return features
Esempio n. 9
0
def getWebPg(link):
	# Get url in file format
	opener = urllib2.build_opener()
	opener.addheaders = [('User-agent', 'Mozilla/5.0')]
	f = opener.open(link)

	# Read in url and store page
	page = f.read()
	f.close()

	# Parse out text (clean html tags)
	page = clean_html(page)

	match = re.search('^\s*References\s*$', page, re.MULTILINE)
	if match:
		endPage = match.start()
		page = page[:endPage]

	#print page

	#page = page.decode('utf-8');

	return page
Esempio n. 10
0
# Web scraping from hw5
page_to_scrape = 'http://101books.net/archives/'  # A book blog
# headers = ["Date", "Title"]
# filename = "blog_info_ramram.csv"
# readFile = open(filename, "wb")
# csvwriter = csv.writer(readFile)
# csvwriter.writerow(headers)
webpage = urllib2.urlopen(page_to_scrape)
soup = BeautifulSoup(webpage.read())
soup.prettify()

# For title table
titles = soup.findAll("li", attrs = {'class':"clear"})
for title in titles:
    t = clean_html(title.find("a")['title'])
    t = t[18::]
    print "{0}".format(t.encode('ascii', 'ignore'))

title_array = []
author_array = ['Robert']*25
# p.s. The code for scraping the author name works but doesn't return a very clean string, that's why i generated the array like this
for i in range(25):
    title = titles[i]
    t = clean_html(title.find("a")['title'])[18::]
    t = t.encode('ascii', 'ignore')
    title_array.append(t)
# print title_array
# print author_array

#Connect to the local database
 def _clean_content(self):
     raw_content_text = self._content
     cleaned_text = clean_html(raw_content_text)
     raw_content = sub(r'\w+ \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2} ', '', cleaned_text)
     tiny_raw_content = raw_content.lower()
     return tiny_raw_content
Esempio n. 12
0
# Extract author
authors = soup.findAll("span", attrs={'class':'author vcard'})
 
# Extract url
urls = soup.findAll("h2")
 
# Extract post_title
post_titles = soup.findAll("h2", attrs={'class': 'entry-title'})

# Extract comment count
comments_count = soup.findAll("a", href=re.compile('comments$'))


for i in range(48)[::-1]: # by adding [::-1], I sort the posts in a chronological way.
 author = authors[i]
 a = clean_html(str(author))
 date = post_dates[i]
 d = clean_html(str(date))
 url = urls[i]
 u = clean_html(str(url.find("a")["href"]))
 title = post_titles[i]
 t = clean_html(str(title.find("a")))
 count = comments_count[i]
 c = clean_html(str(count))
 # is_post: a boolean value that is 1 if your crawler thinks the page is a post
 if authors[i] != "" and post_titles[i] != "" and urls[i] !="":
  p = 1
 else: p = 0
 csvwriter.writerow([d, a, u, t, c, p])

readFile.close()
Esempio n. 13
0
soup.prettify()

# Extract posts
post_entries = soup.findAll("div", attrs={'id':re.compile('^post')})

is_post = []
post_date = []
post_author = []
post_url = []
post_titles = []


# First loop is for the homepage!
for i in post_entries:
	temp_is_post = True
	temp_titles = clean_html(str(i.find("a")))
	temp_date = i.find("div", attrs={'class':'date'})
	temp_date = clean_html(str(temp_date))
	temp_author = i.find("a", attrs={'rel':'author'})
	temp_author = clean_html(str(temp_author))
	temp_url = i.find("a", attrs={'class' : 'more-link'}).get("href")
	is_post.append(temp_is_post)
	post_date.append(temp_date)
	post_author.append(temp_author)
	post_url.append(temp_url)
	post_titles.append(temp_titles)

# Scrap from 2nd page and on....
n = 1
for j in range(2, 536):
Esempio n. 14
0
#!/usr/bin/env python
import sys
from nltk.util import clean_html

if (len(sys.argv)) !=3:
  sys.stderr.write('Usage <input file> <output file>!\n')
  sys.exit(1)

html = ''

inpFile = open(sys.argv[1], 'r');
outFile = open(sys.argv[2], 'w');

for line in inpFile.readlines():
  html = html + line

outFile.write(clean_html(html) + '\n')
Esempio n. 15
0
webpage = urllib2.urlopen(page_to_scrape)

# Parse it
soup = BeautifulSoup(webpage.read())
soup.prettify()

Articles = soup.findAll("div", attrs={'class':'hnews hentry item'})

for article in Articles:
	observation = []
	for item in article.findAll("span", {"id": True}): #Determine if post
		if item["id"] == "mainentry":
			observation.append(1)
		else:
			observation.append(0)
	observation.append(clean_html(str(article.find("span", attrs={"class": "timestamp"})))) #Post date
	observation.append(clean_html(str(article.find("span", attrs={"class": "author vcard"})))) #Authors
	
	past_urls = []
	for item in article.fetch("a"): #URL
		temp_url = item["href"]
		if temp_url[-10:] == "/#comments":
			continue
		elif temp_url[0:7] != "http://":
			continue
		elif str(temp_url) in past_urls:
			continue
		else:
			observation.append(str(temp_url))
			past_urls.append(str(temp_url))
			
Esempio n. 16
0
# Parse it
  soup = BeautifulSoup(webpage.read())
  soup.prettify()
  
# Extract petitions on page
  dates = soup.findAll("h2", attrs={"class":"date-header"})
  posts = soup.findAll("div", attrs={"class":"post hentry uncustomized-post-template"})
  urls = soup.findAll("h3", attrs={"class":"post-title entry-title"})
  titles = soup.findAll("h3", attrs={"class":"post-title entry-title"})
  comments = soup.findAll("span", attrs={"class":"post-comment-link"})
  authors = soup.findAll("a", attrs={"class":"g-profile"})

  for i in range(len(dates)):
    post = posts[i]
    p = 0
    if clean_html(str(post.find("a")["name"])): p = 1
    date = dates[i]
    d = clean_html(str(date.find("span")))
    author = authors[i]
    a = clean_html(str(author.find("span")))
    url = urls[i]
    u = clean_html(str(url.find("a")["href"]))
    title = titles[i]
    t = clean_html(str(title.find("a")))
    comment = comments[i]
    c = clean_html(str(comment.find("a"))).split()[0]
    if c == "No": c = 0
  table_source = Source("Ph D talk", 'http://phdtalk.blogspot.com/')
  session.add(table_source)
  table_scrape = Scrapes(p,d,u,t,a,c)
  table_source.scrape.append(table_scrape)
Esempio n. 17
0
with con:
    cur = con.cursor()
    cur.execute("SELECT id, date_txt, text FROM Bioworld_Today")
    #cur.execute("SELECT id, date_txt, text FROM Bioworld_Today WHERE date > '1990-12-31' and date < '1993-01-01'")
    articles = cur.fetchall()

    # build in order!!!
    documents = {}
    for idx, article in enumerate(articles):
        print "%d of %d" %(idx, len(articles))
        
        aid, date_txt, html = article
        d = parser.parse(date_txt)
        #year = str(date).split("-")[0]
        if d in documents:
            documents[d].append(clean_html(html.decode('utf8')))
        else:
            documents[d] = [clean_html(html.decode('utf8'))]
        
    
    dates = documents.keys()
    dates.sort()
    
    
    years = {}
    raw = ""
    for date in dates:
        raw += "\n\n-----"+str(date)+"-----\n\n"+"\n\n".join(documents[date]) # build big text
        year = date.strftime("%Y")
        if year in years:
            years[year] += documents[date]
Esempio n. 18
0
def cleanXML(text):
    from nltk.util import clean_html
    return clean_html(text)
Esempio n. 19
0
#so add -c to not dl if remote same size
os.system("wget -r -c -l0 -t1 -N -np -A.html,shtml -erobots=off http://www.thelatinlibrary.com/indices.html")

print "Removing indices and other non-Latin files ..."
os.system("rm -r www.thelatinlibrary.com/101/ www.thelatinlibrary.com/imperialism/ www.thelatinlibrary.com/ll2/ www.thelatinlibrary.com/law/ www.thelatinlibrary.com/romhist/ www.thelatinlibrary.com/satire/ www.thelatinlibrary.com/sallust/ www.thelatinlibrary.com/historians/ www.thelatinlibrary.com/certamen/ www.thelatinlibrary.com/caligula/ www.thelatinlibrary.com/caes/ www.thelatinlibrary.com/apul/ www.thelatinlibrary.com/august.html www.thelatinlibrary.com/ammianus.html www.thelatinlibrary.com/alanus.html www.thelatinlibrary.com/apicius.html www.thelatinlibrary.com/albertanus.html www.thelatinlibrary.com/albertofaix.html www.thelatinlibrary.com/alcuin.html www.thelatinlibrary.com/avienus.html www.thelatinlibrary.com/appverg.html www.thelatinlibrary.com/arnobius.html www.thelatinlibrary.com/apuleius.html www.thelatinlibrary.com/aquinas.html www.thelatinlibrary.com/alice.html www.thelatinlibrary.com/ausonius.html www.thelatinlibrary.com/abelard.html www.thelatinlibrary.com/about.html www.thelatinlibrary.com/anselm.html www.thelatinlibrary.com/addison.html www.thelatinlibrary.com/aug.html www.thelatinlibrary.com/ambrose.html www.thelatinlibrary.com/egeria.html www.thelatinlibrary.com/hyginus.html www.thelatinlibrary.com/iordanes.html www.thelatinlibrary.com/epubs.html www.thelatinlibrary.com/erasmus.html www.thelatinlibrary.com/decl.html www.thelatinlibrary.com/des.html www.thelatinlibrary.com/eutropius.html www.thelatinlibrary.com/florus.html www.thelatinlibrary.com/forsett.html www.thelatinlibrary.com/frame1.html www.thelatinlibrary.com/frame2.html www.thelatinlibrary.com/frontinus.html www.thelatinlibrary.com/commodianus.html www.thelatinlibrary.com/curtius.html www.thelatinlibrary.com/dante.html www.thelatinlibrary.com/contemp.html www.thelatinlibrary.com/cred.html www.thelatinlibrary.com/fulgentius.html www.thelatinlibrary.com/gaius.html www.thelatinlibrary.com/gellius.html www.thelatinlibrary.com/gestafrancorum.html www.thelatinlibrary.com/celtis.html www.thelatinlibrary.com/corvinus.html www.thelatinlibrary.com/godfrey.html www.thelatinlibrary.com/bultelius.html www.thelatinlibrary.com/claudian.html www.thelatinlibrary.com/cassiodorus.html www.thelatinlibrary.com/bible.html www.thelatinlibrary.com/caes.html www.thelatinlibrary.com/columba.html www.thelatinlibrary.com/campion.html www.thelatinlibrary.com/capellanus.html www.thelatinlibrary.com/columella.html www.thelatinlibrary.com/cato.html www.thelatinlibrary.com/certamen.html www.thelatinlibrary.com/christian.html www.thelatinlibrary.com/cic.html www.thelatinlibrary.com/classics.html www.thelatinlibrary.com/boethiusdacia.html www.thelatinlibrary.com/bede.html www.thelatinlibrary.com/bennett.html www.thelatinlibrary.com/bernardcluny.html www.thelatinlibrary.com/balde.html www.thelatinlibrary.com/bacon.html www.thelatinlibrary.com/manilius.html www.thelatinlibrary.com/miscmed.html www.thelatinlibrary.com/nemesianus.html www.thelatinlibrary.com/martial.html www.thelatinlibrary.com/malaterra.html www.thelatinlibrary.com/neo.html www.thelatinlibrary.com/nepos.html www.thelatinlibrary.com/marcellinus.html www.thelatinlibrary.com/liberpontificalis.html www.thelatinlibrary.com/may.html www.thelatinlibrary.com/medieval.html www.thelatinlibrary.com/melancthon.html www.thelatinlibrary.com/mirandola.html www.thelatinlibrary.com/misc.html www.thelatinlibrary.com/modinst.html www.thelatinlibrary.com/newton.html www.thelatinlibrary.com/leo.html www.thelatinlibrary.com/nithardus.html www.thelatinlibrary.com/lhomond.html www.thelatinlibrary.com/notitia.html www.thelatinlibrary.com/luther.html www.thelatinlibrary.com/phaedrus.html www.thelatinlibrary.com/lactantius.html www.thelatinlibrary.com/martinofbraga.html www.thelatinlibrary.com/leges.html www.thelatinlibrary.com/mapps.html www.thelatinlibrary.com/lucan.html www.thelatinlibrary.com/lucretius.html www.thelatinlibrary.com/orosius.html www.thelatinlibrary.com/ovid.html www.thelatinlibrary.com/ottofreising.html www.thelatinlibrary.com/papal.html www.thelatinlibrary.com/pascoli.html www.thelatinlibrary.com/patricius.html www.thelatinlibrary.com/pauldeacon.html www.thelatinlibrary.com/landor.html www.thelatinlibrary.com/leothegreat.html www.thelatinlibrary.com/liv.html www.thelatinlibrary.com/justin.html www.thelatinlibrary.com/justinian.html www.thelatinlibrary.com/juvenal.html www.thelatinlibrary.com/jerome.html www.thelatinlibrary.com/janus.html www.thelatinlibrary.com/sedulius.html www.thelatinlibrary.com/sall.html www.thelatinlibrary.com/ter.html www.thelatinlibrary.com/solinus.html www.thelatinlibrary.com/ritchie.html www.thelatinlibrary.com/sabinus.html www.thelatinlibrary.com/sidonius.html www.thelatinlibrary.com/sannazaro.html www.thelatinlibrary.com/sigebert.html www.thelatinlibrary.com/williamtyre.html www.thelatinlibrary.com/sen.html www.thelatinlibrary.com/tertullian.html www.thelatinlibrary.com/seneca.html www.thelatinlibrary.com/sha.html www.thelatinlibrary.com/vallauri.html www.thelatinlibrary.com/silius.html www.thelatinlibrary.com/waltarius.html www.thelatinlibrary.com/spinoza.html www.thelatinlibrary.com/statius.html www.thelatinlibrary.com/suet.html www.thelatinlibrary.com/sulpiciusseverus.html www.thelatinlibrary.com/tac.html www.thelatinlibrary.com/theodosius.html www.thelatinlibrary.com/tib.html www.thelatinlibrary.com/valeriusflaccus.html www.thelatinlibrary.com/vitruvius.html www.thelatinlibrary.com/readme2005.html www.thelatinlibrary.com/readme2007.html www.thelatinlibrary.com/richerus.html www.thelatinlibrary.com/readme2006.html www.thelatinlibrary.com/readme1999.html www.thelatinlibrary.com/readme.html www.thelatinlibrary.com/readme2000.html www.thelatinlibrary.com/readme1998.html www.thelatinlibrary.com/readme2001.html www.thelatinlibrary.com/readme2002.html www.thelatinlibrary.com/readme2003.html www.thelatinlibrary.com/readme2004.html www.thelatinlibrary.com/quintilian.html www.thelatinlibrary.com/livius/ www.thelatinlibrary.com/livy/liv.per.shtml www.thelatinlibrary.com/plautus.html www.thelatinlibrary.com/pliny.html www.thelatinlibrary.com/pliny1.html www.thelatinlibrary.com/augustine/serm.shtml www.thelatinlibrary.com/cicero/adbrutum.shtml www.thelatinlibrary.com/cicero/cat.shtml www.thelatinlibrary.com/cicero/fam.shtml www.thelatinlibrary.com/cicero/fin.shtml www.thelatinlibrary.com/cicero/inventione.shtml www.thelatinlibrary.com/cicero/fratrem.shtml www.thelatinlibrary.com/cicero/leg.shtml www.thelatinlibrary.com/cicero/legagr.shtml www.thelatinlibrary.com/cicero/oratore.shtml www.thelatinlibrary.com/cicero/phil.shtml www.thelatinlibrary.com/cicero/off.shtml www.thelatinlibrary.com/cicero/repub.shtml www.thelatinlibrary.com/cicero/tusc.shtml www.thelatinlibrary.com/cicero/ver.shtml www.thelatinlibrary.com/cicero/nd.shtml www.thelatinlibrary.com/cicero/epis.shtml www.thelatinlibrary.com/virgil/index.html www.thelatinlibrary.com/varro.html www.thelatinlibrary.com/valmax.html www.thelatinlibrary.com/prop.html www.thelatinlibrary.com/Voc.html www.thelatinlibrary.com/Vocab.html www.thelatinlibrary.com/Vocab2.html www.thelatinlibrary.com/tertullian/tertullian.cultu.shtml www.thelatinlibrary.com/tertullian/tertullian.marcionem.shtml www.thelatinlibrary.com/tertullian/tertullian.nationes.shtml www.thelatinlibrary.com/tertullian/tertullian.uxor.shtml www.thelatinlibrary.com/prud.html www.thelatinlibrary.com/pomponius.html www.thelatinlibrary.com/sedulius.html www.thelatinlibrary.com/vegetius.html www.thelatinlibrary.com/vell.html www.thelatinlibrary.com/verg.html www.thelatinlibrary.com/addison.html www.thelatinlibrary.com/albertanus.html")

print "Stripping HTML and changing extensions to .txt ..."
for r,d,f in os.walk("www.thelatinlibrary.com"):
    for files in f:
        if files.endswith("html"):
            path = os.path.join(r,files)
            opened = open(path, 'r')
            readed = opened.read()
            opened.close()
            new_opened = open(path, "w")
            new_opened.write(clean_html(readed))
            new_opened.close()
            fileName, fileExtension = os.path.splitext(path)
            os.rename(fileName + fileExtension, fileName + ".txt")

print "Creating Public Domain LICENSE ..."
os.system("touch www.thelatinlibrary.com/LICENSE.md")
os.system("printf 'Public Domain Mark 1.0\n----------------------\n### No Copyright\nThis work has been identified as being free of known restrictions under copyright law, including all related and neighboring rights.\n\nYou can copy, modify, distribute and perform the work, even for commercial purposes, all without asking permission. See Other Information below.\n\n### Other Information\n- The work may not be free of known copyright restrictions in all jurisdictions.\n- Persons may have other rights in or related to the work, such as patent or trademark rights, and others may have rights in how the work is used, such as publicity or privacy rights.\n- In some jurisdictions moral rights of the author may persist beyond the term of copyright. These rights may include the right to be identified as the author and the right to object to derogatory treatments.\n- Unless expressly stated otherwise, the person who identified the work makes no warranties about the work, and disclaims liability for all uses of the work, to the fullest extent permitted by applicable law.\n- When using or citing the work, you should not imply endorsement by the author or the person who identified the work.\n\nA copy of this Mark is available at: <https://creativecommons.org/publicdomain/mark/1.0/>.' >> LICENSE.md")

print "Creating README.md ..."
os.system("touch www.thelatinlibrary.com/README.md")
os.system('printf "About the Latin Library\n=======================\n\nThe Latin Library is a collection of a wide variety of texts from the archaic period to the modern era. Altogether the corpus is about 108 MB.\n\nThese files are in the public domain, [as explained here](http://thelatinlibrary.com/about.html). For a declaration of their status in public domain, see LICENSE.md." >> README.md')

print "Renaming corpus to thelatinlibrary ..."
os.system("mv www.thelatinlibrary.com thelatinlibrary")
Esempio n. 20
0
  # Parse it
  soup = BeautifulSoup(webpage.read())
  soup.prettify()

  # extract all post_titles, publish_dates, comment_counts, urls
  post_title = soup.findAll("h2", attrs={'class':'entry-title'})
  publish_date = soup.findAll("div", attrs={'class':'postMeta'})
  comment_count = soup.findAll("p", attrs={'class':'container'})
  urls = soup.findAll("h2", class_="entry-title")



  for i in range(10):
    date = publish_date[i]
    d = clean_html(str(date.find("span", attrs={'class':'date'})))
    # d_clean = d.encode('ascii', 'ignore')
    d_clean = d.decode('utf-8')
        
    url = urls[i]
    u = url('a')[0]['href']
    # u_clean = u.encode('ascii', 'ignore')
    u_clean = u.decode('utf-8')

    post = post_title[i]
    p = clean_html(str(post.find("a")))
    # p_clean = p.encode('ascii', 'ignore')
    p_clean = p.decode('utf-8')

    comment = comment_count[i]
    c = clean_html(str(comment.find("span", attrs={'class':'comments'})))
Esempio n. 21
0
def remove_html(text):
    #取出html的tag和url
    return clean_html(text)
Esempio n. 22
0
  		links.append(link.find("a").get('href'))
  	for link in soup.findAll("div", attrs = {'class':'pagenav clearfix'}):
  		for ref in link.findAll("a"): #not sure how this could've been not a nested for loop. 
  			links.append(ref.get("href"))
  
  	#add all links on current page that haven't been visited and aren't already on the list
  	#inspiration for this step taken from David Carlson's code
	for link in links:
		if (link not in links2) and (link not in visited):
			links2.append(link)

	if len(soup.findAll(content = "article")) != 0: #is it a post? If so add info and if not move to next
		is_post = True
		date = soup.findAll("time", attrs = {'class':"post-date"})
		for i in date:
			post_date =  clean_html(str(i))
		author = soup.findAll("a", attrs = {'rel':"author"})
		for i in author:
			post_author =  clean_html(str(i))	
		title = soup.findAll("h1", attrs = {'class':'post-title'})
		for i in title:
			post_title =  clean_html(str(i))
		comments = soup.findAll("span", attrs = {'class':'post-comment'})
		for i in comments:
			post_comments =  clean_html(str(i))
			if post_comments == "No Comments":
				post_comments = 0
		csvwriter.writerow([page_to_scrape, is_post, post_date, post_author, post_title, post_comments]) #Write these to the CSV  
	else:
		csvwriter.writerow([page_to_scrape, False, None, None, None, None]) #Write these to the CSV
	
Esempio n. 23
0
# Parse it
soup = BeautifulSoup(webpage.read())
soup.prettify()

# Links to posts
links = soup.findAll("li", attrs = {'class':"clear"})
l = []
for link in links:
  l.append(str(link.find("a")['href']))
# print l

# Is post?
posts = soup.findAll("li", attrs = {'class':"clear"})
print len(posts)
for post in posts:
  p =  clean_html(str(post.find("a")['rel']))
  print p

# Date
dates = soup.findAll("li", attrs = {'class':"clear"})
print len(dates)
for date in dates:
  d = clean_html(str(date.find("span")))
  print d

# Title
titles = soup.findAll("li", attrs = {'class':"clear"})
print len(titles)
for title in titles:
  t = clean_html(title.find("a")['title'])
  t = t[18::]
Esempio n. 24
0
# links = []
# for tag in tags:
#   links.append(tag['href'])
# print links
# 
# for i in links:
#   webpages[i]=urllib2.urlopen(links[i]) # must be indices not unicode
#   time.sleep(1)
  
# soups = BeatifulSoup(webpages.read())

# Is post?
posts = soup.findAll("div", attrs = {'class':"entry"})
print len(posts)
for post in posts:
  p =  clean_html(str(post.find("a")['rel']))
  print p

# Author
# authors = soup.findAll("div", attrs = {'class': "single-post-meta"})
# for author in authors:
#   a = clean_html(str(author.find("div")))
#   print a

# Date
dates = soup.findAll("div", attrs = {'class':"post-date"})
print len(dates)
for date in dates:
  d = clean_html(str(date.find("p")))
  print d
Esempio n. 25
0
if __name__ == '__main__':	
	import sys
	from nltk.util import clean_html
	print sys.argv
	inHTML = sys.argv[1]
	outText = sys.argv[2]
	fd = open(inHTML)
	ct = fd.read()
	fd.close()
	text = clean_html(ct)
	fd = open(outText, 'w')
	fd.write(text)
	fd.close()
Esempio n. 26
0
webpage = urllib2.urlopen(page_to_scrape)	# Open homepage

soup = BeautifulSoup(webpage.read())		# Parse homepage
soup.prettify()		
	
flight_deals = soup.findAll("a", attrs={'rel':'bookmark'})	# Find first post from the homepage
first_deal = flight_deals[0]
new_url = first_deal.get('href')
print new_url

webpage = urllib2.urlopen(new_url)		# Open the first post page
soup = BeautifulSoup(webpage.read())
soup.prettify()

url_div = soup.find("div", attrs={'class':'prev_next'})
previous_checker = clean_html(str(url_div.find("p")))

flights_array = []		# Array used to store info from each post before putting into CSV file

while "Previous post" in previous_checker:		# Each post's html links to "previous post", except the final one
												# So, while loop continues until the final post on the blog
	post_page = urllib2.urlopen(new_url)		# Open the webpage with the blog post

	soup = BeautifulSoup(post_page.read())		# Parse the webpage
	soup.prettify()
	
	title_of_deal = soup.find("title")			# Extract title of deal on page
	clean_title = clean_html(str(title_of_deal))

	date_deal_posted = soup.find("p", attrs={'class':'headline_meta'})	# Extract date of deal post
	clean_date = clean_html(str(date_deal_posted))
Esempio n. 27
0
 def open_html_file(self, html_file):
     html_and_text = urlopen(html_file).read()
     raw_text_and_space = clean_html(html_and_text)
     return raw_text_and_space
Esempio n. 28
0
time.sleep(2)

# Extract the author
authors = soup.findAll("div", attrs={'class':'blog-byline'})

time.sleep(2)

# Extract the time and the date 
dates = soup.findAll("span", attrs={'class':'updated'})

# Indicate that the page is a post if articles are listed in reverse chronological order
dates_list = []
for i in range(95):
	date = dates[i]
	date = clean_html(str(date))  # Clean up the markup
	date = date.split(", ")[1::]  # Get rid of the time part
	date = ''.join(date)  # Join month, day, and year as a string
	date = date.replace('/','')  # Get rid of the forward slashes
	date = int(date)  # Make it an integer, which should be bigger for later dates in July 2014
	dates_list.append(date)
posts_list = []
for i in range(94):
	if dates_list[i] >= dates_list[i+1]:  # Compare two contiguous dates
		posts_list.append(True)
		if i + 1 == 94:  # Add True to the last article if the previous article got True
			posts_list.append(True)
	else:
		posts_list.append(False)
		if i + 1 == 94:  # Add False to the last article if the previous article got False
			posts_list.append(False)
Esempio n. 29
0
# Extract date
	dates = soup.findAll("span", attrs={'class':'postdate'}) 
  
# Extract author
	authors = soup.findAll("span", attrs={'class':'postauthor'})
  
# Extract url
	urls = soup.findAll("h2", attrs={'class':'posttitle'})
  
# Extract comments
	comments = soup.findAll("span", attrs={'class':'postcomment'})

	
	for i in range(len(titles)):
		title = titles[i]
		t = clean_html(str(title.find("a")))
		date = dates[i]
		d = clean_html(str(date))
		d = datetime.strptime(d, "%d %B %Y, %I:%M %p")		#converts it into standard date time format
		a = authors[i]
		a = clean_html(str(a.find("a")))
		url = urls[i]
		u = url.find("a").get("href")
		comment = comments[i]
		c = clean_html(str(comment.find("a")))
		c = re.findall(r"\d\S*", c)
		if c == []: c = 0
		else: c = int(c[0])
		ispost = False
		if (len(t) != 0) & (len(str(d)) != 0) & (len(u) != 0) & (isinstance(c, int)): ispost = True
		csvwriter.writerow([ispost, d, a, u, t, c])
Esempio n. 30
0
csvwriter = csv.writer(readFile)
csvwriter.writerow(headers)

# Open webpage
webpage = urllib2.urlopen(page_to_scrape)

# Parse it
soup = BeautifulSoup(webpage.read())
soup.prettify()

# Extract petitions on page
#petitions = soup.findAll("a", href=re.compile('^/petition'))
petitions = soup.findAll("div", attrs={'class':'title'})
print len(petitions)
for petition in petitions:
  p = clean_html(str(petition.find("a")))
  print p

signatures = soup.findAll("div", attrs={'class':'num-sig'})
print len(signatures)
for signature in signatures:
  s = clean_html(str(signature.find("span", attrs={'class':'num'})))
  print s

for i in range(20):
  petition = petitions[i]
  p = clean_html(str(petition.find("a")))
  signature = signatures[i]
  s = clean_html(str(signature.find("span", attrs={'class':'num'})))
  csvwriter.writerow([p, s])
Esempio n. 31
0
with con:
    cur = con.cursor()
    cur.execute("SELECT id, date_txt, text FROM Bioworld_Today")
    #cur.execute("SELECT id, date_txt, text FROM Bioworld_Today WHERE date > '1990-12-31' and date < '1993-01-01'")
    articles = cur.fetchall()

    # build in order!!!
    documents = {}
    for idx, article in enumerate(articles):
        print "%d of %d" % (idx, len(articles))

        aid, date_txt, html = article
        d = parser.parse(date_txt)
        #year = str(date).split("-")[0]
        if d in documents:
            documents[d].append(clean_html(html.decode('utf8')))
        else:
            documents[d] = [clean_html(html.decode('utf8'))]

    dates = documents.keys()
    dates.sort()

    years = {}
    raw = ""
    for date in dates:
        raw += "\n\n-----" + str(date) + "-----\n\n" + "\n\n".join(
            documents[date])  # build big text
        year = date.strftime("%Y")
        if year in years:
            years[year] += documents[date]
        else:
Esempio n. 32
0
readFile = open(filename, "wb")
csvwriter = csv.writer(readFile)
csvwriter.writerow(headers)

# Open webpage
webpage = urllib2.urlopen(page_to_scrape)

# Parse it
soup = BeautifulSoup(webpage.read())
soup.prettify()

# Extract titles on page
titles = soup.findAll("h2", attrs={'class':'entry-title'})
#print len(titles)
for title in titles:
	p = clean_html(str(title.find("a")))
#	print p

authors = soup.findAll("div", attrs={'class':'blog-byline'})
#print len(authors)
for author in authors:
	s = clean_html(str(author))
	s = "".join(s.split("By ")[1::])
#	print s

dates = soup.findAll("span", attrs={'class':'timestamp'})
#print len(dates)
for date in dates:
	d = clean_html(str(date))
	d = "".join(d.split("Posted at ")[1::])
#	print d
Esempio n. 33
0
def remove_html(text):
    #取出html的tag和url
    return clean_html(text)