Ejemplo n.º 1
0
def get_similar_image_urls(html):
    soup = b(html)
    for item in soup.find('div', {
            'id': 'iur'
    }).findAll('a', {'class': 'bia uh_rl'}):
        url = item.get('href')
        yield urlparse.parse_qs(urlparse.urlparse(url).query)['imgurl']
Ejemplo n.º 2
0
def find_party_get_party_party_on(url):
    response = urllib2.urlopen(url)
    html = response.read()

    poop = b(html)

    trs = poop.findAll('tr')
    
    drunk = 'Unknown'
    
    for tr in trs:
        a = tr.findAll('th')
        if len(a) >= 1:
            k = a[0]
            if k.text == 'Political party':
                t = tr.findAll('td')
                ass = t[0].findAll('a')
                if len(ass) >= 1:
                    if ass[-1].text == '[1]':
                        drunk = ass[-2].text
                    else:
                        drunk = ass[-1].text                        
                    break
                
    text = [p.getText() for p in poop.findAll('p')[:5]]
    conc_text = "".join(text).lower()

    C = Counter(conc_text.split())
    #print C['he'], C['she']

    if C['he'] > C['she']:
        tonk = 'Male'

    elif C['she'] > C['he']:
        tonk = 'Female'

    else:
        tonk = 'Not Found'
    
    return drunk, tonk
Ejemplo n.º 3
0
#! /usr/bin/python2

import urllib2 as u
from BeautifulSoup import BeautifulSoup as b
import re,sys,os,tkMessageBox
from Tkinter import *
while True:
	try:
		args = sys.argv[1:]
		page = u.urlopen(args[0])
		newpath = args[0].split('/')[-3] +'-'+args[0].split('/')[-1]
		if len(args) >1:
			newpath = args[1]
		if not os.path.exists(newpath): os.makedirs(newpath)
		os.chdir(newpath)
		soup = b(page)
		images = set()
		for link in soup("a"):
			href = link.get('href')
			if href is not None:
				if "//images.4chan.org" in link.get('href'):
					images.add("https:"+link.get('href'))
		for x in images:
			print 'saving image %s'%x
			os.system('wget -c --limit-rate=30k %s'%x)
		if not tkMessageBox.askyesno("Done", "done saving 4chan thread %s, do it again?"%args[0]):
			break
	except Exception as e:
		if not tkMessageBox.askyesno("Error", "error %s in saving 4chan thread %s, do it again?"%(str(e),args[0])):
			break
Ejemplo n.º 4
0
from BeautifulSoup import BeautifulSoup as b
from collections import Counter
import urllib2, numpy
import matplotlib.pyplot as plt
import sqlite3 as lite
import numpy
import seaborn as sns
# Creates or opens a file called mydb with a SQLite3 DB
db = lite.connect('QTdb')


response = urllib2.urlopen('http://en.wikipedia.org/wiki/List_of_Question_Time_episodes')
html = response.read()

soup = b(html)

people_all = []
genders_all = []
parties_all = []

people_multi = []
genders_multi = []
parties_multi = []

tables = soup.findAll('table','wikitable')[2:] #First two tables are other content
year_headers = soup.findAll('h2')[2:-4] # Likewise with headers
years = []

def find_party_get_party_party_on(url):
    response = urllib2.urlopen(url)
    html = response.read()
Ejemplo n.º 5
0
#!/usr/bin/env python
#this will return dell warranty information. tested and works as of 2012-02-16. no promises if dell updates their page and breaks stuff

from BeautifulSoup import BeautifulSoup as b
import sys
import requests

serviceTag = sys.argv[1]

url = "http://www.dell.com/support/troubleshooting/us/en/usgen1/Index?c=us&l=en&s=gen&cs=&t=warranty&servicetag="

r = requests.get(url + serviceTag)
if(r.ok):
        soup = b(r.content)
        x = soup.find("li", "TopTwoWarrantyListItem") #search for the css class inside a li
        days = x.text.split('.')[0] #only want the first sentence that we match on
        print days
else:
        print "Error retrieving url"
Ejemplo n.º 6
0
    def export(self, filename, append=True):
        if filename.split('.')[-1] != '.txt':
            filename += '.txt'
        mode = 'a' if append else 'w'
        with open(filename, mode) as f:
            for p in self.data:
                f.write(p + "\n")
        f.close()


class json:
    def __init__(self, data):
        self.data = data

    def export(self, filename, append=True):
        import demjson

        js = demjson.encode(self.data)
        if filename.split('.')[-1] != '.json':
            filename += '.json'
        mode = 'a' if append else 'w'
        with open(filename, mode) as f:
            f.write(js)
        f.close()

if __name__ == '__main__':
    import urllib
    from BeautifulSoup import BeautifulSoup as b
    bb = b(urllib.urlopen("http://www.thehindu.com/sci-tech/science/irnss1d-launch/article7043608.ece?homepage=true").read())
    bs = bb.findAll("p")
    docx(bb.find("title").text, "\n".join([s.text for s in bs])).export("a")