コード例 #1
0
ファイル: poll_predict.py プロジェクト: ealehman/poll-predict
def plot_colors(xml):
    dom = web.Element(xml)
    result = {}
    for graph in dom.by_tag('graph'):
        title = _strip(graph.attributes['title'])
        result[title] = graph.attributes['color']
    return result
コード例 #2
0
def rcp_poll_data(xml):
    dom = web.Element(xml)
    result = {}

    dates = dom.by_tag('series')[0]
    dates = {
        n.attributes['xid']: str(n.content)
        for n in dates.by_tag('value')
    }

    keys = dates.keys()

    result['date'] = pd.to_datetime([dates[k] for k in keys])

    for graph in dom.by_tag('graph'):
        name = graph.attributes['title']
        data = {
            n.attributes['xid']: float(n.content) if n.content else np.nan
            for n in graph.by_tag('value')
        }
        result[name] = [data[k] for k in keys]

    result = pd.DataFrame(result)
    result = result.sort(columns=['date'])

    return result
コード例 #3
0
ファイル: poll_predict.py プロジェクト: ealehman/poll-predict
def rcp_poll_data(xml):
    dom = web.Element(xml)
    result = {}

    # extract dates
    series = dom.by_tag('series')
    date_value = series[0].by_tag('value')
    date = []
    for d in date_value:
        date.append(pd.to_datetime(d.content))
    result['date'] = date
    
    #extract result data and titles
    graphs_tag = dom.by_tag('graphs')
    graph_tags = graphs_tag[0].by_tag('graph')
    
    for graph in graph_tags:
        title = graph.attributes['title']
        values = []
        for value in graph.by_tag('value'):
            try:
                values.append(float(value.content))
            except ValueError:
                values.append(np.nan)
        result[title] = values

    result = pd.DataFrame(result)
    return result
コード例 #4
0
def parse_results(fname):
    with open('results_raw_2015/%s' % fname, 'r') as f:
        dom = web.Element(f.read())
    fields = dom('td')
    stud_det = {
        'roll_num': web.plaintext(fields[8].content),
        'name': web.plaintext(fields[10].content),
        'mother_name': web.plaintext(fields[12].content),
        'father_name': web.plaintext(fields[14].content)
    }
    for i in range(21, len(fields) - 7, 6):
        if web.plaintext(fields[i].content) == 'Additional Subject':
            i += 1
        stud_det.update({
            web.plaintext(fields[i + 1].content) + '_theory':
            web.plaintext(fields[i + 2].content),
            web.plaintext(fields[i + 1].content) + '_practical':
            web.plaintext(fields[i + 3].content),
            web.plaintext(fields[i + 1].content) + '_total':
            web.plaintext(fields[i + 4].content),
            web.plaintext(fields[i + 1].content) + '_grade':
            web.plaintext(fields[i + 5].content)
        })
    stud_det['fin_result'] = web.plaintext(fields[-5].content)[8:]
    return stud_det
コード例 #5
0
ファイル: pqarchive_wsj.py プロジェクト: 2dpodcast/CS109-1
def PQarchive_url_list(start_date,
                       end_date,
                       page,
                       newspaper_tag='latimes',
                       query='romney OR obama',
                       debug=False):
    '''
    Scrapes the PQ archive system to get a list of all URLs.
    
    Inputs: M(M)-D(D)-YYYY of start and end date, page number (1-indexed)
    Output: URL List
    '''

    # split dates into M, D, Y
    start_date = start_date.split('-')
    end_date = end_date.split('-')

    options = {}

    # run the query
    url = 'http://pqasb.pqarchiver.com/' + newspaper_tag + '/results.html'
    options['st'] = 'advanced'
    options['sortby'] = 'CHRON'
    options['datetype'] = 6
    options['frommonth'] = start_date[0]
    options['fromday'] = start_date[1]
    options['fromyear'] = start_date[2]
    options['tomonth'] = end_date[0]
    options['today'] = end_date[1]
    options['toyear'] = end_date[2]
    options['type'] = 'current'
    options['start'] = (page - 1) * 10
    options['QryTxt'] = query

    # try to get url with specified parameters
    try:
        r = requests.get(url, params=options)
        html = r.text
        if debug: print r.url
    except:
        print 'Unable to parse URL list for ' + str(url)
        return None

    # declare dom object to begin parsing the data
    dom = web.Element(html)

    url_list = []
    wp_pattern_good = re.compile(u'FMT=ABS')
    wp_pattern_bad = re.compile(u'washingtonpost_historical')

    # find each url
    for a in dom('table a'):
        # check if the a tag has a title, the title matches the Preview sring, and the href is not from the header faq section
        if ('title' in a.attrs) and (a.attrs['title']
                                     == 'Preview (Abstract/Citation)'
                                     ) and (a.attrs['href'] != 'faq.html#abs'):
            # add url to url_list
            url_list += [str(a.attrs['href'])]

    return url_list
コード例 #6
0
ファイル: poll_predict.py プロジェクト: ealehman/poll-predict
def race_result(url):
    html = requests.get(url).text
    
    dom = web.Element(html)
    result = {}
    
    #find tags unique to candidate names
    tr_tags = dom.by_tag('tr.omit')
    th_tags = tr_tags[0].by_tag('th')
    

    #extract candidate names
    candidate = []
    
    #add names to candidate list without additional chars
    for tags in th_tags[3:-1]:
        if re.search("\(", tags.content):    
            candidate.append(tags.content[:-4]) 
        else:
            candidate.append(tags.content)
    
    #find tags unique to final polling results
    td_tags = tr_tags[0].next.by_tag('td')
    
    # extract percentages
    percentage = []
    for tags in td_tags[3:-1]:
        percentage.append(float(tags.content))
    
    result = dict(zip(candidate, percentage))

    return result
コード例 #7
0
    def convert_answers(self):
        """
        Convert Posts.xml to the required answers_df.
        """
        
        posts_dom = web.Element(file('data/'+self.site_name+"/Posts.xml").read())
        
        aids, uids, parentids, bodies, scores = [],[],[],[], []

        for row in posts_dom.by_tag('row'):

            if row.attributes['posttypeid'] == '2': # get the answers
                aids.append(row.attributes['id'])
                if 'owneruserid' in row.attributes.keys():
                    uids.append(row.attributes['owneruserid'])
                else: uids.append(u'-999')
                scores.append(int(row.attributes['score']))
                parentids.append(row.attributes['parentid'])
                bodies.append(row.attributes['body'].encode('unicode-escape'))

        self.answers_df = pd.DataFrame(data={'post_id':aids,
                                  'user_id':uids,
                                  'parent_id':parentids,
                                  'score':scores,
                                  'answer':bodies},
                            columns=['post_id','user_id','parent_id','score','answer'])
        self.answers_df = self.answers_df.set_index('post_id') # index by (unique) post_id

        return
コード例 #8
0
ファイル: naver_crawler.py プロジェクト: hsy159/etc
def parsing_headline(date):  #1~30위까지 rank 30위까지 crawl
    html = get_headline_page(date)
    element = web.Element(html)

    lst = []

    for i in range(3):
        a = html.findAll('li', {'class': 'num' + str(i + 1)})
        lst.append(
            str(a[0]).split('<dt>')[1].split('title=')[1].split('>')[0].decode(
                'utf-8'))
    for i in range(7):
        b = html.findAll('li', {'class': 'gnum' + str(i + 4)})
        #print 'Rank' + str(i+4) +': ' + str(b[0]).split('title=')[1].split('"')[1]
        lst.append(
            (str(b[0]).split('title=')[1].split('"')[1]).decode('utf-8'))
    for i in range(10):
        c = html.findAll('li', {'class': 'gnum' + str(i + 11)})
        #print 'Rank' + str(i+11) +': ' + str(c[0]).split('title=')[1].split('"')[1]
        lst.append(
            (str(c[0]).split('title=')[1].split('"')[1]).decode('utf-8'))
    for i in range(10):
        d = html.findAll('li', {'class': 'gnum' + str(i + 21)})
        #print 'Rank' + str(i+21) +': ' + str(d[0]).split('title=')[1].split('"')[1]
        lst.append(
            (str(d[0]).split('title=')[1].split('"')[1]).decode('utf-8'))
    return lst
コード例 #9
0
    def convert_questions(self):
        """
        Convert Posts.xml to the required questions_df.
        """
        
        posts_dom = web.Element(file('data/'+self.site_name+"/Posts.xml").read())
        
        # Warning: super-hacky code follows!
        qids, uids, tags, titles, bodies, creation_dates = [],[],[],[],[],[]

        for row in posts_dom.by_tag('row'):

            if row.attributes['posttypeid'] == '1': # get the questions
                qids.append(row.attributes['id'])
                if 'owneruserid' in row.attributes.keys():
                    uids.append(row.attributes['owneruserid'])
                else: uids.append(u'-999')
                if 'tags' in row.attributes.keys():
                    tags.append(re.sub('[^-a-zA-Z]',' ',row.attributes['tags']).strip().split())
                else: tags.append([u''])
                titles.append(row.attributes['title'].encode('unicode-escape').replace(r'\n',''))
                bodies.append(row.attributes['body'].encode('unicode-escape').replace(r'\n',''))
                creation_dates.append(pd.to_datetime(row.attributes['creationdate']))
                
        self.questions_df = pd.DataFrame(data={'post_id':qids,
                              'user_id':uids,
                              'title':titles,
                              'tags':tags,
                              'question':bodies,
                              'date':creation_dates},
                        columns=['post_id','user_id','title','tags','question','date'])
            
        self.questions_df = self.questions_df.set_index('post_id') # index by (unique) post_id
        
        return
コード例 #10
0
def find_governor_races(html):
    dom = web.Element(html)
    links = [a.attributes.get('href', '') for a in dom.by_tag('a')]
    links = [l for l in links if is_gov_race(l)]
    #eliminate duplicates!
    links = list(set(links))
    return links
コード例 #11
0
def get_poll(id):
    url = "http://charts.realclearpolitics.com/charts/%i.xml" % int(id)
    xml = requests.get(url).text
    dom = web.Element(xml)
    result = {}

    dates = dom.by_tag('series')[0]

    temp = []
    temp_dates = []
    for n in dates.by_tag('value'):
        temp = temp + [str(n.content)]
    for t in temp:
        split = t.split('/')
        d = datetime.date(int(split[2]), int(split[0]), int(split[1]))
        temp_dates = temp_dates + [unicode(d)]
    dates = temp_dates

    for graph in dom.by_tag('graph'):
        name = graph.attributes['title']
        poll_data = []
        for n in graph.by_tag('value'):
            if n.content:
                poll_data = poll_data + [float(n.content)]
            else:
                poll_data = poll_data + [np.nan]
        result[name] = poll_data

    frame = pd.DataFrame(result, index=dates)
    frame.index.name = 'date'
    return frame
コード例 #12
0
def rcp_poll_data(xml):
    '''dom = web.Element(xml)
    result = {}

    dates = dom.by_tag('series')[0]
    dates = {n.attributes['xid']: str(n.content) for n in dates.by_tag('value')}
    keys = dates.keys()

    result['date'] = pd.to_datetime([dates[k] for k in keys])

    for graph in dom.by_tag('graph'):
        name = graph.attributes['title']
        data = {n.attributes['xid']: float(n.content)
                if n.content else np.nan for n in graph.by_tag('value')}
        result[name] = [data[k] for k in keys]

    result = pd.DataFrame(result)
    result = result.sort_values("date",ascending=True)
    result = result.reset_index(drop=True)
    return result
xml = get_poll_xml(1044)
print rcp_poll_data(xml)'''
    '''result = {}
    dom = web.Element(xml)
    dates = dom.by_tag("series")[0]
    dates = {n.attributes["xid"]:str(n.content) for n in dates.by_tag("value")}
    result["date"] = pd.to_datetime([dates[k] for k in dates.keys()])
    for value in dom.by_tag("graph"):
        name = value.attributes['title']
        dic = {n.attributes["xid"]:float(n.content) if n.content else np.nan for n in value.by_tag("value")}
        result[name] = [dic[k] for k in dic.keys()]
    result = pd.DataFrame(result)
    result = result.sort_values("date",ascending=True)
    result = result.reset_index(drop=True)
    return result'''
    dom = web.Element(xml)
    result = {}
    general = dom.by_tag("div#polling-data-full")[0]
    if_name = general.by_tag("th")
    if len(if_name)>6:
        dates = [date.content.split("-")[1].strip()+"/10" for date in general.by_tag("td")[15::7]]
        dates = pd.to_datetime(dates)
        name_1 = str(if_name[3].content.split("(")[0].strip())
        name_2 = str(if_name[4].content.split("(")[0].strip())
        name_3 = str(if_name[5].content.split("(")[0].strip())
        result["date"] = dates
        result[name_1] = [float(c.content) if c.content!="--" else np.nan for c in general.by_tag("td")[17::7]]
        result[name_2] = [float(c.content) if c.content!="--" else np.nan for c in general.by_tag("td")[18::7]]
        result[name_3] = [float(c.content) if c.content!="--" else np.nan for c in general.by_tag("td")[19::7]]
    elif len(if_name)<=6:
        dates = [date.content.split("-")[1].strip()+"/10" for date in general.by_tag("td")[13::6]]
        dates = pd.to_datetime(dates)
        name_1 = str(if_name[3].content.split("(")[0].strip())
        name_2 = str(if_name[4].content.split("(")[0].strip())
        result["date"] = dates
        result[name_1] = [float(c.content) for c in general.by_tag("td")[15::6]]
        result[name_2] = [float(c.content) for c in general.by_tag("td")[16::6]]
    result = pd.DataFrame(result)
    return result
コード例 #13
0
def url_to_text(input):  #scrape url, get content of news articles
    text = ''
    dom = web.Element(input)
    para = dom.by_tag('div.column1 grid8 grid-inside')[0]
    for p in para.by_tag('p'):
        for c in p.children:
            if c.type == 'text': text += c.source
    return text
コード例 #14
0
def get_population_html_tables(html):
    
    dom = web.Element(html)

    #print(dom.by_tag('table'))
    tbls = [t for t in dom.by_tag('table') if t.attributes['class'] == ['sortable', 'wikitable']]
    
    return tbls
コード例 #15
0
 def __init__(self, schedule):
     self.schedule = schedule
     self.browser_schedule = webdriver.PhantomJS()
     self.browser_schedule.set_window_size(1024, 768)
     self.browser_schedule.get(self.schedule)
     self.website_schedule = self.browser_schedule.page_source
     self.browser_schedule.quit()
     self.dom_schedule = web.Element(self.website_schedule)
コード例 #16
0
def extract_dom(sort, start, url='http://www.imdb.com/search/title'):
    params = dict(sort=sort,
                  at=0,
                  start=start,
                  title_type='feature',
                  year='1950,2014')
    r = requests.get(url, params=params)
    print bcolors.WARNING + r.url + bcolors.ENDC
    return web.Element(r.text)
コード例 #17
0
def get_population_html_tables(html):
    dom = web.Element(html)
    ### 0. step: look at html source!

    #### 1. step: get all tables
    #tbls = dom('table')
    #### 2. step: get all tables we care about
    tbls = dom.by_class('sortable wikitable')
    return tbls
コード例 #18
0
 def __init__(self, main_website):
     self.main_website = main_website
     self.browser_main = webdriver.PhantomJS()
     self.browser_main.set_window_size(1024, 768)
     self.browser_main.get(self.main_website)
     self.website_main = self.browser_main.page_source
     self.browser_main.quit()
     self.dom = web.Element(self.website_main)
     self.links = self.dom.by_class('expanded')
     self.main_url = URL(self.main_website)
コード例 #19
0
def get_acestream(url):
    browser = webdriver.PhantomJS()
    browser.set_window_size(1024, 768)
    browser.get(url)
    website = browser.page_source
    browser.quit()
    dom = web.Element(website)
    aces = dom.by_attr(target='_blank')
    for ace in aces:
        if ace.href[:3] == 'ace':
            return str(ace.href)
コード例 #20
0
def PQarchive_url_list(start_date,
                       end_date,
                       page,
                       newspaper_tag='latimes',
                       query='romney OR obama',
                       debug=False):
    '''
    Scrapes the PQ archive system to get a list of all URLs.
    
    Inputs: M(M)-D(D)-YYYY of start and end date, page number (1-indexed)
    Output: URL List
    '''

    # split dates into M, D, Y
    start_date = start_date.split('-')
    end_date = end_date.split('-')

    options = {}

    # run the query
    url = 'http://pqasb.pqarchiver.com/' + newspaper_tag + '/results.html'
    options['st'] = 'advanced'
    options['sortby'] = 'CHRON'
    options['datetype'] = 6
    options['frommonth'] = start_date[0]
    options['fromday'] = start_date[1]
    options['fromyear'] = start_date[2]
    options['tomonth'] = end_date[0]
    options['today'] = end_date[1]
    options['toyear'] = end_date[2]
    options['type'] = 'current'
    options['start'] = (page - 1) * 10
    options['QryTxt'] = query

    # try to get url with specified parameters
    try:
        r = requests.get(url, params=options)
        html = r.text
        if debug: print r.url
    except:
        print 'Unable to parse URL list for ' + str(url)
        return None

    # declare dom object to begin parsing the data
    dom = web.Element(html)

    url_list = []

    # find each url
    for a in dom('.result_title a'):
        url_list.append(str(a.attrs['href']))

    return url_list
コード例 #21
0
def get_tables(html):
    """Parse html and return html tables of wikipedia population data."""

    dom = web.Element(html)

    ### 0. step: look at html source!
    #### 1. step: get all tables
    #tbls = dom('table')
    #### 2. step: get all tables we care about
    tbls = dom.by_class('collapse1')
    #     href = tbls.by_class('thumbnail')
    return tbls
コード例 #22
0
def get_craigslist_postings():
    postings = json.load(open('results.json', 'r'))

    # filter cross postings
    filtered = {}
    for boat_name, posts in postings.items():
        founds = []
        urls = []
        for post in posts:
            _hash = post.split('/')[-2]
            if _hash not in founds:
                founds.append(_hash)
                urls.append(post)
        filtered[boat_name] = urls


    for boat_name, posts in filtered.items():
        for post in posts:
            r = requests.get(post)
            w = web.Element(r.content)
            body = w.by_id('postingbody')
            content = body.content
            links = body.by_tag('a')
            if links:
                content = content.replace(links[0].content, '')
            c = w.by_class("print-qrcode-label")
            if c:
                content = content.replace(c[0].content, '')
            
            content = web.plaintext(content)

            formatted_attrs = {}
            attrs = w.by_class('attrgroup')[0].by_tag('span')
            for attr in attrs:
                values = web.plaintext(attr.content).split(': ')
                if len(values) == 2:
                    key = values[0].replace(' ', '_').replace('/','')
                    value = values[1]
                    formatted_attrs[key] = value
            price = web.plaintext(w.by_class('price')[0].content.replace('$',''))
            post_data = {
                'body': {
                    'source': content,
                    'link': post,
                    'attrs': formatted_attrs,
                    'boat': boat_name,
                    'price': price
                },
                "index": "listings",
                'doc_type': 'listing'
            }
            res = es.index(**post_data)
コード例 #23
0
def get_all_year_votes(year_url):
    senate_votes = []
    house_votes = []
    vote_page = requests.get(year_url).text
    element = web.Element(vote_page)
    link_list = element.by_tag("a")
    for link in link_list[0:5]:
        label = link[0]
        if str(label)[0] == "s":
            senate_votes.append(get_vote(str(label)))
        if str(label)[0] == "h":
            house_votes.append(get_vote(str(label)))
    return (senate_votes, house_votes)
コード例 #24
0
    def get_region(self):

        if self.data:

            dom = web.Element(self.data)
            h = HTMLParser.HTMLParser()

            locality_section = dom.by_tag('span.locality')
            if locality_section:
                region = h.unescape(locality_section[0].content.strip())
                return region
        else:
            print self.name, 'get_region Error - no page given'
コード例 #25
0
ファイル: poll_predict.py プロジェクト: ealehman/poll-predict
def find_governor_races(html):
    dom = web.Element(html)
    option_tags = dom.by_tag('option')

    gov_links = []
    
    #iterate through option tags
    for op in option_tags:
        value = op.attributes['value']
        # only append governor links
        if re.search("2010/governor", value):
            gov_links.append(value)
    return gov_links
コード例 #26
0
ファイル: hw1.py プロジェクト: ThomasSan/harvardCS109
def find_governor_races(html):
    dom = web.Element(html)
    result = []

    for ahref in dom.by_tag('a'):
        name = ahref.attributes.get('href', '')
        reg1 = 'http://www.realclearpolitics.com/epolls/????/governor/??/*-*.html'
        reg2 = '/epolls/????/governor/??/*-*.html'
        if fnmatch.fnmatch(name, reg1):
            result.append(name)
        elif fnmatch.fnmatch(name, reg2):
            result.append("http://www.realclearpolitics.com" + name)
    return result
コード例 #27
0
ファイル: bill_scraper.py プロジェクト: nietfeld/cs284r
def get_year_bills(chamber, session):
    bills = []
    url = bill_url(chamber, session)
    html = requests.get(url).text
    element = web.Element(html)
    bill_list = element.by_tag("a")

    for link in bill_list[1:]:
        number=link[0]
        bill = get_bill(url,number)
        bills.append(bill)

    return bills
コード例 #28
0
def Globe_url_list(start_date, end_date, page):
    '''
    Scrapes the Boston Globe archives to get a list of all URLs.
    
    Inputs: M(M)-D(D)-YYYY of start and end date, page number (1-indexed)
    Output: URL List
    '''

    # split dates into M, D, Y
    start_date = start_date.split('-')
    end_date = end_date.split('-')

    options = {}

    # run the query
    url = 'http://pqasb.pqarchiver.com/boston/results.html'
    options['st'] = 'advanced'
    options['sortby'] = 'CHRON'
    options['datetype'] = 6
    options['frommonth'] = start_date[0]
    options['fromday'] = start_date[1]
    options['fromyear'] = start_date[2]
    options['tomonth'] = end_date[0]
    options['today'] = end_date[1]
    options['toyear'] = end_date[2]
    options['type'] = 'current'
    options['start'] = (page - 1) * 10
    options['QryTxt'] = 'romney OR obama'

    # try to get url with specified parameters
    try:
        html = requests.get(url, params=options).text
    except:
        print 'Unable to parse URL list for ' + str(url)
        return None

    # declare dom object to begin parsing the data
    dom = web.Element(html)

    url_list = []

    # find each url
    for a in dom('table a'):
        # check if the a tag has a title, the title matches the Preview sring, and the href is not from the header faq section
        if (('title' in a.attrs)
                and (a.attrs['title'] == 'Preview&nbsp;(Abstract/Citation)')
                and (a.attrs['href'] != 'faq.html#abs')):
            # add url to url_list
            url_list += [str(a.attrs['href'])]

    return url_list
コード例 #29
0
def get_population_html_tables(html):
    """Parse html and return html tables of wikipedia population data."""

    dom = web.Element(html)

    ### 0. step: look at html source!
    
    #### 1. step: get all tables
    tbls = dom.by_class('sortable wikitable')
    

    #### 2. step: get all tables we care about

    return tbls
コード例 #30
0
def race_result(url):
    dom = web.Element(requests.get(url).text)

    table = dom.by_tag('div#polling-data-rcp')[0]
    result = table.by_tag('tr.final')[0]
    td = result.by_tag('td')

    results = [float(t.content) for t in td[3:-1]]
    tot = sum(results) / 100

    headers = table.by_tag('th')
    labels = {str(t.content).split('(')[0].strip() for t in headers[3:-1]}

    return {l: r / tot for l, r in zip(labels, results)}