def page_extract(self, html): actors = [] for div in html.cssselect( 'div.lister-list div.lister-item.mode-detail'): try: actor = {} actor[keys.entity_rank] = parse.csstext( div.cssselect('span.lister-item-index.unbold.text-primary') [0]).split('.')[0] actor[keys.entity_pic] = div.cssselect( 'div.lister-item-image a img')[0].attrib['src'] actor[keys.entity_name] = parse.csstext( div.cssselect('h3.lister-item-header a')[0]).strip() actor[keys.entity_profile] = fixed.clean_url( 'http://www.imdb.com' + div.cssselect('div.lister-item-image a')[0].attrib['href']) actor[keys.entity_position] = parse.csstext( div.cssselect('p.text-muted.text-small')[0]).split( '|')[0].strip() actor[keys_hollywood.noted] = parse.csstext( div.cssselect('p.text-muted.text-small a')[0]).strip() actor[keys_hollywood.noted_profile] = fixed.clean_url( 'http://www.imdb.com' + div.cssselect( 'p.text-muted.text-small a')[0].attrib['href']) print actor actors.append(actor) except Exception as e: print 'page_extract exception:', e return actors
def search_ufc(self, players): print 'ufc_find:', len(players) for i, p in enumerate(players): print 'p:', i, 'of', len(players) if keys.entity_profile not in p.keys(): search_term = p[keys.entity_name] + ' site:www.ufc.com' d = self.cv.bing(search_term) google_results = yield d print 'google results:', google_results try: profile = [ result for result in google_results if result.startswith('http://www.ufc.com/fighter') ][0] profile = profile.split('?')[0] print 'profile:', profile if not fixed.clean_url(profile).endswith('/media'): p[keys.entity_profile] = fixed.clean_url( profile.replace('%20', '')).lower() d = self.cv.goto_url(p[keys.entity_profile]) d.addCallback(self.cv.to_html) d.addCallback(self.parse_fighter, p) yield d except: print 'missing on ufc.com:', p[keys.entity_name] defer.returnValue([ p for p in players if keys.entity_profile in p and 'weight_class' not in p[keys.entity_profile] ])
def pullteam(self, h, players = []): doc = html.document_fromstring(h) for tr in doc.cssselect('a[title="List of sovereign states"]'): tr = tr.getparent().getparent() while tr.getnext() is not None: tr = tr.getnext() try: country = parse.csstext(tr.cssselect('th a')[0]) if country: for td in tr.cssselect('td'): if len(td.cssselect('a')) == 2: try: player = {} player[keys.entity_team] = 'World Leaders' player[keys.entity_position] = parse.csstext(td.cssselect('a')[0]).split('\xc2')[0] player[keys.entity_name] = parse.csstext(td.cssselect('a')[1]) player[keys.entity_profile] = fixed.clean_url('http://en.wikipedia.org' + td.cssselect('a')[1].attrib['href']) player[keys.entity_country] = country players.append(player) except Exception as e: print 'world leader exception:', e except Exception as e: print 'world leader exception:', e players.append({keys.entity_twitter: 'UN', keys.entity_profile: 'team:World Leaders'}) return players
def callbackExtractGovernors(self, h): try: governors = [] doc = html.document_fromstring(h) h2 = doc.cssselect('h2 span[id="State_governors"]')[0].getparent() while h2.tag != 'table': h2 = h2.getnext() #n = doc.xpath('/html/body/div[3]/div[4]/div[4]/div/table[1]')[0] for tr in h2.cssselect('tr'): g = {} try: g[keys.entity_team] = 'Governors' g[keys.entity_flag] = 'http:' + tr[0].xpath('div[1]/a/img')[0].attrib['src'] g[keys.entity_state] = tr[0].xpath('div[2]/a')[0].text try: g[keys.entity_pic] = 'http:' + tr[1].find('a').find('img').attrib['src'] except: pass g[keys.entity_name] = tr[2].cssselect('center span.vcard a')[0].attrib['title'] #t = tr[2].find(".//a") g[keys.entity_profile] = fixed.clean_url('http://en.wikipedia.org' + tr[2].cssselect('center span.vcard a')[0].attrib['href']) g[keys.entity_party] = tr[4].find('a').text g[keys.entity_prior_exp] = parse.csstext(tr[5]) g[keys.entity_assumed_office] = parse.csstext(tr[6]) g[keys.entity_term_expires] = parse.csstext(tr[7]) governors.append(g) except: pass governors.append({ keys.entity_twitter: 'NatlGovsAssoc', keys.entity_profile: 'team:Governors'}) return governors except Exception as e: print e
def get_skater(self, html, community): for tr in html.cssselect('table.vitals.vitalsshrink tr')[1:]: skater = {} skater[keys.entity_rank] = parse.csstext(tr[0])[:-2] skater[keys.entity_profile] = fixed.clean_url( tr[1].cssselect('a')[0].attrib['href']) skater[keys.entity_name] = parse.csstext( tr[2].cssselect('a')[0]).replace(',', '') skater[keys.entity_country] = parse.csstext( tr[2].cssselect('a')[1]) try: skater[keys.entity_age] = tr.cssselect( 'h3 br')[0].tail.strip().split(' ')[1] skater[keys.entity_points] = tr.cssselect( 'h3 br')[1].tail.strip().split(' ')[1] except: try: skater[keys.entity_points] = tr.cssselect( 'h3 br')[0].tail.strip().split(' ')[1] except: pass pic_url = 'https://theboardr.blob.core.windows.net/headshots/' + skater[ keys.entity_profile].split('/')[4] + '_900.jpg' check = requests.head(pic_url, headers={ 'User-Agent': 'curl/7.35.0', 'Accept': '*/*' }, verify=True) if check.status_code == 200: skater[keys.entity_pic] = pic_url skater[keys.entity_team] = self.skating community.append(skater)
def get_snow(self, html, community, gender, style): print 'get_snow:', self.snowboarding for tr in html.cssselect('tr.ranking'): try: player = {} player[keys.entity_gender] = gender player[keys.entity_style] = style player[keys.entity_rank] = parse.csstext( tr.cssselect('td span')[0]).replace('.', '') #player[keys.entity_rank_change] = parse.csstext(tr.cssselect('td')[2]) #player[keys.entity_rank_change] = player[keys.entity_rank_change].replace('--','-') player[keys.entity_name] = parse.csstext(tr.cssselect('td')[3]) player[keys.entity_profile] = fixed.clean_url( self.snow_rankings + tr.cssselect('td')[3].cssselect( 'a')[0].attrib['href'].strip().split('?')[0]) player[keys.entity_origin] = parse.csstext( tr.cssselect('td')[4]) player[keys.entity_age] = parse.csstext(tr.cssselect('td')[5]) player[keys.entity_sponsors] = parse.csstext( tr.cssselect('td')[6]) player[keys.entity_points] = parse.csstext( tr.cssselect('td')[8]) if keys.entity_profile in player.keys(): player[keys.entity_team] = self.snowboarding community.append(player) except: pass
def get_community(self, html, community, gender='Male'): trs = html.cssselect( 'table[class="tableType-athlete hasGroups"]')[0].cssselect('tr') print 'community length:', len(trs) for tr in trs: player = {} try: player[keys.entity_rank] = parse.csstext( tr.cssselect('td[class~="athlete-tour-rank"]')[0]) #player[keys.entity_rank_change] = parse.csstext(tr.cssselect('td[class="athlete-tour-rank-change"]')[0]) name_element = parse.csstext( tr.cssselect('a[class="athlete-name"]')[0]).title() player[keys.entity_name] = name_element.replace( 'INJU', '').replace('RECO', '').strip() player[keys.entity_profile] = fixed.clean_url( 'http://www.worldsurfleague.com' + tr.cssselect('a[class="athlete-name"]')[0].attrib['href']) player[keys.entity_origin] = tr.cssselect( 'span.athlete-country-flag')[0].attrib['title'] player[keys.entity_points] = parse.csstext( tr.cssselect('span[class="tour-points"]')[0]) player[keys.entity_prizemoney] = parse.csstext( tr.cssselect('td[class~="athlete-tour-prize-money"]')[0]) if player[keys.entity_name]: player[keys.entity_team] = self.surfing community.append(player) except Exception as e: if keys.entity_name in player: player[keys.entity_team] = self.surfing community.append(player)
def entrepreneurVC100(self, html): components = [] for h2s in html.cssselect('h2[class="slides"]'): player = {keys.entity_team: 'Entrepreneur VC 100'} player[keys.entity_rank] = parse.csstext(h2s).split(' ')[0][1:] player[keys.entity_name] = h2s.find('./a').text player[keys.entity_location] = h2s.find('./a').tail[1:] player[keys.entity_profile] = fixed.clean_url( 'http://' + urlparse(h2s.cssselect('a')[0].attrib['href']).netloc) if len(player[keys.entity_profile]) > 7: player[keys.entity_location] = parse.csstext(h2s).split( ',', 1)[1].strip() if h2s.getnext()[0].tag.lower() == 'img': player[keys.entity_pic] = h2s.getnext().cssselect( 'img')[0].attrib['src'] player[keys.entity_investments] = parse.csstext( h2s.getnext().getnext()).split(' ')[-1] + 'M' try: player[keys.entity_deals] = parse.csstext( h2s.getnext().getnext().getnext()) except: pass else: player[keys.entity_investments] = parse.csstext( h2s.getnext()).split(' ')[-1] + 'M' player[keys.entity_deals] = parse.csstext( h2s.getnext().getnext()).split(' ')[-1] components.append(player) return components
def getplayers(self, html, team): team['players'] = [] for a in html.cssselect('a.squadPlayerCard'): print a #print 'hey:', player_span, player_span.cssselect('div.playerPhoto img')[0].attrib player = {} player[keys.entity_profile] = fixed.clean_url(ipl_base + a.attrib['href']) #print 'player 1:', player player[ keys. entity_pic] = 'http://iplstatic.s3.amazonaws.com/players/210/' + a.cssselect( 'div.playerPhoto')[0].cssselect('img[data-player-id]')[ 0].attrib['data-player-id'] + '.png' #print 'player 2:', player player[keys.entity_name] = parse.csstext( a.cssselect('p.player-name')[0]) #print 'player 3:', player if len(a.cssselect('span.captain')) > 0: player[keys.entity_captain] = True if len(a.cssselect('span.overseas-player')) > 0: player[keys.entity_foreign] = True if len(a.cssselect('span.wicket-keeper')) > 0: player[keys.entity_position] = "Wicket Keeper" for li in a.cssselect('ul.stats li'): label = parse.csstext(li.cssselect('span.label')[0]) value = parse.csstext(li.cssselect('span.value')[0]) player[label.lower()] = value print player team['players'].append(player) print 'length of team:', len(team['players']) return team
def createCeoTeam(self, components): ceos = [] for player in components: if keys.entity_ceo in player: for ceo in player[keys.entity_ceo]: ceo_player = {keys.entity_team: 'Billion Dollar CEO'} ceo_player[keys.entity_company] = player[keys.entity_name] ceo_player.update({keys.entity_name: ceo}) print 'lookup:', ceo d = self.cv.google(ceo, domain='en.wikipedia.org') d.addErrback(self.error_league) res = yield d if res and res[0]: print 'wikipedia to profile:', res[0] ceo_player[keys.entity_profile] = fixed.clean_url( res[0]) isb = yield self.cv.goto_url( ceo_player[keys.entity_profile]).addCallback( lambda ign: self.cv.to_html()).addCallback( self.is_born) if isb: ceos.append(ceo_player) else: print 'NO ceo!', player[keys.entity_name] defer.returnValue(ceos)
def getplayers(self, html, team): print 'getplayers' team['team'] = parse.csstext( html.cssselect( 'div[class="cabecera-seccion"] span[class="titulo"]')[0]) print team['team'] for tr in html.cssselect( 'div[class="rotar-tabla margen"] div[id="DataTables_Table_0_wrapper"] table[id="DataTables_Table_0"] tr' )[1:]: #if parse.csstext(positions) != 'Coach': player = { keys.entity_position: parse.csstext(tr.cssselect('td')[0]) } a = tr.cssselect('td')[1].cssselect('a')[0] player[keys.entity_profile] = fixed.clean_url(a.attrib['href']) player[keys.entity_pic] = a.cssselect('img')[0].attrib['src'] try: jersey = parse.csstext(tr.cssselect('td')[2]) if jersey: player[keys.entity_jersey] = jersey except: print 'no jersey' player[keys.entity_yellows] = parse.csstext(tr.cssselect('td')[15]) player[keys.entity_reds] = parse.csstext(tr.cssselect('td')[16]) player[keys.entity_goals] = parse.csstext(tr.cssselect('td')[18]) team['players'].append(player) print[p[keys.entity_profile] for p in team['players']]
def pullteams(self, h): players=[] doc = html.document_fromstring(h) h3 = doc.cssselect('h3 ~ ul li') print 'h3 length:', len(h3) h2 = doc.cssselect('h2 ~ ul li') print 'h2 length:', len(h2) h3.extend(h2) for li in h3: player = {} try: player[keys.entity_team] = 'PAC' player[keys.entity_topic] = parse.csstext(li.getparent().getprevious().getchildren()[0]) if player[keys.entity_topic] != 'External links': try: href = li.cssselect('a')[0].attrib['href'] if not urlparse(href).scheme and href: href = 'http://en.wikipedia.org' + href player[keys.entity_profile] = fixed.clean_url(href) player[keys.entity_name] = parse.csstext(li.cssselect('a')[0]) if player[keys.entity_name] and player[keys.entity_profile]: if player[keys.entity_name].rfind(' - ') > 0: player[keys.entity_location] = player[keys.entity_name][player[keys.entity_name].rfind(' - ') + 3:] player[keys.entity_name] = player[keys.entity_name][:player[keys.entity_name].rfind(' - ')] #print player players.append(player) except Exception as e2: print 'exception inner:', e2 except: pass players.append({keys.entity_profile: 'team:PAC', keys.entity_twitter: 'FEC'}) return players
def sprintcup_drivers(self, html): drivers = [] for tr in html.cssselect('table.driver-list-table tr')[1:]: ''' <tr> <td class="driver-name-td"><a href="/drivers/driversaj-allmendinger/">Allmendinger, AJ</a></td> <td class="driver-number-td">47</td> <td class="driver-make-td"><img src="/wp-content/uploads/sites/7/2017/01/Chevy-Driver-Page-New-2-160x811-265x180.png"></td> <td class="driver-team-td">JTG Daugherty Racing</td> </tr> ''' driver = {} driver[keys.entity_profile] = fixed.clean_url( NASCAR.nascar_url + tr.cssselect('td.driver-name-td a')[0].attrib['href']) driver[keys.entity_name] = parse.csstext( tr.cssselect('td.driver-name-td a')[0]).strip() driver[keys. entity_name] = driver[keys.entity_name].split(',')[1].strip( ) + ' ' + driver[keys.entity_name].split(',')[0].strip() driver[keys.entity_carnumber] = parse.csstext( tr.cssselect('td.driver-number-td')[0]) team = parse.csstext(tr.cssselect('td.driver-team-td')[0]).strip() if not team: team = NASCAR.unaffiliated driver[keys.entity_team] = team driver[keys.entity_carnumber] = parse.csstext( tr.cssselect('td.driver-number-td')[0]) driver[keys.entity_circuit] = self.get_common_name() print driver drivers.append(driver) #driver[keys.entity_rank] = parse.csstext(div.cssselect('div.position')[0]).strip() #driver[keys.entity_name] = parse.csstext(div.cssselect('div.driver div.driver-first')[0]).split() + ' ' + parse.csstext(div.cssselect('div.driver div.driver-last')[0]).split() #<div class="driver"><div class="driver-first"> Martin</div><div class="driver-last">Truex Jr.</div><div class="legend-symbols"></div></div> #tr = driver_art.getparent().getparent().getparent().getparent().getparent() #driver[keys.entity_points] = parse.csstext(tr.cssselect('td')[3]) #driver[keys.entity_points_behind] = parse.csstext(tr.cssselect('td')[4]).replace('--','') #driver[keys.entity_starts] = parse.csstext(tr.cssselect('td')[5]) #driver[keys.entity_wins] = parse.csstext(tr.cssselect('td')[6]) #driver[keys.entity_top5] = parse.csstext(tr.cssselect('td')[7]) #driver[keys.entity_top10] = parse.csstext(tr.cssselect('td')[8]) #driver[keys.entity_dnf] = parse.csstext(tr.cssselect('td')[9]) #if not team: # team = NASCAR.unaffiliated #elif 'team:' + team not in [t[keys.entity_profile] for t in drivers]: # drivers.append({ keys.entity_profile: 'team:' + team }) driver[keys.entity_circuit] = self.get_common_name() drivers.append(driver) return drivers
def entities(self): firms = self.firms() for firm in firms: d = self.cv.bing(firm[keys.entity_name]) d.addErrback(self.error_league) cites = yield d if cites: firm[keys.entity_profile] = fixed.clean_url(cites[0]) defer.returnValue([f for f in firms if keys.entity_profile in f])
def studio_detail(self, html, studio): try: info = html.cssselect('table.infobox')[0] try: studio[keys.entity_name] = parse.csstext( info.cssselect('caption')[0]) except: studio[keys.entity_name] = studio[keys.entity_company] try: studio[keys.entity_pic] = fixed.clean_url( 'http:' + info.cssselect('.logo a img')[0].attrib['src']) except: pass for th in info.cssselect('tr th'): if parse.csstext(th) == 'Website': studio[keys.entity_website] = fixed.clean_url( th.getnext().cssselect('a')[0].attrib['href']) except: pass print 'studio:', studio return True
def gather_active_roster(self, h, team): doc = html.document_fromstring(h) #/html/body/div[1]/div[3]/div[1]/section/div/section[1]/table team[keys.entity_team] = doc.cssselect( 'meta[property="og:site_name"]')[0].attrib['content'] for t in doc.xpath('//table[@class="data roster_table"][@summary]'): for pt in t.xpath('preceding-sibling::h4'): position = pt.text if pt.text[-1] == 's': position = pt.text[:-1] for player in t.xpath('tbody/tr[position() > 0]'): #print etree.tostring(player) try: player_dict = {} player_dict[keys.entity_position] = position player_dict[keys.entity_profile] = fixed.clean_url( 'http://m.mlb.com' + player[2].xpath('a/@href')[0]) if player[0].text: player_dict[keys.entity_jersey] = player[0].text if player_dict[keys.entity_jersey] == '42': try: e = Entity().get_item( league='mlb', profile=player_dict[ keys.entity_profile]) player_dict[keys.entity_jersey] = e[ keys.entity_jersey] except: pass player_dict[keys.entity_name] = player[2].xpath( 'a[starts-with(@href, "/player/")]')[0].text try: player_dict[keys.entity_status] = etree.tostring( player[2], method="text").strip().split('<br>')[1] print 'has status:', player_dict[ keys.entity_status] except: pass player_dict[keys.entity_height] = player[4].text player_dict[keys.entity_weight] = player[5].text player_dict[keys.entity_born] = player[6].text bt = player[3].text player_dict['bats'] = bt.split("/")[0] player_dict['throws'] = bt.split("/")[1] #print player_dict team['players'].append(player_dict) except Exception as e: print 'player exception:', e print 'team:', team['team'], 'players length:', len(team['players']) return team
def get_company_details(self, company, doc): for dd in doc.cssselect('dd'): if 'has been closed' in parse.csstext(dd): company[keys.entity_closed] = True for h2 in doc.cssselect('h2'): if parse.csstext(h2) == 'Overview': for dt in h2.getparent().getnext().cssselect( 'div.definition-list.container dt'): dd = dt.getnext() dt_text = parse.csstext(dt)[:-1] if dt_text.lower() == 'headquarters': company[keys.entity_headquarters] = parse.csstext(dd) elif dt_text.lower() == 'description': company[keys.entity_description] = parse.csstext(dd) elif dt_text.lower() == 'founders': company[keys.entity_founders] = parse.csstext(dd) elif dt_text.lower() == 'categories': company[keys.entity_sector] = parse.csstext(dd) elif dt_text.lower() == 'website ': company[keys.entity_profile] = fixed.clean_url( parse.csstext(dd)) elif dt_text.lower() == 'social': for a in dd.cssselect('a[data-icons]'): if a.attrib['data-icons'] == keys.entity_facebook: company[ keys.entity_facebook] = fixed.clean_url( a.attrib['href']).rsplit('/', 1)[1] if a.attrib['data-icons'] == keys.entity_twitter: company[keys.entity_twitter] = fixed.clean_url( a.attrib['href']).rsplit('/', 1)[1] if company[keys.entity_twitter].startswith( '@'): company[keys.entity_twitter] = company[ keys.entity_twitter][1:] if a.attrib['data-icons'] == keys.entity_linkedin: company[ keys.entity_linkedin] = fixed.clean_url( a.attrib['href']).replace( 'http://www.linkedin.com/', '')
def table(self): players = [{ keys.entity_name: 'Andrew Feldman', 'twitter': 'AFeldmanESPN', keys.entity_profile: fixed.clean_url('http://en.wikipedia.org/wiki/andrew_feldman_(poker_player)') }, { keys.entity_name: 'The Hendon Mob', 'twitter': 'thehendonmob', keys.entity_profile: fixed.clean_url('http://www.thehendonmob.com') }, { keys.entity_name: 'World Poker Tour', 'twitter': 'WPT', keys.entity_profile: fixed.clean_url('http://www.worldpokertour.com') }, { keys.entity_name: 'Rio Las Vegas', 'twitter': 'RioVegas', keys.entity_profile: fixed.clean_url('http://en.wikipedia.org/wiki/rio_all_suite_hotel_and_casino') }, { keys.entity_name: 'Party Poker', 'twitter': 'partypoker', keys.entity_profile: fixed.clean_url('http://www.partypoker.com/') }, { keys.entity_name: 'European Poker Tour', 'twitter': 'PokerStarsEPT', keys.entity_profile: fixed.clean_url('http://www.europeanpokertour.com') }] return players
def getICOs(self): html = yield self.cv.goto_url( 'https://coinmarketcap.com/all/views/all/').addCallback( self.cv.to_html) trs = html.cssselect( 'div.table-responsive.compact-name-column div.dataTables_wrapper.no-footer table tr' ) icos = [] for tr in trs[1:][:1200]: name = parse.csstext(tr.cssselect('a.currency-name-container')[0]) rank = parse.csstext(tr[0]) symbol = parse.csstext(tr.cssselect('td.col-symbol')[0]) try: href = tr.cssselect('span.currency-symbol a')[0].attrib['href'] profile = fixed.clean_url('http://coinmarketcap.com' + href) print 'rank:', rank, 'name:', name, 'sybol:', symbol ico = { keys.entity_name: name, keys.entity_profile: profile, keys_market.symbol: symbol, keys.entity_rank: rank } try: market_cap = twitter_keys.numTwitter( int( parse.csstext( tr.cssselect('td.no-wrap.market-cap.text-right' )[0]).replace('$', '').replace( ',', '').strip())) ico[keys.entity_market_cap] = market_cap except: pass try: supply = twitter_keys.numTwitter( int( parse.csstext( tr.cssselect( 'td.no-wrap.text-right.circulating-supply') [0]).replace('*', '').replace(',', '').strip())) ico[keys.entity_circulating_supply] = supply except: pass icos.append(ico) except: pass defer.returnValue(icos)
def get_2010s(self, html): performers = [] for li in html.cssselect('h2 span[id="2010s"]')[0].getparent().getnext( ).cssselect('ul li'): a = li.cssselect('a')[0] star = { keys.entity_profile: fixed.clean_url('http://en.wikipedia.org' + a.attrib['href']), keys.entity_name: a.attrib['title'] } performers.append(star) return performers
def callbackExtractSenate(self, h): senators = [] doc = html.document_fromstring(h) try: trs = doc.cssselect('h2 span[id="Senators"]')[0].getparent() while trs.tag != 'table': trs = trs.getnext() trs = trs.cssselect('tr') state = None party = None for i, tr in enumerate(trs[1:]): if i % 2 == 0: offset = 1 if len(tr.cssselect('td')) == 9: party = self.get_party(tr) if len(tr.cssselect('td')) == 8: offset = 0 state = tr[offset][0].text else: if len(tr.cssselect('td')) == 8: party = self.get_party(tr) offset = 0 if len(tr.cssselect('td')) == 7: offset = -1 senator = {} senator[keys.entity_team] = 'US Senate' senator[keys.entity_state] = state senator[keys.entity_pic] = fixed.clean_url('http:' + tr[1 + offset].cssselect('img')[0].attrib['src']) senator[keys.entity_name] = tr[2 + offset].cssselect('span.vcard a')[0].attrib['title'] senator[keys.entity_profile] = fixed.clean_url('http://en.wikipedia.org' + tr[2 + offset].cssselect('span.fn a')[0].attrib['href']) senator[keys.entity_party] = party senator[keys.entity_born] = parse.csstext(tr[3 + offset].cssselect('span.bday')[0]) senator[keys.entity_term_expires] = parse.csstext(tr[7 + offset]).split(' ')[-1] senators.append(senator) except Exception as e: print 'senate exception:', e senators.append({keys.entity_twitter: 'SenateHistory', keys.entity_profile: 'team:US Senate' }) return senators
def thefundedTopRatedVCs(self, html): components = [] for rank in html.cssselect('div[id="post"]')[0].cssselect( 'p[class="larger red"]'): player = {keys.entity_team: 'TheFunded Top Partners'} player[keys.entity_rank] = parse.csstext(rank)[:-1] player[keys.entity_name] = parse.csstext( rank.getnext().cssselect('a')[0]) player[keys.entity_profile] = fixed.clean_url( 'http://www.thefunded.com' + rank.getnext().cssselect('a')[0].attrib['href']) player[keys.entity_firm] = parse.csstext( rank.getparent().cssselect('a[class="fund"]')[0]) print 'player:', player components.append(player) return components
def get_community(self, html, community): subteams = {} for cycling_team in html.cssselect('.team_box')[0].cssselect('ul li'): jersey_pic = cycling_team.cssselect('a img')[0].attrib['src'] thref = 'http://www.cyclingnews.com' + cycling_team.cssselect( 'a')[0].attrib['href'] print 'team url:', thref subteam = {} subteam[keys.entity_jersey_pic] = jersey_pic subteams[thref] = subteam for k, st in subteams.iteritems(): d = self.cv.goto_url(k) d.addCallback(self.cv.to_html) d.addErrback(self.error_league) subhtml = yield d team_name = parse.csstext( subhtml.cssselect('div[class="team-name"]')[0]) st[keys.entity_profile] = 'team:' + team_name print 'cycle team:', st for rider in subhtml.cssselect('div.riders div.rider'): player = {} player[keys.entity_team] = team_name player[keys.entity_name] = parse.csstext( rider.cssselect('a')[0]) player[keys.entity_profile] = fixed.clean_url( 'http://www.cyclingnews.com' + rider.cssselect('a')[0].attrib['href']) #print 'found one!:', player community.append(player) for p in community: d = self.cv.goto_url(p[keys.entity_profile] + "/") d.addCallback(self.cv.to_html) d.addErrback(self.error_league) riderhtml = yield d try: rider = riderhtml.cssselect('rider-info-boxout')[0] p[keys.entity_pic] = rider.cssselect( 'img.rider-image')[0].attrib['src'] p[keys.entity_dob] = parse.csstext( rider.cssselect('span[itemprop="birthDate')[0]) p[keys.entity_nationality] = parse.csstext( rider.cssselect('span[itemprop="nationality')[0]) except: pass community.extend(subteams.values()) defer.returnValue(community)
def adjustments(self, components): print 'adjustments components len:', len(components) for c in components: cites = yield self.cv.bing(c[keys.entity_name]) if cites[0]: from urlparse import urlparse c[keys.entity_profile] = fixed.clean_url( 'http://' + urlparse(fixed.simpleurl(cites[0])).netloc) print c[keys.entity_name], 'bing profile:', c[ keys.entity_profile] for key in wbd: if key[0] == c[keys.entity_name] and c[ keys.entity_profile] != key[1]: c[keys.entity_profile] = key[1] print ' ', c[keys.entity_profile] print 'profile:', c[keys.entity_profile], c[keys.entity_name] defer.returnValue([c for c in components if keys.entity_profile in c])
def process_team(self, doc, team): print 'process team' team['players'] = [] for section in doc.cssselect('section.row.nba-player-index__row'): for p in section.cssselect('section.nba-player-index__trending-item'): player = {} player[keys.entity_jersey] = parse.csstext(p.cssselect('span.nba-player-trending-item__number')[0]) anchor = p.cssselect('a')[0] player[keys.entity_name] = anchor.attrib['title'] player[keys.entity_profile] = fixed.clean_url(NBA.nba_url + anchor.attrib['href']) player[keys.entity_pic] = 'http:' + anchor.cssselect('div.nba-player-index__image div.nba-player-index__headshot_wrapper img')[0].attrib['data-src'] player[keys.entity_position] = parse.csstext(p.cssselect('div.nba-player-index__details span')[0]) player[keys.entity_height] = parse.csstext(p.cssselect('div.nba-player-index__details strong')[0]).split(' ')[0] + '\' ' + parse.csstext(p.cssselect('div.nba-player-index__details strong')[1]).split(' ')[0] + '\"' player[keys.entity_weight] = parse.csstext(p.cssselect('div.nba-player-index__details strong')[2]) team['players'].append(player) print 'team:', team['team'], 'players length:', len(team['players']) return team
def extract_players(self, doc, team): try: n = doc.xpath('//div[@id="searchResultsLargeTable"]//tbody[1]')[0] for a in n: player_data = {} player_data[keys.entity_name] = a[2][0].text player_data[keys.entity_profile] = fixed.clean_url('http://www.nfl.com' + a[2][0].attrib['href']) player_data[keys.entity_position] = a[0].text player_data[keys.entity_status] = a[3].text try: if a[1].text: player_data[keys.entity_jersey] = a[1].text except Exception as e: print 'player exception:', e, team['team'] team['players'].append(player_data) except Exception as e2: print 'team exception:', e2, team['team']
def scrape_page(self, html, team): for li in html.cssselect('li[data-pos]'): ranking = li.attrib['data-pos'] celebrity_handle = li.cssselect( 'div[class="clr"] div[class="name-bio"] a[class="uname"]' )[0].attrib['href'][1:] name = parse.csstext( li.cssselect( 'div[class="clr"] div[class="name-bio"] a[class="name"] span' )[0]) partial = {keys.entity_rank: ranking, keys.entity_name: name} existing_league = None for count_e in Entity().query_2(index=Entity.index_twitter_league, twitter__eq=celebrity_handle): if count_e[keys.entity_league] != 'celebrity': existing_league = count_e[keys.entity_league] if not existing_league: partial[keys.entity_twitter] = celebrity_handle else: print 'already in league:', existing_league cite = yield self.cv.google(name + ' wikipedia', results=1, domain='en.wikipedia.org') try: clean_cite = self.check_cite(fixed.clean_url(cite[0])) print name, clean_cite, ranking html = yield getPage(str(clean_cite)).addCallback(etree.HTML) self.is_born(html, partial, clean_cite) if keys.entity_profile in partial.keys(): dob = '' try: dob = partial[keys.entity_dob] except: pass print '{:5s}'.format( partial[keys.entity_rank]), '{:40s}'.format( partial[keys.entity_name]), '{:20s}'.format( celebrity_handle), dob team.append(partial) else: print ' not born:', ranking, '{:40s}'.format( 'https://twitter.com/' + celebrity_handle), name except: print 'cite exception:', ranking, celebrity_handle, name
def callbackDepartments(self, h): departments = [] doc = html.document_fromstring(h) for a in doc.cssselect('li a, h3 a'): department_name = parse.csstext(a).split('(')[0] for k in self.sw: if department_name.startswith(k): department_name = department_name[len(k):] department_name = department_name.strip() department_name = department_name.strip() department_url = fixed.clean_url(a.attrib['href']) if department_url not in self.skip: department = { keys.entity_name: department_name, keys.entity_profile: department_url} department[keys.entity_team] = 'Departments' departments.append(department) departments.append({ keys.entity_profile: 'team:Departments', keys.entity_twitter: 'USGAO', keys.entity_name: 'Oversight Committee'}) return departments
def callbackExtractHouse(self, h): representatives = [] doc = html.document_fromstring(h) table = doc.cssselect('h2 span[id="Voting_members_by_state"]')[0].getparent().getnext() trs = table.cssselect('tr') for tr in trs[1:]: try: congress = {} congress[keys.entity_team] = 'House of Representatives' s = parse.csstext(tr[0].cssselect("a")[0]).split(" ")[:-1] try: s.remove(' at') s.remove('At') except: pass congress[keys.entity_state] = ' '.join(s) if congress[keys.entity_state].endswith(' at'): congress[keys.entity_state] = congress[keys.entity_state][:-3] try: congress[keys.entity_pic] = 'http:' + tr[1].cssselect("a img")[0].attrib['src'] except: pass congress[keys.entity_name] = tr[1].cssselect('span.vcard a')[0].text congress[keys.entity_profile] = fixed.clean_url('http://en.wikipedia.org' + tr[1].cssselect('span.vcard a')[0].attrib['href']) if len(tr) == 9: congress[keys.entity_party] = parse.csstext(tr[3]) elif len(tr) == 7: congress[keys.entity_party] = representatives[-1][keys.entity_party] congress[keys.entity_prior_exp] = parse.csstext(tr[-5]) congress[keys.entity_college] = parse.csstext(tr[-4]) try: ao = tr[-3].text if '*' in ao: ao = ao.replace('*', '') congress[keys.entity_assumed_office] = ao.strip() except: pass congress[keys.entity_born] = parse.csstext(tr[-1]).strip() representatives.append(congress) except: pass representatives.append({keys.entity_twitter: 'USHouseHistory', keys.entity_profile: 'team:House of Representatives' }) return representatives
def is_born(self, html, maybeperson, url): try: maybeperson[keys.entity_name] = parse.csstext( html.cssselect( 'table[class="infobox biography vcard"] tr th span')[0]) except: maybeperson[keys.entity_name] = parse.csstext( html.cssselect('h1[id="firstHeading"][class="firstHeading"]') [0]) for th in html.cssselect('th'): if parse.csstext(th).lower() in ['born', 'date of birth']: try: maybeperson[keys.entity_dob] = parse.csstext( th.getparent().cssselect( 'span[class="bday"]')[0]).replace(')', '') except: pass maybeperson[keys.entity_profile] = fixed.clean_url(url)