def fbs_standings(): teams = [t for t in Entity().query_2(index=Entity.index_site_profile, site__eq='d1tweets.com', profile__beginswith='team:')] standings_html = yield cv.goto_url('http://www.espn.com/college-football/standings').addCallback(cv.to_html) for td in standings_html.cssselect('tr.standings-row td.team'): record = parse.csstext(td.getnext().getnext().getnext().getnext()) fb_team_href = td.cssselect('a')[0].attrib['href'].rsplit('/', 1)[1] try: team = fbs_get_team(teams, fb_team_href) team[keys.entity_record] = record print team[keys.entity_profile].split(':', 1)[1], record team.partial_save() except: print 'no such luck:', fb_team_href rankings_html = yield cv.goto_url('http://www.espn.com/college-football/rankings').addCallback(cv.to_html) try: for h2 in rankings_html.cssselect('h2.table-caption'): if parse.csstext(h2) == 'AP Top 25': for r in h2.getparent().cssselect('table')[0].cssselect('span.number'): fb_team_href = r.getparent().getnext().cssselect('a.logo')[0].attrib['href'].rsplit('/', 1)[1] team = fbs_get_team(teams, fb_team_href) team[keys.entity_rank] = parse.csstext(r) team.partial_save() except Exception as e: print 'e:', e
def pullteam(self, h, players = []): doc = html.document_fromstring(h) for tr in doc.cssselect('a[title="List of sovereign states"]'): tr = tr.getparent().getparent() while tr.getnext() is not None: tr = tr.getnext() try: country = parse.csstext(tr.cssselect('th a')[0]) if country: for td in tr.cssselect('td'): if len(td.cssselect('a')) == 2: try: player = {} player[keys.entity_team] = 'World Leaders' player[keys.entity_position] = parse.csstext(td.cssselect('a')[0]).split('\xc2')[0] player[keys.entity_name] = parse.csstext(td.cssselect('a')[1]) player[keys.entity_profile] = fixed.clean_url('http://en.wikipedia.org' + td.cssselect('a')[1].attrib['href']) player[keys.entity_country] = country players.append(player) except Exception as e: print 'world leader exception:', e except Exception as e: print 'world leader exception:', e players.append({keys.entity_twitter: 'UN', keys.entity_profile: 'team:World Leaders'}) return players
def callbackExtractGovernors(self, h): try: governors = [] doc = html.document_fromstring(h) h2 = doc.cssselect('h2 span[id="State_governors"]')[0].getparent() while h2.tag != 'table': h2 = h2.getnext() #n = doc.xpath('/html/body/div[3]/div[4]/div[4]/div/table[1]')[0] for tr in h2.cssselect('tr'): g = {} try: g[keys.entity_team] = 'Governors' g[keys.entity_flag] = 'http:' + tr[0].xpath('div[1]/a/img')[0].attrib['src'] g[keys.entity_state] = tr[0].xpath('div[2]/a')[0].text try: g[keys.entity_pic] = 'http:' + tr[1].find('a').find('img').attrib['src'] except: pass g[keys.entity_name] = tr[2].cssselect('center span.vcard a')[0].attrib['title'] #t = tr[2].find(".//a") g[keys.entity_profile] = fixed.clean_url('http://en.wikipedia.org' + tr[2].cssselect('center span.vcard a')[0].attrib['href']) g[keys.entity_party] = tr[4].find('a').text g[keys.entity_prior_exp] = parse.csstext(tr[5]) g[keys.entity_assumed_office] = parse.csstext(tr[6]) g[keys.entity_term_expires] = parse.csstext(tr[7]) governors.append(g) except: pass governors.append({ keys.entity_twitter: 'NatlGovsAssoc', keys.entity_profile: 'team:Governors'}) return governors except Exception as e: print e
def pullteams(self, h): players=[] doc = html.document_fromstring(h) h3 = doc.cssselect('h3 ~ ul li') print 'h3 length:', len(h3) h2 = doc.cssselect('h2 ~ ul li') print 'h2 length:', len(h2) h3.extend(h2) for li in h3: player = {} try: player[keys.entity_team] = 'PAC' player[keys.entity_topic] = parse.csstext(li.getparent().getprevious().getchildren()[0]) if player[keys.entity_topic] != 'External links': try: href = li.cssselect('a')[0].attrib['href'] if not urlparse(href).scheme and href: href = 'http://en.wikipedia.org' + href player[keys.entity_profile] = fixed.clean_url(href) player[keys.entity_name] = parse.csstext(li.cssselect('a')[0]) if player[keys.entity_name] and player[keys.entity_profile]: if player[keys.entity_name].rfind(' - ') > 0: player[keys.entity_location] = player[keys.entity_name][player[keys.entity_name].rfind(' - ') + 3:] player[keys.entity_name] = player[keys.entity_name][:player[keys.entity_name].rfind(' - ')] #print player players.append(player) except Exception as e2: print 'exception inner:', e2 except: pass players.append({keys.entity_profile: 'team:PAC', keys.entity_twitter: 'FEC'}) return players
def entities(self): self.gamblers.extend(BOOTH().table()) yield self.cv.goto_url('http://www.wsop.com/players/').addCallback(lambda ign: task.deferLater(reactor, 5, defer.succeed, True)) self.cv.page().runJavaScript('data', self.cb) yield self.done for l in must_haves: for pid in l: purl = 'http://www.wsop.com/players/profile/?playerid=' + pid if purl not in [g[keys.entity_profile] for g in self.gamblers]: print purl try: html = yield self.cv.goto_url(purl).addCallback(self.cv.to_html) missing_gambler = {} missing_gambler[keys.entity_profile] = purl missing_gambler[keys.entity_name] = parse.csstext(html.cssselect('div.iRight div.iRightContent h3')[0]) try: missing_gambler[keys.entity_country] = html.cssselect('div.PPCountry')[0].cssselect('i')[0].attrib['title'] except: pass tr = html.cssselect('table[id="PPtotals"] tr')[0] missing_gambler[keys.entity_bracelets] = parse.csstext(tr[0].cssselect('b')[0]) missing_gambler[keys.entity_rings] = parse.csstext(tr[1].cssselect('b')[0]) missing_gambler[keys.entity_cashes] = parse.csstext(tr[2].cssselect('b')[0]) missing_gambler[keys.entity_earnings] = parse.csstext(tr[3].cssselect('b')[0]) print 'missing:', missing_gambler self.gamblers.append(missing_gambler) except Exception as e: print e defer.returnValue(self.gamblers)
def getplayers(self, html, team): team['players'] = [] for a in html.cssselect('a.squadPlayerCard'): print a #print 'hey:', player_span, player_span.cssselect('div.playerPhoto img')[0].attrib player = {} player[keys.entity_profile] = fixed.clean_url(ipl_base + a.attrib['href']) #print 'player 1:', player player[ keys. entity_pic] = 'http://iplstatic.s3.amazonaws.com/players/210/' + a.cssselect( 'div.playerPhoto')[0].cssselect('img[data-player-id]')[ 0].attrib['data-player-id'] + '.png' #print 'player 2:', player player[keys.entity_name] = parse.csstext( a.cssselect('p.player-name')[0]) #print 'player 3:', player if len(a.cssselect('span.captain')) > 0: player[keys.entity_captain] = True if len(a.cssselect('span.overseas-player')) > 0: player[keys.entity_foreign] = True if len(a.cssselect('span.wicket-keeper')) > 0: player[keys.entity_position] = "Wicket Keeper" for li in a.cssselect('ul.stats li'): label = parse.csstext(li.cssselect('span.label')[0]) value = parse.csstext(li.cssselect('span.value')[0]) player[label.lower()] = value print player team['players'].append(player) print 'length of team:', len(team['players']) return team
def get_teams(self, html): teams = [] for conference in html.cssselect('div.mt7'): conference_name = parse.csstext( conference.cssselect('div.headline')[0]) conference_name = conference_name.lower().replace(' ', '').replace( '-', '').replace('americanathletic', 'aac').replace('midamerican', 'mac').replace( 'conferenceusa', 'cusa').replace('fbsindependents', 'fbsindependent') if conference_name == self.get_league_name(): print 'conference:', conference_name, len( conference.cssselect('section.TeamLinks')) for section in conference.cssselect('section.TeamLinks'): team = { 'conference': conference_name, 'link': 'http://espn.go.com' + section.cssselect('a')[0].attrib['href'] } for a in section.cssselect( 'div.TeamLinks__Links span.TeamLinks__Link a'): if parse.csstext(a).lower() == 'roster': roster_link = 'http://espn.go.com' + a.attrib[ 'href'] team['roster_link'] = roster_link teams.append(team) print teams return teams
def page_extract(self, html): actors = [] for div in html.cssselect( 'div.lister-list div.lister-item.mode-detail'): try: actor = {} actor[keys.entity_rank] = parse.csstext( div.cssselect('span.lister-item-index.unbold.text-primary') [0]).split('.')[0] actor[keys.entity_pic] = div.cssselect( 'div.lister-item-image a img')[0].attrib['src'] actor[keys.entity_name] = parse.csstext( div.cssselect('h3.lister-item-header a')[0]).strip() actor[keys.entity_profile] = fixed.clean_url( 'http://www.imdb.com' + div.cssselect('div.lister-item-image a')[0].attrib['href']) actor[keys.entity_position] = parse.csstext( div.cssselect('p.text-muted.text-small')[0]).split( '|')[0].strip() actor[keys_hollywood.noted] = parse.csstext( div.cssselect('p.text-muted.text-small a')[0]).strip() actor[keys_hollywood.noted_profile] = fixed.clean_url( 'http://www.imdb.com' + div.cssselect( 'p.text-muted.text-small a')[0].attrib['href']) print actor actors.append(actor) except Exception as e: print 'page_extract exception:', e return actors
def get_skater(self, html, community): for tr in html.cssselect('table.vitals.vitalsshrink tr')[1:]: skater = {} skater[keys.entity_rank] = parse.csstext(tr[0])[:-2] skater[keys.entity_profile] = fixed.clean_url( tr[1].cssselect('a')[0].attrib['href']) skater[keys.entity_name] = parse.csstext( tr[2].cssselect('a')[0]).replace(',', '') skater[keys.entity_country] = parse.csstext( tr[2].cssselect('a')[1]) try: skater[keys.entity_age] = tr.cssselect( 'h3 br')[0].tail.strip().split(' ')[1] skater[keys.entity_points] = tr.cssselect( 'h3 br')[1].tail.strip().split(' ')[1] except: try: skater[keys.entity_points] = tr.cssselect( 'h3 br')[0].tail.strip().split(' ')[1] except: pass pic_url = 'https://theboardr.blob.core.windows.net/headshots/' + skater[ keys.entity_profile].split('/')[4] + '_900.jpg' check = requests.head(pic_url, headers={ 'User-Agent': 'curl/7.35.0', 'Accept': '*/*' }, verify=True) if check.status_code == 200: skater[keys.entity_pic] = pic_url skater[keys.entity_team] = self.skating community.append(skater)
def get_community(self, html, community, gender='Male'): trs = html.cssselect( 'table[class="tableType-athlete hasGroups"]')[0].cssselect('tr') print 'community length:', len(trs) for tr in trs: player = {} try: player[keys.entity_rank] = parse.csstext( tr.cssselect('td[class~="athlete-tour-rank"]')[0]) #player[keys.entity_rank_change] = parse.csstext(tr.cssselect('td[class="athlete-tour-rank-change"]')[0]) name_element = parse.csstext( tr.cssselect('a[class="athlete-name"]')[0]).title() player[keys.entity_name] = name_element.replace( 'INJU', '').replace('RECO', '').strip() player[keys.entity_profile] = fixed.clean_url( 'http://www.worldsurfleague.com' + tr.cssselect('a[class="athlete-name"]')[0].attrib['href']) player[keys.entity_origin] = tr.cssselect( 'span.athlete-country-flag')[0].attrib['title'] player[keys.entity_points] = parse.csstext( tr.cssselect('span[class="tour-points"]')[0]) player[keys.entity_prizemoney] = parse.csstext( tr.cssselect('td[class~="athlete-tour-prize-money"]')[0]) if player[keys.entity_name]: player[keys.entity_team] = self.surfing community.append(player) except Exception as e: if keys.entity_name in player: player[keys.entity_team] = self.surfing community.append(player)
def fc_standings(league_name, urls, teams, sub = {}): print 'league:', league_name, 'team length:', len(teams) for url in urls: fc_standings_html = yield cv.goto_url(url).addCallback(cv.to_html) print 'fc_standings html length:', len(fc_standings_html), url, cv.page().url().toString() team_tds = fc_standings_html.cssselect('tr.standings-row') print 'team tds:', len(team_tds) for i, team_td in enumerate(team_tds): rank = i + 1 try: tn = parse.csstext(team_td.cssselect('span.team-names')[0]) if tn in sub: #print 'sub:', tn, sub[tn] tn = sub[tn] wdl = team_td.cssselect('td[style="white-space:no-wrap;"]') wins = parse.csstext(wdl[0]) ties = parse.csstext(wdl[1]) losses = parse.csstext(wdl[2]) record = wins + '-' + losses + '-' + ties print 'team:', tn, 'record:', record found = False for t in Entity().query_2(league__eq=league_name, profile__eq='team:' + tn): found = True t[keys.entity_record] = record t[keys.entity_rank] = rank print tn, rank, record t.partial_save() if not found: for t2 in Entity().query_2(league__eq=league_name, profile__beginswith='team:' + tn): found = True t2[keys.entity_record] = record t2[keys.entity_rank] = rank print tn, rank, record t2.partial_save() if not found: try: potential_teams = [t3 for t3 in teams if tn in t3[keys.entity_profile]] if len(potential_teams) == 1: found = True potential_teams[0] potential_teams[0][keys.entity_record] = record potential_teams[0][keys.entity_rank] = rank print tn, rank, record potential_teams[0].partial_save() except: pass if not found: print ' missing:', tn, rank except Exception as e: print 'fc exception:', e #<span class="team-names">Barcelona</span> '''
def sprintcup_drivers(self, html): drivers = [] for tr in html.cssselect('table.driver-list-table tr')[1:]: ''' <tr> <td class="driver-name-td"><a href="/drivers/driversaj-allmendinger/">Allmendinger, AJ</a></td> <td class="driver-number-td">47</td> <td class="driver-make-td"><img src="/wp-content/uploads/sites/7/2017/01/Chevy-Driver-Page-New-2-160x811-265x180.png"></td> <td class="driver-team-td">JTG Daugherty Racing</td> </tr> ''' driver = {} driver[keys.entity_profile] = fixed.clean_url( NASCAR.nascar_url + tr.cssselect('td.driver-name-td a')[0].attrib['href']) driver[keys.entity_name] = parse.csstext( tr.cssselect('td.driver-name-td a')[0]).strip() driver[keys. entity_name] = driver[keys.entity_name].split(',')[1].strip( ) + ' ' + driver[keys.entity_name].split(',')[0].strip() driver[keys.entity_carnumber] = parse.csstext( tr.cssselect('td.driver-number-td')[0]) team = parse.csstext(tr.cssselect('td.driver-team-td')[0]).strip() if not team: team = NASCAR.unaffiliated driver[keys.entity_team] = team driver[keys.entity_carnumber] = parse.csstext( tr.cssselect('td.driver-number-td')[0]) driver[keys.entity_circuit] = self.get_common_name() print driver drivers.append(driver) #driver[keys.entity_rank] = parse.csstext(div.cssselect('div.position')[0]).strip() #driver[keys.entity_name] = parse.csstext(div.cssselect('div.driver div.driver-first')[0]).split() + ' ' + parse.csstext(div.cssselect('div.driver div.driver-last')[0]).split() #<div class="driver"><div class="driver-first"> Martin</div><div class="driver-last">Truex Jr.</div><div class="legend-symbols"></div></div> #tr = driver_art.getparent().getparent().getparent().getparent().getparent() #driver[keys.entity_points] = parse.csstext(tr.cssselect('td')[3]) #driver[keys.entity_points_behind] = parse.csstext(tr.cssselect('td')[4]).replace('--','') #driver[keys.entity_starts] = parse.csstext(tr.cssselect('td')[5]) #driver[keys.entity_wins] = parse.csstext(tr.cssselect('td')[6]) #driver[keys.entity_top5] = parse.csstext(tr.cssselect('td')[7]) #driver[keys.entity_top10] = parse.csstext(tr.cssselect('td')[8]) #driver[keys.entity_dnf] = parse.csstext(tr.cssselect('td')[9]) #if not team: # team = NASCAR.unaffiliated #elif 'team:' + team not in [t[keys.entity_profile] for t in drivers]: # drivers.append({ keys.entity_profile: 'team:' + team }) driver[keys.entity_circuit] = self.get_common_name() drivers.append(driver) return drivers
def playerinfo(self, html, player): for key in html.cssselect('td[class="label"]'): if not parse.csstext(key).isnumeric(): value = key.getnext() player[self.key_lookup( parse.csstext(key).lower())] = parse.csstext(value) for key in html.cssselect('p[class="qsHeader"]'): value = key.getnext() player[self.key_lookup(key)] = parse.csstext(value)
def getComponents(self): components = [] yield self.cv.goto_url( 'http://graphics.wsj.com/billion-dollar-club/').addCallback( lambda ign: task.deferLater(reactor, 5, defer.succeed, True)) html = yield self.cv.to_html() for i, company in enumerate( html.cssselect('table[id="data-table"] tbody tr')): player = {keys.entity_team: 'WSJ Billion Dollar Startup'} player[keys.entity_rank] = i + 1 player[keys.entity_name] = parse.csstext( company.cssselect('td.company')[0]).strip() player[keys.entity_valuation] = parse.csstext( company.cssselect('td.valuation')[0]) player[keys.entity_total_funding] = parse.csstext( company.cssselect('td.total_funding')[0]) player[keys.entity_last_valuation] = parse.csstext( company.cssselect('td.val_date')[0]) self.cv.page().runJavaScript(js_link % str(i)) d = task.deferLater(reactor, 1, defer.succeed, True) d.addCallback(self.cv.to_html) html2 = yield d dets = html2.cssselect('tr.card-tr')[0] try: player[keys.entity_rounds] = parse.csstext( dets.cssselect('div[class="rounds co-info"]')[0][1]) except: pass for ceo in dets.cssselect('div[class="ceo co-info"]'): ceo_txt = parse.csstext(ceo).replace('CEO:', '') c = ceo_txt.split('(co-founder)')[0].split('(founder)')[ 0].split(', founder')[0].split(', founder')[0].split( '(co-founders)')[0].split(', co-founder') for rceo in c[0].split(' and '): rceo = rceo.strip() if keys.entity_ceo not in player: player[keys.entity_ceo] = [rceo] else: player[keys.entity_ceo].append(rceo) player[keys.entity_ratio] = parse.csstext( dets.cssselect('div[class="ratio co-info"] span[class="val"]') [0]) player[keys.entity_location] = parse.csstext( dets.cssselect( 'div[class="location co-info"] span[class="val"]')[0]) player[keys.entity_competitors] = parse.csstext( dets.cssselect( 'p[class="competitors co-info"] span[class="val"]')[0]) player[keys.entity_investors] = parse.csstext( dets.cssselect( 'p[class="investors co-info"] span[class="val"]')[0]) components.append(player) defer.returnValue(components)
def getICOs(self): html = yield self.cv.goto_url( 'https://coinmarketcap.com/all/views/all/').addCallback( self.cv.to_html) trs = html.cssselect( 'div.table-responsive.compact-name-column div.dataTables_wrapper.no-footer table tr' ) icos = [] for tr in trs[1:][:1200]: name = parse.csstext(tr.cssselect('a.currency-name-container')[0]) rank = parse.csstext(tr[0]) symbol = parse.csstext(tr.cssselect('td.col-symbol')[0]) try: href = tr.cssselect('span.currency-symbol a')[0].attrib['href'] profile = fixed.clean_url('http://coinmarketcap.com' + href) print 'rank:', rank, 'name:', name, 'sybol:', symbol ico = { keys.entity_name: name, keys.entity_profile: profile, keys_market.symbol: symbol, keys.entity_rank: rank } try: market_cap = twitter_keys.numTwitter( int( parse.csstext( tr.cssselect('td.no-wrap.market-cap.text-right' )[0]).replace('$', '').replace( ',', '').strip())) ico[keys.entity_market_cap] = market_cap except: pass try: supply = twitter_keys.numTwitter( int( parse.csstext( tr.cssselect( 'td.no-wrap.text-right.circulating-supply') [0]).replace('*', '').replace(',', '').strip())) ico[keys.entity_circulating_supply] = supply except: pass icos.append(ico) except: pass defer.returnValue(icos)
def update_player(self, html, player): for info in html.cssselect('div.personalLists ul li div.info'): label = parse.csstext(info.getprevious()) if label == 'Weight': player[keys.entity_weight] = parse.csstext(info) elif label == 'Height': player[keys.entity_height] = parse.csstext(info) elif label == 'Date of Birth': player[keys.entity_dob] = parse.csstext(info) elif label == 'Age': player[keys.entity_age] = parse.csstext(info) print '' print player print ''
def add_players(self, html, team): for li in html.cssselect('ul.squadListContainer.squadList > li'): player = {} player[keys.entity_profile] = self.bpl_url + li.cssselect( 'a.playerOverviewCard')[0].attrib['href'] player[keys.entity_name] = parse.csstext( li.cssselect('h4.name')[0]) player[keys.entity_jersey] = parse.csstext( li.cssselect('span.number')[0]) player[keys.entity_position] = parse.csstext( li.cssselect('span.position')[0]) try: player[keys.entity_nationality] = parse.csstext( li.cssselect( 'li.nationality dl dd.info span.playerCountry')[0]) except: pass for l in li.cssselect('ul.squadPlayerStats li dl dd.info'): label = parse.csstext(l.getprevious()) if label == 'Appearances': player[keys.entity_appearances] = parse.csstext(l) elif label == 'Goals': player[keys.entity_goals] = parse.csstext(l) elif label == 'Assists': player[keys.entity_assists] = parse.csstext(l) try: player[keys.entity_pic] = 'http:' + li.cssselect( 'img.statCardImg')[0].attrib['src'] except: pass print 'player:', player team['players'].append(player)
def thefundedTopRatedVCs(self, html): components = [] for rank in html.cssselect('div[id="post"]')[0].cssselect( 'p[class="larger red"]'): player = {keys.entity_team: 'TheFunded Top Partners'} player[keys.entity_rank] = parse.csstext(rank)[:-1] player[keys.entity_name] = parse.csstext( rank.getnext().cssselect('a')[0]) player[keys.entity_profile] = fixed.clean_url( 'http://www.thefunded.com' + rank.getnext().cssselect('a')[0].attrib['href']) player[keys.entity_firm] = parse.csstext( rank.getparent().cssselect('a[class="fund"]')[0]) print 'player:', player components.append(player) return components
def get_teams_links(self, doc): teams = [] for a in doc.cssselect('div.team__list_wrapper div.team__list a'): href = a.attrib['href'] teams.append({'link': NBA.nba_url + href, 'team': parse.csstext(a) }) print 'nba links:', teams return teams
def nfl_standings(): nfl_standings = yield cv.goto_url('http://www.espn.com/nfl/standings').addCallback(cv.to_html) for span in nfl_standings.cssselect('span span.team-names'): td = span.getparent().getparent().getparent() wins = parse.csstext(td.getnext()) losses = parse.csstext(td.getnext().getnext()) ties = parse.csstext(td.getnext().getnext().getnext()) tn = parse.csstext(span) try: record = wins + '-' + losses + '-' + ties t = Entity().get_item(league='nfl', profile='team:' + tn) t[keys.entity_record] = record print tn, record t.partial_save() except Exception as e: print e
def getTweetKit(self, msg): if self.page().url().toString() == 'http://twitter.com': yield self.goto_url('http://twitter.com') qt5.app.clipboard().setText(msg) self.page().runJavaScript(js_key) yield task.deferLater(reactor, 1, defer.succeed, True) self.page().triggerAction(QWebEnginePage.SelectAll) self.page().triggerAction(QWebEnginePage.Paste) yield task.deferLater(reactor, 1, defer.succeed, True) html = yield self.to_html() while len( html.cssselect( 'span[class="tweet-counter superwarn max-reached"]')) > 0: print 'bad:', parse.csstext( html.cssselect( 'span[class="tweet-counter superwarn max-reached"]')[0]) msg = msg.rsplit(' ', 1)[0] qt5.app.clipboard().setText(msg) self.page().triggerAction(QWebEnginePage.SelectAll) self.page().triggerAction(QWebEnginePage.Paste) yield task.deferLater(reactor, .5, defer.succeed, True) html = yield self.to_html() parse.dumpit(html, '/tmp/tweet_trim.html') defer.returnValue(msg)
def nhl_standings(): nhl_standings = yield cv.goto_url('https://www.nhl.com/standings').addCallback(cv.to_html) for span in nhl_standings.cssselect('a span.team--name'): try: tn = parse.csstext(span) td = span.getparent().getparent().getparent() wins = parse.csstext(td.getnext().getnext()) losses = parse.csstext(td.getnext().getnext().getnext()) ot = parse.csstext(td.getnext().getnext().getnext().getnext()) record = wins + '-' + losses + '-' + ot for t in Entity().query_2(league__eq='nhl', profile__beginswith='team:' + tn): t[keys.entity_record] = record print tn, record t.partial_save() except Exception as e: print e
def scrape_divisions(self, html, divisions): print 'scrape_division:', divisions players = [] mens = True for wc in divisions: wccss = wc.replace(" ", "_").replace("'", ".27") if "Women's" in wc: mens = False wc = wc.replace("Women's ", "").capitalize() print 'team:', wc, wccss css_string = 'span[id^="' + wccss + '"]' print css_string try: css = CSSSelector(css_string)(html)[0] except: css_string = css_string.replace(wc.lower(), wc) print css_string css = CSSSelector(css_string)(html)[0] t = css.getparent() while t.tag != 'table': t = t.getnext() print 'finally:', t.tag for tr in t.findall('.//tr')[2:][:-1]: #print etree.tostring(tr) #country = parse.csstext(tr.findall('.//td')[0]) fighter = {} try: fighter[keys.entity_nickname] = parse.csstext( tr.find('.//td[3]/i')) except: pass fighter[keys.entity_gender] = 'Male' if mens else 'Female' #fighter[keys.entity_origin] = country try: a = tr.cssselect('td span.vcard span a')[0] fighter[keys.entity_name] = parse.csstext(a) except: fighter[keys.entity_name] = parse.csstext( tr.find('.//td[1]')) if '(C)' in parse.csstext(tr): fighter['titleholder'] = 'yes' fighter[keys.entity_weightclass] = wc print fighter players.append(fighter) print 'done: figther len', len(players) return players
def get_community(self, html, community): subteams = {} for cycling_team in html.cssselect('.team_box')[0].cssselect('ul li'): jersey_pic = cycling_team.cssselect('a img')[0].attrib['src'] thref = 'http://www.cyclingnews.com' + cycling_team.cssselect( 'a')[0].attrib['href'] print 'team url:', thref subteam = {} subteam[keys.entity_jersey_pic] = jersey_pic subteams[thref] = subteam for k, st in subteams.iteritems(): d = self.cv.goto_url(k) d.addCallback(self.cv.to_html) d.addErrback(self.error_league) subhtml = yield d team_name = parse.csstext( subhtml.cssselect('div[class="team-name"]')[0]) st[keys.entity_profile] = 'team:' + team_name print 'cycle team:', st for rider in subhtml.cssselect('div.riders div.rider'): player = {} player[keys.entity_team] = team_name player[keys.entity_name] = parse.csstext( rider.cssselect('a')[0]) player[keys.entity_profile] = fixed.clean_url( 'http://www.cyclingnews.com' + rider.cssselect('a')[0].attrib['href']) #print 'found one!:', player community.append(player) for p in community: d = self.cv.goto_url(p[keys.entity_profile] + "/") d.addCallback(self.cv.to_html) d.addErrback(self.error_league) riderhtml = yield d try: rider = riderhtml.cssselect('rider-info-boxout')[0] p[keys.entity_pic] = rider.cssselect( 'img.rider-image')[0].attrib['src'] p[keys.entity_dob] = parse.csstext( rider.cssselect('span[itemprop="birthDate')[0]) p[keys.entity_nationality] = parse.csstext( rider.cssselect('span[itemprop="nationality')[0]) except: pass community.extend(subteams.values()) defer.returnValue(community)
def process_team(self, doc, team): print 'process team' team['players'] = [] for section in doc.cssselect('section.row.nba-player-index__row'): for p in section.cssselect('section.nba-player-index__trending-item'): player = {} player[keys.entity_jersey] = parse.csstext(p.cssselect('span.nba-player-trending-item__number')[0]) anchor = p.cssselect('a')[0] player[keys.entity_name] = anchor.attrib['title'] player[keys.entity_profile] = fixed.clean_url(NBA.nba_url + anchor.attrib['href']) player[keys.entity_pic] = 'http:' + anchor.cssselect('div.nba-player-index__image div.nba-player-index__headshot_wrapper img')[0].attrib['data-src'] player[keys.entity_position] = parse.csstext(p.cssselect('div.nba-player-index__details span')[0]) player[keys.entity_height] = parse.csstext(p.cssselect('div.nba-player-index__details strong')[0]).split(' ')[0] + '\' ' + parse.csstext(p.cssselect('div.nba-player-index__details strong')[1]).split(' ')[0] + '\"' player[keys.entity_weight] = parse.csstext(p.cssselect('div.nba-player-index__details strong')[2]) team['players'].append(player) print 'team:', team['team'], 'players length:', len(team['players']) return team
def innerHtml(self, frag, ico): frame_html = soupparser.fromstring(frag) frame_anchor = frame_html.cssselect( 'h1.timeline-Header-title.u-inlineBlock a.customisable-highlight' )[0] twitter = parse.csstext(frame_anchor).split('@')[1] if twitter: ico[keys.entity_twitter] = twitter
def callbackExtractHouse(self, h): representatives = [] doc = html.document_fromstring(h) table = doc.cssselect('h2 span[id="Voting_members_by_state"]')[0].getparent().getnext() trs = table.cssselect('tr') for tr in trs[1:]: try: congress = {} congress[keys.entity_team] = 'House of Representatives' s = parse.csstext(tr[0].cssselect("a")[0]).split(" ")[:-1] try: s.remove(' at') s.remove('At') except: pass congress[keys.entity_state] = ' '.join(s) if congress[keys.entity_state].endswith(' at'): congress[keys.entity_state] = congress[keys.entity_state][:-3] try: congress[keys.entity_pic] = 'http:' + tr[1].cssselect("a img")[0].attrib['src'] except: pass congress[keys.entity_name] = tr[1].cssselect('span.vcard a')[0].text congress[keys.entity_profile] = fixed.clean_url('http://en.wikipedia.org' + tr[1].cssselect('span.vcard a')[0].attrib['href']) if len(tr) == 9: congress[keys.entity_party] = parse.csstext(tr[3]) elif len(tr) == 7: congress[keys.entity_party] = representatives[-1][keys.entity_party] congress[keys.entity_prior_exp] = parse.csstext(tr[-5]) congress[keys.entity_college] = parse.csstext(tr[-4]) try: ao = tr[-3].text if '*' in ao: ao = ao.replace('*', '') congress[keys.entity_assumed_office] = ao.strip() except: pass congress[keys.entity_born] = parse.csstext(tr[-1]).strip() representatives.append(congress) except: pass representatives.append({keys.entity_twitter: 'USHouseHistory', keys.entity_profile: 'team:House of Representatives' }) return representatives
def getteams(self, html): teams = [] for t in html.cssselect('div[class="equipos"] a'): teamname = parse.csstext(t) if teamname[:2] == 'R.': teamname = 'Real' + teamname[2:] team = {'href': t.attrib['href'], 'team': teamname, 'players': []} teams.append(team) return teams
def is_born(self, html, maybeperson, url): try: maybeperson[keys.entity_name] = parse.csstext( html.cssselect( 'table[class="infobox biography vcard"] tr th span')[0]) except: maybeperson[keys.entity_name] = parse.csstext( html.cssselect('h1[id="firstHeading"][class="firstHeading"]') [0]) for th in html.cssselect('th'): if parse.csstext(th).lower() in ['born', 'date of birth']: try: maybeperson[keys.entity_dob] = parse.csstext( th.getparent().cssselect( 'span[class="bday"]')[0]).replace(')', '') except: pass maybeperson[keys.entity_profile] = fixed.clean_url(url)
def nba_standings(): nba_standings = yield cv.goto_url('http://www.espn.com/nba/standings').addCallback(cv.to_html) for span in nba_standings.cssselect('span span.team-names'): try: tn = parse.csstext(span) td = span.getparent().getparent().getparent() wins = parse.csstext(td.getnext()) losses = parse.csstext(td.getnext().getnext()) record = wins + '-' + losses found = False for t in Entity().query_2(league__eq='nba', profile__eq='team:' + tn): found = True t[keys.entity_record] = record print tn, record t.partial_save() if not found: print 'missing:', tn except Exception as e: print e