Example #1
0
def fbs_standings(): 
    teams = [t for t in Entity().query_2(index=Entity.index_site_profile, site__eq='d1tweets.com', profile__beginswith='team:')]
    standings_html = yield cv.goto_url('http://www.espn.com/college-football/standings').addCallback(cv.to_html)
    for td in standings_html.cssselect('tr.standings-row td.team'):
        record = parse.csstext(td.getnext().getnext().getnext().getnext())
        fb_team_href = td.cssselect('a')[0].attrib['href'].rsplit('/', 1)[1]
        try:
            team = fbs_get_team(teams, fb_team_href)
            team[keys.entity_record] = record
            print team[keys.entity_profile].split(':', 1)[1], record
            team.partial_save()            
        except:
            print 'no such luck:', fb_team_href
    
    rankings_html = yield cv.goto_url('http://www.espn.com/college-football/rankings').addCallback(cv.to_html)
    try:
        for h2 in rankings_html.cssselect('h2.table-caption'):
            if parse.csstext(h2) == 'AP Top 25':
                for r in h2.getparent().cssselect('table')[0].cssselect('span.number'):
                    fb_team_href = r.getparent().getnext().cssselect('a.logo')[0].attrib['href'].rsplit('/', 1)[1]
                    team = fbs_get_team(teams, fb_team_href)
                    team[keys.entity_rank] = parse.csstext(r)
                    team.partial_save()
    except Exception as e:
        print 'e:', e
Example #2
0
 def pullteam(self, h, players = []):
     doc = html.document_fromstring(h)
     for tr in doc.cssselect('a[title="List of sovereign states"]'):
         tr = tr.getparent().getparent()            
         while tr.getnext() is not None:
             tr = tr.getnext()
             try:
                 country = parse.csstext(tr.cssselect('th a')[0])
                 if country:
                     for td in tr.cssselect('td'):
                         if len(td.cssselect('a')) == 2:
                             try:
                                 player = {}
                                 player[keys.entity_team] = 'World Leaders'
                                 player[keys.entity_position] = parse.csstext(td.cssselect('a')[0]).split('\xc2')[0]
                                 player[keys.entity_name] = parse.csstext(td.cssselect('a')[1])
                                 player[keys.entity_profile] = fixed.clean_url('http://en.wikipedia.org' + td.cssselect('a')[1].attrib['href'])
                                 player[keys.entity_country] = country
                                 players.append(player)
                             except Exception as e:
                                 print 'world leader exception:', e                        
             except Exception as e:
                 print 'world leader exception:', e
     players.append({keys.entity_twitter: 'UN', keys.entity_profile: 'team:World Leaders'})
     return players
Example #3
0
 def callbackExtractGovernors(self, h):
     try:
         governors = []
         doc = html.document_fromstring(h)
         h2 = doc.cssselect('h2 span[id="State_governors"]')[0].getparent()
         while h2.tag != 'table':
             h2 = h2.getnext()
         #n = doc.xpath('/html/body/div[3]/div[4]/div[4]/div/table[1]')[0]
         for tr in h2.cssselect('tr'):
             g = {}            
             try:
                 
                 g[keys.entity_team] = 'Governors'
                 g[keys.entity_flag] = 'http:' + tr[0].xpath('div[1]/a/img')[0].attrib['src']
                 g[keys.entity_state] = tr[0].xpath('div[2]/a')[0].text
                 try:
                     g[keys.entity_pic] = 'http:' + tr[1].find('a').find('img').attrib['src']
                 except:
                     pass
                 g[keys.entity_name] = tr[2].cssselect('center span.vcard a')[0].attrib['title']
                 #t = tr[2].find(".//a")
                 g[keys.entity_profile] = fixed.clean_url('http://en.wikipedia.org' + tr[2].cssselect('center span.vcard a')[0].attrib['href'])                
                 g[keys.entity_party] = tr[4].find('a').text
                 g[keys.entity_prior_exp] = parse.csstext(tr[5])
                 g[keys.entity_assumed_office] = parse.csstext(tr[6])
                 g[keys.entity_term_expires] = parse.csstext(tr[7])
                 governors.append(g)
             except:
                 pass
         governors.append({ keys.entity_twitter: 'NatlGovsAssoc', keys.entity_profile: 'team:Governors'})
         return governors
     except Exception as e:
         print e
Example #4
0
 def pullteams(self, h):
     players=[]
     doc = html.document_fromstring(h) 
     h3 = doc.cssselect('h3 ~ ul li')
     print 'h3 length:', len(h3)
     h2 = doc.cssselect('h2 ~ ul li')
     print 'h2 length:', len(h2)
     h3.extend(h2)
     for li in h3:
         player = {}
         try:
             player[keys.entity_team] = 'PAC'
             player[keys.entity_topic] = parse.csstext(li.getparent().getprevious().getchildren()[0])
             if player[keys.entity_topic] != 'External links':
                 try:
                     href = li.cssselect('a')[0].attrib['href']
                     if not urlparse(href).scheme and href:
                         href = 'http://en.wikipedia.org' + href
                     player[keys.entity_profile] = fixed.clean_url(href)                    
                     player[keys.entity_name] = parse.csstext(li.cssselect('a')[0])
                     if player[keys.entity_name] and player[keys.entity_profile]:
                         if player[keys.entity_name].rfind(' - ') > 0:
                             player[keys.entity_location] = player[keys.entity_name][player[keys.entity_name].rfind(' - ') + 3:]
                             player[keys.entity_name] = player[keys.entity_name][:player[keys.entity_name].rfind(' - ')]
                         #print player
                         players.append(player)
                 except Exception as e2:
                     print 'exception inner:', e2
         except: 
             pass
     players.append({keys.entity_profile: 'team:PAC', keys.entity_twitter: 'FEC'})
     return players
Example #5
0
 def entities(self):
     self.gamblers.extend(BOOTH().table())
     yield self.cv.goto_url('http://www.wsop.com/players/').addCallback(lambda ign: task.deferLater(reactor, 5, defer.succeed, True))
     self.cv.page().runJavaScript('data', self.cb)
     yield self.done
     for l in must_haves:
         for pid in l:
             purl = 'http://www.wsop.com/players/profile/?playerid=' + pid
             if purl not in [g[keys.entity_profile] for g in self.gamblers]:
                 print purl
                 try:
                     html = yield self.cv.goto_url(purl).addCallback(self.cv.to_html)
                     missing_gambler = {}
                     missing_gambler[keys.entity_profile] = purl
                     missing_gambler[keys.entity_name] = parse.csstext(html.cssselect('div.iRight div.iRightContent h3')[0])
                     try:
                         missing_gambler[keys.entity_country] = html.cssselect('div.PPCountry')[0].cssselect('i')[0].attrib['title']
                     except:
                         pass
                     tr = html.cssselect('table[id="PPtotals"] tr')[0]
                     missing_gambler[keys.entity_bracelets] = parse.csstext(tr[0].cssselect('b')[0])
                     missing_gambler[keys.entity_rings] = parse.csstext(tr[1].cssselect('b')[0])
                     missing_gambler[keys.entity_cashes] = parse.csstext(tr[2].cssselect('b')[0])
                     missing_gambler[keys.entity_earnings] = parse.csstext(tr[3].cssselect('b')[0])
                     print 'missing:', missing_gambler
                     self.gamblers.append(missing_gambler)
                 except Exception as e:
                     print e
     defer.returnValue(self.gamblers)
Example #6
0
 def getplayers(self, html, team):
     team['players'] = []
     for a in html.cssselect('a.squadPlayerCard'):
         print a
         #print 'hey:', player_span, player_span.cssselect('div.playerPhoto img')[0].attrib
         player = {}
         player[keys.entity_profile] = fixed.clean_url(ipl_base +
                                                       a.attrib['href'])
         #print 'player 1:', player
         player[
             keys.
             entity_pic] = 'http://iplstatic.s3.amazonaws.com/players/210/' + a.cssselect(
                 'div.playerPhoto')[0].cssselect('img[data-player-id]')[
                     0].attrib['data-player-id'] + '.png'
         #print 'player 2:', player
         player[keys.entity_name] = parse.csstext(
             a.cssselect('p.player-name')[0])
         #print 'player 3:', player
         if len(a.cssselect('span.captain')) > 0:
             player[keys.entity_captain] = True
         if len(a.cssselect('span.overseas-player')) > 0:
             player[keys.entity_foreign] = True
         if len(a.cssselect('span.wicket-keeper')) > 0:
             player[keys.entity_position] = "Wicket Keeper"
         for li in a.cssselect('ul.stats li'):
             label = parse.csstext(li.cssselect('span.label')[0])
             value = parse.csstext(li.cssselect('span.value')[0])
             player[label.lower()] = value
         print player
         team['players'].append(player)
     print 'length of team:', len(team['players'])
     return team
Example #7
0
 def get_teams(self, html):
     teams = []
     for conference in html.cssselect('div.mt7'):
         conference_name = parse.csstext(
             conference.cssselect('div.headline')[0])
         conference_name = conference_name.lower().replace(' ', '').replace(
             '-', '').replace('americanathletic',
                              'aac').replace('midamerican', 'mac').replace(
                                  'conferenceusa',
                                  'cusa').replace('fbsindependents',
                                                  'fbsindependent')
         if conference_name == self.get_league_name():
             print 'conference:', conference_name, len(
                 conference.cssselect('section.TeamLinks'))
             for section in conference.cssselect('section.TeamLinks'):
                 team = {
                     'conference':
                     conference_name,
                     'link':
                     'http://espn.go.com' +
                     section.cssselect('a')[0].attrib['href']
                 }
                 for a in section.cssselect(
                         'div.TeamLinks__Links span.TeamLinks__Link a'):
                     if parse.csstext(a).lower() == 'roster':
                         roster_link = 'http://espn.go.com' + a.attrib[
                             'href']
                         team['roster_link'] = roster_link
                 teams.append(team)
     print teams
     return teams
Example #8
0
 def page_extract(self, html):
     actors = []
     for div in html.cssselect(
             'div.lister-list div.lister-item.mode-detail'):
         try:
             actor = {}
             actor[keys.entity_rank] = parse.csstext(
                 div.cssselect('span.lister-item-index.unbold.text-primary')
                 [0]).split('.')[0]
             actor[keys.entity_pic] = div.cssselect(
                 'div.lister-item-image a img')[0].attrib['src']
             actor[keys.entity_name] = parse.csstext(
                 div.cssselect('h3.lister-item-header a')[0]).strip()
             actor[keys.entity_profile] = fixed.clean_url(
                 'http://www.imdb.com' +
                 div.cssselect('div.lister-item-image a')[0].attrib['href'])
             actor[keys.entity_position] = parse.csstext(
                 div.cssselect('p.text-muted.text-small')[0]).split(
                     '|')[0].strip()
             actor[keys_hollywood.noted] = parse.csstext(
                 div.cssselect('p.text-muted.text-small a')[0]).strip()
             actor[keys_hollywood.noted_profile] = fixed.clean_url(
                 'http://www.imdb.com' + div.cssselect(
                     'p.text-muted.text-small a')[0].attrib['href'])
             print actor
             actors.append(actor)
         except Exception as e:
             print 'page_extract exception:', e
     return actors
Example #9
0
 def get_skater(self, html, community):
     for tr in html.cssselect('table.vitals.vitalsshrink tr')[1:]:
         skater = {}
         skater[keys.entity_rank] = parse.csstext(tr[0])[:-2]
         skater[keys.entity_profile] = fixed.clean_url(
             tr[1].cssselect('a')[0].attrib['href'])
         skater[keys.entity_name] = parse.csstext(
             tr[2].cssselect('a')[0]).replace(',', '')
         skater[keys.entity_country] = parse.csstext(
             tr[2].cssselect('a')[1])
         try:
             skater[keys.entity_age] = tr.cssselect(
                 'h3 br')[0].tail.strip().split(' ')[1]
             skater[keys.entity_points] = tr.cssselect(
                 'h3 br')[1].tail.strip().split(' ')[1]
         except:
             try:
                 skater[keys.entity_points] = tr.cssselect(
                     'h3 br')[0].tail.strip().split(' ')[1]
             except:
                 pass
         pic_url = 'https://theboardr.blob.core.windows.net/headshots/' + skater[
             keys.entity_profile].split('/')[4] + '_900.jpg'
         check = requests.head(pic_url,
                               headers={
                                   'User-Agent': 'curl/7.35.0',
                                   'Accept': '*/*'
                               },
                               verify=True)
         if check.status_code == 200:
             skater[keys.entity_pic] = pic_url
         skater[keys.entity_team] = self.skating
         community.append(skater)
Example #10
0
    def get_community(self, html, community, gender='Male'):
        trs = html.cssselect(
            'table[class="tableType-athlete hasGroups"]')[0].cssselect('tr')
        print 'community length:', len(trs)
        for tr in trs:
            player = {}
            try:

                player[keys.entity_rank] = parse.csstext(
                    tr.cssselect('td[class~="athlete-tour-rank"]')[0])
                #player[keys.entity_rank_change] = parse.csstext(tr.cssselect('td[class="athlete-tour-rank-change"]')[0])
                name_element = parse.csstext(
                    tr.cssselect('a[class="athlete-name"]')[0]).title()
                player[keys.entity_name] = name_element.replace(
                    'INJU', '').replace('RECO', '').strip()
                player[keys.entity_profile] = fixed.clean_url(
                    'http://www.worldsurfleague.com' +
                    tr.cssselect('a[class="athlete-name"]')[0].attrib['href'])
                player[keys.entity_origin] = tr.cssselect(
                    'span.athlete-country-flag')[0].attrib['title']
                player[keys.entity_points] = parse.csstext(
                    tr.cssselect('span[class="tour-points"]')[0])
                player[keys.entity_prizemoney] = parse.csstext(
                    tr.cssselect('td[class~="athlete-tour-prize-money"]')[0])
                if player[keys.entity_name]:
                    player[keys.entity_team] = self.surfing
                    community.append(player)
            except Exception as e:
                if keys.entity_name in player:
                    player[keys.entity_team] = self.surfing
                    community.append(player)
Example #11
0
def fc_standings(league_name, urls, teams, sub = {}):
    print 'league:', league_name, 'team length:', len(teams)
    for url in urls:
        fc_standings_html = yield cv.goto_url(url).addCallback(cv.to_html)
        print 'fc_standings html length:', len(fc_standings_html), url, cv.page().url().toString()
        team_tds = fc_standings_html.cssselect('tr.standings-row')
        print 'team tds:', len(team_tds)
        for i, team_td in enumerate(team_tds):
            rank = i + 1
            try:
                tn = parse.csstext(team_td.cssselect('span.team-names')[0])
                if tn in sub:
                    #print 'sub:', tn, sub[tn]
                    tn = sub[tn]
                
                wdl = team_td.cssselect('td[style="white-space:no-wrap;"]')
                wins = parse.csstext(wdl[0])
                ties = parse.csstext(wdl[1])
                losses = parse.csstext(wdl[2])

                record = wins + '-' + losses + '-' + ties
                print 'team:', tn, 'record:', record
                found = False
                for t in Entity().query_2(league__eq=league_name, profile__eq='team:' + tn):
                    found = True
                    t[keys.entity_record] = record
                    t[keys.entity_rank] = rank
                    print tn, rank, record
                    t.partial_save()
                if not found:
                    for t2 in Entity().query_2(league__eq=league_name, profile__beginswith='team:' + tn):
                        found = True
                        t2[keys.entity_record] = record
                        t2[keys.entity_rank] = rank
                        print tn, rank, record
                        t2.partial_save()
                if not found:
                    try:
                        potential_teams = [t3 for t3 in teams if tn in t3[keys.entity_profile]]
                        if len(potential_teams) == 1:
                            found = True
                            potential_teams[0]
                            potential_teams[0][keys.entity_record] = record
                            potential_teams[0][keys.entity_rank] = rank
                            print tn, rank, record
                            potential_teams[0].partial_save()
                    except:
                        pass
                if not found:
                    print '    missing:', tn, rank


                
            except Exception as e:
                print 'fc exception:', e
            #<span class="team-names">Barcelona</span>
        '''    
Example #12
0
    def sprintcup_drivers(self, html):
        drivers = []
        for tr in html.cssselect('table.driver-list-table tr')[1:]:
            '''
            <tr>
                <td class="driver-name-td"><a href="/drivers/driversaj-allmendinger/">Allmendinger, AJ</a></td>
                <td class="driver-number-td">47</td>
                <td class="driver-make-td"><img src="/wp-content/uploads/sites/7/2017/01/Chevy-Driver-Page-New-2-160x811-265x180.png"></td>
                <td class="driver-team-td">JTG Daugherty Racing</td>
    
            </tr>
            '''
            driver = {}
            driver[keys.entity_profile] = fixed.clean_url(
                NASCAR.nascar_url +
                tr.cssselect('td.driver-name-td a')[0].attrib['href'])
            driver[keys.entity_name] = parse.csstext(
                tr.cssselect('td.driver-name-td a')[0]).strip()
            driver[keys.
                   entity_name] = driver[keys.entity_name].split(',')[1].strip(
                   ) + ' ' + driver[keys.entity_name].split(',')[0].strip()

            driver[keys.entity_carnumber] = parse.csstext(
                tr.cssselect('td.driver-number-td')[0])
            team = parse.csstext(tr.cssselect('td.driver-team-td')[0]).strip()
            if not team:
                team = NASCAR.unaffiliated
            driver[keys.entity_team] = team

            driver[keys.entity_carnumber] = parse.csstext(
                tr.cssselect('td.driver-number-td')[0])
            driver[keys.entity_circuit] = self.get_common_name()
            print driver
            drivers.append(driver)
            #driver[keys.entity_rank] = parse.csstext(div.cssselect('div.position')[0]).strip()
            #driver[keys.entity_name] = parse.csstext(div.cssselect('div.driver div.driver-first')[0]).split() + ' ' + parse.csstext(div.cssselect('div.driver div.driver-last')[0]).split()

            #<div class="driver"><div class="driver-first"> Martin</div><div class="driver-last">Truex Jr.</div><div class="legend-symbols"></div></div>

            #tr = driver_art.getparent().getparent().getparent().getparent().getparent()

            #driver[keys.entity_points] = parse.csstext(tr.cssselect('td')[3])
            #driver[keys.entity_points_behind] = parse.csstext(tr.cssselect('td')[4]).replace('--','')
            #driver[keys.entity_starts] = parse.csstext(tr.cssselect('td')[5])
            #driver[keys.entity_wins] = parse.csstext(tr.cssselect('td')[6])
            #driver[keys.entity_top5] = parse.csstext(tr.cssselect('td')[7])
            #driver[keys.entity_top10] = parse.csstext(tr.cssselect('td')[8])
            #driver[keys.entity_dnf] = parse.csstext(tr.cssselect('td')[9])

            #if not team:
            #    team = NASCAR.unaffiliated
            #elif 'team:' + team not in [t[keys.entity_profile] for t in drivers]:
            #    drivers.append({ keys.entity_profile: 'team:' + team })

            driver[keys.entity_circuit] = self.get_common_name()
            drivers.append(driver)
        return drivers
Example #13
0
 def playerinfo(self, html, player):
     for key in html.cssselect('td[class="label"]'):
         if not parse.csstext(key).isnumeric():
             value = key.getnext()
             player[self.key_lookup(
                 parse.csstext(key).lower())] = parse.csstext(value)
     for key in html.cssselect('p[class="qsHeader"]'):
         value = key.getnext()
         player[self.key_lookup(key)] = parse.csstext(value)
Example #14
0
    def getComponents(self):
        components = []
        yield self.cv.goto_url(
            'http://graphics.wsj.com/billion-dollar-club/').addCallback(
                lambda ign: task.deferLater(reactor, 5, defer.succeed, True))
        html = yield self.cv.to_html()
        for i, company in enumerate(
                html.cssselect('table[id="data-table"] tbody tr')):
            player = {keys.entity_team: 'WSJ Billion Dollar Startup'}
            player[keys.entity_rank] = i + 1
            player[keys.entity_name] = parse.csstext(
                company.cssselect('td.company')[0]).strip()
            player[keys.entity_valuation] = parse.csstext(
                company.cssselect('td.valuation')[0])
            player[keys.entity_total_funding] = parse.csstext(
                company.cssselect('td.total_funding')[0])
            player[keys.entity_last_valuation] = parse.csstext(
                company.cssselect('td.val_date')[0])

            self.cv.page().runJavaScript(js_link % str(i))

            d = task.deferLater(reactor, 1, defer.succeed, True)
            d.addCallback(self.cv.to_html)
            html2 = yield d
            dets = html2.cssselect('tr.card-tr')[0]
            try:
                player[keys.entity_rounds] = parse.csstext(
                    dets.cssselect('div[class="rounds co-info"]')[0][1])
            except:
                pass
            for ceo in dets.cssselect('div[class="ceo co-info"]'):
                ceo_txt = parse.csstext(ceo).replace('CEO:', '')
                c = ceo_txt.split('(co-founder)')[0].split('(founder)')[
                    0].split(', founder')[0].split(', founder')[0].split(
                        '(co-founders)')[0].split(', co-founder')
                for rceo in c[0].split(' and '):
                    rceo = rceo.strip()
                    if keys.entity_ceo not in player:
                        player[keys.entity_ceo] = [rceo]
                    else:
                        player[keys.entity_ceo].append(rceo)
            player[keys.entity_ratio] = parse.csstext(
                dets.cssselect('div[class="ratio co-info"] span[class="val"]')
                [0])
            player[keys.entity_location] = parse.csstext(
                dets.cssselect(
                    'div[class="location co-info"] span[class="val"]')[0])
            player[keys.entity_competitors] = parse.csstext(
                dets.cssselect(
                    'p[class="competitors co-info"] span[class="val"]')[0])
            player[keys.entity_investors] = parse.csstext(
                dets.cssselect(
                    'p[class="investors co-info"] span[class="val"]')[0])
            components.append(player)
        defer.returnValue(components)
Example #15
0
    def getICOs(self):
        html = yield self.cv.goto_url(
            'https://coinmarketcap.com/all/views/all/').addCallback(
                self.cv.to_html)
        trs = html.cssselect(
            'div.table-responsive.compact-name-column div.dataTables_wrapper.no-footer table tr'
        )
        icos = []
        for tr in trs[1:][:1200]:

            name = parse.csstext(tr.cssselect('a.currency-name-container')[0])
            rank = parse.csstext(tr[0])
            symbol = parse.csstext(tr.cssselect('td.col-symbol')[0])

            try:
                href = tr.cssselect('span.currency-symbol a')[0].attrib['href']
                profile = fixed.clean_url('http://coinmarketcap.com' + href)

                print 'rank:', rank, 'name:', name, 'sybol:', symbol
                ico = {
                    keys.entity_name: name,
                    keys.entity_profile: profile,
                    keys_market.symbol: symbol,
                    keys.entity_rank: rank
                }

                try:
                    market_cap = twitter_keys.numTwitter(
                        int(
                            parse.csstext(
                                tr.cssselect('td.no-wrap.market-cap.text-right'
                                             )[0]).replace('$', '').replace(
                                                 ',', '').strip()))
                    ico[keys.entity_market_cap] = market_cap
                except:
                    pass
                try:
                    supply = twitter_keys.numTwitter(
                        int(
                            parse.csstext(
                                tr.cssselect(
                                    'td.no-wrap.text-right.circulating-supply')
                                [0]).replace('*', '').replace(',',
                                                              '').strip()))
                    ico[keys.entity_circulating_supply] = supply
                except:
                    pass

                icos.append(ico)
            except:
                pass
        defer.returnValue(icos)
Example #16
0
 def update_player(self, html, player):
     for info in html.cssselect('div.personalLists ul li div.info'):
         label = parse.csstext(info.getprevious())
         if label == 'Weight':
             player[keys.entity_weight] = parse.csstext(info)
         elif label == 'Height':
             player[keys.entity_height] = parse.csstext(info)
         elif label == 'Date of Birth':
             player[keys.entity_dob] = parse.csstext(info)
         elif label == 'Age':
             player[keys.entity_age] = parse.csstext(info)
     print ''
     print player
     print ''
Example #17
0
 def add_players(self, html, team):
     for li in html.cssselect('ul.squadListContainer.squadList > li'):
         player = {}
         player[keys.entity_profile] = self.bpl_url + li.cssselect(
             'a.playerOverviewCard')[0].attrib['href']
         player[keys.entity_name] = parse.csstext(
             li.cssselect('h4.name')[0])
         player[keys.entity_jersey] = parse.csstext(
             li.cssselect('span.number')[0])
         player[keys.entity_position] = parse.csstext(
             li.cssselect('span.position')[0])
         try:
             player[keys.entity_nationality] = parse.csstext(
                 li.cssselect(
                     'li.nationality dl dd.info span.playerCountry')[0])
         except:
             pass
         for l in li.cssselect('ul.squadPlayerStats li dl dd.info'):
             label = parse.csstext(l.getprevious())
             if label == 'Appearances':
                 player[keys.entity_appearances] = parse.csstext(l)
             elif label == 'Goals':
                 player[keys.entity_goals] = parse.csstext(l)
             elif label == 'Assists':
                 player[keys.entity_assists] = parse.csstext(l)
         try:
             player[keys.entity_pic] = 'http:' + li.cssselect(
                 'img.statCardImg')[0].attrib['src']
         except:
             pass
         print 'player:', player
         team['players'].append(player)
Example #18
0
 def thefundedTopRatedVCs(self, html):
     components = []
     for rank in html.cssselect('div[id="post"]')[0].cssselect(
             'p[class="larger red"]'):
         player = {keys.entity_team: 'TheFunded Top Partners'}
         player[keys.entity_rank] = parse.csstext(rank)[:-1]
         player[keys.entity_name] = parse.csstext(
             rank.getnext().cssselect('a')[0])
         player[keys.entity_profile] = fixed.clean_url(
             'http://www.thefunded.com' +
             rank.getnext().cssselect('a')[0].attrib['href'])
         player[keys.entity_firm] = parse.csstext(
             rank.getparent().cssselect('a[class="fund"]')[0])
         print 'player:', player
         components.append(player)
     return components
Example #19
0
 def get_teams_links(self, doc):    
     teams = []
     for a in doc.cssselect('div.team__list_wrapper div.team__list a'):
         href = a.attrib['href']
         teams.append({'link': NBA.nba_url + href, 'team': parse.csstext(a) })
     print 'nba links:', teams
     return teams
Example #20
0
def nfl_standings():
    nfl_standings = yield cv.goto_url('http://www.espn.com/nfl/standings').addCallback(cv.to_html)
    for span in nfl_standings.cssselect('span span.team-names'):
        td = span.getparent().getparent().getparent()
        wins = parse.csstext(td.getnext())
        losses = parse.csstext(td.getnext().getnext())
        ties = parse.csstext(td.getnext().getnext().getnext())
        tn = parse.csstext(span)
        try:
            record = wins + '-' + losses + '-' + ties
            t = Entity().get_item(league='nfl', profile='team:' + tn)
            t[keys.entity_record] = record
            print tn, record
            t.partial_save() 
        except Exception as e:
            print e
Example #21
0
    def getTweetKit(self, msg):
        if self.page().url().toString() == 'http://twitter.com':
            yield self.goto_url('http://twitter.com')

        qt5.app.clipboard().setText(msg)
        self.page().runJavaScript(js_key)
        yield task.deferLater(reactor, 1, defer.succeed, True)

        self.page().triggerAction(QWebEnginePage.SelectAll)
        self.page().triggerAction(QWebEnginePage.Paste)

        yield task.deferLater(reactor, 1, defer.succeed, True)
        html = yield self.to_html()
        while len(
                html.cssselect(
                    'span[class="tweet-counter superwarn max-reached"]')) > 0:
            print 'bad:', parse.csstext(
                html.cssselect(
                    'span[class="tweet-counter superwarn max-reached"]')[0])
            msg = msg.rsplit(' ', 1)[0]
            qt5.app.clipboard().setText(msg)
            self.page().triggerAction(QWebEnginePage.SelectAll)
            self.page().triggerAction(QWebEnginePage.Paste)
            yield task.deferLater(reactor, .5, defer.succeed, True)
            html = yield self.to_html()
        parse.dumpit(html, '/tmp/tweet_trim.html')
        defer.returnValue(msg)
Example #22
0
def nhl_standings():
    nhl_standings = yield cv.goto_url('https://www.nhl.com/standings').addCallback(cv.to_html)    
    for span in nhl_standings.cssselect('a span.team--name'):
        try:
            tn = parse.csstext(span)
            td = span.getparent().getparent().getparent()
            wins = parse.csstext(td.getnext().getnext())
            losses = parse.csstext(td.getnext().getnext().getnext())
            ot = parse.csstext(td.getnext().getnext().getnext().getnext())
            record = wins + '-' + losses + '-' + ot
            for t in Entity().query_2(league__eq='nhl', profile__beginswith='team:' + tn):
                t[keys.entity_record] = record
                print tn, record
                t.partial_save() 
        except Exception as e:
            print e
Example #23
0
    def scrape_divisions(self, html, divisions):
        print 'scrape_division:', divisions
        players = []
        mens = True
        for wc in divisions:
            wccss = wc.replace(" ", "_").replace("'", ".27")
            if "Women's" in wc:
                mens = False
            wc = wc.replace("Women's ", "").capitalize()
            print 'team:', wc, wccss
            css_string = 'span[id^="' + wccss + '"]'
            print css_string
            try:
                css = CSSSelector(css_string)(html)[0]
            except:
                css_string = css_string.replace(wc.lower(), wc)
                print css_string
                css = CSSSelector(css_string)(html)[0]
            t = css.getparent()
            while t.tag != 'table':
                t = t.getnext()
            print 'finally:', t.tag
            for tr in t.findall('.//tr')[2:][:-1]:
                #print etree.tostring(tr)
                #country = parse.csstext(tr.findall('.//td')[0])
                fighter = {}
                try:
                    fighter[keys.entity_nickname] = parse.csstext(
                        tr.find('.//td[3]/i'))
                except:
                    pass

                fighter[keys.entity_gender] = 'Male' if mens else 'Female'
                #fighter[keys.entity_origin] = country
                try:
                    a = tr.cssselect('td span.vcard span a')[0]
                    fighter[keys.entity_name] = parse.csstext(a)
                except:
                    fighter[keys.entity_name] = parse.csstext(
                        tr.find('.//td[1]'))
                if '(C)' in parse.csstext(tr):
                    fighter['titleholder'] = 'yes'
                fighter[keys.entity_weightclass] = wc
                print fighter
                players.append(fighter)
        print 'done: figther len', len(players)
        return players
Example #24
0
    def get_community(self, html, community):
        subteams = {}
        for cycling_team in html.cssselect('.team_box')[0].cssselect('ul li'):
            jersey_pic = cycling_team.cssselect('a img')[0].attrib['src']
            thref = 'http://www.cyclingnews.com' + cycling_team.cssselect(
                'a')[0].attrib['href']
            print 'team url:', thref
            subteam = {}
            subteam[keys.entity_jersey_pic] = jersey_pic
            subteams[thref] = subteam
        for k, st in subteams.iteritems():
            d = self.cv.goto_url(k)
            d.addCallback(self.cv.to_html)
            d.addErrback(self.error_league)
            subhtml = yield d
            team_name = parse.csstext(
                subhtml.cssselect('div[class="team-name"]')[0])
            st[keys.entity_profile] = 'team:' + team_name
            print 'cycle team:', st

            for rider in subhtml.cssselect('div.riders div.rider'):
                player = {}
                player[keys.entity_team] = team_name
                player[keys.entity_name] = parse.csstext(
                    rider.cssselect('a')[0])
                player[keys.entity_profile] = fixed.clean_url(
                    'http://www.cyclingnews.com' +
                    rider.cssselect('a')[0].attrib['href'])
                #print 'found one!:', player
                community.append(player)
        for p in community:
            d = self.cv.goto_url(p[keys.entity_profile] + "/")
            d.addCallback(self.cv.to_html)
            d.addErrback(self.error_league)
            riderhtml = yield d
            try:
                rider = riderhtml.cssselect('rider-info-boxout')[0]
                p[keys.entity_pic] = rider.cssselect(
                    'img.rider-image')[0].attrib['src']
                p[keys.entity_dob] = parse.csstext(
                    rider.cssselect('span[itemprop="birthDate')[0])
                p[keys.entity_nationality] = parse.csstext(
                    rider.cssselect('span[itemprop="nationality')[0])
            except:
                pass
        community.extend(subteams.values())
        defer.returnValue(community)
Example #25
0
 def process_team(self, doc, team):
     print 'process team'
     team['players'] = []
     for section in doc.cssselect('section.row.nba-player-index__row'):            
         for p in section.cssselect('section.nba-player-index__trending-item'):
             player = {}
             player[keys.entity_jersey] = parse.csstext(p.cssselect('span.nba-player-trending-item__number')[0])
             anchor = p.cssselect('a')[0]
             player[keys.entity_name] = anchor.attrib['title']
             player[keys.entity_profile] = fixed.clean_url(NBA.nba_url + anchor.attrib['href'])
             player[keys.entity_pic] = 'http:' + anchor.cssselect('div.nba-player-index__image div.nba-player-index__headshot_wrapper img')[0].attrib['data-src']
             player[keys.entity_position] = parse.csstext(p.cssselect('div.nba-player-index__details span')[0])
             player[keys.entity_height] = parse.csstext(p.cssselect('div.nba-player-index__details strong')[0]).split(' ')[0] + '\' ' + parse.csstext(p.cssselect('div.nba-player-index__details strong')[1]).split(' ')[0] + '\"'
             player[keys.entity_weight] = parse.csstext(p.cssselect('div.nba-player-index__details strong')[2])
             team['players'].append(player)
     print 'team:', team['team'], 'players length:', len(team['players'])
     return team
Example #26
0
 def innerHtml(self, frag, ico):
     frame_html = soupparser.fromstring(frag)
     frame_anchor = frame_html.cssselect(
         'h1.timeline-Header-title.u-inlineBlock a.customisable-highlight'
     )[0]
     twitter = parse.csstext(frame_anchor).split('@')[1]
     if twitter:
         ico[keys.entity_twitter] = twitter
Example #27
0
 def callbackExtractHouse(self, h):
     representatives = []
     doc = html.document_fromstring(h)
     table = doc.cssselect('h2 span[id="Voting_members_by_state"]')[0].getparent().getnext()
     trs = table.cssselect('tr')
     for tr in trs[1:]:
         try:
             congress = {}
             congress[keys.entity_team] = 'House of Representatives'
             s = parse.csstext(tr[0].cssselect("a")[0]).split(" ")[:-1]
             try:
                 s.remove(' at')
                 s.remove('At')
             except:
                 pass
             congress[keys.entity_state] = ' '.join(s)
             if congress[keys.entity_state].endswith(' at'):
                 congress[keys.entity_state] = congress[keys.entity_state][:-3]
             try:
                 congress[keys.entity_pic] = 'http:' + tr[1].cssselect("a img")[0].attrib['src']
             except:
                 pass
             congress[keys.entity_name] = tr[1].cssselect('span.vcard a')[0].text
             congress[keys.entity_profile] = fixed.clean_url('http://en.wikipedia.org' + tr[1].cssselect('span.vcard a')[0].attrib['href'])
             if len(tr) == 9:
                 congress[keys.entity_party] = parse.csstext(tr[3])
             elif len(tr) == 7:
                 congress[keys.entity_party] = representatives[-1][keys.entity_party]
             congress[keys.entity_prior_exp] = parse.csstext(tr[-5])
             congress[keys.entity_college] = parse.csstext(tr[-4])
             try:
                 ao = tr[-3].text
                 if '*' in ao:
                     ao = ao.replace('*', '')
                 congress[keys.entity_assumed_office] = ao.strip()
             except:
                 pass
             congress[keys.entity_born] = parse.csstext(tr[-1]).strip()
             
             representatives.append(congress)
             
         except:
             pass
     representatives.append({keys.entity_twitter: 'USHouseHistory', keys.entity_profile: 'team:House of Representatives' })        
     return representatives
Example #28
0
 def getteams(self, html):
     teams = []
     for t in html.cssselect('div[class="equipos"] a'):
         teamname = parse.csstext(t)
         if teamname[:2] == 'R.':
             teamname = 'Real' + teamname[2:]
         team = {'href': t.attrib['href'], 'team': teamname, 'players': []}
         teams.append(team)
     return teams
Example #29
0
 def is_born(self, html, maybeperson, url):
     try:
         maybeperson[keys.entity_name] = parse.csstext(
             html.cssselect(
                 'table[class="infobox biography vcard"] tr th span')[0])
     except:
         maybeperson[keys.entity_name] = parse.csstext(
             html.cssselect('h1[id="firstHeading"][class="firstHeading"]')
             [0])
     for th in html.cssselect('th'):
         if parse.csstext(th).lower() in ['born', 'date of birth']:
             try:
                 maybeperson[keys.entity_dob] = parse.csstext(
                     th.getparent().cssselect(
                         'span[class="bday"]')[0]).replace(')', '')
             except:
                 pass
             maybeperson[keys.entity_profile] = fixed.clean_url(url)
Example #30
0
def nba_standings():
    nba_standings = yield cv.goto_url('http://www.espn.com/nba/standings').addCallback(cv.to_html)    
    for span in nba_standings.cssselect('span span.team-names'):
        try:
            tn = parse.csstext(span)
            td = span.getparent().getparent().getparent()
            wins = parse.csstext(td.getnext())
            losses = parse.csstext(td.getnext().getnext())
            record = wins + '-' + losses
            found = False
            for t in Entity().query_2(league__eq='nba', profile__eq='team:' + tn):
                found = True
                t[keys.entity_record] = record
                print tn, record
                t.partial_save()
            if not found:
                print 'missing:', tn 
        except Exception as e:
            print e