def parseLeague(self, response): # Probably most efficient to scrape all tournIds # then build one big request for all leagues # after filtering bad leagues. lnames = response.xpath( '//li[@id="sport_2"]/div[@class="subCategories"]' '/ul/li/label/text()').extract() lids = response.xpath('//li[@id="sport_2"]/div[@class="subCategories"]' '/ul/li/input/@id').extract() # checkboxTournament_5406 is format of lids, chop: lids = [id[19:] for id in lids] # Make pairs for easy filtering lpairs = zip(lnames, lids) lids = [ lid for (lname, lid) in lpairs if not linkFilter(self.name, lname) ] # Build request for leagues base_url = 'https://tonybet.com/cached_sports/football?' GETstr = 'country=gb&eo_format=eu&' for lid in lids: GETstr += 'tournaments_ids[]=%s&' % lid GETstr += 't=t' headers = { 'Referer': 'https://tonybet.com/football', 'X-Requested-With': 'XMLHttpRequest', 'Host': 'tonybet.com' } yield Request(url=base_url + GETstr, headers=headers, callback=self.pre_parseData)
def parse(self, response): # Use some ninja xpath to get only li for soccer li = response.xpath( '//div[@id="oddsmenu-inner"]/ul[@class="parent"]/' 'li[descendant::div[@class="section "]/a[@id="betclass_soccer"]]') # league links: league_links = li.xpath( 'ul[@class="child"]/li/ul/li//a/@href').extract() # Remove unwanted links, returns True to filter out link. league_links = [ link for link in league_links if not linkFilter(self.name, link) ] base_url = 'https://sports.betway.com/?u=' headers = { 'Referer': 'https://sports.betway.com/', 'X-Requested-With': 'XMLHttpRequest', 'Host': 'sports.betway.com', } for link in league_links: yield Request(url=base_url + link + '&m=win-draw-win', headers=headers, callback=self.parse_Data)
def parseLeague(self, response): # If need to can access the last cookie you set with # request.headers.getlist('Cookie') lnames = response.xpath( '//ul[@id="sb-sportlist"]/li[1]/ul[@id="lg1"]/li/' 'div[@class="line"]/a/text()').extract() links = response.xpath( '//ul[@id="sb-sportlist"]/li[1]/ul[@id="lg1"]/li/' 'div[@class="line"]/a/@href').extract() # Make pairs for easy filtering lpairs = zip(lnames, links) links = [ link for (lname, link) in lpairs if not linkFilter(self.name, lname) ] # Build request for leagues # I seem to be having a problem with cookies # If you make the request w.o them you get 302 redirect # , which is not being coped with well. How do I cope with it? # or why are the cookies not working consistently? headers = { 'Referer': 'http://www.oddsring.com', 'Host': 'www.oddsring.com' } for link in links: yield Request(url=link, headers=headers, callback=self.parseData, dont_filter=True)
def parse_leagues(self, response): # Extract the needed params from the JSON response try: jResp = json.loads(response.body) except: log.msg('lostconn perhaps?', level=log.ERROR) log.msg('response dump: \n%s' % response.body, level=log.ERROR) yield [] # Load the string from the mod key into json jsonCountries = json.loads(jResp['mod']) # Reap the comp ids (cids) (keep name too for filter) cids = [(comp['n'], comp['id']) for country in jsonCountries for comp in country['c']] # Filter the comps cids = [(cname, cid) for (cname, cid) in cids if not linkFilter(self.name, cname)] base_url = 'http://sb.188bet.co.uk/en-gb/Service/CentralService?GetData' headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest'} for (name, cid) in cids: # Seems the referer also gets set to requested cid. # Would be nice if there was a way to test if 1x2 avail before making req, # otherwise in parse_Data there will be no keys needed. formdata = {'reqUrl': '/en-gb/sports/1/competition/1x2?competitionids='+str(cid)+'&'} headers['Referer'] = 'http://sb.188bet.co.uk/en-gb/sports/1/competition/1x2?competitionids='+str(cid) yield FormRequest(url=base_url, formdata=formdata, headers=headers, meta={'league_name': name}, callback=self.parse_Data, dont_filter=True)
def parse(self, response): log.msg('Grabbing all checkbox ids..') cbids = response.xpath('//table[@id="TBL_Content_Leagues"]' '/tr/td/input/@id').extract() leagueids = [cbid[2:] for cbid in cbids] # drop cb prefix leaguenames = response.xpath('//table[@id="TBL_Content_Leagues"]' '/tr/td/a/text()').extract() lpairs = zip(leagueids, leaguenames) # Remove unwanted links, returns True to filter out link leagueids = [ id for (id, name) in lpairs if not linkFilter(self.name, name) ] base_url = 'https://www.interwetten.com/en/SportsBook/Betting/BettingOffer.aspx' GETstr = '?leagueid=' + ','.join(leagueids) + '&type=0&ogPreselect=1' headers = { 'Host': 'www.interwetten.com', 'Referer': 'https://www.interwetten.com/en/sportsbook/o/10/fussball', } yield Request(url=base_url + GETstr, headers=headers, callback=self.parse_ListMatches, dont_filter=True)
def parse_leagues(self, response): # Get competitions section sections = response.xpath('//div[@class="section"]') for sec in sections: if sec.xpath('h3[@class="hecto"]/text()').extract() == [ u'Competitions' ]: compSec = sec leagues = compSec.xpath( 'ul[@class="limit-list"]//li/a/@href').extract() # Filter. leagues = [ league for league in leagues if not linkFilter(self.name, league) ] # Request leagues. base_url = 'http://www.skybet.com' headers = {'Referer': 'http://www.skybet.com/football'} for league in leagues: yield Request(url=base_url + league, headers=headers, callback=self.pre_parse_Data)
def traverseNav(self, response): ''' This will call itself back until we hit bottom rung of bonavigation tree ''' log.msg('traverseNav is at %s' % response.url, level=log.INFO) bonav_nodes = response.xpath('//bonavigationnodes/bonavigationnode') markets = response.xpath('//marketgroups//marketgroup') if bonav_nodes and not markets: base_url = 'http://www.betfred.com' headers = {'Accept': 'application/xml, text/xml, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'Referer': 'http://www.betfred.com/sport'} log.msg('traverseNav there ARE bonav nodes', level=log.INFO) # whilst still nodes, get id for n in bonav_nodes: bid = take_first(n.xpath('idfwbonavigation/text()').extract()) bname = take_first(n.xpath('name/text()').extract()) if linkFilter(self.name, bname): # cont = raw_input('Ent to cont...') continue # req next level stamp = str(int(time.time() * 1000)) GETstr = ('/__Admin/Proxy.aspx?proxyurl=http://warp.betfred.com/cache/' 'boNavigationList/2/UK/'+str(bid)+'.xml&'+'_='+str(stamp)) yield Request(url=base_url+GETstr, headers=headers, callback=self.traverseNav) else: log.msg('traverseNav there are NO MORE bonav nodes', level=log.INFO) base_url = 'http://warp.betfred.com' headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Referer': 'http://warp.betfred.com/UK/2/bettingsc?bettingscIndex=3'} # Hit bottom parse markets for market in markets: mname = take_first(market.xpath('name/text()').extract()) if linkFilter(self.name, mname): # stop = raw_input('Ent to cont...') continue mid = take_first(market.xpath('idfwmarketgroup/text()').extract()) log.msg('traverseNav making market req for market %s with id %s' % (mname, mid), level=log.INFO) # For each marketId (i.e. each league) build AJAX GET request, to # receive back event data for that league in XML format. (lightMarketGroup # has no price data) GETstr = '/cache/marketGroup/UK/'+str(mid)+'.xml' yield Request(url=base_url+GETstr, headers=headers, callback=self.parse_Data)
def parse_leagues(self, response): sx = SgmlLinkExtractor(allow=[ r'http://www.sportingbet.com/sports-football/' '[A-Za-z0-9-]+/1-102-\d+.html' ]) league_links = sx.extract_links(response) # Remove unwanted links, returns True to filter out link league_links = [ link for link in league_links if not linkFilter(self.name, link.url) ] eventClassIdList = [] # Extract eventClassId from the link.url with regex for link in league_links: matches = re.findall( r'http://www.sportingbet.com/sports-football/' '[A-Za-z0-9-]+/1-102-(\d+?).html', link.url) if matches: eventClassIdList.append(matches[0]) base_url = 'http://www.sportingbet.com/services/CouponTemplate.mvc/GetCoupon' headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'http://www.sportingbet.com/sports-football/0-102-410.html', 'X-Requested-With': 'XMLHttpRequest', 'Host': 'www.sportingbet.com', } # cookies =response.meta['cookies'] for id in eventClassIdList: # Build GETstr GETstr = '?couponAction=EVENTCLASSCOUPON&' GETstr += 'sportIds=102&' GETstr += 'marketTypeId=&' GETstr += 'eventId=&' GETstr += 'bookId=&' GETstr += 'eventClassId=' + str(id) + '&' GETstr += 'sportId=102&' GETstr += 'eventTimeGroup=ETG_NextFewHours_0_0' # make req yield Request(url=base_url + GETstr, headers=headers, meta={'eventClassId': str(id)}, callback=self.pre_parse_Data)
def parse(self, response): league_links = response.xpath( '//ul[@class="hierarchy"]/' 'li[@class="expander expander-collapsed sport-FOOT"]/' 'ul[@class="expander-content"]/' 'li[@class="expander expander-collapsed"]/' 'ul[@class="expander-content"]/li/a/@href').extract() league_links = [ l for l in league_links if not linkFilter(self.name, l) ] headers = { 'Referer': 'http://sports.titanbet.co.uk/en/football', 'Host': 'sports.titanbet.co.uk', } for link in league_links: link = 'http://sports.titanbet.com' + link yield Request(link, headers=headers, callback=self.pre_parse_Data)
def parse(self, response): league_links = response.xpath('//ul[@class="matrixB"]/li/ul/li/a') league_pairs = [ (take_first(l.xpath('@href').extract()), take_first(l.xpath('text()').extract())) for l in league_links if not linkFilter(self.name, take_first(l.xpath('text()').extract())) ] headers = { 'Host': 'sports.williamhill.com', 'Referer': 'http://sports.williamhill.com/bet/en-gb/betting/y/5/et/Football.html', } for pair in league_pairs: yield Request(url=pair[0], headers=headers, callback=self.parse_match)
def parseLeague(self, response): lpairs = [] league_lis = response.xpath('//li[@class="sport_240"]/ul/li/ul/li') for li in league_lis: leagueName = take_first(li.xpath('a/text()').extract()) leagueId = take_first(li.xpath('a//@data-id').extract()) lpairs.append((leagueName, leagueId)) leagueIds = [lId for (lName, lId) in lpairs if not linkFilter(self.name, lName)] # Build req base_url = 'https://www.apostasonline.com/pt-PT/sportsbook/eventpaths/multi/' headers = {'Referer': 'https://www.apostasonline.com/', 'X-Requested-With': 'XMLHttpRequest', 'Host': 'www.apostasonline.com'} for lid in leagueIds: GETstr = '[%s]?ajax=true&timezone=undefined' % lid yield Request(url=base_url+GETstr, headers=headers, callback=self.pre_parseData, dont_filter=True)