def fix_dates(): for match in matches.find({'startDate': {'$type': 2}}).sort('matchId', -1).batch_size(100): print(match['matchId']) match['startDate'] = datetime.strptime(match['startDate'], '%m/%d/%Y %H:%M:%S') match['startTime'] = datetime.strptime(match['startTime'], '%m/%d/%Y %H:%M:%S') if 'timeStamp' in match: try: match['timeStamp'] = datetime.strptime(match['timeStamp'], '%d/%m/%Y %H:%M:%S') except ValueError: match['timeStamp'] = datetime.strptime(match['timeStamp'], '%Y-%m-%d %H:%M:%S') matches.save(match)
def load_events(): print(datetime.now(), 'Starting') for match in matches.find({'events': {'$exists': True}}).sort('matchId', -1): count = events.count({'matchId': match['matchId']}) if count and len(match['events']) == count: # print('Events already written') continue else: print(datetime.now(), match['matchId'], '{0} events'.format(len(match['events']))) events.remove({'matchId': match['matchId']}) for event in match['events']: for k in ['matchId', 'stageId', 'seasonId', 'tournamentId', 'regionId']: if k in match: event[k] = match[k] events.insert_one(event) print(datetime.now(), 'Complete')
def update_matches(status_code=None): # 0: Error # 1: Pending # 2: Postponed # 3: In-Play # 4: (Not seen) # 5: Abandoned # 6: Complete # 7: Cancelled # 8: Ignore if status_code is None or type(status_code) is not int: status_code = {'$in': [1, 3]} for match in matches.find({'statusCode': status_code, 'error': {'$exists': False}, 'startDate': {'$lte': datetime.today()}, 'startTime': {'$lte': datetime.today()}, }).sort([('statusCode', 1), ('startDate', -1)]): print(match['matchId'], match['statusCode'], match['startTime']) get_match(match['matchId'], overwrite=True)
if geocode_result: elevation_result = gmaps.elevation( convert.normalize_lat_lng( geocode_result[0]['geometry']['location'])) stadium_data_to_insert = { 'venueName': stadium, 'location_data': geocode_result[0], 'altitude_data': elevation_result[0] } stadiums.replace_one({'venueName': stadium}, stadium_data_to_insert, upsert=True) if wiki_name: wiki_page = wptools.page(wiki_name[0], silent=True) wiki_page.get_parse() wiki_data = wiki_page.data['infobox'] wiki_data_to_insert = { 'venueName': stadium, 'wiki_name': wiki_name[0], 'wiki_data': wiki_data } wiki.replace_one({'venueName': stadium}, wiki_data_to_insert, upsert=True) if __name__ == "__main__": stads = matches.find().distinct('venueName') stads = [x for x in stads if x != None] for stadium in stads: scrape(stadium)
def missing_stadiums(): for stadium in matches.find().distinct('venueName'): if not stadiums.find_one({'venueName': stadium}): print(stadium)
def load_players(): for match in matches.find({'playerIdNameDictionary': {'$exists': True}}).sort('matchId', 1): print(match['matchId']) for k, v in match['playerIdNameDictionary'].items(): players.update_one({'playerId': int(k)}, {'$setOnInsert': {'name': v}}, upsert=True)
for attendance in content.xpath( '//div[@class="sb-spieldaten"]/p[3]/span/strong/text()'): match['attendance'] = int( attendance.replace(' Spectators', '').replace('.', '')) for referee in content.xpath('//div[@class="sb-spieldaten"]/p[3]/a'): referees.update_one( {'referee': int(referee.xpath('@href')[0].split('/')[-1])}, {'$setOnInsert': { 'name': referee.xpath('@title')[0] }}, upsert=True) match['referee'] = int(referee.xpath('@href')[0].split('/')[-1]) matches.save(match) wait() if __name__ == '__main__': get_regions() for region in regions.find().sort('name'): get_tournaments(region['region']) for tournament in tournaments.find().sort('tournament'): get_seasons(tournament['tournament']) for season in seasons.find().sort([('season', -1), ('tournament', 1)]).batch_size(1): get_fixtures(season['tournament'], season['season']) for match in matches.find().sort('date', -1): get_lineups(match['match'])