async def process_season(season): text2 = get_text( 'https://www.championat.com/biathlon/_biathlonworldcup/tournament/{}/players/' .format(season['cc_id'])) if text2: soup = BeautifulSoup(text2, 'html.parser') nodes = soup.select('a[href]') for node in nodes: if '/biathlon/_biathlonworldcup/' in node.get( 'href') and '/players/' in node.get('href'): player = get_player(node) client.ibustats.racers.update_one( {'wiki.ru': player['name']}, {'$set': { 'champ.cc_id': player['cc_id'] }}, upsert=True) client.ibustats.racers.update_one( {'wiki.ru': player['name']}, {'$addToSet': { 'champ.tournaments': player['tournament'] }}, upsert=False) await asyncio.sleep(4 + random.randint(4, 12)) text = get_text( 'https://www.championat.com/biathlon/_biathlonworldcup/tournament/{}/teams/' .format(season['cc_id'])) if text: soup = BeautifulSoup(text, 'html.parser') nodes = soup.select('a[href]') for node in nodes: if '/biathlon/_biathlonworldcup/' in node.get( 'href') and '/teams/' in node.get('href'): country = get_country(node)
async def get_infobox(lang, title): print('--get_infobox--{}--{}'.format(lang, title)) wikis = {'lang': lang, 'name': title, 'infobox': {}} text = get_text('https://{}.wikipedia.org/wiki/{}'.format(lang, title)) if text: soup = BeautifulSoup(text, 'html.parser') nodes = soup.select( 'li.interlanguage-link a.interlanguage-link-target') for _ in nodes: pass return wikis
async def get_links(lang, title): text = get_text('https://{}.wikipedia.org/wiki/{}'.format(lang, title)) if text is None: return {'title': title, 'links': []} category = {'title': title, 'links': []} if text: soup = BeautifulSoup(text, 'html.parser') cat_nodes = soup.select('div#mw-content-text a[title]') if cat_nodes: for cat_node in cat_nodes: category['links'].append(cat_node.get('title')) return category
async def get_ci(lang, title): print('--get_ci--{}--{}'.format(lang, title)) text = get_text('https://{}.wikipedia.org/wiki/{}'.format(lang, title)) if text is None: return {'name': title} info = {'name': title} if text: soup = BeautifulSoup(text, 'html.parser') info['flag'] = get_flag_info(soup) info['emblem'] = get_emblem_info(soup) print('INFO\tget_ci({}, {})\r\n\t{}'.format(lang, title, info)) return info
async def get_pi(lang, title): print('--get_pi--{}--{}'.format(lang, title)) text = get_text( 'https://{}.wikipedia.org/w/index.php?title={}&action=info'.format( lang, title)) if text is None: return {'name': title} category = {'name': title} if text: soup = BeautifulSoup(text, 'html.parser') category['pvi_month'] = get_pvi_month(soup) category['lasttime'] = get_lasttime(soup) return category
def get_articles(url): base_url = "{0.scheme}://{0.netloc}".format(urlsplit(url)) html, error = get_text(url) pol_nodes = html.select('a[href]') urls = set() if pol_nodes: for pol_node in pol_nodes: ref = pol_node.get('href') if '#comments' in ref: continue if '/2021/' in ref or '/2020/' in ref: if 'https://' in ref: urls.add(ref) else: urls.add('{}{}'.format(base_url, ref)) return urls
async def get_interwikis(lang, title): print('--get_interwikis--{}--{}'.format(lang, title)) wikis = {'lang': lang, 'name': title, 'interwikis': {}} text = get_text('https://{}.wikipedia.org/wiki/{}'.format(lang, title)) if text: soup = BeautifulSoup(text, 'html.parser') nodes = soup.select( 'li.interlanguage-link a.interlanguage-link-target') for node in nodes: lang_title = node.get('title') if '–' in lang_title: lang_title = lang_title[:lang_title.rfind('–')].strip() elif '—' in lang_title: lang_title = lang_title[:lang_title.rfind('—')].strip() wikis['interwikis'][node.get('lang')] = lang_title print('\t--interwiki--{}--{}'.format(node.get('lang'), lang_title)) return wikis
async def get_info(lang, title): print('INFO\tget_info({}, {})'.format(lang, title)) text = get_text('https://{}.wikipedia.org/wiki/{}'.format(lang, title)) if text is None: return {'name': title, 'countries': []} category = {'name': title, 'countries': []} if text: soup = BeautifulSoup(text, 'html.parser') category['countries'] = get_country_info(soup) category['image'] = get_image_info(soup) category['desc'] = get_desc(soup) category['name'] = get_name_info(soup) if category['name'] is None: category['name'] = title category['bday'] = get_bday_info(soup) print('INFO\tget_info({}, {})\r\n\t{}'.format(lang, title, category)) return category
async def _get_pages(url): text = get_text(url) await asyncio.sleep(4) if text is None: return {'pages': [], 'next': None} category = {'pages': [], 'next': None} if text: soup = BeautifulSoup(text, 'html.parser') next_nodes = soup.select('div#mw-pages a[title]') if next_nodes: for next_node in next_nodes: if 'Следующая страница' in next_node.text: category['next'] = next_node.get('href') cat_nodes = soup.select('div#mw-pages div.mw-category li a[title]') if cat_nodes: for cat_node in cat_nodes: category['pages'].append(cat_node.get('title')) return category
async def process_player(player): client.ibustats.racers.update_one( {'champ.cc_id': player['champ']['cc_id']}, {'$set': { 'images': [] }}, upsert=False) text2 = get_text( 'https://www.championat.com/biathlon/_biathlonworldcup/tournament/{}/players/{}/' .format(player['champ']['tournaments'][0], player['champ']['cc_id'])) if text2: soup = BeautifulSoup(text2, 'html.parser') images = soup.select( 'div._player div.entity-header__info div.entity-header__img img') for image in images: update_image(player, image) nodes = soup.select('div._player.entity-header > div > ul > li') for node in nodes: if 'Команда:' in node.text: update_team(player, node) if 'Дата рождения:' in node.text: update_bday(player, node) await asyncio.sleep(16 + random.randint(8, 16))
async def get_externals(lang, title): print('--get_externals--{}--{}'.format(lang, title)) wikis = {'lang': lang, 'name': title, 'externals': []} text = get_text('https://{}.wikipedia.org/wiki/{}'.format(lang, title)) if text: soup = BeautifulSoup(text, 'html.parser') nodes = soup.select('a.external.text') for node in nodes: if node.text == 'Facebook': wikis['externals'].append(node.get('href')) if node.text == 'Instagram': wikis['externals'].append(node.get('href')) if node.text == 'Твиттер': wikis['externals'].append(node.get('href')) if node.text == 'ВКонтакте': wikis['externals'].append(node.get('href')) if node.text == 'biathlon.com.ua': wikis['externals'].append(node.get('href')) if node.text == 'IBU': wikis['externals'].append(node.get('href')) print('\t--externals--{}'.format(wikis['externals'])) await asyncio.sleep(10 + random.randint(4, 8)) return wikis
async def get(self, request): feed_id = request.path_params['feed_id'] feed = feeds.find_one({"_id": ObjectId(feed_id)}) text = get_text(feed['link']) return XmlResponse(text)
threads4 = threads.find({}).sort([("thread_id", 1)]).limit(32) for thread in threads4: thread_url = 'https://talks.by/showthread.php?t={}'.format( thread['thread_id']) urls.add(thread_url) def parse_user(node): return None while len(urls) > 0: url = urls.pop() print(url) text = get_text(url) if text: soup = BeautifulSoup(text, 'html.parser') ref_nodes = soup.select('a[href]') if ref_nodes: for ref_node in ref_nodes: if '>>' in ref_node.text: urls.add('https://talks.by/{}'.format( ref_node.get('href'))) user_nodes = soup.select('div.row-user a.username') if user_nodes: for user_node in user_nodes: query = parse.urlsplit(user_node.get('href')).query params = parse.parse_qs(query) op_result = users.update_one( {'u': params['u'][0]}, {'$set': {
client = MongoClient() news = client.news users = news.users urls = set() turls = { 'https://talks.by/forumdisplay.php?f=45&page={}&order=desc'.format(i) for i in range(64) } while turls: turl = turls.pop() print(turl) threads_page = get_text(turl) if threads_page: soup = BeautifulSoup(threads_page, 'html.parser') ref_nodes = soup.select('a[href]') if ref_nodes: for ref_node in ref_nodes: query = parse.urlsplit(ref_node.get('href')).query params = parse.parse_qs(query) if 't' in params: urls.add('https://talks.by/showthread.php?t={}'.format( params['t'][0])) print(len(urls)) while urls: url = urls.pop() print(url)