def crawl(channels_queue, min_words=_utils.MIN_CHUNK_WORDS, max_words=_utils.MAX_CHUNK_WORDS, post_limit=_utils.POST_LIMIT, channels_ignore=None, authors_ignore=None, driver=None, cookies=None, silent=True): need_quit = False if not driver: driver = _utils.selenium_init(silent=False) need_quit = True if channels_ignore is None: channels_ignore = OrderedDict() if authors_ignore is None: authors_ignore = OrderedDict() channel_names_ignore = set(channels_ignore.values()) class LoopBreakException(Exception): pass while True: if not channels_queue: break page = text = None isrussian = False # search for link to VIDEOS folder channel = channels_queue.popitem(last=False) url = channel[0] if not silent: print('START', url, end='') html = get_url(url).text match = re2.search(html) if match: path = match.group(1) #page_url = '{0.scheme}://{0.netloc}{1}'.format(urlparse(url), path) page_url = ROOT_URL + path else: page_url = url.replace('/channel/', '/c/', 1) + '/videos' url = url.replace('/channel/', '/user/', 1) html = get_url(url).text match = re2.search(html) if match: path = match.group(1) #page_url = '{0.scheme}://{0.netloc}{1}'.format(urlparse(url), path) page_url = ROOT_URL + path else: url = None if not silent: print(' ->', page_url, end='') driver.delete_all_cookies() # get random video of first NUM_ITEMS driver.get(page_url) try: channel_url = \ driver.find_element_by_css_selector('link[rel="canonical"]') \ .get_attribute('href') channel_name = driver.find_element_by_id('inner-header-container') \ .find_element_by_id('channel-name') \ .find_element_by_id('text').text except NoSuchElementException: if not silent: print(': NOT FOUND') yield None, None, None # possibility to save new status continue if not silent: print() if url: channels_ignore[url] = channel_name channels_ignore[channel_url] = channel_name channel_names_ignore.add(channel_name) elem = driver.find_element_by_id('contents') try: elem_items = elem.find_element_by_id('items') except NoSuchElementException: yield None, None, None # possibility to save new status continue elem_cont = elem.find_element_by_id('continuations') items, num_items = None, 0 while True: items = elem_items.find_elements_by_xpath( './ytd-grid-video-renderer') num_items_ = len(items) if num_items_ == num_items: break num_items = num_items_ if num_items >= NUM_ITEMS: break _utils.selenium_scroll_into_view(driver, elem_cont) # abandon this channel if there is no comments # (and not to scan related videos because they are probably the same) if not items: yield None, None, None # possibility to save new status continue item = items[random.randint(0, num_items - 1)] elem = item.find_element_by_id('thumbnail') url = elem.get_attribute('href') driver.get(url) # search for a comment of a size in given limits tries_ = 0 try: while True: try: _utils.selenium_click(driver, visible_elem=(By.ID, 'ytd-player'), max_tries=3) _utils.selenium_remove( driver, driver.find_element_by_id('ytd-player')) raise LoopBreakException() except TimeoutException: try: _utils.selenium_click(driver, visible_elem=(By.ID, 'error-screen'), max_tries=1) raise NoSuchElementException() except TimeoutException: if tries_ % 10 == 0: print("Can't find a video") tries_ += 1 except NoSuchElementException: yield None, None, None # possibility to save new status continue except LoopBreakException: pass try: _utils.selenium_click(driver, visible_elem=(By.ID, 'sections'), max_tries=10) except TimeoutException: yield None, None, None # possibility to save new status continue elem = driver.find_element_by_id('sections') elem_items = elem.find_element_by_id('contents') elem_cont = elem.find_element_by_id('continuations') items, num_items = None, 0 def notactive(elem): return elem.get_attribute('active') is None while True: _utils.selenium_scroll_into_view(driver, elem_cont) WebDriverWait(elem_cont, 10).until(notactive) items = elem_items.find_elements_by_xpath( './ytd-comment-thread-renderer') num_items_ = len(items) if num_items_ == num_items: break num_items = num_items_ if num_items >= NUM_ITEMS: break if not items: yield None, None, None # possibility to save new status continue _utils.selenium_scroll_to_top(driver) for item in reversed(items): try: elem = item.find_element_by_id('author-comment-badge') except NoSuchElementException: continue elem = item.find_element_by_id('author-text') author_href = elem.get_attribute('href') author_name = elem.get_attribute('text').strip() if author_href in authors_ignore: continue text_elem = item.find_element_by_id('content-text') text = text_elem.text #text = unescape(text).replace('\u200b', '') \ # .replace('\ufeff', '') \ # .replace('й', 'й') \ # .replace('ё', 'ё') \ # .strip() text = utils.norm_text2(text) if not silent: print(text) text0 = re0.sub('', text) text1 = re1.sub('', text0) if text0 and len(text1) / len(text0) >= .9: num_words = len( [x for x in re4.sub('', text).split() if re5.sub('', x)]) isrussian = True if not silent: print('<russian>') print(num_words) if num_words >= min_words \ and num_words <= max_words: page = text_elem.get_attribute('innerHTML') authors_ignore[author_href] = author_name break elif not silent: print('<foreign>') text = None if not isrussian: yield None, None, None # possibility to save new status continue # search in related videos for new channels elem = driver.find_element_by_id('secondary-inner') elem_cont = elem.find_element_by_id('continuations') elem_items = elem.find_element_by_id('items') items, num_items = [], 0 while True: items = elem_items.find_elements_by_xpath( './/ytd-compact-video-renderer') num_items_ = len(items) if num_items_ == num_items: break num_items = num_items_ if num_items >= NUM_ITEMS: break try: elem_cont_flag = elem_items.find_element_by_xpath( '..//ytd-continuation-item-renderer') except NoSuchElementException: break _utils.selenium_scroll_into_view(driver, elem_cont) if not silent: print('found', num_items, 'relative videos') for item in items: channel_name = item.find_element_by_id('channel-name') \ .find_element_by_id('text') if channel_name in channel_names_ignore: continue elem = item.find_element_by_id('thumbnail') url = elem.get_attribute('href') html = get_url(url).text match = re3.search(html) if not match: continue channel_url, channel_name = match.groups() channel_url = channel_url.replace('http:', 'https:') if not silent: print(' ', channel_url, channel_name, end='') if channel_url not in channels_ignore: channels_queue[channel_url] = channel_name channel_names_ignore.add(channel_name) elif not silent: print(': IGNORED', end='') if not silent: print() # return yield text, page, driver.current_url if need_quit: driver.quit()
x.split('\t') for x in f.read().split('\n') if x) else: channels_queue = OrderedDict(x.split('\t') for x in links) if os.path.isfile(CHANNELS_IGNORE_FN): with open(CHANNELS_IGNORE_FN, 'rt', encoding='utf-8') as f: channels_ignore = OrderedDict( x.split('\t') for x in f.read().split('\n') if x) else: channels_ignore = OrderedDict() if os.path.isfile(AUTHORS_IGNORE_FN): with open(AUTHORS_IGNORE_FN, 'rt', encoding='utf-8') as f: authors_ignore = OrderedDict( x.split('\t') for x in f.read().split('\n') if x) else: authors_ignore = OrderedDict() driver = _utils.selenium_init(silent=False) for text, page, link in _youtube.crawl(channels_queue, min_words=_utils.MIN_CHUNK_WORDS, max_words=_utils.MAX_CHUNK_WORDS, post_limit=_utils.POST_LIMIT, channels_ignore=channels_ignore, authors_ignore=authors_ignore, driver=driver, silent=True): if page: text = parse_page(page) texts_total += 1 page_fn = utils.get_data_path(utils.PAGES_DIR, MAX_FILES, texts_total) text_fn = utils.get_data_path(utils.TEXTS_DIR, MAX_FILES, texts_total)
def init(cookies=None, silent=False): driver = _utils.selenium_init(silent=silent) #login(driver, LOGIN, PASSWORD, cookies) return driver
if SEED: random.seed(SEED) '''=========================================================================== Links download ===========================================================================''' if os.path.isfile(utils.LINKS_FN): with open(utils.LINKS_FN, 'rt', encoding='utf-8') as f: #TIMESTAMP = f.readline().strip() links = [x for x in f.read().split('\n') if x] else: #TIMESTAMP = int(time.time()) links = [] if len(links) < MAX_LINKS: driver = _utils.selenium_init(silent=True) links = OrderedDict({x: 1 for x in links}) if os.path.isfile(_utils.AUTHORS_IGNORE_FN): with open(_utils.AUTHORS_IGNORE_FN, 'rt', encoding='utf-8') as f: authors_ignore = set(x for x in f.read().split('\n') if x) else: authors_ignore = set() need_break = False while True: for page_no in itertools.count(1): #url = INIT_URL.format(TIMESTAMP, page_no) url = INIT_URL.format('', page_no) if not SILENT: print(url) driver.get(url) res = driver.page_source
'''=========================================================================== Links download ===========================================================================''' if os.path.isfile(utils.LINKS_FN): with open(utils.LINKS_FN, 'rt', encoding='utf-8') as f: links = [x for x in f.read().split('\n') if x] if not links[0].startswith('http'): TIMESTAMP, OFFSET = links[0].split(':') OFFSET = int(OFFSET) links = links[1:] else: TIMESTAMP, OFFSET = int(time.time()), 0 links = [] if len(links) < MAX_LINKS: driver = _utils.selenium_init() links = OrderedDict({x: 1 for x in links}) no_items = False while True: url = INIT_URL.format('', TIMESTAMP, OFFSET) if not SILENT: print(url) driver.get(url) res = driver.page_source if DUMP: with open('1111.html', 'wt', encoding='utf-8') as f: f.write(res) pos = res.find('{') assert pos >= 0, 'ERROR: No json start on page "{}"!'.format(url) res = res[pos:] pos = res.rfind('}')