def get_post_text(page_url, min_words=_utils.MIN_CHUNK_WORDS, max_words=_utils.MAX_CHUNK_WORDS, post_limit=_utils.POST_LIMIT, driver=None, cookies=None, silent=False): need_quit = False if not silent: print('START', page_url) if not driver: need_quit = True driver = init(cookies) driver.get(page_url) time.sleep(LOAD_TIMEOUT) page, text = None, None class PageEndException(Exception): pass class PostLimitException(Exception): pass class PostFoundException(Exception): pass try: labels = set() posts, prev_page_len = None, -1 tries, prev_num_labels = 0, 0 while True: try: posts = driver.find_elements_by_css_selector( #'article[role="article"]' 'div[data-testid="tweet"]') if not silent: print(len(posts)) post_no, post = 0, None for post_no, post in enumerate(posts): try: post = post.find_element_by_xpath( './div/div/div/div[@class="css-901oao r-18jsvk2 r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0"]' ) except NoSuchElementException: continue label = post.text if label not in labels: if not silent: print('post #{} has found'.format(len(labels))) elems = post.find_elements_by_xpath('./*') #text = ' '.join(x.text for x in elems if x.text).strip() text = '' for elem in elems: if elem.tag_name != 'span': text = '' break text += elem.text + ' ' #text = unescape(text).replace('\u200b', '') \ # .replace('\ufeff', '') \ # .replace('й', 'й') \ # .replace('ё', 'ё') \ # .strip() text = utils.norm_text2(text) if not silent: print(text) text0 = re0.sub('', text) text1 = re1.sub('', text0) if text0 and len(text1) / len(text0) >= .9: num_words = len([ x for x in re4.sub('', text).split() if re5.sub('', x) ]) if not silent: print('<russian>') print(num_words) if num_words >= min_words \ and num_words <= max_words: page = post.get_attribute('innerHTML') raise PostFoundException() elif not silent: print('<foreign>') labels.add(label) if len(labels) >= post_limit: text = None raise PostLimitException() text, tries = None, 0 else: if post: _utils.selenium_scroll_into_view(driver, post) if len(labels) > prev_num_labels: prev_num_labels = len(labels) continue if not silent: print('post #{} is not found'.format(len(labels))) page_len = \ _utils.selenium_scroll_to_bottom(driver, sleep=LOAD_TIMEOUT) if not silent: print('page_len =', page_len) if page_len == prev_page_len: if tries >= 2: raise PageEndException() tries += 1 else: tries = 0 prev_page_len = page_len except StaleElementReferenceException: _utils.selenium_scroll_to_bottom(driver, sleep=LOAD_TIMEOUT) posts = None except (PageEndException, PostLimitException, PostFoundException): pass if need_quit: driver.quit() return text, page
def get_trend_authors(trend, num_authors=10, skip_first=0, authors_ignore=None, driver=None, cookies=None, silent=False): page_url = ROOT_URL + '/search?q=' + trend + '&src=typed_query' need_quit = False if not silent: print('START', page_url) if not driver: need_quit = True driver = init(cookies) driver.get(page_url) time.sleep(LOAD_TIMEOUT) authors = OrderedDict() if authors_ignore is None: authors_ignore = OrderedDict() class PageEndException(Exception): pass class AuthorsEnoughException(Exception): pass try: post, prev_page_len = None, -1 while True: tries, errs = 0, 0 while True: try: time.sleep(1) posts = driver.find_elements_by_css_selector( 'article[role="article"]') if not silent: print('found {} posts'.format(len(posts))) #post = None if not posts: try: # it can continue many times. don't worry, just wait btn = driver.find_element_by_xpath( '//div[@class="css-18t94o4 css-1dbjc4n r-urgr8i r-42olwf r-sdzlij r-1phboty r-rs99b7 r-1w2pmg r-1vuscfd r-1dhvaqw r-1ny4l3l r-1fneopy r-o7ynqc r-6416eg r-lrvibr"' ' or @class="css-18t94o4 css-1dbjc4n r-1q3imqu r-42olwf r-sdzlij r-1phboty r-rs99b7 r-1w2pmg r-1vuscfd r-1dhvaqw r-1ny4l3l r-1fneopy r-o7ynqc r-6416eg r-lrvibr"]' ) #print(btn.get_attribute('class')) #print('Twitter raise an error. ' # 'Pushing the button...') _utils.selenium_click( driver, elem=btn, max_tries=3) # exit with error if timeout time.sleep(10) driver.refresh() time.sleep(LOAD_TIMEOUT) errs = 0 continue except NoSuchElementException: try: driver.find_element_by_xpath( '//div[@class="css-901oao r-18jsvk2 r-1qd0xha r-1b6yd1w r-b88u0q r-ad9z0x r-15d164r r-bcqeeo r-q4m81j r-qvutc0"]' ) raise PageEndException() except NoSuchElementException: if errs >= 2: print('Unknown situation, exiting. ' 'Manage it manually') exit() time.sleep(10) errs += 1 for post in enumerate(posts): try: post = post.find_element_by_css_selector( 'a[class="css-4rbku5 css-18t94o4 css-1dbjc4n r-1loqt21 r-1wbh5a2 r-dnmrzs r-1ny4l3l"]' ) href = post.get_attribute('href') text = post.get_attribute('text') if len(authors) < skip_first \ or href not in authors_ignore: authors[href] = text authors_ignore[href] = 1 if not silent: print(text, href) if len(authors) >= skip_first + num_authors: raise AuthorsEnoughException except NoSuchElementException: continue page_len = \ _utils.selenium_scroll_to_bottom(driver, sleep=LOAD_TIMEOUT) if not silent: print('page_len =', page_len) if page_len == prev_page_len: if tries >= 2: raise PageEndException() tries += 1 else: tries = 0 prev_page_len = page_len continue #if post: # _utils.selenium_scroll_into_view(driver, post_) except StaleElementReferenceException: _utils.selenium_scroll_to_bottom(driver, sleep=LOAD_TIMEOUT) except (PageEndException, AuthorsEnoughException): pass authors = list(authors.items())[skip_first:skip_first + num_authors] if not silent: print(authors) print(len(authors)) if need_quit: driver.quit() return authors
def get_post_text(page_url, min_words=_utils.MIN_CHUNK_WORDS, max_words=_utils.MAX_CHUNK_WORDS, post_limit=_utils.POST_LIMIT, driver=None, cookies=None, silent=False): need_quit = False if not silent: print('START', page_url) if not driver: need_quit = True driver = init(cookies) driver.get(page_url) page, text = None, None class PageEndException(Exception): pass try: labels = set() post, prev_page_len = None, -1 prev_post = None for post_no in range(1, post_limit + 1): tries = 0 while True: if not silent: print('post #{}...'.format(post_no)) post = None posts = driver.find_elements_by_css_selector( 'div[aria-labelledby]') if not silent: print(len(posts)) for post_ in posts: label = post_.get_attribute('aria-labelledby') #print(label, labels) if label not in labels: labels.add(label) try: # if repost, continue elem = \ post_.find_element_by_class_name('hqeojc4l') continue except NoSuchElementException: pass post = prev_post = post_ tries = 0 break else: if not silent: print('post #{} is not found'.format(post_no)) page_len = _utils.selenium_scroll_to_bottom(driver) if page_len == prev_page_len: if tries >= 2: raise PageEndException() if prev_post: _utils.selenium_scroll_into_view(driver, prev_post) tries += 1 else: tries = 0 prev_page_len = page_len if post: break if post: try: post = post.find_element_by_css_selector( 'div.cxmmr5t8.oygrvhab.hcukyx3x.c1et5uql.ii04i59q') post = post.find_element_by_xpath('..') except NoSuchElementException: continue elems = \ post.find_elements_by_css_selector('div[role="button"]') for elem in elems: try: if elem.text == "See more": if not silent: print('See more') for try_ in range(3): action = webdriver.common.action_chains \ .ActionChains(driver) action.move_to_element_with_offset(elem, 3, 3) action.perform() try: elem.click() break except ElementClickInterceptedException: _utils.selenium_scroll_by(driver, 0, 100) else: post = None break while True: try: WebDriverWait(driver, 10) \ .until(EC.staleness_of(elem)) break except TimeoutException: print('WARNING: Timeout while post ' 'expanding. Retrying...') except StaleElementReferenceException: pass if not post: break try: page = post.get_attribute('innerHTML') elems = post.find_elements_by_css_selector( 'div.cxmmr5t8.oygrvhab.hcukyx3x.c1et5uql.ii04i59q') except StaleElementReferenceException: continue #text = ''.join(x.text for x in elems if x.text).strip() text = '' for elem in elems: #print('[' + elem.text + ']') elem = elem.find_elements_by_xpath('./div') for elem_ in elem: text_ = re2.sub(' ', elem_.text.replace('\n', '')) \ .strip() #print('{' + text_ + '}') if text_: text += text_ + '\n' #text = unescape(text).replace('\u200b', '') \ # .replace('\ufeff', '') \ # .replace('й', 'й').replace('ё', 'ё') \ # .replace('\n\n', '\n').strip() text = _utils_add.norm_text2(text).replace('\n\n', '\n') if not silent: print(text) text0 = re0.sub('', text) text1 = re1.sub('', text0) if text0 and len(text1) / len(text0) >= .9: num_words = len([ x for x in re4.sub('', text).split() if re5.sub('', x) ]) if not silent: print('<russian>') print(num_words) if num_words >= min_words and num_words <= max_words: break elif not silent: print('<foreign>') page, text = None, None except PageEndException: pass if need_quit: driver.quit() return text, page
def get_comment_authors(page_url, num_authors=10, depth=_utils.SEARCH_DEPTH, post_limit=_utils.POST_LIMIT, authors_ignore=None, driver=None, cookies=None, silent=False): if not silent: print('START', page_url) if driver: _utils.selenium_open_new_window(driver, page_url) else: driver = init(cookies) driver.get(page_url) authors = OrderedDict() if authors_ignore is None: authors_ignore = OrderedDict() authors_ignore[page_url] = 1 class PageEndException(Exception): pass class AuthorsEnoughException(Exception): pass try: labels = set() post, prev_page_len = None, -1 prev_post = None for post_no in range(1, post_limit + 1): tries = 0 while True: if not silent: print('post #{}...'.format(post_no)) post = None posts = driver.find_elements_by_css_selector( 'div[aria-labelledby]') if not silent: print(len(posts)) for post_ in posts: label = post_.get_attribute('aria-labelledby') #print(label, labels) if label not in labels: labels.add(label) post = prev_post = post_ tries = 0 break else: if not silent: print('post #{} is not found'.format(post_no)) page_len = _utils.selenium_scroll_to_bottom(driver) if page_len == prev_page_len: if tries >= 2: raise PageEndException() if prev_post: _utils.selenium_scroll_into_view(driver, prev_post) tries += 1 else: tries = 0 prev_page_len = page_len if post: break if post: comment_elems, author_elems = set(), set() pass_no, need_more = 0, True while need_more: need_more = False pass_no += 1 if not silent: print('post {}, pass {}'.format(post_no, pass_no)) for elem in (x for x in post.find_elements_by_tag_name('a') if x not in author_elems): author_elems.add(elem) author = elem.get_attribute('href') #print('[[[ author =', author, ']]]') if author and author.startswith(ROOT_URL) \ and 'comment_id=' in author: if author.startswith(ROOT_URL + '/profile.php?id='): pos = author.find('&') else: pos = author.find('?') if pos > 0: author = author[:pos] if author not in authors_ignore and not ( author.endswith('.php') or author.endswith('/')): #print(author) if author[len(ROOT_URL) + 1:].find('/') < 0: try: author_name = \ elem.find_element_by_tag_name( 'span' ).text if not silent: print(author_name, author) authors_ignore[author] = 1 if depth > 1: authors.update( get_comment_authors( author, num_authors=num_authors - len(authors), depth=depth - 1, post_limit=post_limit, authors_ignore=\ authors_ignore, driver=driver, #cookies=\ # driver.get_cookies() silent=silent )) else: authors[author] = author_name if len(authors) >= num_authors: raise AuthorsEnoughException() except NoSuchElementException: pass for elem in ( x for x in post.find_elements_by_tag_name('span') if x not in comment_elems): comment_elems.add(elem) try: text = elem.text #print('[', text, ']') if (text.startswith('View') and ('more comment' in text or 'more repl' in text) ) or ('replied' in text and 'repl' in text): if not silent: print(' [', text, ']') need_more = True action = webdriver.common.action_chains \ .ActionChains(driver) action.move_to_element_with_offset(elem, 5, 5) action.perform() tries = 0 while True: try: elem.click() WebDriverWait(driver, 10) \ .until(EC.staleness_of(elem)) if tries: print() break except TimeoutException: print( '\rWARNING: Comments loading ' 'timeout. Retrying...', end='') if tries >= 2: print( '\rWARNING: Comments loading ' 'timeout. Skipped ') break tries += 1 except: pass except (PageEndException, AuthorsEnoughException): pass _utils.selenium_close_window(driver) if not silent: print(authors) print(len(authors)) return list(authors.items())[:num_authors]
def get_post_text(page_url, min_words=_utils.MIN_CHUNK_WORDS, max_words=_utils.MAX_CHUNK_WORDS, post_limit=_utils.POST_LIMIT, driver=None, cookies=None, silent=False): need_quit = False if not silent: print('START', page_url) if not driver: need_quit = True driver = init(cookies) driver.get(page_url) iferror(driver) link, page, text = None, None, None class PageEndException(Exception): pass try: labels = set() post_no, prev_page_len = 1, -1 tries = 0 while True: if not silent: print('post #{}...'.format(post_no)) posts = driver.find_elements_by_xpath('//article/div/div/div/div') if not silent: print(len(posts)) for post in posts: try: if post.get_attribute('class') == 'EcJQs': raise PageEndException() _utils.selenium_scroll_into_view(driver, post) except StaleElementReferenceException: break _utils.selenium_move_to_element(driver, post, 3) try: label = post.find_element_by_tag_name('a') \ .get_attribute('href') except NoSuchElementException: continue if not silent: print('url', label, end=' ') if label in labels: if not silent: print('old') continue if not silent: print('new') labels.add(label) post_no += 1 if post_no > post_limit: raise PageEndException() tries = 0 _utils.selenium_open_new_window(driver, label) try: WebDriverWait(driver, 3) \ .until(EC.visibility_of_element_located( (By.CLASS_NAME, 'XQXOT') )) elem = driver.find_element_by_class_name('XQXOT') elem = elem.find_element_by_xpath( './div[@class="ZyFrc"]/li/div/div/div[@class="C4VMK"]/span' ) except (TimeoutException, NoSuchElementException): _utils.selenium_close_window(driver) continue link = label page = elem.get_attribute('innerHTML') text = elem.text if not silent: print(text) _utils.selenium_close_window(driver) text = re3.sub( '\n', re2.sub( ' ', #unescape(text).replace('\u200b', '') \ # .replace('\ufeff', '') \ # .replace('й', 'й').replace('ё', 'ё') \ # .strip() utils.norm_text2(text))) if not silent: print(text) text0 = re0.sub('', re4.sub('', text)) text1 = re1.sub('', text0) if text0 and len(text1) / len(text0) >= .9: num_words = len([ x for x in re4.sub('', text).split() if re5.sub('', x) ]) if not silent: print('<russian>') print(num_words) if num_words >= min_words and num_words <= max_words: raise PageEndException() elif not silent: print('<foreign>') link, page, text = None, None, None else: if not silent: print('post #{} is not found'.format(post_no)) page_len = _utils.selenium_scroll_to_bottom(driver) if page_len == prev_page_len: if tries >= 2: raise PageEndException() tries += 1 else: tries = 0 prev_page_len = page_len except PageEndException: pass if need_quit: driver.quit() return text, page, link
def get_likers(page_url, num_likers=10, skip=(0, 0), post_limit=_utils.POST_LIMIT, likers_ignore=None, driver=None, cookies=None, silent=False): need_quit = False if not silent: print('START', page_url) if not driver: need_quit = True driver = init(cookies) driver.get(page_url) iferror(driver) likers = OrderedDict() if likers_ignore is None: likers_ignore = OrderedDict() likers_ignore[page_url] = 1 class PageEndException(Exception): pass class LikersEnoughException(Exception): pass try: labels = set() post_no, prev_page_len = 1, -1 tries = 0 while True: if not silent: print('post #{}...'.format(post_no)) posts = driver.find_elements_by_xpath('//article/div/div/div/div') if not silent: print(len(posts)) for post in posts: try: if post.get_attribute('class') == 'EcJQs': raise PageEndException() _utils.selenium_scroll_into_view(driver, post) except StaleElementReferenceException: break _utils.selenium_move_to_element(driver, post, 3) try: label = post.find_element_by_tag_name('a') \ .get_attribute('href') except NoSuchElementException: continue if not silent: print('url', label, end=' ') if label in labels: if not silent: print('old') continue if not silent: print('new') labels.add(label) post_no += 1 if post_no > post_limit: raise PageEndException() tries = 0 try: likelem = post.find_element_by_css_selector( 'span[class="_1P1TY coreSpriteHeartSmall"]') except NoSuchElementException: continue try: cnt = int(likelem.find_element_by_xpath('..').text.strip()) except ValueError: cnt = 1000 if cnt <= skip[0]: continue start_liker_no = min(max(skip[0], round(cnt * skip[1])), 100) if not silent: print(cnt, start_liker_no) _utils.selenium_open_new_window(driver, label) time.sleep(3) elem = driver.find_element_by_css_selector( 'button[class="sqdOP yWX7d _8A5w5 "]') css_selector = 'div[class=" Igw0E ' \ ' IwRSH YBx95 vwCYk ' \ ' ' \ ' ' \ ' "]' try: _utils.selenium_click(driver, elem=elem, visible_elem=(By.CSS_SELECTOR, css_selector), max_tries=1) except TimeoutException: time.sleep(60) _utils.selenium_close_window(driver) continue like_no = 0 likers_passed = set() while True: likers_passed_ = set() likelems = driver.find_elements_by_css_selector( css_selector) if not silent: print('found {} likers'.format(len(likelems))) if not likelems: break all_ignored = True for likelem in likelems: like_no += 1 elem = likelem.find_element_by_tag_name('a') link = elem.get_attribute('href') name = elem.get_attribute('title') likers_passed_.add(link) if link not in likers_passed: all_ignored = False if link in likers_ignore: continue likers_ignore[link] = 1 if not silent: print('like:', link) if like_no >= start_liker_no: try: name = \ likelem.find_element_by_css_selector( 'div[class="_7UhW9 xLCgt ' ' MMzan _0PwGv uL8Hv ' ' "]' ).text except NoSuchElementException: pass likers[link] = name if not silent: print(' ', name) if len(likers) >= num_likers: _utils.selenium_close_window(driver) raise LikersEnoughException() if all_ignored: time.sleep(60) break likers_passed = likers_passed_ _utils.selenium_scroll_into_view(driver, likelems[-1]) _utils.selenium_close_window(driver) else: if not silent: print('post #{} is not found'.format(post_no)) page_len = _utils.selenium_scroll_to_bottom(driver) if page_len == prev_page_len: if tries >= 2: raise PageEndException() tries += 1 else: tries = 0 prev_page_len = page_len except (PageEndException, LikersEnoughException): pass if not silent: print(likers) print(len(likers)) if need_quit: driver.quit() return list(likers.items())[:num_likers]