Exemple #1
0
def get_post_text(page_url,
                  min_words=_utils.MIN_CHUNK_WORDS,
                  max_words=_utils.MAX_CHUNK_WORDS,
                  post_limit=_utils.POST_LIMIT,
                  driver=None,
                  cookies=None,
                  silent=False):
    need_quit = False
    if not silent:
        print('START', page_url)
    if not driver:
        need_quit = True
        driver = init(cookies)
    driver.get(page_url)
    time.sleep(LOAD_TIMEOUT)
    page, text = None, None

    class PageEndException(Exception):
        pass

    class PostLimitException(Exception):
        pass

    class PostFoundException(Exception):
        pass

    try:
        labels = set()
        posts, prev_page_len = None, -1
        tries, prev_num_labels = 0, 0
        while True:
            try:
                posts = driver.find_elements_by_css_selector(
                    #'article[role="article"]'
                    'div[data-testid="tweet"]')
                if not silent:
                    print(len(posts))
                post_no, post = 0, None
                for post_no, post in enumerate(posts):
                    try:
                        post = post.find_element_by_xpath(
                            './div/div/div/div[@class="css-901oao r-18jsvk2 r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0"]'
                        )
                    except NoSuchElementException:
                        continue
                    label = post.text
                    if label not in labels:
                        if not silent:
                            print('post #{} has found'.format(len(labels)))
                        elems = post.find_elements_by_xpath('./*')
                        #text = ' '.join(x.text for x in elems if x.text).strip()
                        text = ''
                        for elem in elems:
                            if elem.tag_name != 'span':
                                text = ''
                                break
                            text += elem.text + ' '
                        #text = unescape(text).replace('\u200b', '') \
                        #                     .replace('\ufeff', '') \
                        #                     .replace('й', 'й') \
                        #                     .replace('ё', 'ё') \
                        #                     .strip()
                        text = utils.norm_text2(text)
                        if not silent:
                            print(text)
                        text0 = re0.sub('', text)
                        text1 = re1.sub('', text0)
                        if text0 and len(text1) / len(text0) >= .9:
                            num_words = len([
                                x for x in re4.sub('', text).split()
                                if re5.sub('', x)
                            ])
                            if not silent:
                                print('<russian>')
                                print(num_words)
                            if num_words >= min_words \
                           and num_words <= max_words:
                                page = post.get_attribute('innerHTML')
                                raise PostFoundException()
                        elif not silent:
                            print('<foreign>')
                        labels.add(label)
                        if len(labels) >= post_limit:
                            text = None
                            raise PostLimitException()
                        text, tries = None, 0
                else:
                    if post:
                        _utils.selenium_scroll_into_view(driver, post)
                    if len(labels) > prev_num_labels:
                        prev_num_labels = len(labels)
                        continue
                    if not silent:
                        print('post #{} is not found'.format(len(labels)))
                    page_len = \
                        _utils.selenium_scroll_to_bottom(driver,
                                                         sleep=LOAD_TIMEOUT)
                    if not silent:
                        print('page_len =', page_len)
                    if page_len == prev_page_len:
                        if tries >= 2:
                            raise PageEndException()
                        tries += 1
                    else:
                        tries = 0
                    prev_page_len = page_len
            except StaleElementReferenceException:
                _utils.selenium_scroll_to_bottom(driver, sleep=LOAD_TIMEOUT)
                posts = None

    except (PageEndException, PostLimitException, PostFoundException):
        pass

    if need_quit:
        driver.quit()
    return text, page
Exemple #2
0
def get_trend_authors(trend,
                      num_authors=10,
                      skip_first=0,
                      authors_ignore=None,
                      driver=None,
                      cookies=None,
                      silent=False):
    page_url = ROOT_URL + '/search?q=' + trend + '&src=typed_query'
    need_quit = False
    if not silent:
        print('START', page_url)
    if not driver:
        need_quit = True
        driver = init(cookies)
    driver.get(page_url)
    time.sleep(LOAD_TIMEOUT)
    authors = OrderedDict()
    if authors_ignore is None:
        authors_ignore = OrderedDict()

    class PageEndException(Exception):
        pass

    class AuthorsEnoughException(Exception):
        pass

    try:
        post, prev_page_len = None, -1
        while True:
            tries, errs = 0, 0
            while True:
                try:
                    time.sleep(1)
                    posts = driver.find_elements_by_css_selector(
                        'article[role="article"]')
                    if not silent:
                        print('found {} posts'.format(len(posts)))
                    #post = None
                    if not posts:
                        try:
                            # it can continue many times. don't worry, just wait
                            btn = driver.find_element_by_xpath(
                                '//div[@class="css-18t94o4 css-1dbjc4n r-urgr8i r-42olwf r-sdzlij r-1phboty r-rs99b7 r-1w2pmg r-1vuscfd r-1dhvaqw r-1ny4l3l r-1fneopy r-o7ynqc r-6416eg r-lrvibr"'
                                ' or @class="css-18t94o4 css-1dbjc4n r-1q3imqu r-42olwf r-sdzlij r-1phboty r-rs99b7 r-1w2pmg r-1vuscfd r-1dhvaqw r-1ny4l3l r-1fneopy r-o7ynqc r-6416eg r-lrvibr"]'
                            )
                            #print(btn.get_attribute('class'))
                            #print('Twitter raise an error. '
                            #      'Pushing the button...')
                            _utils.selenium_click(
                                driver, elem=btn,
                                max_tries=3)  # exit with error if timeout
                            time.sleep(10)
                            driver.refresh()
                            time.sleep(LOAD_TIMEOUT)
                            errs = 0
                            continue
                        except NoSuchElementException:
                            try:
                                driver.find_element_by_xpath(
                                    '//div[@class="css-901oao r-18jsvk2 r-1qd0xha r-1b6yd1w r-b88u0q r-ad9z0x r-15d164r r-bcqeeo r-q4m81j r-qvutc0"]'
                                )
                                raise PageEndException()
                            except NoSuchElementException:
                                if errs >= 2:
                                    print('Unknown situation, exiting. '
                                          'Manage it manually')
                                    exit()
                                time.sleep(10)
                                errs += 1
                    for post in enumerate(posts):
                        try:
                            post = post.find_element_by_css_selector(
                                'a[class="css-4rbku5 css-18t94o4 css-1dbjc4n r-1loqt21 r-1wbh5a2 r-dnmrzs r-1ny4l3l"]'
                            )
                            href = post.get_attribute('href')
                            text = post.get_attribute('text')
                            if len(authors) < skip_first \
                            or href not in authors_ignore:
                                authors[href] = text
                                authors_ignore[href] = 1
                            if not silent:
                                print(text, href)
                            if len(authors) >= skip_first + num_authors:
                                raise AuthorsEnoughException
                        except NoSuchElementException:
                            continue
                    page_len = \
                        _utils.selenium_scroll_to_bottom(driver,
                                                         sleep=LOAD_TIMEOUT)
                    if not silent:
                        print('page_len =', page_len)
                    if page_len == prev_page_len:
                        if tries >= 2:
                            raise PageEndException()
                        tries += 1
                    else:
                        tries = 0
                    prev_page_len = page_len
                    continue
                    #if post:
                    #    _utils.selenium_scroll_into_view(driver, post_)
                except StaleElementReferenceException:
                    _utils.selenium_scroll_to_bottom(driver,
                                                     sleep=LOAD_TIMEOUT)

    except (PageEndException, AuthorsEnoughException):
        pass

    authors = list(authors.items())[skip_first:skip_first + num_authors]
    if not silent:
        print(authors)
        print(len(authors))
    if need_quit:
        driver.quit()
    return authors
Exemple #3
0
def get_post_text(page_url,
                  min_words=_utils.MIN_CHUNK_WORDS,
                  max_words=_utils.MAX_CHUNK_WORDS,
                  post_limit=_utils.POST_LIMIT,
                  driver=None,
                  cookies=None,
                  silent=False):
    need_quit = False
    if not silent:
        print('START', page_url)
    if not driver:
        need_quit = True
        driver = init(cookies)
    driver.get(page_url)
    page, text = None, None

    class PageEndException(Exception):
        pass

    try:
        labels = set()
        post, prev_page_len = None, -1
        prev_post = None
        for post_no in range(1, post_limit + 1):
            tries = 0
            while True:
                if not silent:
                    print('post #{}...'.format(post_no))
                post = None
                posts = driver.find_elements_by_css_selector(
                    'div[aria-labelledby]')
                if not silent:
                    print(len(posts))
                for post_ in posts:
                    label = post_.get_attribute('aria-labelledby')
                    #print(label, labels)
                    if label not in labels:
                        labels.add(label)
                        try:
                            # if repost, continue
                            elem = \
                                post_.find_element_by_class_name('hqeojc4l')
                            continue
                        except NoSuchElementException:
                            pass
                        post = prev_post = post_
                        tries = 0
                        break
                else:
                    if not silent:
                        print('post #{} is not found'.format(post_no))
                    page_len = _utils.selenium_scroll_to_bottom(driver)
                    if page_len == prev_page_len:
                        if tries >= 2:
                            raise PageEndException()
                        if prev_post:
                            _utils.selenium_scroll_into_view(driver, prev_post)
                        tries += 1
                    else:
                        tries = 0
                    prev_page_len = page_len
                if post:
                    break

            if post:
                try:
                    post = post.find_element_by_css_selector(
                        'div.cxmmr5t8.oygrvhab.hcukyx3x.c1et5uql.ii04i59q')
                    post = post.find_element_by_xpath('..')
                except NoSuchElementException:
                    continue
                elems = \
                    post.find_elements_by_css_selector('div[role="button"]')
                for elem in elems:
                    try:
                        if elem.text == "See more":
                            if not silent:
                                print('See more')
                            for try_ in range(3):
                                action = webdriver.common.action_chains \
                                                         .ActionChains(driver)
                                action.move_to_element_with_offset(elem, 3, 3)
                                action.perform()
                                try:
                                    elem.click()
                                    break
                                except ElementClickInterceptedException:
                                    _utils.selenium_scroll_by(driver, 0, 100)
                            else:
                                post = None
                                break
                            while True:
                                try:
                                    WebDriverWait(driver, 10) \
                                        .until(EC.staleness_of(elem))
                                    break
                                except TimeoutException:
                                    print('WARNING: Timeout while post '
                                          'expanding. Retrying...')
                    except StaleElementReferenceException:
                        pass
                if not post:
                    break
                try:
                    page = post.get_attribute('innerHTML')
                    elems = post.find_elements_by_css_selector(
                        'div.cxmmr5t8.oygrvhab.hcukyx3x.c1et5uql.ii04i59q')
                except StaleElementReferenceException:
                    continue
                #text = ''.join(x.text for x in elems if x.text).strip()
                text = ''
                for elem in elems:
                    #print('[' + elem.text + ']')
                    elem = elem.find_elements_by_xpath('./div')
                    for elem_ in elem:
                        text_ = re2.sub(' ', elem_.text.replace('\n', '')) \
                                   .strip()
                        #print('{' + text_ + '}')
                        if text_:
                            text += text_ + '\n'
                #text = unescape(text).replace('\u200b', '') \
                #                     .replace('\ufeff', '') \
                #                     .replace('й', 'й').replace('ё', 'ё') \
                #                     .replace('\n\n', '\n').strip()
                text = _utils_add.norm_text2(text).replace('\n\n', '\n')
                if not silent:
                    print(text)
                text0 = re0.sub('', text)
                text1 = re1.sub('', text0)
                if text0 and len(text1) / len(text0) >= .9:
                    num_words = len([
                        x for x in re4.sub('', text).split() if re5.sub('', x)
                    ])
                    if not silent:
                        print('<russian>')
                        print(num_words)
                    if num_words >= min_words and num_words <= max_words:
                        break
                elif not silent:
                    print('<foreign>')
                page, text = None, None

    except PageEndException:
        pass

    if need_quit:
        driver.quit()
    return text, page
Exemple #4
0
def get_comment_authors(page_url,
                        num_authors=10,
                        depth=_utils.SEARCH_DEPTH,
                        post_limit=_utils.POST_LIMIT,
                        authors_ignore=None,
                        driver=None,
                        cookies=None,
                        silent=False):
    if not silent:
        print('START', page_url)
    if driver:
        _utils.selenium_open_new_window(driver, page_url)
    else:
        driver = init(cookies)
        driver.get(page_url)
    authors = OrderedDict()
    if authors_ignore is None:
        authors_ignore = OrderedDict()
    authors_ignore[page_url] = 1

    class PageEndException(Exception):
        pass

    class AuthorsEnoughException(Exception):
        pass

    try:
        labels = set()
        post, prev_page_len = None, -1
        prev_post = None
        for post_no in range(1, post_limit + 1):
            tries = 0
            while True:
                if not silent:
                    print('post #{}...'.format(post_no))
                post = None
                posts = driver.find_elements_by_css_selector(
                    'div[aria-labelledby]')
                if not silent:
                    print(len(posts))
                for post_ in posts:
                    label = post_.get_attribute('aria-labelledby')
                    #print(label, labels)
                    if label not in labels:
                        labels.add(label)
                        post = prev_post = post_
                        tries = 0
                        break
                else:
                    if not silent:
                        print('post #{} is not found'.format(post_no))
                    page_len = _utils.selenium_scroll_to_bottom(driver)
                    if page_len == prev_page_len:
                        if tries >= 2:
                            raise PageEndException()
                        if prev_post:
                            _utils.selenium_scroll_into_view(driver, prev_post)
                        tries += 1
                    else:
                        tries = 0
                    prev_page_len = page_len
                if post:
                    break

            if post:
                comment_elems, author_elems = set(), set()
                pass_no, need_more = 0, True
                while need_more:
                    need_more = False
                    pass_no += 1
                    if not silent:
                        print('post {}, pass {}'.format(post_no, pass_no))
                    for elem in (x for x in post.find_elements_by_tag_name('a')
                                 if x not in author_elems):
                        author_elems.add(elem)
                        author = elem.get_attribute('href')
                        #print('[[[ author =', author, ']]]')
                        if author and author.startswith(ROOT_URL) \
                       and 'comment_id=' in author:
                            if author.startswith(ROOT_URL +
                                                 '/profile.php?id='):
                                pos = author.find('&')
                            else:
                                pos = author.find('?')
                            if pos > 0:
                                author = author[:pos]
                            if author not in authors_ignore and not (
                                    author.endswith('.php')
                                    or author.endswith('/')):
                                #print(author)
                                if author[len(ROOT_URL) + 1:].find('/') < 0:
                                    try:
                                        author_name = \
                                            elem.find_element_by_tag_name(
                                                'span'
                                            ).text
                                        if not silent:
                                            print(author_name, author)
                                        authors_ignore[author] = 1
                                        if depth > 1:
                                            authors.update(
                                                get_comment_authors(
                                                    author,
                                                    num_authors=num_authors
                                                              - len(authors),
                                                    depth=depth - 1,
                                                    post_limit=post_limit,
                                                    authors_ignore=\
                                                        authors_ignore,
                                                    driver=driver,
                                                    #cookies=\
                                                    #    driver.get_cookies()
                                                    silent=silent
                                            ))
                                        else:
                                            authors[author] = author_name
                                        if len(authors) >= num_authors:
                                            raise AuthorsEnoughException()
                                    except NoSuchElementException:
                                        pass

                    for elem in (
                            x for x in post.find_elements_by_tag_name('span')
                            if x not in comment_elems):
                        comment_elems.add(elem)
                        try:
                            text = elem.text
                            #print('[', text, ']')
                            if (text.startswith('View') and
                                ('more comment' in text or 'more repl' in text)
                                ) or ('replied' in text and 'repl' in text):
                                if not silent:
                                    print('    [', text, ']')
                                need_more = True
                                action = webdriver.common.action_chains \
                                                         .ActionChains(driver)
                                action.move_to_element_with_offset(elem, 5, 5)
                                action.perform()
                                tries = 0
                                while True:
                                    try:
                                        elem.click()
                                        WebDriverWait(driver, 10) \
                                            .until(EC.staleness_of(elem))
                                        if tries:
                                            print()
                                        break
                                    except TimeoutException:
                                        print(
                                            '\rWARNING: Comments loading '
                                            'timeout. Retrying...',
                                            end='')
                                        if tries >= 2:
                                            print(
                                                '\rWARNING: Comments loading '
                                                'timeout. Skipped    ')
                                            break
                                        tries += 1
                        except:
                            pass

    except (PageEndException, AuthorsEnoughException):
        pass

    _utils.selenium_close_window(driver)
    if not silent:
        print(authors)
        print(len(authors))
    return list(authors.items())[:num_authors]
Exemple #5
0
def get_post_text(page_url,
                  min_words=_utils.MIN_CHUNK_WORDS,
                  max_words=_utils.MAX_CHUNK_WORDS,
                  post_limit=_utils.POST_LIMIT,
                  driver=None,
                  cookies=None,
                  silent=False):
    need_quit = False
    if not silent:
        print('START', page_url)
    if not driver:
        need_quit = True
        driver = init(cookies)
    driver.get(page_url)
    iferror(driver)
    link, page, text = None, None, None

    class PageEndException(Exception):
        pass

    try:
        labels = set()
        post_no, prev_page_len = 1, -1
        tries = 0
        while True:
            if not silent:
                print('post #{}...'.format(post_no))
            posts = driver.find_elements_by_xpath('//article/div/div/div/div')
            if not silent:
                print(len(posts))
            for post in posts:
                try:
                    if post.get_attribute('class') == 'EcJQs':
                        raise PageEndException()
                    _utils.selenium_scroll_into_view(driver, post)
                except StaleElementReferenceException:
                    break
                _utils.selenium_move_to_element(driver, post, 3)
                try:
                    label = post.find_element_by_tag_name('a') \
                                .get_attribute('href')
                except NoSuchElementException:
                    continue
                if not silent:
                    print('url', label, end=' ')
                if label in labels:
                    if not silent:
                        print('old')
                    continue
                if not silent:
                    print('new')
                labels.add(label)
                post_no += 1
                if post_no > post_limit:
                    raise PageEndException()
                tries = 0
                _utils.selenium_open_new_window(driver, label)
                try:
                    WebDriverWait(driver, 3) \
                        .until(EC.visibility_of_element_located(
                            (By.CLASS_NAME, 'XQXOT')
                        ))
                    elem = driver.find_element_by_class_name('XQXOT')
                    elem = elem.find_element_by_xpath(
                        './div[@class="ZyFrc"]/li/div/div/div[@class="C4VMK"]/span'
                    )
                except (TimeoutException, NoSuchElementException):
                    _utils.selenium_close_window(driver)
                    continue
                link = label
                page = elem.get_attribute('innerHTML')
                text = elem.text
                if not silent:
                    print(text)
                _utils.selenium_close_window(driver)
                text = re3.sub(
                    '\n',
                    re2.sub(
                        ' ',
                        #unescape(text).replace('\u200b', '') \
                        #              .replace('\ufeff', '') \
                        #              .replace('й', 'й').replace('ё', 'ё') \
                        #              .strip()
                        utils.norm_text2(text)))
                if not silent:
                    print(text)
                text0 = re0.sub('', re4.sub('', text))
                text1 = re1.sub('', text0)
                if text0 and len(text1) / len(text0) >= .9:
                    num_words = len([
                        x for x in re4.sub('', text).split() if re5.sub('', x)
                    ])
                    if not silent:
                        print('<russian>')
                        print(num_words)
                    if num_words >= min_words and num_words <= max_words:
                        raise PageEndException()
                elif not silent:
                    print('<foreign>')
                link, page, text = None, None, None
            else:
                if not silent:
                    print('post #{} is not found'.format(post_no))
                page_len = _utils.selenium_scroll_to_bottom(driver)
                if page_len == prev_page_len:
                    if tries >= 2:
                        raise PageEndException()
                    tries += 1
                else:
                    tries = 0
                prev_page_len = page_len

    except PageEndException:
        pass

    if need_quit:
        driver.quit()
    return text, page, link
Exemple #6
0
def get_likers(page_url,
               num_likers=10,
               skip=(0, 0),
               post_limit=_utils.POST_LIMIT,
               likers_ignore=None,
               driver=None,
               cookies=None,
               silent=False):
    need_quit = False
    if not silent:
        print('START', page_url)
    if not driver:
        need_quit = True
        driver = init(cookies)
    driver.get(page_url)
    iferror(driver)
    likers = OrderedDict()
    if likers_ignore is None:
        likers_ignore = OrderedDict()
    likers_ignore[page_url] = 1

    class PageEndException(Exception):
        pass

    class LikersEnoughException(Exception):
        pass

    try:
        labels = set()
        post_no, prev_page_len = 1, -1
        tries = 0
        while True:
            if not silent:
                print('post #{}...'.format(post_no))
            posts = driver.find_elements_by_xpath('//article/div/div/div/div')
            if not silent:
                print(len(posts))
            for post in posts:
                try:
                    if post.get_attribute('class') == 'EcJQs':
                        raise PageEndException()
                    _utils.selenium_scroll_into_view(driver, post)
                except StaleElementReferenceException:
                    break
                _utils.selenium_move_to_element(driver, post, 3)
                try:
                    label = post.find_element_by_tag_name('a') \
                                .get_attribute('href')
                except NoSuchElementException:
                    continue
                if not silent:
                    print('url', label, end=' ')
                if label in labels:
                    if not silent:
                        print('old')
                    continue
                if not silent:
                    print('new')
                labels.add(label)
                post_no += 1
                if post_no > post_limit:
                    raise PageEndException()
                tries = 0
                try:
                    likelem = post.find_element_by_css_selector(
                        'span[class="_1P1TY coreSpriteHeartSmall"]')
                except NoSuchElementException:
                    continue
                try:
                    cnt = int(likelem.find_element_by_xpath('..').text.strip())
                except ValueError:
                    cnt = 1000
                if cnt <= skip[0]:
                    continue
                start_liker_no = min(max(skip[0], round(cnt * skip[1])), 100)
                if not silent:
                    print(cnt, start_liker_no)
                _utils.selenium_open_new_window(driver, label)
                time.sleep(3)
                elem = driver.find_element_by_css_selector(
                    'button[class="sqdOP yWX7d     _8A5w5    "]')
                css_selector = 'div[class="                     Igw0E  ' \
                               '   IwRSH        YBx95      vwCYk       ' \
                               '                                       ' \
                               '                                       ' \
                               '                          "]'
                try:
                    _utils.selenium_click(driver,
                                          elem=elem,
                                          visible_elem=(By.CSS_SELECTOR,
                                                        css_selector),
                                          max_tries=1)
                except TimeoutException:
                    time.sleep(60)
                    _utils.selenium_close_window(driver)
                    continue
                like_no = 0
                likers_passed = set()
                while True:
                    likers_passed_ = set()
                    likelems = driver.find_elements_by_css_selector(
                        css_selector)
                    if not silent:
                        print('found {} likers'.format(len(likelems)))
                    if not likelems:
                        break
                    all_ignored = True
                    for likelem in likelems:
                        like_no += 1
                        elem = likelem.find_element_by_tag_name('a')
                        link = elem.get_attribute('href')
                        name = elem.get_attribute('title')
                        likers_passed_.add(link)
                        if link not in likers_passed:
                            all_ignored = False
                        if link in likers_ignore:
                            continue
                        likers_ignore[link] = 1
                        if not silent:
                            print('like:', link)
                        if like_no >= start_liker_no:
                            try:
                                name = \
                                    likelem.find_element_by_css_selector(
                                        'div[class="_7UhW9   xLCgt     '
                                        ' MMzan   _0PwGv         uL8Hv '
                                        '        "]'
                                ).text
                            except NoSuchElementException:
                                pass
                            likers[link] = name
                            if not silent:
                                print('   ', name)
                            if len(likers) >= num_likers:
                                _utils.selenium_close_window(driver)
                                raise LikersEnoughException()
                    if all_ignored:
                        time.sleep(60)
                        break
                    likers_passed = likers_passed_
                    _utils.selenium_scroll_into_view(driver, likelems[-1])
                _utils.selenium_close_window(driver)
            else:
                if not silent:
                    print('post #{} is not found'.format(post_no))
                page_len = _utils.selenium_scroll_to_bottom(driver)
                if page_len == prev_page_len:
                    if tries >= 2:
                        raise PageEndException()
                    tries += 1
                else:
                    tries = 0
                prev_page_len = page_len

    except (PageEndException, LikersEnoughException):
        pass

    if not silent:
        print(likers)
        print(len(likers))
    if need_quit:
        driver.quit()
    return list(likers.items())[:num_likers]