Ejemplo n.º 1
0
def crawl(channels_queue,
          min_words=_utils.MIN_CHUNK_WORDS,
          max_words=_utils.MAX_CHUNK_WORDS,
          post_limit=_utils.POST_LIMIT,
          channels_ignore=None,
          authors_ignore=None,
          driver=None,
          cookies=None,
          silent=True):
    need_quit = False
    if not driver:
        driver = _utils.selenium_init(silent=False)
        need_quit = True
    if channels_ignore is None:
        channels_ignore = OrderedDict()
    if authors_ignore is None:
        authors_ignore = OrderedDict()
    channel_names_ignore = set(channels_ignore.values())

    class LoopBreakException(Exception):
        pass

    while True:
        if not channels_queue:
            break
        page = text = None
        isrussian = False
        # search for link to VIDEOS folder
        channel = channels_queue.popitem(last=False)
        url = channel[0]
        if not silent:
            print('START', url, end='')
        html = get_url(url).text
        match = re2.search(html)
        if match:
            path = match.group(1)
            #page_url = '{0.scheme}://{0.netloc}{1}'.format(urlparse(url), path)
            page_url = ROOT_URL + path
        else:
            page_url = url.replace('/channel/', '/c/', 1) + '/videos'
            url = url.replace('/channel/', '/user/', 1)
            html = get_url(url).text
            match = re2.search(html)
            if match:
                path = match.group(1)
                #page_url = '{0.scheme}://{0.netloc}{1}'.format(urlparse(url), path)
                page_url = ROOT_URL + path
            else:
                url = None
        if not silent:
            print(' ->', page_url, end='')

        driver.delete_all_cookies()

        # get random video of first NUM_ITEMS
        driver.get(page_url)
        try:
            channel_url = \
                driver.find_element_by_css_selector('link[rel="canonical"]') \
                      .get_attribute('href')
            channel_name = driver.find_element_by_id('inner-header-container') \
                                 .find_element_by_id('channel-name') \
                                 .find_element_by_id('text').text
        except NoSuchElementException:
            if not silent:
                print(': NOT FOUND')
            yield None, None, None  # possibility to save new status
            continue
        if not silent:
            print()
        if url:
            channels_ignore[url] = channel_name
        channels_ignore[channel_url] = channel_name
        channel_names_ignore.add(channel_name)
        elem = driver.find_element_by_id('contents')
        try:
            elem_items = elem.find_element_by_id('items')
        except NoSuchElementException:
            yield None, None, None  # possibility to save new status
            continue
        elem_cont = elem.find_element_by_id('continuations')
        items, num_items = None, 0
        while True:
            items = elem_items.find_elements_by_xpath(
                './ytd-grid-video-renderer')
            num_items_ = len(items)
            if num_items_ == num_items:
                break
            num_items = num_items_
            if num_items >= NUM_ITEMS:
                break
            _utils.selenium_scroll_into_view(driver, elem_cont)
        # abandon this channel if there is no comments
        # (and not to scan related videos because they are probably the same)
        if not items:
            yield None, None, None  # possibility to save new status
            continue
        item = items[random.randint(0, num_items - 1)]
        elem = item.find_element_by_id('thumbnail')
        url = elem.get_attribute('href')
        driver.get(url)

        # search for a comment of a size in given limits
        tries_ = 0
        try:
            while True:
                try:
                    _utils.selenium_click(driver,
                                          visible_elem=(By.ID, 'ytd-player'),
                                          max_tries=3)
                    _utils.selenium_remove(
                        driver, driver.find_element_by_id('ytd-player'))
                    raise LoopBreakException()
                except TimeoutException:
                    try:
                        _utils.selenium_click(driver,
                                              visible_elem=(By.ID,
                                                            'error-screen'),
                                              max_tries=1)
                        raise NoSuchElementException()
                    except TimeoutException:
                        if tries_ % 10 == 0:
                            print("Can't find a video")
                        tries_ += 1
        except NoSuchElementException:
            yield None, None, None  # possibility to save new status
            continue
        except LoopBreakException:
            pass
        try:
            _utils.selenium_click(driver,
                                  visible_elem=(By.ID, 'sections'),
                                  max_tries=10)
        except TimeoutException:
            yield None, None, None  # possibility to save new status
            continue
        elem = driver.find_element_by_id('sections')
        elem_items = elem.find_element_by_id('contents')
        elem_cont = elem.find_element_by_id('continuations')
        items, num_items = None, 0

        def notactive(elem):
            return elem.get_attribute('active') is None

        while True:
            _utils.selenium_scroll_into_view(driver, elem_cont)
            WebDriverWait(elem_cont, 10).until(notactive)
            items = elem_items.find_elements_by_xpath(
                './ytd-comment-thread-renderer')
            num_items_ = len(items)
            if num_items_ == num_items:
                break
            num_items = num_items_
            if num_items >= NUM_ITEMS:
                break
        if not items:
            yield None, None, None  # possibility to save new status
            continue
        _utils.selenium_scroll_to_top(driver)
        for item in reversed(items):
            try:
                elem = item.find_element_by_id('author-comment-badge')
            except NoSuchElementException:
                continue
            elem = item.find_element_by_id('author-text')
            author_href = elem.get_attribute('href')
            author_name = elem.get_attribute('text').strip()
            if author_href in authors_ignore:
                continue
            text_elem = item.find_element_by_id('content-text')
            text = text_elem.text
            #text = unescape(text).replace('\u200b', '') \
            #                     .replace('\ufeff', '') \
            #                     .replace('й', 'й') \
            #                     .replace('ё', 'ё') \
            #                     .strip()
            text = utils.norm_text2(text)
            if not silent:
                print(text)
            text0 = re0.sub('', text)
            text1 = re1.sub('', text0)
            if text0 and len(text1) / len(text0) >= .9:
                num_words = len(
                    [x for x in re4.sub('', text).split() if re5.sub('', x)])
                isrussian = True
                if not silent:
                    print('<russian>')
                    print(num_words)
                if num_words >= min_words \
               and num_words <= max_words:
                    page = text_elem.get_attribute('innerHTML')
                    authors_ignore[author_href] = author_name
                    break
            elif not silent:
                print('<foreign>')
            text = None
        if not isrussian:
            yield None, None, None  # possibility to save new status
            continue

        # search in related videos for new channels
        elem = driver.find_element_by_id('secondary-inner')
        elem_cont = elem.find_element_by_id('continuations')
        elem_items = elem.find_element_by_id('items')
        items, num_items = [], 0
        while True:
            items = elem_items.find_elements_by_xpath(
                './/ytd-compact-video-renderer')
            num_items_ = len(items)
            if num_items_ == num_items:
                break
            num_items = num_items_
            if num_items >= NUM_ITEMS:
                break
            try:
                elem_cont_flag = elem_items.find_element_by_xpath(
                    '..//ytd-continuation-item-renderer')
            except NoSuchElementException:
                break
            _utils.selenium_scroll_into_view(driver, elem_cont)
        if not silent:
            print('found', num_items, 'relative videos')
        for item in items:
            channel_name = item.find_element_by_id('channel-name') \
                               .find_element_by_id('text')
            if channel_name in channel_names_ignore:
                continue
            elem = item.find_element_by_id('thumbnail')
            url = elem.get_attribute('href')
            html = get_url(url).text
            match = re3.search(html)
            if not match:
                continue
            channel_url, channel_name = match.groups()
            channel_url = channel_url.replace('http:', 'https:')
            if not silent:
                print('   ', channel_url, channel_name, end='')
            if channel_url not in channels_ignore:
                channels_queue[channel_url] = channel_name
                channel_names_ignore.add(channel_name)
            elif not silent:
                print(': IGNORED', end='')
            if not silent:
                print()

        # return
        yield text, page, driver.current_url

    if need_quit:
        driver.quit()
Ejemplo n.º 2
0
             x.split('\t') for x in f.read().split('\n') if x)
 else:
     channels_queue = OrderedDict(x.split('\t') for x in links)
 if os.path.isfile(CHANNELS_IGNORE_FN):
     with open(CHANNELS_IGNORE_FN, 'rt', encoding='utf-8') as f:
         channels_ignore = OrderedDict(
             x.split('\t') for x in f.read().split('\n') if x)
 else:
     channels_ignore = OrderedDict()
 if os.path.isfile(AUTHORS_IGNORE_FN):
     with open(AUTHORS_IGNORE_FN, 'rt', encoding='utf-8') as f:
         authors_ignore = OrderedDict(
             x.split('\t') for x in f.read().split('\n') if x)
 else:
     authors_ignore = OrderedDict()
 driver = _utils.selenium_init(silent=False)
 for text, page, link in _youtube.crawl(channels_queue,
                                        min_words=_utils.MIN_CHUNK_WORDS,
                                        max_words=_utils.MAX_CHUNK_WORDS,
                                        post_limit=_utils.POST_LIMIT,
                                        channels_ignore=channels_ignore,
                                        authors_ignore=authors_ignore,
                                        driver=driver,
                                        silent=True):
     if page:
         text = parse_page(page)
         texts_total += 1
         page_fn = utils.get_data_path(utils.PAGES_DIR, MAX_FILES,
                                       texts_total)
         text_fn = utils.get_data_path(utils.TEXTS_DIR, MAX_FILES,
                                       texts_total)
Ejemplo n.º 3
0
def init(cookies=None, silent=False):
    driver = _utils.selenium_init(silent=silent)
    #login(driver, LOGIN, PASSWORD, cookies)
    return driver
Ejemplo n.º 4
0
if SEED:
    random.seed(SEED)
'''===========================================================================
Links download
==========================================================================='''
if os.path.isfile(utils.LINKS_FN):
    with open(utils.LINKS_FN, 'rt', encoding='utf-8') as f:
        #TIMESTAMP = f.readline().strip()
        links = [x for x in f.read().split('\n') if x]
else:
    #TIMESTAMP = int(time.time())
    links = []

if len(links) < MAX_LINKS:
    driver = _utils.selenium_init(silent=True)
    links = OrderedDict({x: 1 for x in links})
    if os.path.isfile(_utils.AUTHORS_IGNORE_FN):
        with open(_utils.AUTHORS_IGNORE_FN, 'rt', encoding='utf-8') as f:
            authors_ignore = set(x for x in f.read().split('\n') if x)
    else:
        authors_ignore = set()
    need_break = False
    while True:
        for page_no in itertools.count(1):
            #url = INIT_URL.format(TIMESTAMP, page_no)
            url = INIT_URL.format('', page_no)
            if not SILENT:
                print(url)
            driver.get(url)
            res = driver.page_source
Ejemplo n.º 5
0
'''===========================================================================
Links download
==========================================================================='''
if os.path.isfile(utils.LINKS_FN):
    with open(utils.LINKS_FN, 'rt', encoding='utf-8') as f:
        links = [x for x in f.read().split('\n') if x]
        if not links[0].startswith('http'):
            TIMESTAMP, OFFSET = links[0].split(':')
            OFFSET = int(OFFSET)
            links = links[1:]
else:
    TIMESTAMP, OFFSET = int(time.time()), 0
    links = []

if len(links) < MAX_LINKS:
    driver = _utils.selenium_init()
    links = OrderedDict({x: 1 for x in links})
    no_items = False
    while True:
        url = INIT_URL.format('', TIMESTAMP, OFFSET)
        if not SILENT:
            print(url)
        driver.get(url)
        res = driver.page_source
        if DUMP:
            with open('1111.html', 'wt', encoding='utf-8') as f:
                f.write(res)
        pos = res.find('{')
        assert pos >= 0, 'ERROR: No json start on page "{}"!'.format(url)
        res = res[pos:]
        pos = res.rfind('}')