Exemple #1
0
def videos_action(namespace, output_file):

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=REPORT_HEADERS,
                                 keep=namespace.select)

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' videos',
    )

    http = create_pool()
    column = namespace.column

    def rows_with_videos_id():

        for row, ytb_data in enricher.cells(namespace.column, with_rows=True):
            video_id = None

            if is_youtube_video_id(ytb_data):
                video_id = ytb_data
            elif is_youtube_url(ytb_data):
                video_id = extract_video_id_from_youtube_url(ytb_data)

            yield row, video_id

    for chunk in chunks_iter(rows_with_videos_id(), 50):

        all_ids = [video_id for _, video_id in chunk if video_id]
        list_id = ",".join(all_ids)

        url = URL_TEMPLATE % {'list_id': list_id, 'key': namespace.key}
        err, response, result = request_json(http, url)

        if err:
            die(err)
        elif response.status == 403:
            time.sleep(seconds_to_midnight_pacific_time())
            continue
        elif response.status >= 400:
            die(response.status)

        data = get_data(result)

        not_available = []

        id_available = set(data)
        not_available = set(all_ids).difference(id_available)

        loading_bar.update(len(chunk))

        line_empty = []

        for row, video_id in chunk:
            if video_id is None or video_id in not_available:
                enricher.writerow(row)
            else:
                enricher.writerow(row, data[video_id])
Exemple #2
0
def videos_action(namespace, output_file):

    enricher = CSVEnricher(
        namespace.file,
        namespace.column,
        output_file,
        report_headers=REPORT_HEADERS,
        select=namespace.select.split(',') if namespace.select else None
    )

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' videos',
    )

    http = create_pool()

    for chunk in gen_chunks(enricher):

        all_ids = [row[0] for row in chunk if row[0]]
        list_id = ",".join(all_ids)

        url = URL_TEMPLATE % {'list_id': list_id, 'key': namespace.key}
        err, response, result = request_json(http, url)

        if err:
            die(err)
        elif response.status == 403:
            time.sleep(seconds_to_midnight_pacific_time())
            continue
        elif response.status >= 400:
            die(response.status)

        data = get_data(result)

        id_available = set(data)
        not_available = set(all_ids).difference(id_available)

        loading_bar.update(len(chunk))

        line_empty = []

        for item in chunk:
            video_id, line = item

            if video_id is None:
                enricher.write_empty(line)

            elif video_id in not_available:
                line_empty = [video_id] + [''] * (len(REPORT_HEADERS) - 1)
                enricher.write(line, line_empty)

            else:
                enricher.write(line, data[video_id])
Exemple #3
0
def search_action(namespace, output_file):

    # Handling output
    output_file = open_output_file(namespace.output)

    edit_namespace_with_csv_io(namespace, 'keyword')

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=CSV_HEADERS
    )

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit='videos',
    )
    http = create_pool()
    error_file = DummyTqdmFile(sys.stderr)
    limit = namespace.limit

    for (row, keyword) in enricher.cells(namespace.column, with_rows=True):
        url = URL_template_accurate % {'subject': keyword, 'key': namespace.key}
        next_page = True
        while next_page:
            if next_page is True:
                err, response, result = request_json(http, url)
            else:
                url_next = url + '&pageToken=' + next_page
                err, response, result = request_json(http, url_next)
            if err:
                die(err)
            elif response.status == 403:
                error_file.write('Running out of API points. You will have to wait until midnight, Pacific time!')
                time.sleep(seconds_to_midnight_pacific_time())
                continue
            elif response.status >= 400:
                die(response.status)
            next_page, data_l = get_data(result)
            for data in data_l:
                if limit is not(None):
                    if limit == 0:
                        return True
                    else:
                        limit -= 1
                        enricher.writerow(row, data)
                else:
                    enricher.writerow(row, data)
def comments_action(namespace, output_file):

    output_file = open_output_file(namespace.output)

    writer = csv.writer(output_file)
    writer.writerow(CSV_HEADERS)

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' comments',
    )

    http = create_pool()

    url = URL_TEMPLATE % {'id': namespace.id, 'key': namespace.key}
    next_page = True
    all_data = []

    while next_page:

        if next_page is True:
            err, response, result = request_json(http, url)
        else:
            url_next = url + '&pageToken=' + next_page
            err, response, result = request_json(http, url_next)

        if err:
            die(err)
        elif response.status == 403:
            time.sleep(seconds_to_midnight_pacific_time())
            continue
        elif response.status >= 400:
            die(response.status)

        next_page, data = get_data(result)

        for comment in data:
            loading_bar.update()
            writer.writerow(comment)
Exemple #5
0
def comments_action(namespace, output_file):

    # Handling output
    output_file = open_output_file(namespace.output)

    # Handling input
    if is_youtube_video_id(namespace.column):
        edit_namespace_with_csv_io(namespace, 'video_id')
    elif is_youtube_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'video_url')

    # Enricher
    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=CSV_HEADERS)

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' comments',
    )

    http = create_pool()
    error_file = DummyTqdmFile(sys.stderr)

    def make_requests(current_url, http=http):
        return (request_json(http, current_url), current_url)

    for (row, url_id) in enricher.cells(namespace.column, with_rows=True):

        if is_youtube_url(url_id):
            yt_id = extract_video_id_from_youtube_url(url_id)
            if yt_id:
                url = URL_TEMPLATE % {'id': yt_id, 'key': namespace.key}
        elif is_youtube_video_id(url_id):
            url = URL_TEMPLATE % {'id': url_id, 'key': namespace.key}
        else:
            continue
        url_queue = deque([url])
        while len(url_queue) != 0:
            couche = []
            with ThreadPoolExecutor(max_workers=25) as executor:
                time.sleep(0.01)
                couche = executor.map(make_requests, url_queue)
            url_queue = deque()
            for resp in couche:
                ((err, response, result), current_url) = resp
                if err:
                    error_file.write('{} for {}'.format(err, current_url))
                    continue
                elif response.status == 403 and result.get('error').get(
                        'errors')[0].get('reason') == 'commentsDisabled':
                    error_file.write(
                        'Comments are disabled for {}'.format(current_url))
                    continue
                elif response.status == 403:
                    error_file.write(
                        'Running out of API points. You will have to wait until midnight, Pacific time!'
                    )
                    time.sleep(seconds_to_midnight_pacific_time())
                    continue
                elif response.status >= 400:
                    error_file.write('Error {} for {}'.format(
                        response.status, current_url))
                    continue
                kind = result.get('kind', None)
                next_page = result.get('nextPageToken', None)
                if next_page:
                    url_next = current_url + '&pageToken=' + next_page
                    url_queue.append(url_next)
                if kind == 'youtube#commentThreadListResponse':
                    # Handling comments pagination
                    items = result.get('items', None)
                    for item in items:
                        snippet = item['snippet']
                        replies = item.get('replies')
                        if replies:
                            # Checking whether youtube's API send a subset of the replies or not
                            if snippet['totalReplyCount'] != len(
                                    replies['comments']) and namespace.full:
                                # If we want the replies and those are not all given by the API, we add the URL specific to the topComment
                                # to the queue, and we deal with that topLevelComment
                                new_url = URL_PARENTID_TEMPLATE % {
                                    'id': snippet['topLevelComment']['id'],
                                    'key': namespace.key
                                }
                                url_queue.append(new_url)
                                data = get_data_full(snippet, True)
                                enricher.writerow(row, data)
                            else:
                                dataTop = get_data_full(snippet, True)
                                enricher.writerow(row, dataTop)
                                for rep in replies['comments']:
                                    enricher.writerow(
                                        row, get_data_full(rep, False))
                        else:
                            # if there is not 'replies' key, it means that the comment we fetch is only a topLevelComment
                            top_comment = get_data_full(snippet, True)
                            enricher.writerow(row, top_comment)
                else:
                    # Handling, commentList, nothing to see here, dealing commments by comments
                    items = result.get('items', None)
                    for item in items:
                        data = get_data_full(item, False)
                        enricher.writerow(row, data)