Ejemplos de open_output_file en Python, ejemplos de minet.cli.utils.open_output_file en Python

Ejemplo n.º 1

0

Mostrar archivo

def url_extract_action(namespace):
    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        add=REPORT_HEADERS,
        keep=namespace.select.split(',') if namespace.select else None)

    extract = EXTRACTORS[getattr(namespace, 'from')]

    loading_bar = tqdm(desc='Extracting',
                       dynamic_ncols=True,
                       unit=' rows',
                       total=namespace.total)

    for row, content in enricher.cells(namespace.column, with_rows=True):
        loading_bar.update()

        content = content.strip()

        if not content:
            continue

        for url in extract(content):
            if namespace.base_url is not None:
                url = urljoin(namespace.base_url, url)

            enricher.writerow(row, [url])

    output_file.close()

Ejemplo n.º 2

0

Mostrar archivo

def extract_action(namespace):
    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(namespace.report,
                                 output_file,
                                 keep=namespace.select,
                                 add=OUTPUT_ADDITIONAL_HEADERS)

    loading_bar = tqdm(desc='Extracting content',
                       total=namespace.total,
                       dynamic_ncols=True,
                       unit=' docs')

    files = create_report_iterator(namespace,
                                   enricher,
                                   loading_bar=loading_bar)

    with Pool(namespace.processes) as pool:
        for error, row, result in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                enricher.writerow(row, [report_error(error)] + PADDING)
                continue

            if result is None:
                enricher.writerow(row, ['no-content'] + PADDING)
                continue

            enricher.writerow(row, result)

    output_file.close()

Ejemplo n.º 3

0

Mostrar archivo

Archivo: sheets.py Proyecto: lebelgique/minet

def google_sheets_action(namespace):
    output_file = open_output_file(namespace.output, flag='w')

    try:
        data = export_google_sheets_as_csv(namespace.url,
                                           cookie=namespace.cookie,
                                           authuser=namespace.authuser)
    except GoogleSheetsInvalidTargetError:
        die('Could not extract a valid google sheets id from provided argument!'
            )
    except BrowserCookieError:
        die('Could not extract cookie from %s!' % namespace.cookie)
    except GoogleSheetsMissingCookieError:
        die('Did not find a relevant cookie!')
    except GoogleSheetsInvalidContentTypeError:
        die('Could not export spreadsheet as CSV!')
    except GoogleSheetsNotFoundError:
        die('Could not find spreadsheet (404)!')
    except GoogleSheetsUnauthorizedError:
        die('You don\'t have access to this spreadsheet. Did you forget to set --cookie?'
            )
    except GoogleSheetsMaxAttemptsExceeded:
        die('Maximum number of attempts exceeded! You can still set --authuser if you logged in numerous google accounts at once.'
            )

    output_file.write(data)
    output_file.close()

Ejemplo n.º 4

0

Mostrar archivo

Archivo: __init__.py Proyecto: lebelgique/minet

def youtube_action(namespace):

    output_file = open_output_file(namespace.output, flag='w')

    if namespace.yt_action == 'videos':
        check_key(namespace)

        from minet.cli.youtube.videos import videos_action
        videos_action(namespace, output_file)

    elif namespace.yt_action == 'comments':
        check_key(namespace)

        from minet.cli.youtube.comments import comments_action
        comments_action(namespace, output_file)

    elif namespace.yt_action == 'captions':
        from minet.cli.youtube.captions import captions_action
        captions_action(namespace, output_file)

    elif namespace.yt_action == 'search':
        check_key(namespace)

        from minet.cli.youtube.search import search_action
        search_action(namespace, output_file)

    if namespace.output is not None:
        output_file.close()

Ejemplo n.º 5

0

Mostrar archivo

Archivo: __init__.py Proyecto: AleksiKnuutila/minet-fork

def twitter_action(namespace):

    # Credentials are required to be able to access the API
    if not namespace.api_key or \
       not namespace.api_secret_key or \
       not namespace.access_token or \
       not namespace.access_token_secret:
        die([
            'Full credentials are required to access Twitter API.',
            'You can provide them using various CLI arguments:',
            '    --api-key', '    --api-secret-key', '    --access-token',
            '    --access-token-secret'
        ])

    output_file = open_output_file(
        namespace.output,
        flag='a+' if getattr(namespace, 'resume', False) else 'w')

    if namespace.tw_action == 'friends':
        from minet.cli.twitter.friends import twitter_friends_action

        twitter_friends_action(namespace, output_file)

    # Cleanup
    if namespace.output is not None:
        output_file.close()

Ejemplo n.º 6

0

Mostrar archivo

Archivo: url_join.py Proyecto: AleksiKnuutila/minet-fork

def url_join_action(namespace):
    right_reader = casanova.reader(namespace.file2)
    left_reader = casanova.reader(namespace.file1, namespace.output)

    output_file = open_output_file(namespace.output)
    output_writer = csv.writer(output_file)

    left_headers = left_reader.fieldnames
    left_indices = None

    if namespace.select is not None:
        selected = namespace.select.split(',')
        left_headers = [h for h in left_headers if h in selected]
        left_indices = collect_column_indices(left_reader.pos, left_headers)

    empty = [''] * len(left_headers)

    output_writer.writerow(right_reader.fieldnames + left_headers)

    loading_bar = tqdm(desc='Indexing left file',
                       dynamic_ncols=True,
                       unit=' lines')

    # First step is to index left file
    trie = NormalizedLRUTrie(strip_trailing_slash=True)

    for row, url in left_reader.cells(namespace.column1, with_rows=True):
        url = url.strip()

        if left_indices is not None:
            row = [row[i] for i in left_indices]

        trie.set(url, row)

        loading_bar.update()

    loading_bar.close()

    loading_bar = tqdm(desc='Matching right file',
                       dynamic_ncols=True,
                       unit=' lines')

    for row, url in right_reader.cells(namespace.column2, with_rows=True):
        url = url.strip()

        match = None

        if url:
            match = trie.match(url)

        loading_bar.update()

        if match is None:
            output_writer.writerow(row + empty)
            continue

        row.extend(match)
        output_writer.writerow(row)

    output_file.close()

Ejemplo n.º 7

0

Mostrar archivo

Archivo: __init__.py Proyecto: paulgirard/minet

def twitter_action(namespace):

    output_file = open_output_file(
        namespace.output,
        flag='a+' if getattr(namespace, 'resume', False) else 'w'
    )

    if namespace.tw_action == 'scrape':
        from minet.cli.twitter.scrape import twitter_scrape_action

        twitter_scrape_action(namespace, output_file)

    else:
        check_credentials(namespace)

        if namespace.tw_action == 'friends':
            from minet.cli.twitter.friends import twitter_friends_action

            twitter_friends_action(namespace, output_file)

        elif namespace.tw_action == 'followers':
            from minet.cli.twitter.followers import twitter_followers_action

            twitter_followers_action(namespace, output_file)

        elif namespace.tw_action == 'users':
            from minet.cli.twitter.users import twitter_users_action

            twitter_users_action(namespace, output_file)

    # Cleanup
    if namespace.output is not None:
        output_file.close()

Ejemplo n.º 8

0

Mostrar archivo

Archivo: comments.py Proyecto: rangsutu88/minet

def facebook_comments_action(namespace):

    # Handling output
    output_file = open_output_file(namespace.output)

    # Handling input

    if is_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'post_url')

    try:
        scraper = FacebookCommentScraper(namespace.cookie)
    except FacebookInvalidCookieError:
        if namespace.cookie in ['firefox', 'chrome']:
            die('Could not extract cookies from %s.' % namespace.cookie)

        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to access Facebook post comments.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    # Enricher
    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=FACEBOOK_COMMENT_CSV_HEADERS)

    # Loading bar
    loading_bar = tqdm(desc='Scraping comments',
                       dynamic_ncols=True,
                       unit=' comments')

    for i, (row,
            url) in enumerate(enricher.cells(namespace.column,
                                             with_rows=True)):

        if not is_facebook_post_url(url):
            loading_bar.close()
            die('Given url (line %i) is not a Facebook post url: %s' %
                (i + 1, url))

        batches = scraper(url, per_call=True, detailed=True, format='csv_row')

        for details, batch in batches:
            for comment in batch:
                enricher.writerow(row, comment)

            loading_bar.update(len(batch))
            loading_bar.set_postfix(calls=details['calls'],
                                    replies=details['replies'],
                                    q=details['queue_size'],
                                    posts=i + 1)

    loading_bar.close()

Ejemplo n.º 9

0

Mostrar archivo

def url_parse_action(namespace):

    output_file = open_output_file(namespace.output)

    headers = REPORT_HEADERS

    if namespace.facebook:
        headers = FACEBOOK_REPORT_HEADERS
    elif namespace.youtube:
        headers = YOUTUBE_REPORT_HEADERS

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        add=headers,
        keep=namespace.select
    )

    loading_bar = tqdm(
        desc='Parsing',
        dynamic_ncols=True,
        unit=' rows',
        total=namespace.total
    )

    for row, url in enricher.cells(namespace.column, with_rows=True):
        url = url.strip()

        loading_bar.update()

        if namespace.separator:
            urls = url.split(namespace.separator)
        else:
            urls = [url]

        for url in urls:
            if not is_url(url, allow_spaces_in_path=True, require_protocol=False):
                enricher.writerow(row)
                continue

            if namespace.facebook:
                addendum = extract_facebook_addendum(url)
            elif namespace.youtube:
                addendum = extract_youtube_addendum(url)
            else:
                addendum = extract_standard_addendum(namespace, url)

            if addendum is None:
                enricher.writerow(row)
                continue

            enricher.writerow(row, addendum)

    output_file.close()

Ejemplo n.º 10

0

Mostrar archivo

def extract_action(namespace):
    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(
        namespace.report,
        output_file,
        keep=namespace.select,
        add=OUTPUT_ADDITIONAL_HEADERS
    )

    loading_bar = LoadingBar(
        desc='Extracting content',
        total=namespace.total,
        unit='doc'
    )

    def on_irrelevant_row(reason, row):
        loading_bar.update()
        enricher.writerow(row, format_error(reason))

    try:
        files = create_report_iterator(
            namespace,
            enricher,
            on_irrelevant_row=on_irrelevant_row
        )
    except NotADirectoryError:
        loading_bar.die([
            'Could not find the "%s" directory!' % namespace.input_dir,
            'Did you forget to specify it with -i/--input-dir?'
        ])

    pool = LazyPool(namespace.processes)

    loading_bar.update_stats(p=pool.processes)

    with pool:
        for error, row, result in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                enricher.writerow(row, format_error(report_error(error)))
                continue

            if result is None:
                enricher.writerow(row, format_error('no-content'))
                continue

            enricher.writerow(row, result)

    loading_bar.close()
    output_file.close()

Ejemplo n.º 11

0

Mostrar archivo

def crowdtangle_action(namespace):

    # A token is needed to be able to access the API
    if not namespace.token:
        die([
            "A token is needed to be able to access CrowdTangle's API.",
            "You can provide one using the `--token` argument.",
        ])

    output_file = open_output_file(
        namespace.output,
        flag="a+" if getattr(namespace, "resume", False) else "w")

    if namespace.ct_action == "posts":
        from minet.cli.crowdtangle.posts import crowdtangle_posts_action

        crowdtangle_posts_action(namespace, output_file)

    elif namespace.ct_action == "posts-by-id":
        from minet.cli.crowdtangle.posts_by_id import crowdtangle_posts_by_id_action

        crowdtangle_posts_by_id_action(namespace, output_file)

    elif namespace.ct_action == "lists":
        from minet.cli.crowdtangle.lists import crowdtangle_lists_action

        crowdtangle_lists_action(namespace, output_file)

    elif namespace.ct_action == "leaderboard":
        from minet.cli.crowdtangle.leaderboard import crowdtangle_leaderboard_action

        crowdtangle_leaderboard_action(namespace, output_file)

    elif namespace.ct_action == "search":
        from minet.cli.crowdtangle.search import crowdtangle_search_action

        crowdtangle_search_action(namespace, output_file)

    elif namespace.ct_action == "summary":
        from minet.cli.crowdtangle.summary import crowdtangle_summary_action

        crowdtangle_summary_action(namespace, output_file)

    elif namespace.ct_action == "links":
        from crowdtangle.links import crowdtangle_links_action

        crowdtangle_links_action(namespace, output_file)

    # Cleanup
    if namespace.output is not None:
        output_file.close()

Ejemplo n.º 12

0

Mostrar archivo

def search_action(namespace, output_file):

    # Handling output
    output_file = open_output_file(namespace.output)

    edit_namespace_with_csv_io(namespace, 'keyword')

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=CSV_HEADERS
    )

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit='videos',
    )
    http = create_pool()
    error_file = DummyTqdmFile(sys.stderr)
    limit = namespace.limit

    for (row, keyword) in enricher.cells(namespace.column, with_rows=True):
        url = URL_template_accurate % {'subject': keyword, 'key': namespace.key}
        next_page = True
        while next_page:
            if next_page is True:
                err, response, result = request_json(http, url)
            else:
                url_next = url + '&pageToken=' + next_page
                err, response, result = request_json(http, url_next)
            if err:
                die(err)
            elif response.status == 403:
                error_file.write('Running out of API points. You will have to wait until midnight, Pacific time!')
                time.sleep(seconds_to_midnight_pacific_time())
                continue
            elif response.status >= 400:
                die(response.status)
            next_page, data_l = get_data(result)
            for data in data_l:
                if limit is not(None):
                    if limit == 0:
                        return True
                    else:
                        limit -= 1
                        enricher.writerow(row, data)
                else:
                    enricher.writerow(row, data)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: scrape.py Proyecto: rangsutu88/minet

def scrape_action(namespace):

    output_file = open_output_file(namespace.output)

    # Parsing scraper definition
    try:
        scraper = load_definition(namespace.scraper)
    except TypeError:
        die(['Unknown scraper format.', 'Expecting a JSON or YAML file.'])
    except:
        die('Invalid scraper file.')

    if namespace.format == 'csv':
        output_headers = headers_from_definition(scraper)
        output_writer = csv.DictWriter(output_file, fieldnames=output_headers)
        output_writer.writeheader()
    else:
        output_writer = ndjson.writer(output_file)

    loading_bar = tqdm(desc='Scraping pages',
                       total=namespace.total,
                       dynamic_ncols=True,
                       unit=' pages')

    loading_bar.set_postfix(p=namespace.processes)

    if namespace.glob is not None:
        files = create_glob_iterator(namespace, scraper)
    else:
        reader = casanova.reader(namespace.report)
        files = create_report_iterator(namespace, reader, scraper, loading_bar)

    with Pool(namespace.processes) as pool:
        for error, items in pool.imap_unordered(worker, files):
            loading_bar.update()

            if not isinstance(items, list):
                items = [items]

            for item in items:
                if not isinstance(item, dict):
                    item = {'value': item}

                output_writer.writerow(item)

    output_file.close()

Ejemplo n.º 14

0

Mostrar archivo

def crowdtangle_action(namespace):

    # A token is needed to be able to access the API
    if not namespace.token:
        die([
            'A token is needed to be able to access CrowdTangle\'s API.',
            'You can provide one using the `--token` argument.'
        ])

    output_file = open_output_file(
        namespace.output,
        flag='a+' if getattr(namespace, 'resume', False) else 'w')

    if namespace.ct_action == 'posts':
        from minet.cli.crowdtangle.posts import crowdtangle_posts_action

        crowdtangle_posts_action(namespace, output_file)

    elif namespace.ct_action == 'posts-by-id':
        from minet.cli.crowdtangle.posts_by_id import crowdtangle_posts_by_id_action

        crowdtangle_posts_by_id_action(namespace, output_file)

    elif namespace.ct_action == 'lists':
        from minet.cli.crowdtangle.lists import crowdtangle_lists_action

        crowdtangle_lists_action(namespace, output_file)

    elif namespace.ct_action == 'leaderboard':
        from minet.cli.crowdtangle.leaderboard import crowdtangle_leaderboard_action

        crowdtangle_leaderboard_action(namespace, output_file)

    elif namespace.ct_action == 'search':
        from minet.cli.crowdtangle.search import crowdtangle_search_action

        crowdtangle_search_action(namespace, output_file)

    elif namespace.ct_action == 'summary':
        from minet.cli.crowdtangle.summary import crowdtangle_summary_action

        crowdtangle_summary_action(namespace, output_file)

    # Cleanup
    if namespace.output is not None:
        output_file.close()

Ejemplo n.º 15

0

Mostrar archivo

Archivo: url_likes.py Proyecto: rangsutu88/minet

def facebook_url_likes_action(namespace):
    output_file = open_output_file(namespace.output)

    if is_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'url')

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=REPORT_HEADERS
    )

    if namespace.column not in enricher.pos:
        die([
            'Could not find the "%s" column containing the urls in the given CSV file.' % namespace.column
        ])

    loading_bar = tqdm(
        desc='Retrieving likes',
        dynamic_ncols=True,
        unit=' urls',
        total=namespace.total
    )

    http = create_pool()

    for row, url in enricher.cells(namespace.column, with_rows=True):
        loading_bar.update()

        url = url.strip()

        err, html = make_request(http, url)

        if err is not None:
            loading_bar.close()
            die('An error occurred while fetching like button for this url: %s' % url)

        scraped = scrape(html)

        if scraped is None:
            loading_bar.close()
            die('Could not extract Facebook likes from this url\'s like button: %s' % url)

        enricher.writerow(row, scraped)

Ejemplo n.º 16

0

Mostrar archivo

def youtube_action(namespace):

    output_file = open_output_file(namespace.output, flag='w')

    if namespace.yt_action == 'url-parse':
        from minet.cli.youtube.url_parse import url_parse_action
        url_parse_action(namespace, output_file)

    elif namespace.yt_action == 'videos':
        from minet.cli.youtube.videos import videos_action
        videos_action(namespace, output_file)

    elif namespace.yt_action == 'comments':
        from minet.cli.youtube.comments import comments_action
        comments_action(namespace, output_file)

    if namespace.output is not None:
        output_file.close()

Ejemplo n.º 17

0

Mostrar archivo

def facebook_url_parse_action(namespace):
    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=REPORT_HEADERS
    )

    loading_bar = tqdm(
        desc='Parsing',
        dynamic_ncols=True,
        unit=' lines',
    )

    for row, url in enricher.cells(namespace.column, with_rows=True):

        loading_bar.update()

        url_data = url.strip()

        parsed = parse_facebook_url(url_data)

        if parsed is None:
            enricher.writerow(row)

        if isinstance(parsed, FacebookPost):
            enricher.writerow(
                row,
                ['post', parsed.id, '', parsed.url]
            )

        elif isinstance(parsed, FacebookHandle):
            enricher.writerow(
                row,
                ['handle', '', parsed.handle, parsed.url]
            )

        elif isinstance(parsed, FacebookUser):
            enricher.writerow(
                row,
                ['user', parsed.id or '', parsed.handle or '', parsed.url]
            )

Ejemplo n.º 18

0

Mostrar archivo

def twitter_action(namespace):

    output_file = open_output_file(
        namespace.output,
        flag='a+' if getattr(namespace, 'resume', False) else 'w')

    if getattr(namespace, 'resume', False) and not namespace.output:
        die('Cannot --resume if -o/--output is not set!')

    if namespace.tw_action == 'scrape':
        from minet.cli.twitter.scrape import twitter_scrape_action

        twitter_scrape_action(namespace, output_file)

    else:
        check_credentials(namespace)

        if namespace.tw_action == 'friends':
            from minet.cli.twitter.friends import twitter_friends_action

            twitter_friends_action(namespace, output_file)

        elif namespace.tw_action == 'followers':
            from minet.cli.twitter.followers import twitter_followers_action

            twitter_followers_action(namespace, output_file)

        elif namespace.tw_action == 'users':
            from minet.cli.twitter.users import twitter_users_action

            twitter_users_action(namespace, output_file)

        elif namespace.tw_action == 'user-tweets':
            from minet.cli.twitter.user_tweets import twitter_user_tweets_action

            twitter_user_tweets_action(namespace, output_file)

        else:
            raise TypeError('unkown tw_action "%s"' % namespace.tw_action)

    # Cleanup
    if namespace.output is not None:
        output_file.close()

Ejemplo n.º 19

0

Mostrar archivo

Archivo: __init__.py Proyecto: AleksiKnuutila/minet-fork

def mediacloud_action(namespace):

    # A token is needed to be able to access the API
    if not namespace.token:
        die([
            'A token is needed to be able to access Mediacloud\'s API.',
            'You can provide one using the `--token` argument.'
        ])

    output_file = open_output_file(namespace.output)

    if namespace.mc_action == 'topic':
        from minet.cli.mediacloud.topic import mediacloud_topic_action
        mediacloud_topic_action(namespace, output_file)

    elif namespace.mc_action == 'search':
        from minet.cli.mediacloud.search import mediacloud_search_action
        mediacloud_search_action(namespace, output_file)

    output_file.close()

Ejemplo n.º 20

0

Mostrar archivo

Archivo: extract.py Proyecto: AleksiKnuutila/minet-fork

def extract_action(namespace):
    input_headers, pos, reader = custom_reader(
        namespace.report, ('status', 'filename', 'encoding'))

    selected_fields = namespace.select.split(',') if namespace.select else None
    selected_pos = [input_headers.index(h)
                    for h in selected_fields] if selected_fields else None

    output_headers = (list(input_headers) if not selected_pos else
                      [input_headers[i] for i in selected_pos])
    output_headers += OUTPUT_ADDITIONAL_HEADERS

    output_file = open_output_file(namespace.output)

    output_writer = csv.writer(output_file)
    output_writer.writerow(output_headers)

    loading_bar = tqdm(desc='Extracting content',
                       total=namespace.total,
                       dynamic_ncols=True,
                       unit=' docs')

    namespace.report.close()
    namespace.report = open(namespace.report.name)
    files = create_report_iterator(namespace, loading_bar=loading_bar)

    with Pool(namespace.processes) as pool:
        for error, line, content in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                message = report_error(error)
                line.extend([message, ''])
                output_writer.writerow(line)
                continue

            line.extend(['', content])
            output_writer.writerow(line)

    output_file.close()

Ejemplo n.º 21

0

Mostrar archivo

Archivo: comments.py Proyecto: AleksiKnuutila/minet-fork

def comments_action(namespace, output_file):

    output_file = open_output_file(namespace.output)

    writer = csv.writer(output_file)
    writer.writerow(CSV_HEADERS)

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' comments',
    )

    http = create_pool()

    url = URL_TEMPLATE % {'id': namespace.id, 'key': namespace.key}
    next_page = True
    all_data = []

    while next_page:

        if next_page is True:
            err, response, result = request_json(http, url)
        else:
            url_next = url + '&pageToken=' + next_page
            err, response, result = request_json(http, url_next)

        if err:
            die(err)
        elif response.status == 403:
            time.sleep(seconds_to_midnight_pacific_time())
            continue
        elif response.status >= 400:
            die(response.status)

        next_page, data = get_data(result)

        for comment in data:
            loading_bar.update()
            writer.writerow(comment)

Ejemplo n.º 22

0

Mostrar archivo

def url_parse_action(namespace):

    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=REPORT_HEADERS,
                                 keep=namespace.select)

    loading_bar = tqdm(desc='Parsing',
                       dynamic_ncols=True,
                       unit=' rows',
                       total=namespace.total)

    for row, url in enricher.cells(namespace.column, with_rows=True):
        url = url.strip()

        loading_bar.update()

        if namespace.separator:
            urls = url.split(namespace.separator)
        else:
            urls = [url]

        for url in urls:
            if not is_url(url, allow_spaces_in_path=True):
                enricher.writerow(row)
                continue

            enricher.writerow(row, [
                normalize_url(url,
                              strip_protocol=namespace.strip_protocol,
                              strip_trailing_slash=True),
                get_domain_name(url),
                get_hostname(url),
                get_normalized_hostname(url)
            ])

    output_file.close()

Ejemplo n.º 23

0

Mostrar archivo

def cookies_action(namespace):
    output_file = open_output_file(namespace.output)

    if namespace.csv:
        output_writer = csv.writer(output_file)

    try:
        jar = getattr(browser_cookie3, namespace.browser)()
    except browser_cookie3.BrowserCookieError:
        die('Could not extract cookies from %s!' % namespace.browser)

    if namespace.url is not None:
        resolver = CookieResolver(jar)

        cookie = resolver(namespace.url)

        if cookie is not None:

            if namespace.csv:
                output_writer.writerow(MORSEL_CSV_HEADER)

                parsed = SimpleCookie(cookie)

                for morsel in parsed.values():
                    output_writer.writerow(format_morsel_for_csv(morsel))
            else:
                print(cookie, file=output_file)
        else:
            die('Could not find relevant cookie for %s in %s!' %
                (namespace.url, namespace.browser))
    else:
        if namespace.csv:
            output_writer.writerow(COOKIE_CSV_HEADER)

            for cookie in jar:
                output_writer.writerow(format_cookie_for_csv(cookie))
        else:
            write_jar_as_text_mozilla(jar, output_file)

Ejemplo n.º 24

0

Mostrar archivo

Archivo: comments.py Proyecto: paulgirard/minet

def comments_action(namespace, output_file):

    # Handling output
    output_file = open_output_file(namespace.output)

    # Handling input
    if is_youtube_video_id(namespace.column):
        edit_namespace_with_csv_io(namespace, 'video_id')
    elif is_youtube_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'video_url')

    # Enricher
    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=CSV_HEADERS)

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' comments',
    )

    http = create_pool()
    error_file = DummyTqdmFile(sys.stderr)

    def make_requests(current_url, http=http):
        return (request_json(http, current_url), current_url)

    for (row, url_id) in enricher.cells(namespace.column, with_rows=True):

        if is_youtube_url(url_id):
            yt_id = extract_video_id_from_youtube_url(url_id)
            if yt_id:
                url = URL_TEMPLATE % {'id': yt_id, 'key': namespace.key}
        elif is_youtube_video_id(url_id):
            url = URL_TEMPLATE % {'id': url_id, 'key': namespace.key}
        else:
            continue
        url_queue = deque([url])
        while len(url_queue) != 0:
            couche = []
            with ThreadPoolExecutor(max_workers=25) as executor:
                time.sleep(0.01)
                couche = executor.map(make_requests, url_queue)
            url_queue = deque()
            for resp in couche:
                ((err, response, result), current_url) = resp
                if err:
                    error_file.write('{} for {}'.format(err, current_url))
                    continue
                elif response.status == 403 and result.get('error').get(
                        'errors')[0].get('reason') == 'commentsDisabled':
                    error_file.write(
                        'Comments are disabled for {}'.format(current_url))
                    continue
                elif response.status == 403:
                    error_file.write(
                        'Running out of API points. You will have to wait until midnight, Pacific time!'
                    )
                    time.sleep(seconds_to_midnight_pacific_time())
                    continue
                elif response.status >= 400:
                    error_file.write('Error {} for {}'.format(
                        response.status, current_url))
                    continue
                kind = result.get('kind', None)
                next_page = result.get('nextPageToken', None)
                if next_page:
                    url_next = current_url + '&pageToken=' + next_page
                    url_queue.append(url_next)
                if kind == 'youtube#commentThreadListResponse':
                    # Handling comments pagination
                    items = result.get('items', None)
                    for item in items:
                        snippet = item['snippet']
                        replies = item.get('replies')
                        if replies:
                            # Checking whether youtube's API send a subset of the replies or not
                            if snippet['totalReplyCount'] != len(
                                    replies['comments']) and namespace.full:
                                # If we want the replies and those are not all given by the API, we add the URL specific to the topComment
                                # to the queue, and we deal with that topLevelComment
                                new_url = URL_PARENTID_TEMPLATE % {
                                    'id': snippet['topLevelComment']['id'],
                                    'key': namespace.key
                                }
                                url_queue.append(new_url)
                                data = get_data_full(snippet, True)
                                enricher.writerow(row, data)
                            else:
                                dataTop = get_data_full(snippet, True)
                                enricher.writerow(row, dataTop)
                                for rep in replies['comments']:
                                    enricher.writerow(
                                        row, get_data_full(rep, False))
                        else:
                            # if there is not 'replies' key, it means that the comment we fetch is only a topLevelComment
                            top_comment = get_data_full(snippet, True)
                            enricher.writerow(row, top_comment)
                else:
                    # Handling, commentList, nothing to see here, dealing commments by comments
                    items = result.get('items', None)
                    for item in items:
                        data = get_data_full(item, False)
                        enricher.writerow(row, data)

Ejemplo n.º 25

0

Mostrar archivo

Archivo: comments.py Proyecto: AleksiKnuutila/minet-fork

def facebook_comments_action(namespace):

    # Reformatting url to hit mobile website
    url = force_protocol(namespace.url, 'https')
    url = convert_facebook_url_to_mobile(url)

    # Grabbing cookie
    cookie = grab_facebook_cookie(namespace)

    # Handling output
    output_file = open_output_file(namespace.output)

    writer = csv.writer(output_file)
    writer.writerow(CSV_HEADERS)

    http = create_pool()

    def request_page(target):
        error, result = request(http, target, cookie=cookie)

        if error is not None:
            raise error

        return result.data.decode('utf-8')

    # Loading bar
    loading_bar = tqdm(desc='Scraping comments',
                       dynamic_ncols=True,
                       unit=' comments')

    url_queue = deque([(url, None)])

    url_count = 0
    replies_count = 0

    while len(url_queue) != 0:
        current_url, in_reply_to = url_queue.popleft()

        html = request_page(current_url)
        data = scrape_comments(html, in_reply_to)

        url_count += 1

        for reply_url, commented_id in data['replies']:
            url_queue.append((reply_url, commented_id))

        if data['next'] is not None:
            url_queue.append((data['next'], in_reply_to))

        for comment in data['comments']:
            loading_bar.update()
            writer.writerow(format_csv_row(comment))

            if in_reply_to is not None:
                replies_count += 1

        loading_bar.set_postfix(urls=url_count,
                                replies=replies_count,
                                q=len(url_queue))

        # Don't be too greedy
        time.sleep(FACEBOOK_MOBILE_DEFAULT_THROTTLE)

    loading_bar.close()

Ejemplo n.º 26

0

Mostrar archivo

def fetch_action(namespace):

    # Are we resuming
    resuming = namespace.resume

    if resuming and not namespace.output:
        die(['Cannot --resume without specifying -o/--output.'])

    # Do we need to fetch only a single url?
    single_url = namespace.file is sys.stdin and is_url(namespace.column)

    if single_url:
        edit_namespace_with_csv_io(namespace, 'url')

        # If we are hitting a single url we enable contents_in_report
        if namespace.contents_in_report is None:
            namespace.contents_in_report = True

    # HTTP method
    http_method = namespace.method

    # Cookie grabber
    get_cookie = None
    if namespace.grab_cookies:
        get_cookie = grab_cookies(namespace.grab_cookies)

    # Global headers
    global_headers = None
    if namespace.headers:
        global_headers = {}

        for header in namespace.headers:
            k, v = parse_http_header(header)
            global_headers = v

    flag = 'w'
    if namespace.output is not None and resuming and isfile(namespace.output):
        flag = 'r+'

    output_file = open_output_file(namespace.output, flag=flag)

    # Resume listener
    listener = None
    resuming_reader_loading = None
    skipped = 0

    if resuming:
        resuming_reader_loading = tqdm(desc='Resuming',
                                       dynamic_ncols=True,
                                       unit=' lines')

        def listener(event, row):
            nonlocal skipped

            if event == 'resume.output':
                resuming_reader_loading.update()

            if event == 'resume.input':
                skipped += 1
                loading_bar.set_postfix(skipped=skipped)
                loading_bar.update()

    # Enricher
    enricher = casanova.threadsafe_enricher(
        namespace.file,
        output_file,
        resumable=resuming,
        auto_resume=False,
        add=OUTPUT_ADDITIONAL_HEADERS +
        (['raw_contents'] if namespace.contents_in_report else []),
        keep=namespace.select,
        listener=listener)

    if namespace.column not in enricher.pos:
        die([
            'Could not find the "%s" column containing the urls in the given CSV file.'
            % namespace.column
        ])

    url_pos = enricher.pos[namespace.column]

    filename_pos = None

    if namespace.filename is not None:
        if namespace.filename not in enricher.pos:
            die([
                'Could not find the "%s" column containing the filenames in the given CSV file.'
                % namespace.filename
            ])

        filename_pos = enricher.pos[namespace.filename]

    indexed_input_headers = {h: i for i, h in enumerate(enricher.fieldnames)}

    if resuming:
        enricher.resume()
        resuming_reader_loading.close()

    # Loading bar
    total = namespace.total

    loading_bar = tqdm(desc='Fetching pages',
                       total=total,
                       dynamic_ncols=True,
                       unit=' urls')

    def url_key(item):
        url = item[1][url_pos].strip()

        if not url:
            return

        # Url templating
        if namespace.url_template:
            return namespace.url_template.format(value=url)

        return url

    def request_args(url, item):
        cookie = None

        # Cookie
        if get_cookie:
            cookie = get_cookie(url)

        # Headers
        headers = None

        if global_headers:
            headers = global_headers

        return {'method': http_method, 'cookie': cookie, 'headers': headers}

    def write_output(index,
                     row,
                     resolved=None,
                     status=None,
                     error=None,
                     filename=None,
                     encoding=None,
                     data=None):

        addendum = [
            resolved or '', status or '', error or '', filename or '', encoding
            or ''
        ]

        if namespace.contents_in_report:
            addendum.append(data or '')

        enricher.writerow(index, row, addendum)

    errors = 0
    status_codes = Counter()

    fetch_kwargs = {
        'threads': namespace.threads,
        'throttle': namespace.throttle,
        'domain_parallelism': namespace.domain_parallelism
    }

    if namespace.timeout is not None:
        fetch_kwargs['timeout'] = namespace.timeout

    multithreaded_iterator = multithreaded_fetch(enricher,
                                                 key=url_key,
                                                 request_args=request_args,
                                                 **fetch_kwargs)

    for result in multithreaded_iterator:
        index, row = result.item

        if not result.url:

            write_output(index, row)

            loading_bar.update()
            continue

        response = result.response
        data = response.data if response is not None else None

        content_write_flag = 'wb'

        # Updating stats
        if result.error is not None:
            errors += 1
        else:
            if response.status >= 400:
                status_codes[response.status] += 1

        postfix = {'errors': errors}

        for code, count in status_codes.most_common(1):
            postfix[str(code)] = count

        loading_bar.set_postfix(**postfix)
        loading_bar.update()

        # No error
        if result.error is None:

            filename = None

            # Building filename
            if data:
                if filename_pos is not None or namespace.filename_template:
                    if namespace.filename_template:
                        filename = CUSTOM_FORMATTER.format(
                            namespace.filename_template,
                            value=row[filename_pos]
                            if filename_pos is not None else None,
                            ext=result.meta['ext'],
                            line=LazyLineDict(indexed_input_headers, row))
                    else:
                        filename = row[filename_pos] + result.meta['ext']
                else:
                    # NOTE: it would be nice to have an id that can be sorted by time
                    filename = str(uuid4()) + result.meta['ext']

            # Standardize encoding?
            encoding = result.meta['encoding']

            if data and namespace.standardize_encoding or namespace.contents_in_report:
                if encoding is None or encoding != 'utf-8' or namespace.contents_in_report:
                    data = data.decode(
                        encoding if encoding is not None else 'utf-8',
                        errors='replace')
                    encoding = 'utf-8'
                    content_write_flag = 'w'

            # Writing file on disk
            if data and not namespace.contents_in_report:

                if namespace.compress:
                    filename += '.gz'

                resource_path = join(namespace.output_dir, filename)
                resource_dir = dirname(resource_path)

                os.makedirs(resource_dir, exist_ok=True)

                with open(resource_path, content_write_flag) as f:

                    # TODO: what if standardize_encoding + compress?
                    f.write(
                        gzip.compress(data) if namespace.compress else data)

            # Reporting in output
            resolved_url = response.geturl()

            write_output(
                index,
                row,
                resolved=resolved_url if resolved_url != result.url else None,
                status=response.status,
                filename=filename,
                encoding=encoding,
                data=data)

        # Handling potential errors
        else:
            error_code = report_error(result.error)

            write_output(index, row, error=error_code)

    # Closing files
    output_file.close()

Ejemplo n.º 27

0

Mostrar archivo

Archivo: scrape.py Proyecto: lebelgique/minet

def scrape_action(namespace):

    output_file = open_output_file(namespace.output)

    # Parsing scraper definition
    try:
        scraper = Scraper(namespace.scraper, strain=namespace.strain)
    except DefinitionInvalidFormatError:
        die(['Unknown scraper format!', 'It should be a JSON or YAML file.'])
    except FileNotFoundError:
        die('Could not find scraper file!')
    except InvalidScraperError as error:
        print('Your scraper is invalid! You need to fix the following errors:',
              file=sys.stderr)
        print(file=sys.stderr)
        sys.stderr.write(
            report_scraper_validation_errors(error.validation_errors))
        die()
    except CSSSelectorTooComplex:
        die([
            'Your strainer\'s CSS selector %s is too complex.' %
            colored(namespace.strain, 'blue'),
            'You cannot use relations to create a strainer.',
            'Try to simplify the selector you passed to --strain.'
        ])

    if namespace.validate:
        print('Your scraper is valid.', file=sys.stderr)
        sys.exit(0)

    if scraper.headers is None and namespace.format == 'csv':
        die([
            'Your scraper does not yield tabular data.',
            'Try changing it or setting --format to "jsonl".'
        ])

    loading_bar = LoadingBar(desc='Scraping pages',
                             total=namespace.total,
                             unit='page')

    proc_args = (namespace.format, namespace.separator)

    def on_irrelevant_row(reason, row):
        loading_bar.update()

    if namespace.glob is not None:
        files = create_glob_iterator(namespace, proc_args)
    else:
        reader = casanova.reader(namespace.report)

        try:
            files = create_report_iterator(namespace,
                                           reader,
                                           args=proc_args,
                                           on_irrelevant_row=on_irrelevant_row)
        except NotADirectoryError:
            loading_bar.die([
                'Could not find the "%s" directory!' % namespace.input_dir,
                'Did you forget to specify it with -i/--input-dir?'
            ])

    if namespace.format == 'csv':
        output_writer = csv.DictWriter(output_file, fieldnames=scraper.headers)
        output_writer.writeheader()
    else:
        output_writer = ndjson.writer(output_file)

    pool = LazyPool(namespace.processes,
                    initializer=init_process,
                    initargs=(scraper.definition, namespace.strain))

    loading_bar.update_stats(p=pool.processes)

    with pool:
        for error, items in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                if isinstance(error, (ScraperEvalError, ScraperEvalTypeError,
                                      ScraperEvalNoneError)):
                    loading_bar.print(report_scraper_evaluation_error(error),
                                      end='')
                loading_bar.inc('errors')
                continue

            for item in items:
                output_writer.writerow(item)

    loading_bar.close()
    output_file.close()

Ejemplo n.º 28

0

Mostrar archivo

Archivo: post_stats.py Proyecto: rangsutu88/minet

def facebook_post_stats_action(namespace):

    # Handling output
    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=REPORT_HEADERS,
                                 keep=namespace.select)

    http = create_pool()

    def fetch_facebook_page_stats(url):
        err, response = request(http, url, cookie='locale=en_US')

        if err:
            return 'http-error', None

        if response.status == 404:
            return 'not-found', None

        if response.status >= 400:
            return 'http-error', None

        html = response.data

        if CAPTCHA in html:
            die(['Rate limit reached!', 'Last url: %s' % url])

        if (CURRENT_AVAILABILITY_DISCLAIMER in html
                or AVAILABILITY_DISCLAIMER in html):
            return 'unavailable', None

        if LOGIN_DISCLAIMER in html:
            return 'private-or-unavailable', None

        # TODO: integrate into ural
        bpost_id = url.rsplit('/', 1)[-1].encode()

        # Extracting metadata
        meta_extractor = re.compile(META_EXTRACTOR_TEMPLATE % bpost_id)

        match = meta_extractor.search(html)

        if match is None:
            return 'extraction-failed', None

        data = json5.loads(match.group(1).decode())
        data = nested_get([
            'jsmods', 'pre_display_requires', 0, 3, 1, '__bbox', 'result',
            'data', 'feedback'
        ], data)

        if data is None:
            return 'extraction-failed', None

        # TODO: remove, this is here as a test
        # TODO: try to find a post where comments are disabled
        if get_count(data['seen_by_count']):
            print_err('Found seen_by_count: %i for %s' %
                      (get_count(data['seen_by_count']), url))

        if 'political_figure_data' in data and data['political_figure_data']:
            print_err('Found political_figure_data:')
            print_err(data['political_figure_data'])

        if get_count(data['reaction_count']) != get_count(data['reactors']):
            print_err('Found different reactions/reactors for %s' % url)

        # Extracting data from hidden html
        hidden_html_extractor = re.compile(HTML_EXTRACTOR_TEMPLATE % bpost_id)
        match = hidden_html_extractor.search(html)

        if match is not None:
            hidden_html = match.group(1).decode()
            soup = BeautifulSoup(hidden_html, 'lxml')

            # Sometimes fetching a post behaves weirdly
            if soup.select_one('h5 a') is None:
                return 'extraction-failed', None

            data['scraped'] = {}

            timestamp_elem = soup.select_one('[data-utime]')
            timestamp = int(timestamp_elem.get('data-utime'))

            data['scraped']['account_name'] = soup.select_one(
                'h5 a').get_text().strip()
            data['scraped']['timestamp'] = timestamp
            data['scraped']['time'] = datetime.fromtimestamp(
                timestamp).isoformat()

            # TODO: use a context manager
            try:
                data['scraped']['aria_label'] = timestamp_elem.parent.get(
                    'aria-label')
            except:
                pass

            try:
                data['scraped']['text'] = soup.select_one(
                    '[data-testid="post_message"]').get_text()
            except:
                pass

            # try:
            #     data['scraped']['link'] = soup.select_one('[data-lynx-uri]').get('href')
            # except:
            #     pass

        return None, data

    # Loading bar
    loading_bar = tqdm(desc='Fetching post stats',
                       dynamic_ncols=True,
                       unit=' posts',
                       total=namespace.total)

    for row, post_url in enricher.cells(namespace.column, with_rows=True):
        loading_bar.update()

        if (not post_url or not is_facebook_post_url(post_url)
                or not is_facebook_url(post_url)):
            enricher.writerow(row, format_err('not-facebook-post'))
            continue

        err, data = fetch_facebook_page_stats(post_url)

        if err:
            enricher.writerow(row, format_err(err))
        else:
            enricher.writerow(row, format(data))

        # Throttling
        sleep_with_entropy(FACEBOOK_WEB_DEFAULT_THROTTLE, 5.0)

Ejemplo n.º 29

0

Mostrar archivo

def fetch_action(namespace):

    # Are we resuming
    resuming = namespace.resume

    if resuming and not namespace.output:
        die(['Cannot --resume without specifying -o/--output.'])

    # Do we need to fetch only a single url?
    if namespace.file is sys.stdin and is_url(namespace.column):
        namespace.file = StringIO('url\n%s' % namespace.column)
        namespace.column = 'url'

        # If we are hitting a single url we enable contents_in_report
        if namespace.contents_in_report is None:
            namespace.contents_in_report = True

    input_headers, pos, reader = custom_reader(namespace.file,
                                               namespace.column)
    filename_pos = input_headers.index(
        namespace.filename) if namespace.filename else None
    indexed_input_headers = {h: p for p, h in enumerate(input_headers)}

    selected_fields = namespace.select.split(',') if namespace.select else None
    selected_pos = [input_headers.index(h)
                    for h in selected_fields] if selected_fields else None

    # HTTP method
    http_method = namespace.method

    # Cookie grabber
    get_cookie = None
    if namespace.grab_cookies:
        get_cookie = grab_cookies(namespace.grab_cookies)

    # Global headers
    global_headers = None
    if namespace.headers:
        global_headers = {}

        for header in namespace.headers:
            k, v = parse_http_header(header)
            global_headers = v

    # Reading output
    output_headers = (list(input_headers) if not selected_pos else
                      [input_headers[i] for i in selected_pos])
    output_headers += OUTPUT_ADDITIONAL_HEADERS

    if namespace.contents_in_report:
        output_headers.append('raw_content')

    flag = 'w'

    if namespace.output is not None and resuming and isfile(namespace.output):
        flag = 'r+'

    output_file = open_output_file(namespace.output, flag=flag)

    output_writer = csv.writer(output_file)

    if not resuming:
        output_writer.writerow(output_headers)
    else:

        # Reading report to know what need to be done
        _, rpos, resuming_reader = custom_reader(output_file, 'line')

        resuming_reader_loading = tqdm(resuming_reader,
                                       desc='Resuming',
                                       dynamic_ncols=True,
                                       unit=' lines')

        already_done = ContiguousRangeSet()

        for line in resuming_reader_loading:
            index = line[rpos]

            already_done.add(int(index))

    # Loading bar
    total = namespace.total

    if total is not None and resuming:
        total -= len(already_done)

    loading_bar = tqdm(desc='Fetching pages',
                       total=total,
                       dynamic_ncols=True,
                       unit=' urls')

    def url_key(item):
        line = item[1]
        url = line[pos].strip()

        if not url:
            return

        # Url templating
        if namespace.url_template:
            return namespace.url_template.format(value=url)

        return url

    def request_args(url, item):
        cookie = None

        # Cookie
        if get_cookie:
            cookie = get_cookie(url)

        # Headers
        headers = None

        if global_headers:
            headers = global_headers

        return {'method': http_method, 'cookie': cookie, 'headers': headers}

    def write_output(index,
                     line,
                     resolved=None,
                     status=None,
                     error=None,
                     filename=None,
                     encoding=None,
                     data=None):

        if selected_pos:
            line = [line[p] for p in selected_pos]

        line.extend([
            index, resolved or '', status or '', error or '', filename or '',
            encoding or ''
        ])

        if namespace.contents_in_report:
            line.append(data or '')

        output_writer.writerow(line)

    errors = 0
    status_codes = Counter()

    target_iterator = enumerate(reader)

    if resuming:
        target_iterator = (pair for pair in target_iterator
                           if not already_done.stateful_contains(pair[0]))

    multithreaded_iterator = multithreaded_fetch(target_iterator,
                                                 key=url_key,
                                                 request_args=request_args,
                                                 threads=namespace.threads,
                                                 throttle=namespace.throttle)

    for result in multithreaded_iterator:
        line_index, line = result.item

        if not result.url:

            write_output(line_index, line)

            loading_bar.update()
            continue

        response = result.response
        data = response.data if response is not None else None

        content_write_flag = 'wb'

        # Updating stats
        if result.error is not None:
            errors += 1
        else:
            if response.status >= 400:
                status_codes[response.status] += 1

        postfix = {'errors': errors}

        for code, count in status_codes.most_common(1):
            postfix[str(code)] = count

        loading_bar.set_postfix(**postfix)
        loading_bar.update()

        # No error
        if result.error is None:

            filename = None

            # Building filename
            if data:
                if filename_pos is not None or namespace.filename_template:
                    if namespace.filename_template:
                        filename = CUSTOM_FORMATTER.format(
                            namespace.filename_template,
                            value=line[filename_pos]
                            if filename_pos is not None else None,
                            ext=result.meta['ext'],
                            line=LazyLineDict(indexed_input_headers, line))
                    else:
                        filename = line[filename_pos] + result.meta['ext']
                else:
                    # NOTE: it would be nice to have an id that can be sorted by time
                    filename = str(uuid4()) + result.meta['ext']

            # Standardize encoding?
            encoding = result.meta['encoding']

            if data and namespace.standardize_encoding or namespace.contents_in_report:
                if encoding is None or encoding != 'utf-8' or namespace.contents_in_report:
                    data = data.decode(
                        encoding if encoding is not None else 'utf-8',
                        errors='replace')
                    encoding = 'utf-8'
                    content_write_flag = 'w'

            # Writing file on disk
            if data and not namespace.contents_in_report:

                if namespace.compress:
                    filename += '.gz'

                resource_path = join(namespace.output_dir, filename)
                resource_dir = dirname(resource_path)

                os.makedirs(resource_dir, exist_ok=True)

                with open(resource_path, content_write_flag) as f:

                    # TODO: what if standardize_encoding + compress?
                    f.write(
                        gzip.compress(data) if namespace.compress else data)

            # Reporting in output
            resolved_url = response.geturl()

            write_output(
                line_index,
                line,
                resolved=resolved_url if resolved_url != result.url else None,
                status=response.status,
                filename=filename,
                encoding=encoding,
                data=data)

        # Handling potential errors
        else:
            error_code = report_error(result.error)

            write_output(line_index, line, error=error_code)

    # Closing files
    if namespace.output is not None:
        output_file.close()

Ejemplo n.º 30

0

Mostrar archivo

def facebook_comments_action(namespace):

    # Handling output
    output_file = open_output_file(namespace.output)

    # Handling input

    if is_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'post_url')

    try:
        scraper = FacebookMobileScraper(namespace.cookie,
                                        throttle=namespace.throttle)
    except FacebookInvalidCookieError:
        if namespace.cookie in COOKIE_BROWSERS:
            die([
                'Could not extract relevant cookie from "%s".' %
                namespace.cookie
            ])

        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to access Facebook post comments.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    # Enricher
    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=FACEBOOK_COMMENT_CSV_HEADERS)

    # Loading bar
    loading_bar = tqdm(desc='Scraping comments',
                       dynamic_ncols=True,
                       unit=' comments')

    for i, (row,
            url) in enumerate(enricher.cells(namespace.column,
                                             with_rows=True)):

        if not has_facebook_comments(url):
            tqdm.write(
                'Given url (line %i) probably cannot have Facebook comments: %s'
                % (i + 1, url),
                file=sys.stderr)
            continue

        batches = scraper.comments(url, per_call=True, detailed=True)

        for details, batch in batches:
            for comment in batch:
                enricher.writerow(row, comment.as_csv_row())

            loading_bar.update(len(batch))
            loading_bar.set_postfix(calls=details['calls'],
                                    replies=details['replies'],
                                    q=details['queue_size'],
                                    posts=i + 1)

    loading_bar.close()