Beispiel #1
0
def url_join_action(cli_args):
    left_reader = casanova.reader(cli_args.file1)
    left_headers = left_reader.fieldnames
    left_idx = None

    if cli_args.select:
        left_idx = left_reader.pos.collect(cli_args.select)
        left_headers = list(cli_args.select)

    # Applying column prefix now
    left_headers = [cli_args.match_column_prefix + h for h in left_headers]

    right_enricher = casanova.enricher(cli_args.file2,
                                       cli_args.output,
                                       add=left_headers)

    loading_bar = LoadingBar(desc='Indexing left file', unit='line')

    # First step is to index left file
    trie = NormalizedLRUTrie()

    for row, cell in left_reader.cells(cli_args.column1, with_rows=True):
        loading_bar.update()

        if left_idx is not None:
            row = [row[i] for i in left_idx]

        urls = [cell]

        if cli_args.separator is not None:
            urls = cell.split(cli_args.separator)

        for url in urls:
            url = url.strip()

            # NOTE: should we filter invalid urls here?
            if url:
                trie.set(url, row)

    loading_bar.close()

    loading_bar = LoadingBar(desc='Matching right file', unit='line')

    for row, url in right_enricher.cells(cli_args.column2, with_rows=True):
        loading_bar.update()

        url = url.strip()

        match = None

        # NOTE: should we filter invalid urls here?
        if url:
            match = trie.match(url)

        if match is None:
            right_enricher.writerow(row)
            continue

        right_enricher.writerow(row, match)
Beispiel #2
0
def captions_action(cli_args):
    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        add=YOUTUBE_CAPTIONS_CSV_HEADERS,
        keep=cli_args.select
    )

    loading_bar = LoadingBar(
        'Retrieving captions',
        unit='video'
    )

    for row, video in enricher.cells(cli_args.column, with_rows=True):
        loading_bar.update()

        result = get_video_captions(video, langs=cli_args.lang)

        if result is None:
            continue

        track, lines = result

        prefix = [track.lang, '1' if track.generated else '']

        for line in lines:
            enricher.writerow(row, prefix + list(line))
Beispiel #3
0
def url_extract_action(cli_args):
    enricher = casanova.enricher(cli_args.file,
                                 cli_args.output,
                                 add=REPORT_HEADERS,
                                 keep=cli_args.select)

    extract = EXTRACTORS[getattr(cli_args, 'from')]

    loading_bar = LoadingBar(desc='Extracting',
                             unit='row',
                             total=cli_args.total)

    for row, content in enricher.cells(cli_args.column, with_rows=True):
        loading_bar.update()

        content = content.strip()

        if not content:
            continue

        for url in extract(content):
            if cli_args.base_url is not None:
                url = urljoin(cli_args.base_url, url)

            enricher.writerow(row, [url])
Beispiel #4
0
def mediacloud_search_action(cli_args):
    writer = csv.writer(cli_args.output)
    writer.writerow(MEDIACLOUD_STORIES_CSV_HEADER)

    client = MediacloudAPIClient(cli_args.token)

    kwargs = {
        'collections': cli_args.collections,
        'medias': cli_args.medias,
        'publish_day': cli_args.publish_day,
        'publish_month': cli_args.publish_month,
        'publish_year': cli_args.publish_year,
        'filter_query': cli_args.filter_query
    }

    loading_bar = LoadingBar('Searching stories',
                             unit='story',
                             unit_plural='stories')

    try:
        if not cli_args.skip_count:
            count = client.count(cli_args.query, **kwargs)

            loading_bar.update_total(count)

        iterator = client.search(cli_args.query, **kwargs)

        for story in iterator:
            writer.writerow(story.as_csv_row())
            loading_bar.update()

    except MediacloudServerError as e:
        loading_bar.die(
            ['Aborted due to a mediacloud server error:', e.server_error])
Beispiel #5
0
def mediacloud_search_action(namespace, output_file):
    writer = csv.writer(output_file)
    writer.writerow(MEDIACLOUD_STORIES_CSV_HEADER)

    client = MediacloudAPIClient(namespace.token)

    kwargs = {
        'collections': namespace.collections,
        'medias': namespace.medias,
        'publish_day': namespace.publish_day,
        'publish_month': namespace.publish_month,
        'publish_year': namespace.publish_year
    }

    loading_bar = LoadingBar('Searching stories',
                             unit='story',
                             unit_plural='stories')

    try:
        if not namespace.skip_count:
            count = client.count(namespace.query, **kwargs)

            loading_bar.update_total(count)

        iterator = client.search(namespace.query, format='csv_row', **kwargs)

        for story in iterator:
            writer.writerow(story)
            loading_bar.update()

    except MediacloudServerError as e:
        loading_bar.die(
            ['Aborted due to a mediacloud server error:', e.server_error])
Beispiel #6
0
def captions_action(namespace, output_file):

    # Handling output
    single_video = namespace.file is sys.stdin and sys.stdin.isatty()

    if single_video:
        edit_namespace_with_csv_io(namespace, 'video')

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=YOUTUBE_CAPTIONS_CSV_HEADERS,
                                 keep=namespace.select)

    loading_bar = LoadingBar('Retrieving captions', unit='video')

    for row, video in enricher.cells(namespace.column, with_rows=True):
        result = get_video_captions(video, langs=namespace.lang)
        loading_bar.update()

        if result is None:
            continue

        track, lines = result

        prefix = [track.lang, '1' if track.generated else '']

        for line in lines:
            enricher.writerow(row, prefix + list(line))

    loading_bar.close()
Beispiel #7
0
def comments_action(cli_args):
    enricher = casanova.enricher(cli_args.file,
                                 cli_args.output,
                                 add=YOUTUBE_COMMENT_CSV_HEADERS,
                                 keep=cli_args.select)

    loading_bar = LoadingBar('Retrieving comments',
                             unit='comment',
                             stats={'videos': 0})

    def before_sleep_until_midnight(seconds):
        loading_bar.print(
            'API limits reached. Will now wait until midnight Pacific time!')

    client = YouTubeAPIClient(
        cli_args.key, before_sleep_until_midnight=before_sleep_until_midnight)

    for row, video in enricher.cells(cli_args.column, with_rows=True):
        generator = client.comments(video)

        for comment in generator:
            loading_bar.update()
            enricher.writerow(row, comment.as_csv_row())

        loading_bar.inc('videos')
Beispiel #8
0
def comments_action(namespace, output_file):

    # Handling output
    single_video = namespace.file is sys.stdin and sys.stdin.isatty()

    if single_video:
        edit_namespace_with_csv_io(namespace, 'video')

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=YOUTUBE_COMMENT_CSV_HEADERS,
                                 keep=namespace.select)

    loading_bar = LoadingBar('Retrieving comments',
                             unit='comment',
                             stats={'videos': 0})

    def before_sleep_until_midnight(seconds):
        loading_bar.print(
            'API limits reached. Will now wait until midnight Pacific time!')

    client = YouTubeAPIClient(
        namespace.key, before_sleep_until_midnight=before_sleep_until_midnight)

    for row, video in enricher.cells(namespace.column, with_rows=True):
        generator = client.comments(video)

        for comment in generator:
            loading_bar.update()
            enricher.writerow(row, comment.as_csv_row())

        loading_bar.inc('videos')

    loading_bar.close()
Beispiel #9
0
def videos_action(namespace, output_file):

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=YOUTUBE_VIDEO_CSV_HEADERS,
                                 keep=namespace.select)

    loading_bar = LoadingBar('Retrieving videos',
                             unit='video',
                             total=namespace.total)

    def before_sleep_until_midnight(seconds):
        loading_bar.print(
            'API limits reached. Will now wait until midnight Pacific time!')

    client = YouTubeAPIClient(
        namespace.key, before_sleep_until_midnight=before_sleep_until_midnight)

    iterator = enricher.cells(namespace.column, with_rows=True)

    for (row, _), video in client.videos(iterator, key=itemgetter(1)):
        loading_bar.update()
        enricher.writerow(row, video.as_csv_row() if video else None)

    loading_bar.close()
Beispiel #10
0
def search_action(cli_args):
    enricher = casanova.enricher(cli_args.file,
                                 cli_args.output,
                                 add=YOUTUBE_VIDEO_SNIPPET_CSV_HEADERS,
                                 keep=cli_args.select)

    loading_bar = LoadingBar('Searching videos', unit='video')

    def before_sleep_until_midnight(seconds):
        loading_bar.print(
            'API limits reached. Will now wait until midnight Pacific time!')

    client = YouTubeAPIClient(
        cli_args.key, before_sleep_until_midnight=before_sleep_until_midnight)

    for row, query in enricher.cells(cli_args.column, with_rows=True):
        loading_bar.print('Searching for "%s"' % query)

        searcher = client.search(query, order=cli_args.order)

        if cli_args.limit:
            searcher = islice(searcher, cli_args.limit)

        for video in searcher:
            loading_bar.update()
            enricher.writerow(row, video.as_csv_row())
Beispiel #11
0
def extract_action(cli_args):
    if cli_args.glob is None and cli_args.input_dir is None:
        cli_args.input_dir = DEFAULT_CONTENT_FOLDER

    input_data = cli_args.report

    if cli_args.glob is not None:
        input_data = dummy_csv_file_from_glob(cli_args.glob, cli_args.input_dir)

    enricher = casanova.enricher(
        input_data,
        cli_args.output,
        keep=cli_args.select,
        add=OUTPUT_ADDITIONAL_HEADERS
    )

    loading_bar = LoadingBar(
        desc='Extracting content',
        total=cli_args.total,
        unit='doc'
    )

    def on_irrelevant_row(reason, row, i):
        loading_bar.update()
        loading_bar.print('Row n°{n} could not be processed: {reason}'.format(n=i + 1, reason=reason))
        enricher.writerow(row, format_error(reason))

    if (
        cli_args.glob is None and
        'raw_contents' not in enricher.headers and
        not isdir(cli_args.input_dir)
    ):
        loading_bar.die([
            'Could not find the "%s" directory!' % cli_args.input_dir,
            'Did you forget to specify it with -i/--input-dir?'
        ])

    files = create_report_iterator(
        cli_args,
        enricher,
        on_irrelevant_row=on_irrelevant_row
    )

    pool = LazyPool(cli_args.processes)

    loading_bar.update_stats(p=pool.processes)

    with pool:
        for error, row, result in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                enricher.writerow(row, format_error(report_error(error)))
                continue

            if result is None:
                enricher.writerow(row, format_error('no-result'))
                continue

            enricher.writerow(row, result)
Beispiel #12
0
def facebook_posts_action(cli_args):
    try:
        scraper = FacebookMobileScraper(cli_args.cookie, throttle=cli_args.throttle)
    except FacebookInvalidCookieError:
        if cli_args.cookie in COOKIE_BROWSERS:
            die([
                'Could not extract relevant cookie from "%s".' % cli_args.cookie
            ])

        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to scrape Facebook groups.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    # Enricher
    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=FACEBOOK_POST_CSV_HEADERS
    )

    # Loading bar
    loading_bar = LoadingBar(
        desc='Scraping posts',
        unit='post'
    )

    translated_langs = set()

    for i, (row, url) in enumerate(enricher.cells(cli_args.column, with_rows=True), 1):
        loading_bar.inc('groups')

        try:
            posts = scraper.posts(url)
        except FacebookInvalidTargetError:
            loading_bar.print('Given url (line %i) is probably not a Facebook group: %s' % (i, url))
            continue

        for post in posts:
            if post.translated_text and post.translated_from not in translated_langs:
                translated_langs.add(post.translated_from)
                lines = [
                    'Found text translated from %s!' % post.translated_from,
                    'Since it means original text may not be entirely retrieved you might want',
                    'to edit your Facebook language settings to add "%s" to' % post.translated_from,
                    'the "Languages you don\'t want to be offered translations for" list here:',
                    'https://www.facebook.com/settings/?tab=language'
                ]

                for line in lines:
                    loading_bar.print(line)

                loading_bar.print()

            loading_bar.update()
            enricher.writerow(row, post.as_csv_row())
Beispiel #13
0
def twitter_users_action(cli_args):

    client = TwitterAPIClient(
        cli_args.access_token,
        cli_args.access_token_secret,
        cli_args.api_key,
        cli_args.api_secret_key
    )

    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=USER_FIELDS
    )

    loading_bar = LoadingBar(
        desc='Retrieving users',
        total=cli_args.total,
        unit='user'
    )

    for chunk in as_chunks(100, enricher.cells(cli_args.column, with_rows=True)):
        users = ','.join(row[1].lstrip('@') for row in chunk)

        if cli_args.ids:
            client_args = {'user_id': users}
            key = 'id'
        else:
            client_args = {'screen_name': users}
            key = 'screen_name'

        try:
            result = client.call(['users', 'lookup'], **client_args)
        except TwitterHTTPError as e:
            if e.e.code == 404:
                for row, user in chunk:
                    enricher.writerow(row)
            else:
                raise e

            continue

        indexed_result = {}

        for user in result:
            user = normalize_user(user)
            user_row = format_user_as_csv_row(user)
            indexed_result[user[key]] = user_row

        for row, user in chunk:
            user_row = indexed_result.get(user.lstrip('@'))

            enricher.writerow(row, user_row)

        loading_bar.update(len(chunk))
Beispiel #14
0
def twitter_scrape_action(cli_args):
    scraper = TwitterAPIScraper()

    # Stats
    loading_bar = LoadingBar('Collecting tweets',
                             total=cli_args.limit,
                             unit='tweet',
                             stats={
                                 'tokens': 1,
                                 'queries': 0
                             })

    enricher = casanova.enricher(cli_args.file,
                                 cli_args.output,
                                 add=TWEET_FIELDS + ADDITIONAL_TWEET_FIELDS,
                                 keep=cli_args.select)

    def before_sleep(retry_state):
        exc = retry_state.outcome.exception()

        if isinstance(exc, TwitterPublicAPIRateLimitError):
            loading_bar.inc('tokens')

        else:
            loading_bar.inc('failures')
            loading_bar.print(
                'Failed to call Twitter search. Will retry in %s' %
                prettyprint_seconds(retry_state.idle_for))

    for row, query in enricher.cells(cli_args.query, with_rows=True):

        # Templating?
        if cli_args.query_template is not None:
            query = CUSTOM_FORMATTER.format(cli_args.query_template,
                                            value=query)

        loading_bar.print('Searching for "%s"' % query)
        loading_bar.inc('queries')

        iterator = scraper.search(
            query,
            limit=cli_args.limit,
            before_sleep=before_sleep,
            include_referenced_tweets=cli_args.include_refs,
            with_meta=True)

        try:
            for tweet, meta in iterator:
                loading_bar.update()

                tweet_row = format_tweet_as_csv_row(tweet)
                enricher.writerow(row, tweet_row + format_meta_row(meta))
        except TwitterPublicAPIOverCapacityError:
            loading_bar.die('Got an "Over Capacity" error. Shutting down...')
Beispiel #15
0
def crowdtangle_summary_action(cli_args):
    if not cli_args.start_date:
        die('Missing --start-date!')

    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=CROWDTANGLE_SUMMARY_CSV_HEADERS
    )

    posts_writer = None

    if cli_args.posts is not None:
        posts_writer = csv.writer(cli_args.posts)
        posts_writer.writerow(CROWDTANGLE_POST_CSV_HEADERS_WITH_LINK)

    loading_bar = LoadingBar(
        desc='Collecting data',
        total=cli_args.total,
        unit='url'
    )

    client = CrowdTangleAPIClient(cli_args.token, rate_limit=cli_args.rate_limit)

    for row, url in enricher.cells(cli_args.column, with_rows=True):
        url = url.strip()

        try:
            stats = client.summary(
                url,
                start_date=cli_args.start_date,
                with_top_posts=cli_args.posts is not None,
                sort_by=cli_args.sort_by,
                platforms=cli_args.platforms
            )

        except CrowdTangleInvalidTokenError:
            die([
                'Your API token is invalid.',
                'Check that you indicated a valid one using the `--token` argument.'
            ])

        if cli_args.posts is not None:
            stats, posts = stats

            if posts is not None:
                for post in posts:
                    posts_writer.writerow(post.as_csv_row())

        enricher.writerow(row, stats.as_csv_row() if stats is not None else None)

        loading_bar.update()
Beispiel #16
0
def crawl_action(cli_args, defer):

    # Loading crawler definition
    queue_path = join(cli_args.output_dir, 'queue')

    if cli_args.resume:
        print_err('Resuming crawl...')
    else:
        rmtree(queue_path, ignore_errors=True)

    # Scaffolding output directory
    os.makedirs(cli_args.output_dir, exist_ok=True)

    jobs_output_path = join(cli_args.output_dir, 'jobs.csv')
    jobs_output, jobs_writer = open_report(jobs_output_path,
                                           JOBS_HEADERS,
                                           resume=cli_args.resume)
    defer(jobs_output.close)

    # Creating crawler
    crawler = Crawler(cli_args.crawler,
                      throttle=cli_args.throttle,
                      queue_path=queue_path)

    reporter_pool = ScraperReporterPool(crawler,
                                        cli_args.output_dir,
                                        resume=cli_args.resume)
    defer(reporter_pool.close)

    # Loading bar
    loading_bar = LoadingBar(desc='Crawling', unit='page')

    def update_loading_bar(result):
        state = crawler.state

        loading_bar.update_stats(queued=state.jobs_queued,
                                 doing=state.jobs_doing + 1,
                                 spider=result.job.spider)
        loading_bar.update()

    # Starting crawler
    crawler.start()

    # Running crawler
    for result in crawler:
        update_loading_bar(result)
        jobs_writer.writerow(format_job_for_csv(result))

        if result.error is not None:
            continue

        reporter_pool.write(result.job.spider, result.scraped)
Beispiel #17
0
def extract_action(namespace):
    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(
        namespace.report,
        output_file,
        keep=namespace.select,
        add=OUTPUT_ADDITIONAL_HEADERS
    )

    loading_bar = LoadingBar(
        desc='Extracting content',
        total=namespace.total,
        unit='doc'
    )

    def on_irrelevant_row(reason, row):
        loading_bar.update()
        enricher.writerow(row, format_error(reason))

    try:
        files = create_report_iterator(
            namespace,
            enricher,
            on_irrelevant_row=on_irrelevant_row
        )
    except NotADirectoryError:
        loading_bar.die([
            'Could not find the "%s" directory!' % namespace.input_dir,
            'Did you forget to specify it with -i/--input-dir?'
        ])

    pool = LazyPool(namespace.processes)

    loading_bar.update_stats(p=pool.processes)

    with pool:
        for error, row, result in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                enricher.writerow(row, format_error(report_error(error)))
                continue

            if result is None:
                enricher.writerow(row, format_error('no-content'))
                continue

            enricher.writerow(row, result)

    loading_bar.close()
    output_file.close()
Beispiel #18
0
def mediacloud_medias_action(cli_args):
    added_headers = MEDIACLOUD_MEDIA_CSV_HEADER[1:]

    feeds_writer = None

    if cli_args.feeds:
        added_headers.append('feeds')
        feeds_writer = csv.writer(cli_args.feeds)
        feeds_writer.writerow(MEDIACLOUD_FEED_CSV_HEADER)

    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=added_headers
    )

    loading_bar = LoadingBar(
        desc='Fetching medias',
        unit='media',
        total=cli_args.total
    )

    client = MediacloudAPIClient(cli_args.token)

    for row, media_id in enricher.cells(cli_args.column, with_rows=True):

        try:
            result = client.media(media_id)
            result = result.as_csv_row()[1:]

            if cli_args.feeds:
                feeds = client.feeds(media_id)

                enricher.writerow(row, result + [len(feeds)])

                for feed in feeds:
                    feeds_writer.writerow(feed.as_csv_row())
            else:
                enricher.writerow(row, result)
        except MediacloudServerError as e:
            loading_bar.die([
                'Aborted due to a mediacloud server error:',
                e.server_error
            ])

        loading_bar.update()
Beispiel #19
0
def facebook_comments_action(cli_args):
    try:
        scraper = FacebookMobileScraper(cli_args.cookie,
                                        throttle=cli_args.throttle)
    except FacebookInvalidCookieError:
        if cli_args.cookie in COOKIE_BROWSERS:
            die([
                'Could not extract relevant cookie from "%s".' %
                cli_args.cookie
            ])

        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to scrape Facebook comments.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    # Enricher
    enricher = casanova.enricher(cli_args.file,
                                 cli_args.output,
                                 keep=cli_args.select,
                                 add=FACEBOOK_COMMENT_CSV_HEADERS)

    # Loading bar
    loading_bar = LoadingBar(desc='Scraping comments', unit='comment')

    for i, (row,
            url) in enumerate(enricher.cells(cli_args.column, with_rows=True),
                              1):
        try:
            batches = scraper.comments(url, per_call=True, detailed=True)
        except FacebookInvalidTargetError:
            loading_bar.print(
                'Given url (line %i) is probably not a Facebook resource having comments: %s'
                % (i, url))
            continue

        for details, batch in batches:
            for comment in batch:
                enricher.writerow(row, comment.as_csv_row())

            loading_bar.update(len(batch))
            loading_bar.update_stats(calls=details['calls'],
                                     replies=details['replies'],
                                     q=details['queue_size'],
                                     posts=i)
Beispiel #20
0
def mediacloud_topic_action(cli_args):
    writer = csv.writer(cli_args.output)
    writer.writerow(MEDIACLOUD_TOPIC_STORIES_CSV_HEADERS)

    loading_bar = LoadingBar(desc='Fetching stories',
                             unit='story',
                             unit_plural='stories')

    client = MediacloudAPIClient(cli_args.token)

    iterator = client.topic_stories(cli_args.topic_id,
                                    media_id=cli_args.media_id,
                                    from_media_id=cli_args.from_media_id)

    for story in iterator:
        writer.writerow(story.as_csv_row())
        loading_bar.update()
Beispiel #21
0
def twitter_scrape_action(cli_args):
    scraper = TwitterAPIScraper()

    # Stats
    loading_bar = LoadingBar(
        'Collecting tweets',
        total=cli_args.limit,
        unit='tweet',
        stats={'tokens': 1, 'queries': 0}
    )

    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        add=TWEET_FIELDS + ADDITIONAL_TWEET_FIELDS,
        keep=cli_args.select
    )

    for row, query in enricher.cells(cli_args.query, with_rows=True):

        # Templating?
        if cli_args.query_template is not None:
            query = CUSTOM_FORMATTER.format(
                cli_args.query_template,
                value=query
            )

        loading_bar.print('Searching for "%s"' % query)
        loading_bar.inc('queries')

        iterator = scraper.search(
            query,
            limit=cli_args.limit,
            include_referenced_tweets=cli_args.include_refs,
            with_meta=True
        )

        try:
            for tweet, meta in iterator:
                loading_bar.update()

                tweet_row = format_tweet_as_csv_row(tweet)
                enricher.writerow(row, tweet_row + format_meta_row(meta))
        except TwitterPublicAPIOverCapacityError:
            loading_bar.die('Got an "Over Capacity" error. Shutting down...')
Beispiel #22
0
def url_parse_action(cli_args):
    headers = REPORT_HEADERS

    if cli_args.facebook:
        headers = FACEBOOK_REPORT_HEADERS
    elif cli_args.youtube:
        headers = YOUTUBE_REPORT_HEADERS

    enricher = casanova.enricher(cli_args.file,
                                 cli_args.output,
                                 add=headers,
                                 keep=cli_args.select)

    loading_bar = LoadingBar(desc='Parsing', unit='row', total=cli_args.total)

    for row, cell in enricher.cells(cli_args.column, with_rows=True):
        loading_bar.update()

        if cli_args.separator:
            urls = cell.split(cli_args.separator)
        else:
            urls = [cell]

        for url in urls:
            url = url.strip()

            if not is_url(
                    url, allow_spaces_in_path=True, require_protocol=False):
                enricher.writerow(row)
                continue

            if cli_args.facebook:
                addendum = extract_facebook_addendum(url)
            elif cli_args.youtube:
                addendum = extract_youtube_addendum(url)
            else:
                addendum = extract_standard_addendum(cli_args, url)

            if addendum is None:
                enricher.writerow(row)
                continue

            enricher.writerow(row, addendum)
Beispiel #23
0
def facebook_url_likes_action(cli_args):
    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=REPORT_HEADERS,
        total=cli_args.total,
        prebuffer_bytes=DEFAULT_PREBUFFER_BYTES
    )

    if cli_args.column not in enricher.pos:
        die([
            'Could not find the "%s" column containing the urls in the given CSV file.' % cli_args.column
        ])

    loading_bar = LoadingBar(
        desc='Retrieving likes',
        unit='url',
        total=enricher.total
    )

    for row, url in enricher.cells(cli_args.column, with_rows=True):
        loading_bar.update()

        url = url.strip()

        if not url or not is_url(url, require_protocol=False):
            enricher.writerow(row)
            continue

        err, html = make_request(url)

        if err is not None:
            loading_bar.die('An error occurred while fetching like button for this url: %s' % url)

        scraped = scrape(html)

        if scraped is None:
            loading_bar.die('Could not extract Facebook likes from this url\'s like button: %s' % url)

        enricher.writerow(row, scraped)
Beispiel #24
0
def search_action(namespace, output_file):

    # Handling output
    single_query = namespace.file is sys.stdin and sys.stdin.isatty()

    if single_query:
        edit_namespace_with_csv_io(namespace, 'query')

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        add=YOUTUBE_VIDEO_SNIPPET_CSV_HEADERS,
        keep=namespace.select
    )

    loading_bar = LoadingBar(
        'Searching videos',
        unit='video'
    )

    def before_sleep_until_midnight(seconds):
        loading_bar.print('API limits reached. Will now wait until midnight Pacific time!')

    client = YouTubeAPIClient(
        namespace.key,
        before_sleep_until_midnight=before_sleep_until_midnight
    )

    for row, query in enricher.cells(namespace.column, with_rows=True):
        loading_bar.print('Searching for "%s"' % query)

        searcher = client.search(query, order=namespace.order)

        if namespace.limit:
            searcher = islice(searcher, namespace.limit)

        for video in searcher:
            loading_bar.update()
            enricher.writerow(row, video.as_csv_row())

    loading_bar.close()
Beispiel #25
0
def facebook_post_authors_action(cli_args):
    try:
        scraper = FacebookMobileScraper(cli_args.cookie, throttle=cli_args.throttle)
    except FacebookInvalidCookieError:
        if cli_args.cookie in COOKIE_BROWSERS:
            die([
                'Could not extract relevant cookie from "%s".' % cli_args.cookie
            ])

        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to scrape Facebook comments.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    # Enricher
    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=FACEBOOK_USER_CSV_HEADERS
    )

    # Loading bar
    loading_bar = LoadingBar(
        desc='Finding authors',
        unit='post'
    )

    for i, (row, post_url) in enumerate(enricher.cells(cli_args.column, with_rows=True), 1):
        loading_bar.update()

        try:
            author = scraper.post_author(post_url)
        except FacebookInvalidTargetError:
            loading_bar.print('Given url (line %i) is probably not a Facebook group post: %s' % (i, post_url))
            continue

        enricher.writerow(row, author.as_csv_row() if author is not None else None)
Beispiel #26
0
def hyphe_dump_action(cli_args):

    # Paths
    output_dir = 'hyphe_corpus_%s' % cli_args.corpus

    if cli_args.output_dir is not None:
        output_dir = cli_args.output_dir

    os.makedirs(output_dir, exist_ok=True)

    webentities_output_path = join(output_dir, 'webentities.csv')
    pages_output_path = join(output_dir, 'pages.csv')

    if cli_args.body:
        body_output_dir = join(output_dir, 'content')
        os.makedirs(body_output_dir, exist_ok=True)

    client = HypheAPIClient(cli_args.url)
    corpus = client.corpus(cli_args.corpus, password=cli_args.password)

    try:
        corpus.ensure_is_started()
    except HypheCorpusAuthenticationError:
        die([
            'Wrong password for the "%s" corpus!' % cli_args.corpus,
            'Don\'t forget to provide a password for this corpus using --password'
        ])

    # Then we gather some handy statistics
    counts = corpus.count(statuses=cli_args.statuses)

    # Then we fetch webentities
    webentities_file = open(webentities_output_path, 'w', encoding='utf-8')
    webentities_writer = csv.writer(webentities_file)
    webentities_writer.writerow(WEBENTITY_CSV_HEADERS)

    loading_bar = LoadingBar(desc='Paginating web entities',
                             unit='webentity',
                             unit_plural='webentities',
                             total=counts['webentities'])

    webentities = {}

    for webentity in corpus.webentities(statuses=cli_args.statuses):
        loading_bar.update()
        webentities[webentity['id']] = webentity
        webentities_writer.writerow(format_webentity_for_csv(webentity))

    webentities_file.close()
    loading_bar.close()

    # Finally we paginate pages
    pages_file = open(pages_output_path, 'w', encoding='utf-8')
    pages_writer = csv.writer(pages_file)
    pages_writer.writerow(PAGE_CSV_HEADERS +
                          (ADDITIONAL_PAGE_HEADERS if cli_args.body else []))

    loading_bar = LoadingBar(desc='Fetching pages',
                             unit='page',
                             total=counts['pages'])

    for webentity in webentities.values():
        for page in corpus.webentity_pages(webentity['id'],
                                           include_body=cli_args.body):
            loading_bar.update()

            filename = None

            if cli_args.body and 'body' in page:
                filename = format_page_filename(webentity, page)
                filepath = join(body_output_dir, filename)
                os.makedirs(dirname(filepath), exist_ok=True)

                with open(filter, 'wb') as f:
                    binary = base64.b64decode(page['body'])
                    binary = zlib.decompress(binary)
                    binary = gzip.compress(binary)

                    f.write(binary)

            pages_writer.writerow(
                format_page_for_csv(webentity, page, filename=filename))
Beispiel #27
0
def fetch_action(cli_args, resolve=False, defer=None):

    # If we are hitting a single url we enable contents_in_report by default
    if not resolve and isinstance(cli_args.file, StringIO) and cli_args.contents_in_report is None:
        cli_args.contents_in_report = True

    if not resolve and cli_args.contents_in_report and cli_args.compress:
        raise InvalidArgumentsError('Cannot both --compress and output --contents-in-report!')

    # HTTP method
    http_method = cli_args.method

    # Cookie grabber
    get_cookie = None
    if cli_args.grab_cookies:
        get_cookie = grab_cookies(cli_args.grab_cookies)

    # Global headers
    global_headers = None
    if cli_args.headers:
        global_headers = {}

        for header in cli_args.headers:
            k, v = parse_http_header(header)
            global_headers[k] = v

    # Resume listener
    skipped_rows = 0
    resuming_reader_loading = None

    if cli_args.resume and cli_args.output.can_resume():
        resuming_reader_loading = LoadingBar(
            desc='Resuming',
            unit='line'
        )

        def output_read_listener(event, row):
            nonlocal skipped_rows

            if event != 'output.row':
                return

            skipped_rows += 1
            resuming_reader_loading.update()

        cli_args.output.listener = output_read_listener

    if resolve:
        additional_headers = RESOLVE_ADDITIONAL_HEADERS
    else:
        additional_headers = FETCH_ADDITIONAL_HEADERS

        if cli_args.contents_in_report:
            additional_headers = additional_headers + ['raw_contents']

    # Enricher
    multiplex = None

    if cli_args.separator is not None:
        multiplex = (cli_args.column, cli_args.separator)

    enricher = casanova.threadsafe_enricher(
        cli_args.file,
        cli_args.output,
        add=additional_headers,
        keep=cli_args.select,
        total=cli_args.total,
        prebuffer_bytes=DEFAULT_PREBUFFER_BYTES,
        multiplex=multiplex
    )

    if resuming_reader_loading is not None:
        resuming_reader_loading.close()

    if cli_args.column not in enricher.headers:
        raise InvalidArgumentsError('Could not find the "%s" column containing the urls in the given CSV file.' % cli_args.column)

    url_pos = enricher.headers[cli_args.column]

    filename_pos = None

    if not resolve and cli_args.filename is not None:
        if cli_args.filename not in enricher.headers:
            raise InvalidArgumentsError('Could not find the "%s" column containing the filenames in the given CSV file.' % cli_args.filename)

        filename_pos = enricher.headers[cli_args.filename]

    # Loading bar
    loading_bar = LoadingBar(
        desc='Fetching pages',
        total=enricher.total,
        unit='url',
        initial=skipped_rows
    )
    defer(loading_bar.close)  # NOTE: it could be dangerous with multithreaded execution, not to close it ourselves

    def update_loading_bar(result):
        nonlocal errors

        if result.error is not None:
            errors += 1
        else:
            if resolve:
                status = result.stack[-1].status
            else:
                status = result.response.status

            if status >= 400:
                status_codes[status] += 1

        stats = {'errors': errors}

        for code, count in status_codes.most_common(1):
            stats[str(code)] = count

        loading_bar.update_stats(**stats)
        loading_bar.update()

    only_shortened = getattr(cli_args, 'only_shortened', False)

    def url_key(item):
        url = item[1][url_pos].strip()

        if not url:
            return

        if only_shortened and not is_shortened_url(url):
            return

        # Url templating
        if cli_args.url_template:
            return cli_args.url_template.format(value=url)

        return url

    def request_args(domain, url, item):
        cookie = None

        # Cookie
        if get_cookie:
            cookie = get_cookie(url)

        # Headers
        headers = None

        if global_headers:
            headers = global_headers

        return {
            'method': http_method,
            'cookie': cookie,
            'headers': headers
        }

    # Worker callback internals
    filename_builder = None
    files_writer = None

    if not resolve:
        try:
            filename_builder = FilenameBuilder(
                folder_strategy=cli_args.folder_strategy,
                template=cli_args.filename_template
            )
        except TypeError:
            die([
                'Invalid "%s" --folder-strategy!' % cli_args.folder_strategy,
                'Check the list at the end of the command help:',
                '  $ minet fetch -h'
            ])

        files_writer = ThreadSafeFilesWriter(cli_args.output_dir)

    def worker_callback(result):
        # NOTE: at this point the callback is only fired on success
        row = result.item[1]
        response = result.response
        meta = result.meta

        if cli_args.keep_failed_contents and response.status != 200:
            return

        # First we need to build a filename
        filename_cell = row[filename_pos] if filename_pos else None

        formatter_kwargs = {}

        if cli_args.filename_template and 'line' in cli_args.filename_template:
            formatter_kwargs['line'] = enricher.wrap(row)

        try:
            filename = filename_builder(
                result.resolved,
                filename=filename_cell,
                ext=meta.get('ext'),
                formatter_kwargs=formatter_kwargs,
                compressed=cli_args.compress
            )
        except FilenameFormattingError as e:
            result.error = e
            return

        meta['filename'] = filename

        # Decoding the response data?
        is_text = meta.get('is_text', False)
        original_encoding = meta.get('encoding', 'utf-8')

        data = response.data
        binary = True

        if is_text and (cli_args.standardize_encoding or cli_args.contents_in_report):
            data = data.decode(original_encoding, errors='replace')
            binary = False

            if cli_args.contents_in_report:
                meta['decoded_contents'] = data

        # Writing the file?
        # TODO: specify what should happen when contents are empty (e.g. POST queries)
        if data and not cli_args.contents_in_report:
            files_writer.write(
                filename,
                data,
                binary=binary,
                compress=cli_args.compress
            )

    def write_fetch_output(index, row, resolved=None, status=None, error=None,
                           filename=None, encoding=None, mimetype=None, data=None):

        addendum = [
            resolved or '',
            status or '',
            error or '',
            filename or '',
            mimetype or '',
            encoding or ''
        ]

        if cli_args.contents_in_report:
            addendum.append(data or '')

        enricher.writerow(index, row, addendum)

    def write_resolve_output(index, row, resolved=None, status=None, error=None,
                             redirects=None, chain=None):
        addendum = [
            resolved or '',
            status or '',
            error or '',
            redirects or '',
            chain or ''
        ]

        enricher.writerow(index, row, addendum)

    errors = 0
    status_codes = Counter()

    common_kwargs = {
        'key': url_key,
        'insecure': cli_args.insecure,
        'threads': cli_args.threads,
        'throttle': cli_args.throttle,
        'domain_parallelism': cli_args.domain_parallelism,
        'max_redirects': cli_args.max_redirects,
        'wait': False,
        'daemonic': True
    }

    if cli_args.timeout is not None:
        common_kwargs['timeout'] = cli_args.timeout

    # Normal fetch
    if not resolve:

        multithreaded_iterator = multithreaded_fetch(
            enricher,
            request_args=request_args,
            callback=worker_callback,
            **common_kwargs
        )

        for result in multithreaded_iterator:
            index, row = result.item

            if not result.url:

                write_fetch_output(
                    index,
                    row
                )

                loading_bar.update()
                continue

            # Updating stats
            update_loading_bar(result)

            # No error
            if result.error is None:
                meta = result.meta

                # Final url target
                resolved_url = result.resolved

                if resolved_url == result.url:
                    resolved_url = None

                # Reporting in output
                write_fetch_output(
                    index,
                    row,
                    resolved=resolved_url,
                    status=result.response.status,
                    filename=meta.get('filename'),
                    encoding=meta.get('encoding'),
                    mimetype=meta.get('mimetype'),
                    data=meta.get('decoded_contents')
                )

            # Handling potential errors
            else:
                error_code = report_error(result.error)

                resolved = None

                if isinstance(result.error, InvalidURLError):
                    resolved = result.error.url

                if isinstance(result.error, FilenameFormattingError):
                    loading_bar.print(report_filename_formatting_error(result.error))

                write_fetch_output(
                    index,
                    row,
                    error=error_code,
                    resolved=resolved
                )

    # Resolve
    else:

        multithreaded_iterator = multithreaded_resolve(
            enricher,
            resolve_args=request_args,
            follow_meta_refresh=cli_args.follow_meta_refresh,
            follow_js_relocation=cli_args.follow_js_relocation,
            infer_redirection=cli_args.infer_redirection,
            **common_kwargs
        )

        for result in multithreaded_iterator:
            index, row = result.item

            if not result.url:

                write_resolve_output(
                    index,
                    row
                )

                loading_bar.update()
                continue

            # Updating stats
            update_loading_bar(result)

            # No error
            if result.error is None:

                # Reporting in output
                last = result.stack[-1]

                write_resolve_output(
                    index,
                    row,
                    resolved=last.url,
                    status=last.status,
                    redirects=len(result.stack) - 1,
                    chain='|'.join(step.type for step in result.stack)
                )

            # Handling potential errors
            else:
                error_code = report_error(result.error)

                write_resolve_output(
                    index,
                    row,
                    error=error_code,
                    redirects=(len(result.stack) - 1) if result.stack else None,
                    chain='|'.join(step.type for step in result.stack) if result.stack else None
                )
Beispiel #28
0
def facebook_post_stats_action(cli_args):
    enricher = casanova.enricher(cli_args.file,
                                 cli_args.output,
                                 add=REPORT_HEADERS,
                                 keep=cli_args.select)

    def fetch_facebook_page_stats(url):
        err, response = request(url, cookie='locale=en_US')

        if err:
            return 'http-error', None

        if response.status == 404:
            return 'not-found', None

        if response.status >= 400:
            return 'http-error', None

        html = response.data

        if CAPTCHA in html:
            die(['Rate limit reached!', 'Last url: %s' % url])

        if (CURRENT_AVAILABILITY_DISCLAIMER in html
                or AVAILABILITY_DISCLAIMER in html):
            return 'unavailable', None

        if LOGIN_DISCLAIMER in html:
            return 'private-or-unavailable', None

        # TODO: integrate into ural
        bpost_id = url.rsplit('/', 1)[-1].encode()

        # Extracting metadata
        meta_extractor = re.compile(META_EXTRACTOR_TEMPLATE % bpost_id)

        match = meta_extractor.search(html)

        if match is None:
            return 'extraction-failed', None

        data = json5.loads(match.group(1).decode())
        data = getpath(data, [
            'jsmods', 'pre_display_requires', 0, 3, 1, '__bbox', 'result',
            'data', 'feedback'
        ])

        if data is None:
            return 'extraction-failed', None

        # TODO: remove, this is here as a test
        # TODO: try to find a post where comments are disabled
        if get_count(data['seen_by_count']):
            print_err('Found seen_by_count: %i for %s' %
                      (get_count(data['seen_by_count']), url))

        if 'political_figure_data' in data and data['political_figure_data']:
            print_err('Found political_figure_data:')
            print_err(data['political_figure_data'])

        if get_count(data['reaction_count']) != get_count(data['reactors']):
            print_err('Found different reactions/reactors for %s' % url)

        # Extracting data from hidden html
        hidden_html_extractor = re.compile(HTML_EXTRACTOR_TEMPLATE % bpost_id)
        match = hidden_html_extractor.search(html)

        if match is not None:
            hidden_html = match.group(1).decode()
            soup = BeautifulSoup(hidden_html, 'lxml')

            # Sometimes fetching a post behaves weirdly
            if soup.select_one('h5 a') is None:
                return 'extraction-failed', None

            data['scraped'] = {}

            timestamp_elem = soup.select_one('[data-utime]')
            timestamp = int(timestamp_elem.get('data-utime'))

            data['scraped']['account_name'] = soup.select_one(
                'h5 a').get_text().strip()
            data['scraped']['timestamp'] = timestamp
            data['scraped']['time'] = datetime.fromtimestamp(
                timestamp).isoformat()

            # TODO: use a context manager
            try:
                data['scraped']['aria_label'] = timestamp_elem.parent.get(
                    'aria-label')
            except:
                pass

            try:
                data['scraped']['text'] = soup.select_one(
                    '[data-testid="post_message"]').get_text()
            except:
                pass

            # try:
            #     data['scraped']['link'] = soup.select_one('[data-lynx-uri]').get('href')
            # except:
            #     pass

        return None, data

    # Loading bar
    loading_bar = LoadingBar(desc='Fetching post stats',
                             unit='post',
                             total=cli_args.total)

    for row, post_url in enricher.cells(cli_args.column, with_rows=True):
        loading_bar.update()

        if (not post_url or not is_facebook_post_url(post_url)):
            enricher.writerow(row, format_err('not-facebook-post'))
            continue

        err, data = fetch_facebook_page_stats(post_url)

        if err:
            enricher.writerow(row, format_err(err))
        else:
            enricher.writerow(row, format(data))

        # Throttling
        sleep_with_entropy(FACEBOOK_WEB_DEFAULT_THROTTLE, 5.0)
Beispiel #29
0
    def action(cli_args):
        enricher = casanova.batch_enricher(cli_args.file,
                                           cli_args.output,
                                           keep=cli_args.select,
                                           add=csv_headers)

        loading_bar = LoadingBar(desc='Retrieving ids',
                                 unit=method_name[:-1],
                                 stats={'users': 0})

        # TODO: this is temp debug
        def listener(event, data):
            loading_bar.print(event)
            loading_bar.print(repr(data))

        wrapper = TwitterWrapper(cli_args.access_token,
                                 cli_args.access_token_secret,
                                 cli_args.api_key,
                                 cli_args.api_secret_key,
                                 listener=listener)

        resuming_state = None

        if cli_args.resume:
            resuming_state = cli_args.output.pop_state()

        for row, user in enricher.cells(cli_args.column, with_rows=True):
            loading_bar.update_stats(user=user)

            all_ids = []
            next_cursor = -1
            result = None

            if resuming_state is not None and resuming_state.last_cursor:
                next_cursor = int(resuming_state.last_cursor)

            if cli_args.ids:
                wrapper_kwargs = {'user_id': user}
            else:
                wrapper_kwargs = {'screen_name': user}

            while next_cursor != 0:
                wrapper_kwargs['cursor'] = next_cursor

                skip_in_output = None

                if resuming_state:
                    skip_in_output = resuming_state.values_to_skip
                    resuming_state = None

                try:
                    result = wrapper.call([method_name, 'ids'],
                                          **wrapper_kwargs)
                except TwitterHTTPError as e:

                    # The user does not exist
                    loading_bar.inc('users_not_found')
                    break

                if result is not None:
                    all_ids = result.get('ids', [])
                    next_cursor = result.get('next_cursor', 0)

                    loading_bar.update(len(all_ids))

                    batch = []

                    for user_id in all_ids:
                        if skip_in_output and user_id in skip_in_output:
                            continue

                        batch.append([user_id])

                    enricher.writebatch(row, batch, next_cursor or None)
                else:
                    next_cursor = 0

            loading_bar.inc('users')
Beispiel #30
0
def scrape_action(cli_args):

    # Parsing scraper definition
    try:
        scraper = Scraper(cli_args.scraper, strain=cli_args.strain)
    except DefinitionInvalidFormatError:
        die(['Unknown scraper format!', 'It should be a JSON or YAML file.'])
    except FileNotFoundError:
        die('Could not find scraper file!')
    except InvalidScraperError as error:
        print('Your scraper is invalid! You need to fix the following errors:',
              file=sys.stderr)
        print(file=sys.stderr)
        sys.stderr.write(
            report_scraper_validation_errors(error.validation_errors))
        die()
    except CSSSelectorTooComplex:
        die([
            'Your strainer\'s CSS selector %s is too complex.' %
            colored(cli_args.strain, 'blue'),
            'You cannot use relations to create a strainer.',
            'Try to simplify the selector you passed to --strain.'
        ])

    if cli_args.validate:
        print('Your scraper is valid.', file=sys.stderr)
        sys.exit(0)

    if scraper.headers is None and cli_args.format == 'csv':
        die([
            'Your scraper does not yield tabular data.',
            'Try changing it or setting --format to "jsonl".'
        ])

    loading_bar = LoadingBar(desc='Scraping pages',
                             total=cli_args.total,
                             unit='page')

    worker_args = (cli_args.format, cli_args.separator)

    def on_irrelevant_row(reason, row):
        loading_bar.update()

    if cli_args.glob is not None:
        files = create_glob_iterator(cli_args, worker_args)
    else:
        reader = casanova.reader(cli_args.report)

        try:
            files = create_report_iterator(cli_args,
                                           reader,
                                           worker_args=worker_args,
                                           on_irrelevant_row=on_irrelevant_row)
        except NotADirectoryError:
            loading_bar.die([
                'Could not find the "%s" directory!' % cli_args.input_dir,
                'Did you forget to specify it with -i/--input-dir?'
            ])

    if cli_args.format == 'csv':
        output_writer = csv.DictWriter(cli_args.output,
                                       fieldnames=scraper.headers)
        output_writer.writeheader()
    else:
        output_writer = ndjson.writer(cli_args.output)

    pool = LazyPool(cli_args.processes,
                    initializer=init_process,
                    initargs=(scraper.definition, cli_args.strain))

    loading_bar.update_stats(p=pool.processes)

    with pool:
        for error, items in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                if isinstance(error, (ScraperEvalError, ScraperEvalTypeError,
                                      ScraperEvalNoneError)):
                    loading_bar.print(report_scraper_evaluation_error(error),
                                      end='')
                loading_bar.inc('errors')
                continue

            for item in items:
                output_writer.writerow(item)