Exemple #1
0
    def test_exceptions(self, tmpdir):
        with pytest.raises(EmptyFileError):
            casanova.enricher(StringIO(''), StringIO(''))

        output_path = str(tmpdir.join('./wrong_resumer.csv'))

        with pytest.raises(TypeError):
            resumer = ThreadSafeResumer(output_path)
            with open('./test/resources/people.csv') as f, resumer:
                casanova.enricher(f, resumer)
Exemple #2
0
    def test_resumable(self, tmpdir):

        log = defaultdict(list)

        def listener(name, row):
            log[name].append(list(row))

        output_path = str(tmpdir.join('./enriched_resumable.csv'))

        resumer = RowCountResumer(output_path, listener=listener)

        with open('./test/resources/people.csv') as f, resumer:

            enricher = casanova.enricher(
                f, resumer,
                add=('x2',),
                keep=('name',)
            )

            row = next(iter(enricher))
            enricher.writerow(row, [2])

        assert collect_csv(output_path) == [
            ['name', 'x2'],
            ['John', '2']
        ]

        with open('./test/resources/people.csv') as f, resumer:

            enricher = casanova.enricher(
                f, resumer,
                add=('x2',),
                keep=('name',)
            )

            for i, row in enumerate(enricher):
                enricher.writerow(row, [(i + 2) * 2])

        assert collect_csv(output_path) == [
            ['name', 'x2'],
            ['John', '2'],
            ['Mary', '4'],
            ['Julia', '6']
        ]

        assert log == {
            'output.row': [['John', '2']],
            'input.row': [['John', 'Matthews']]
        }
Exemple #3
0
def url_parse_action(namespace, output_file):
    enricher = casanova.enricher(
        namespace.file,
        output_file,
        add=REPORT_HEADERS,
        keep=namespace.select
    )

    loading_bar = tqdm(
        desc='Parsing',
        dynamic_ncols=True,
        unit=' lines',
    )

    for row, url in enricher.cells(namespace.column, with_rows=True):

        loading_bar.update()

        url = url.strip()
        youtube_url = parse_youtube_url(url)

        if not youtube_url:
            enricher.writerow(row)
            continue

        enricher.writerow(
            row,
            [YOUTUBE_TYPES.get(type(youtube_url)), youtube_url.id, getattr(youtube_url, 'name', None)]
        )
Exemple #4
0
def captions_action(namespace, output_file):

    # Handling output
    single_video = namespace.file is sys.stdin and sys.stdin.isatty()

    if single_video:
        edit_namespace_with_csv_io(namespace, 'video')

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=YOUTUBE_CAPTIONS_CSV_HEADERS,
                                 keep=namespace.select)

    loading_bar = LoadingBar('Retrieving captions', unit='video')

    for row, video in enricher.cells(namespace.column, with_rows=True):
        result = get_video_captions(video, langs=namespace.lang)
        loading_bar.update()

        if result is None:
            continue

        track, lines = result

        prefix = [track.lang, '1' if track.generated else '']

        for line in lines:
            enricher.writerow(row, prefix + list(line))

    loading_bar.close()
Exemple #5
0
def comments_action(namespace, output_file):

    # Handling output
    single_video = namespace.file is sys.stdin and sys.stdin.isatty()

    if single_video:
        edit_namespace_with_csv_io(namespace, 'video')

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=YOUTUBE_COMMENT_CSV_HEADERS,
                                 keep=namespace.select)

    loading_bar = LoadingBar('Retrieving comments',
                             unit='comment',
                             stats={'videos': 0})

    def before_sleep_until_midnight(seconds):
        loading_bar.print(
            'API limits reached. Will now wait until midnight Pacific time!')

    client = YouTubeAPIClient(
        namespace.key, before_sleep_until_midnight=before_sleep_until_midnight)

    for row, video in enricher.cells(namespace.column, with_rows=True):
        generator = client.comments(video)

        for comment in generator:
            loading_bar.update()
            enricher.writerow(row, comment.as_csv_row())

        loading_bar.inc('videos')

    loading_bar.close()
Exemple #6
0
def videos_action(namespace, output_file):

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=YOUTUBE_VIDEO_CSV_HEADERS,
                                 keep=namespace.select)

    loading_bar = LoadingBar('Retrieving videos',
                             unit='video',
                             total=namespace.total)

    def before_sleep_until_midnight(seconds):
        loading_bar.print(
            'API limits reached. Will now wait until midnight Pacific time!')

    client = YouTubeAPIClient(
        namespace.key, before_sleep_until_midnight=before_sleep_until_midnight)

    iterator = enricher.cells(namespace.column, with_rows=True)

    for (row, _), video in client.videos(iterator, key=itemgetter(1)):
        loading_bar.update()
        enricher.writerow(row, video.as_csv_row() if video else None)

    loading_bar.close()
def crowdtangle_posts_by_id_action(namespace, output_file):

    client = CrowdTangleClient(namespace.token, rate_limit=namespace.rate_limit)

    already_done = 0

    def listener(event, row):
        nonlocal already_done

        if event == 'resume.input':
            already_done += 1

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=CROWDTANGLE_POST_CSV_HEADERS,
        resumable=namespace.resume,
        listener=listener
    )

    loading_bar = tqdm(
        desc='Retrieving posts',
        dynamic_ncols=True,
        total=namespace.total,
        unit=' posts'
    )

    loading_bar.update(already_done)
    loading_bar_context = LoadingBarContext(loading_bar)

    try:
        for row, url in enricher.cells(namespace.column, with_rows=True):
            with loading_bar_context:
                url = url.strip()

                if not url:
                    enricher.writerow(row)
                    continue

                url = ensure_protocol(url)

                if not is_facebook_post_url(url):
                    enricher.writerow(row)
                    continue

                post_id = facebook.post_id_from_url(url)

                if post_id is None:
                    enricher.writerow(row)
                    continue

                post = client.post(post_id, format='csv_row')
                enricher.writerow(row, post)

    except CrowdTangleInvalidTokenError:
        die([
            'Your API token is invalid.',
            'Check that you indicated a valid one using the `--token` argument.'
        ])
Exemple #8
0
def extract_action(cli_args):
    if cli_args.glob is None and cli_args.input_dir is None:
        cli_args.input_dir = DEFAULT_CONTENT_FOLDER

    input_data = cli_args.report

    if cli_args.glob is not None:
        input_data = dummy_csv_file_from_glob(cli_args.glob, cli_args.input_dir)

    enricher = casanova.enricher(
        input_data,
        cli_args.output,
        keep=cli_args.select,
        add=OUTPUT_ADDITIONAL_HEADERS
    )

    loading_bar = LoadingBar(
        desc='Extracting content',
        total=cli_args.total,
        unit='doc'
    )

    def on_irrelevant_row(reason, row, i):
        loading_bar.update()
        loading_bar.print('Row n°{n} could not be processed: {reason}'.format(n=i + 1, reason=reason))
        enricher.writerow(row, format_error(reason))

    if (
        cli_args.glob is None and
        'raw_contents' not in enricher.headers and
        not isdir(cli_args.input_dir)
    ):
        loading_bar.die([
            'Could not find the "%s" directory!' % cli_args.input_dir,
            'Did you forget to specify it with -i/--input-dir?'
        ])

    files = create_report_iterator(
        cli_args,
        enricher,
        on_irrelevant_row=on_irrelevant_row
    )

    pool = LazyPool(cli_args.processes)

    loading_bar.update_stats(p=pool.processes)

    with pool:
        for error, row, result in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                enricher.writerow(row, format_error(report_error(error)))
                continue

            if result is None:
                enricher.writerow(row, format_error('no-result'))
                continue

            enricher.writerow(row, result)
Exemple #9
0
def comments_action(cli_args):
    enricher = casanova.enricher(cli_args.file,
                                 cli_args.output,
                                 add=YOUTUBE_COMMENT_CSV_HEADERS,
                                 keep=cli_args.select)

    loading_bar = LoadingBar('Retrieving comments',
                             unit='comment',
                             stats={'videos': 0})

    def before_sleep_until_midnight(seconds):
        loading_bar.print(
            'API limits reached. Will now wait until midnight Pacific time!')

    client = YouTubeAPIClient(
        cli_args.key, before_sleep_until_midnight=before_sleep_until_midnight)

    for row, video in enricher.cells(cli_args.column, with_rows=True):
        generator = client.comments(video)

        for comment in generator:
            loading_bar.update()
            enricher.writerow(row, comment.as_csv_row())

        loading_bar.inc('videos')
Exemple #10
0
def extract_action(namespace):
    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(namespace.report,
                                 output_file,
                                 keep=namespace.select,
                                 add=OUTPUT_ADDITIONAL_HEADERS)

    loading_bar = tqdm(desc='Extracting content',
                       total=namespace.total,
                       dynamic_ncols=True,
                       unit=' docs')

    files = create_report_iterator(namespace,
                                   enricher,
                                   loading_bar=loading_bar)

    with Pool(namespace.processes) as pool:
        for error, row, result in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                enricher.writerow(row, [report_error(error)] + PADDING)
                continue

            if result is None:
                enricher.writerow(row, ['no-content'] + PADDING)
                continue

            enricher.writerow(row, result)

    output_file.close()
def filter_and_enrich_tweets_from_csv(f, cat_urls, of=sys.stdout, total=None):
    categories = list(cat_urls.keys())
    casa = casanova.enricher(f,
                             of,
                             add=["matched_urls", "webentities"] + categories)
    links_pos = casa.pos.links

    try:
        for row in tqdm(casa, total=total):
            links = [normalize_url(u) for u in row[links_pos].split('|')]
            if not links:
                continue

            matched_urls = []
            webentities = set()
            cat_belongings = []
            for cat in categories:
                cat_match = False
                for we, urls in cat_urls[cat].items():
                    for u in links:
                        if u in urls:
                            cat_match = True
                            matched_urls.append(u)
                            webentities.add(we)
                            links.remove(u)
                cat_belongings.append(cat_match)

            if webentities:
                casa.writerow(row,
                              ["|".join(matched_urls), "|".join(webentities)] +
                              cat_belongings)

    except Exception as e:
        print("ERROR while processing", row, file=sys.stderr)
        raise (e)
Exemple #12
0
def url_extract_action(cli_args):
    enricher = casanova.enricher(cli_args.file,
                                 cli_args.output,
                                 add=REPORT_HEADERS,
                                 keep=cli_args.select)

    extract = EXTRACTORS[getattr(cli_args, 'from')]

    loading_bar = LoadingBar(desc='Extracting',
                             unit='row',
                             total=cli_args.total)

    for row, content in enricher.cells(cli_args.column, with_rows=True):
        loading_bar.update()

        content = content.strip()

        if not content:
            continue

        for url in extract(content):
            if cli_args.base_url is not None:
                url = urljoin(cli_args.base_url, url)

            enricher.writerow(row, [url])
Exemple #13
0
def captions_action(cli_args):
    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        add=YOUTUBE_CAPTIONS_CSV_HEADERS,
        keep=cli_args.select
    )

    loading_bar = LoadingBar(
        'Retrieving captions',
        unit='video'
    )

    for row, video in enricher.cells(cli_args.column, with_rows=True):
        loading_bar.update()

        result = get_video_captions(video, langs=cli_args.lang)

        if result is None:
            continue

        track, lines = result

        prefix = [track.lang, '1' if track.generated else '']

        for line in lines:
            enricher.writerow(row, prefix + list(line))
Exemple #14
0
def search_action(cli_args):
    enricher = casanova.enricher(cli_args.file,
                                 cli_args.output,
                                 add=YOUTUBE_VIDEO_SNIPPET_CSV_HEADERS,
                                 keep=cli_args.select)

    loading_bar = LoadingBar('Searching videos', unit='video')

    def before_sleep_until_midnight(seconds):
        loading_bar.print(
            'API limits reached. Will now wait until midnight Pacific time!')

    client = YouTubeAPIClient(
        cli_args.key, before_sleep_until_midnight=before_sleep_until_midnight)

    for row, query in enricher.cells(cli_args.column, with_rows=True):
        loading_bar.print('Searching for "%s"' % query)

        searcher = client.search(query, order=cli_args.order)

        if cli_args.limit:
            searcher = islice(searcher, cli_args.limit)

        for video in searcher:
            loading_bar.update()
            enricher.writerow(row, video.as_csv_row())
Exemple #15
0
def videos_action(namespace, output_file):

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=REPORT_HEADERS,
                                 keep=namespace.select)

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' videos',
    )

    http = create_pool()
    column = namespace.column

    def rows_with_videos_id():

        for row, ytb_data in enricher.cells(namespace.column, with_rows=True):
            video_id = None

            if is_youtube_video_id(ytb_data):
                video_id = ytb_data
            elif is_youtube_url(ytb_data):
                video_id = extract_video_id_from_youtube_url(ytb_data)

            yield row, video_id

    for chunk in chunks_iter(rows_with_videos_id(), 50):

        all_ids = [video_id for _, video_id in chunk if video_id]
        list_id = ",".join(all_ids)

        url = URL_TEMPLATE % {'list_id': list_id, 'key': namespace.key}
        err, response, result = request_json(http, url)

        if err:
            die(err)
        elif response.status == 403:
            time.sleep(seconds_to_midnight_pacific_time())
            continue
        elif response.status >= 400:
            die(response.status)

        data = get_data(result)

        not_available = []

        id_available = set(data)
        not_available = set(all_ids).difference(id_available)

        loading_bar.update(len(chunk))

        line_empty = []

        for row, video_id in chunk:
            if video_id is None or video_id in not_available:
                enricher.writerow(row)
            else:
                enricher.writerow(row, data[video_id])
Exemple #16
0
def twitter_friends_action(namespace, output_file):

    TWITTER = {
        'access_token': namespace.access_token,
        'access_token_secret': namespace.access_token_secret,
        'api_key': namespace.api_key,
        'api_secret_key': namespace.api_secret_key
    }

    wrapper = TwitterWrapper(TWITTER)

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=REPORT_HEADERS)

    loading_bar = tqdm(desc='Retrieving ids',
                       dynamic_ncols=True,
                       total=namespace.total,
                       unit=' line')

    for row, user_id in enricher.cells(namespace.column, with_rows=True):
        all_ids = []

        result = wrapper.call('friends.ids', args={'user_id': user_id})

        if result is not None:
            all_ids = result.get('ids', None)
            for friend_id in all_ids:
                enricher.writerow(row, [friend_id])

        loading_bar.update()

    loading_bar.close()
Exemple #17
0
def url_extract_action(namespace):
    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        add=REPORT_HEADERS,
        keep=namespace.select.split(',') if namespace.select else None)

    extract = EXTRACTORS[getattr(namespace, 'from')]

    loading_bar = tqdm(desc='Extracting',
                       dynamic_ncols=True,
                       unit=' rows',
                       total=namespace.total)

    for row, content in enricher.cells(namespace.column, with_rows=True):
        loading_bar.update()

        content = content.strip()

        if not content:
            continue

        for url in extract(content):
            if namespace.base_url is not None:
                url = urljoin(namespace.base_url, url)

            enricher.writerow(row, [url])

    output_file.close()
Exemple #18
0
def url_join_action(cli_args):
    left_reader = casanova.reader(cli_args.file1)
    left_headers = left_reader.fieldnames
    left_idx = None

    if cli_args.select:
        left_idx = left_reader.pos.collect(cli_args.select)
        left_headers = list(cli_args.select)

    # Applying column prefix now
    left_headers = [cli_args.match_column_prefix + h for h in left_headers]

    right_enricher = casanova.enricher(cli_args.file2,
                                       cli_args.output,
                                       add=left_headers)

    loading_bar = LoadingBar(desc='Indexing left file', unit='line')

    # First step is to index left file
    trie = NormalizedLRUTrie()

    for row, cell in left_reader.cells(cli_args.column1, with_rows=True):
        loading_bar.update()

        if left_idx is not None:
            row = [row[i] for i in left_idx]

        urls = [cell]

        if cli_args.separator is not None:
            urls = cell.split(cli_args.separator)

        for url in urls:
            url = url.strip()

            # NOTE: should we filter invalid urls here?
            if url:
                trie.set(url, row)

    loading_bar.close()

    loading_bar = LoadingBar(desc='Matching right file', unit='line')

    for row, url in right_enricher.cells(cli_args.column2, with_rows=True):
        loading_bar.update()

        url = url.strip()

        match = None

        # NOTE: should we filter invalid urls here?
        if url:
            match = trie.match(url)

        if match is None:
            right_enricher.writerow(row)
            continue

        right_enricher.writerow(row, match)
Exemple #19
0
def facebook_posts_action(cli_args):
    try:
        scraper = FacebookMobileScraper(cli_args.cookie, throttle=cli_args.throttle)
    except FacebookInvalidCookieError:
        if cli_args.cookie in COOKIE_BROWSERS:
            die([
                'Could not extract relevant cookie from "%s".' % cli_args.cookie
            ])

        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to scrape Facebook groups.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    # Enricher
    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=FACEBOOK_POST_CSV_HEADERS
    )

    # Loading bar
    loading_bar = LoadingBar(
        desc='Scraping posts',
        unit='post'
    )

    translated_langs = set()

    for i, (row, url) in enumerate(enricher.cells(cli_args.column, with_rows=True), 1):
        loading_bar.inc('groups')

        try:
            posts = scraper.posts(url)
        except FacebookInvalidTargetError:
            loading_bar.print('Given url (line %i) is probably not a Facebook group: %s' % (i, url))
            continue

        for post in posts:
            if post.translated_text and post.translated_from not in translated_langs:
                translated_langs.add(post.translated_from)
                lines = [
                    'Found text translated from %s!' % post.translated_from,
                    'Since it means original text may not be entirely retrieved you might want',
                    'to edit your Facebook language settings to add "%s" to' % post.translated_from,
                    'the "Languages you don\'t want to be offered translations for" list here:',
                    'https://www.facebook.com/settings/?tab=language'
                ]

                for line in lines:
                    loading_bar.print(line)

                loading_bar.print()

            loading_bar.update()
            enricher.writerow(row, post.as_csv_row())
Exemple #20
0
    def test_combined_pos(self, tmpdir):
        output_path = str(tmpdir.join('./enriched.csv'))
        with open('./test/resources/people.csv') as f, \
             open(output_path, 'w', newline='') as of:
            enricher = casanova.enricher(f, of, add=('line',), keep=('surname',))

            assert len(enricher.output_headers) == 2
            assert enricher.output_headers.surname == 0
            assert enricher.output_headers.line == 1
Exemple #21
0
def crowdtangle_summary_action(namespace, output_file):
    if not namespace.start_date:
        die('Missing --start-date!')

    if is_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'url')

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select.split(',') if namespace.select else None,
        add=CROWDTANGLE_SUMMARY_CSV_HEADERS)

    posts_writer = None

    if namespace.posts is not None:
        posts_writer = csv.writer(namespace.posts)
        posts_writer.writerow(CROWDTANGLE_POST_CSV_HEADERS_WITH_LINK)

    loading_bar = tqdm(desc='Collecting data',
                       dynamic_ncols=True,
                       total=namespace.total,
                       unit=' urls')

    client = CrowdTangleAPIClient(namespace.token,
                                  rate_limit=namespace.rate_limit)

    for row, url in enricher.cells(namespace.column, with_rows=True):
        url = url.strip()

        try:
            stats = client.summary(url,
                                   start_date=namespace.start_date,
                                   with_top_posts=namespace.posts is not None,
                                   sort_by=namespace.sort_by,
                                   format='csv_row',
                                   platforms=namespace.platforms)

        except CrowdTangleInvalidTokenError:
            die([
                'Your API token is invalid.',
                'Check that you indicated a valid one using the `--token` argument.'
            ])

        except Exception as err:
            raise err

        if namespace.posts is not None:
            stats, posts = stats

            if posts is not None:
                for post in posts:
                    posts_writer.writerow([url] + post)

        enricher.writerow(row, stats)

        loading_bar.update()
Exemple #22
0
def twitter_users_action(cli_args):

    client = TwitterAPIClient(
        cli_args.access_token,
        cli_args.access_token_secret,
        cli_args.api_key,
        cli_args.api_secret_key
    )

    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=USER_FIELDS
    )

    loading_bar = LoadingBar(
        desc='Retrieving users',
        total=cli_args.total,
        unit='user'
    )

    for chunk in as_chunks(100, enricher.cells(cli_args.column, with_rows=True)):
        users = ','.join(row[1].lstrip('@') for row in chunk)

        if cli_args.ids:
            client_args = {'user_id': users}
            key = 'id'
        else:
            client_args = {'screen_name': users}
            key = 'screen_name'

        try:
            result = client.call(['users', 'lookup'], **client_args)
        except TwitterHTTPError as e:
            if e.e.code == 404:
                for row, user in chunk:
                    enricher.writerow(row)
            else:
                raise e

            continue

        indexed_result = {}

        for user in result:
            user = normalize_user(user)
            user_row = format_user_as_csv_row(user)
            indexed_result[user[key]] = user_row

        for row, user in chunk:
            user_row = indexed_result.get(user.lstrip('@'))

            enricher.writerow(row, user_row)

        loading_bar.update(len(chunk))
Exemple #23
0
def facebook_comments_action(namespace):

    # Handling output
    output_file = open_output_file(namespace.output)

    # Handling input

    if is_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'post_url')

    try:
        scraper = FacebookCommentScraper(namespace.cookie)
    except FacebookInvalidCookieError:
        if namespace.cookie in ['firefox', 'chrome']:
            die('Could not extract cookies from %s.' % namespace.cookie)

        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to access Facebook post comments.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    # Enricher
    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=FACEBOOK_COMMENT_CSV_HEADERS)

    # Loading bar
    loading_bar = tqdm(desc='Scraping comments',
                       dynamic_ncols=True,
                       unit=' comments')

    for i, (row,
            url) in enumerate(enricher.cells(namespace.column,
                                             with_rows=True)):

        if not is_facebook_post_url(url):
            loading_bar.close()
            die('Given url (line %i) is not a Facebook post url: %s' %
                (i + 1, url))

        batches = scraper(url, per_call=True, detailed=True, format='csv_row')

        for details, batch in batches:
            for comment in batch:
                enricher.writerow(row, comment)

            loading_bar.update(len(batch))
            loading_bar.set_postfix(calls=details['calls'],
                                    replies=details['replies'],
                                    q=details['queue_size'],
                                    posts=i + 1)

    loading_bar.close()
Exemple #24
0
    def action(namespace, output_file):

        # TODO: this is temp debug
        def listener(event, data):
            tqdm.write(event, file=sys.stderr)
            tqdm.write(repr(data), file=sys.stderr)

        wrapper = TwitterWrapper(namespace.access_token,
                                 namespace.access_token_secret,
                                 namespace.api_key,
                                 namespace.api_secret_key,
                                 listener=listener)

        enricher = casanova.enricher(namespace.file,
                                     output_file,
                                     keep=namespace.select,
                                     add=csv_headers)

        loading_bar = tqdm(desc='Retrieving ids',
                           dynamic_ncols=True,
                           total=namespace.total,
                           unit=' followers',
                           postfix={'users': 0})

        users_done = 0

        for row, user in enricher.cells(namespace.column, with_rows=True):
            all_ids = []
            next_cursor = -1
            result = None

            if namespace.id:
                wrapper_kwargs = {'user_id': user}
            else:
                wrapper_kwargs = {'screen_name': user}

            while next_cursor != 0:
                wrapper_kwargs['cursor'] = next_cursor
                result = wrapper.call([method_name, 'ids'], **wrapper_kwargs)

                if result is not None:
                    all_ids = result.get('ids', [])
                    next_cursor = result.get('next_cursor', 0)

                    loading_bar.update(len(all_ids))

                    for user_id in all_ids:
                        enricher.writerow(row, [user_id])
                else:
                    next_cursor = 0

            users_done += 1
            loading_bar.set_postfix(users=users_done)

        loading_bar.close()
Exemple #25
0
def twitter_scrape_action(cli_args):
    scraper = TwitterAPIScraper()

    # Stats
    loading_bar = LoadingBar('Collecting tweets',
                             total=cli_args.limit,
                             unit='tweet',
                             stats={
                                 'tokens': 1,
                                 'queries': 0
                             })

    enricher = casanova.enricher(cli_args.file,
                                 cli_args.output,
                                 add=TWEET_FIELDS + ADDITIONAL_TWEET_FIELDS,
                                 keep=cli_args.select)

    def before_sleep(retry_state):
        exc = retry_state.outcome.exception()

        if isinstance(exc, TwitterPublicAPIRateLimitError):
            loading_bar.inc('tokens')

        else:
            loading_bar.inc('failures')
            loading_bar.print(
                'Failed to call Twitter search. Will retry in %s' %
                prettyprint_seconds(retry_state.idle_for))

    for row, query in enricher.cells(cli_args.query, with_rows=True):

        # Templating?
        if cli_args.query_template is not None:
            query = CUSTOM_FORMATTER.format(cli_args.query_template,
                                            value=query)

        loading_bar.print('Searching for "%s"' % query)
        loading_bar.inc('queries')

        iterator = scraper.search(
            query,
            limit=cli_args.limit,
            before_sleep=before_sleep,
            include_referenced_tweets=cli_args.include_refs,
            with_meta=True)

        try:
            for tweet, meta in iterator:
                loading_bar.update()

                tweet_row = format_tweet_as_csv_row(tweet)
                enricher.writerow(row, tweet_row + format_meta_row(meta))
        except TwitterPublicAPIOverCapacityError:
            loading_bar.die('Got an "Over Capacity" error. Shutting down...')
Exemple #26
0
def url_parse_action(namespace):

    output_file = open_output_file(namespace.output)

    headers = REPORT_HEADERS

    if namespace.facebook:
        headers = FACEBOOK_REPORT_HEADERS
    elif namespace.youtube:
        headers = YOUTUBE_REPORT_HEADERS

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        add=headers,
        keep=namespace.select
    )

    loading_bar = tqdm(
        desc='Parsing',
        dynamic_ncols=True,
        unit=' rows',
        total=namespace.total
    )

    for row, url in enricher.cells(namespace.column, with_rows=True):
        url = url.strip()

        loading_bar.update()

        if namespace.separator:
            urls = url.split(namespace.separator)
        else:
            urls = [url]

        for url in urls:
            if not is_url(url, allow_spaces_in_path=True, require_protocol=False):
                enricher.writerow(row)
                continue

            if namespace.facebook:
                addendum = extract_facebook_addendum(url)
            elif namespace.youtube:
                addendum = extract_youtube_addendum(url)
            else:
                addendum = extract_standard_addendum(namespace, url)

            if addendum is None:
                enricher.writerow(row)
                continue

            enricher.writerow(row, addendum)

    output_file.close()
Exemple #27
0
def crowdtangle_summary_action(cli_args):
    if not cli_args.start_date:
        die('Missing --start-date!')

    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=CROWDTANGLE_SUMMARY_CSV_HEADERS
    )

    posts_writer = None

    if cli_args.posts is not None:
        posts_writer = csv.writer(cli_args.posts)
        posts_writer.writerow(CROWDTANGLE_POST_CSV_HEADERS_WITH_LINK)

    loading_bar = LoadingBar(
        desc='Collecting data',
        total=cli_args.total,
        unit='url'
    )

    client = CrowdTangleAPIClient(cli_args.token, rate_limit=cli_args.rate_limit)

    for row, url in enricher.cells(cli_args.column, with_rows=True):
        url = url.strip()

        try:
            stats = client.summary(
                url,
                start_date=cli_args.start_date,
                with_top_posts=cli_args.posts is not None,
                sort_by=cli_args.sort_by,
                platforms=cli_args.platforms
            )

        except CrowdTangleInvalidTokenError:
            die([
                'Your API token is invalid.',
                'Check that you indicated a valid one using the `--token` argument.'
            ])

        if cli_args.posts is not None:
            stats, posts = stats

            if posts is not None:
                for post in posts:
                    posts_writer.writerow(post.as_csv_row())

        enricher.writerow(row, stats.as_csv_row() if stats is not None else None)

        loading_bar.update()
Exemple #28
0
def extract_action(namespace):
    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(
        namespace.report,
        output_file,
        keep=namespace.select,
        add=OUTPUT_ADDITIONAL_HEADERS
    )

    loading_bar = LoadingBar(
        desc='Extracting content',
        total=namespace.total,
        unit='doc'
    )

    def on_irrelevant_row(reason, row):
        loading_bar.update()
        enricher.writerow(row, format_error(reason))

    try:
        files = create_report_iterator(
            namespace,
            enricher,
            on_irrelevant_row=on_irrelevant_row
        )
    except NotADirectoryError:
        loading_bar.die([
            'Could not find the "%s" directory!' % namespace.input_dir,
            'Did you forget to specify it with -i/--input-dir?'
        ])

    pool = LazyPool(namespace.processes)

    loading_bar.update_stats(p=pool.processes)

    with pool:
        for error, row, result in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                enricher.writerow(row, format_error(report_error(error)))
                continue

            if result is None:
                enricher.writerow(row, format_error('no-content'))
                continue

            enricher.writerow(row, result)

    loading_bar.close()
    output_file.close()
Exemple #29
0
    def action(namespace, output_file):

        TWITTER = {
            'access_token': namespace.access_token,
            'access_token_secret': namespace.access_token_secret,
            'api_key': namespace.api_key,
            'api_secret_key': namespace.api_secret_key
        }

        wrapper = TwitterWrapper(TWITTER)

        enricher = casanova.enricher(
            namespace.file,
            output_file,
            keep=namespace.select,
            add=csv_headers
        )

        loading_bar = tqdm(
            desc='Retrieving ids',
            dynamic_ncols=True,
            total=namespace.total,
            unit=' line'
        )

        for row, user in enricher.cells(namespace.column, with_rows=True):
            all_ids = []
            next_cursor = -1
            result = None

            if namespace.id:
                wrapper_args = {'user_id': user}
            else:
                wrapper_args = {'screen_name': user}

            while next_cursor != 0:
                wrapper_args['cursor'] = next_cursor
                method = '%(method_name)s.ids' % {'method_name': method_name}
                result = wrapper.call(method, wrapper_args)

                if result is not None:
                    all_ids = result.get('ids', [])
                    next_cursor = result.get('next_cursor', 0)

                    for friend_id in all_ids:
                        enricher.writerow(row, [friend_id])
                else:
                    next_cursor = 0

            loading_bar.update()

        loading_bar.close()
Exemple #30
0
    def test_dialect(self, tmpdir):
        output_path = str(tmpdir.join('./enriched.csv'))
        with open('./test/resources/semicolons.csv') as f, \
             open(output_path, 'w', newline='') as of:
            enricher = casanova.enricher(f, of, add=('line',), delimiter=';')

            for i, row in enumerate(enricher):
                enricher.writerow(row, [i])

        assert collect_csv(output_path) == [
            ['name', 'surname', 'line'],
            ['Rose', 'Philips', '0'],
            ['Luke', 'Atman', '1']
        ]