Esempio n. 1
0
def videos_action(namespace, output_file):

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=REPORT_HEADERS,
                                 keep=namespace.select)

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' videos',
    )

    http = create_pool()
    column = namespace.column

    def rows_with_videos_id():

        for row, ytb_data in enricher.cells(namespace.column, with_rows=True):
            video_id = None

            if is_youtube_video_id(ytb_data):
                video_id = ytb_data
            elif is_youtube_url(ytb_data):
                video_id = extract_video_id_from_youtube_url(ytb_data)

            yield row, video_id

    for chunk in chunks_iter(rows_with_videos_id(), 50):

        all_ids = [video_id for _, video_id in chunk if video_id]
        list_id = ",".join(all_ids)

        url = URL_TEMPLATE % {'list_id': list_id, 'key': namespace.key}
        err, response, result = request_json(http, url)

        if err:
            die(err)
        elif response.status == 403:
            time.sleep(seconds_to_midnight_pacific_time())
            continue
        elif response.status >= 400:
            die(response.status)

        data = get_data(result)

        not_available = []

        id_available = set(data)
        not_available = set(all_ids).difference(id_available)

        loading_bar.update(len(chunk))

        line_empty = []

        for row, video_id in chunk:
            if video_id is None or video_id in not_available:
                enricher.writerow(row)
            else:
                enricher.writerow(row, data[video_id])
Esempio n. 2
0
def twitter_action(namespace):

    # Credentials are required to be able to access the API
    if not namespace.api_key or \
       not namespace.api_secret_key or \
       not namespace.access_token or \
       not namespace.access_token_secret:
        die([
            'Full credentials are required to access Twitter API.',
            'You can provide them using various CLI arguments:',
            '    --api-key', '    --api-secret-key', '    --access-token',
            '    --access-token-secret'
        ])

    output_file = open_output_file(
        namespace.output,
        flag='a+' if getattr(namespace, 'resume', False) else 'w')

    if namespace.tw_action == 'friends':
        from minet.cli.twitter.friends import twitter_friends_action

        twitter_friends_action(namespace, output_file)

    # Cleanup
    if namespace.output is not None:
        output_file.close()
Esempio n. 3
0
def crowdtangle_posts_by_id_action(namespace, output_file):

    client = CrowdTangleClient(namespace.token, rate_limit=namespace.rate_limit)

    already_done = 0

    def listener(event, row):
        nonlocal already_done

        if event == 'resume.input':
            already_done += 1

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=CROWDTANGLE_POST_CSV_HEADERS,
        resumable=namespace.resume,
        listener=listener
    )

    loading_bar = tqdm(
        desc='Retrieving posts',
        dynamic_ncols=True,
        total=namespace.total,
        unit=' posts'
    )

    loading_bar.update(already_done)
    loading_bar_context = LoadingBarContext(loading_bar)

    try:
        for row, url in enricher.cells(namespace.column, with_rows=True):
            with loading_bar_context:
                url = url.strip()

                if not url:
                    enricher.writerow(row)
                    continue

                url = ensure_protocol(url)

                if not is_facebook_post_url(url):
                    enricher.writerow(row)
                    continue

                post_id = facebook.post_id_from_url(url)

                if post_id is None:
                    enricher.writerow(row)
                    continue

                post = client.post(post_id, format='csv_row')
                enricher.writerow(row, post)

    except CrowdTangleInvalidTokenError:
        die([
            'Your API token is invalid.',
            'Check that you indicated a valid one using the `--token` argument.'
        ])
Esempio n. 4
0
def cookies_action(cli_args):
    if cli_args.csv:
        output_writer = csv.writer(cli_args.output)

    try:
        jar = getattr(browser_cookie3, cli_args.browser)()
    except browser_cookie3.BrowserCookieError:
        die('Could not extract cookies from %s!' % cli_args.browser)

    if cli_args.url is not None:
        resolver = CookieResolver(jar)

        cookie = resolver(cli_args.url)

        if cookie is not None:

            if cli_args.csv:
                output_writer.writerow(MORSEL_CSV_HEADER)

                parsed = SimpleCookie(cookie)

                for morsel in parsed.values():
                    output_writer.writerow(format_morsel_for_csv(morsel))
            else:
                print(cookie, file=cli_args.output)
        else:
            die('Could not find relevant cookie for %s in %s!' % (cli_args.url, cli_args.browser))
    else:
        if cli_args.csv:
            output_writer.writerow(COOKIE_CSV_HEADER)

            for cookie in jar:
                output_writer.writerow(format_cookie_for_csv(cookie))
        else:
            write_jar_as_text_mozilla(jar, cli_args.output)
Esempio n. 5
0
def check_key(namespace):

    # A key is required to used the API
    if not namespace.key:
        die([
            'A key is required to access YouTube API.',
            'You can provide it using the --key argument.'
        ])
Esempio n. 6
0
def check_key(cli_args):

    # A key is required to used the API
    if not cli_args.key:
        die([
            'A key is required to access YouTube API.',
            'You can provide it using the --key argument.'
        ])
Esempio n. 7
0
def facebook_posts_action(cli_args):
    try:
        scraper = FacebookMobileScraper(cli_args.cookie, throttle=cli_args.throttle)
    except FacebookInvalidCookieError:
        if cli_args.cookie in COOKIE_BROWSERS:
            die([
                'Could not extract relevant cookie from "%s".' % cli_args.cookie
            ])

        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to scrape Facebook groups.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    # Enricher
    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=FACEBOOK_POST_CSV_HEADERS
    )

    # Loading bar
    loading_bar = LoadingBar(
        desc='Scraping posts',
        unit='post'
    )

    translated_langs = set()

    for i, (row, url) in enumerate(enricher.cells(cli_args.column, with_rows=True), 1):
        loading_bar.inc('groups')

        try:
            posts = scraper.posts(url)
        except FacebookInvalidTargetError:
            loading_bar.print('Given url (line %i) is probably not a Facebook group: %s' % (i, url))
            continue

        for post in posts:
            if post.translated_text and post.translated_from not in translated_langs:
                translated_langs.add(post.translated_from)
                lines = [
                    'Found text translated from %s!' % post.translated_from,
                    'Since it means original text may not be entirely retrieved you might want',
                    'to edit your Facebook language settings to add "%s" to' % post.translated_from,
                    'the "Languages you don\'t want to be offered translations for" list here:',
                    'https://www.facebook.com/settings/?tab=language'
                ]

                for line in lines:
                    loading_bar.print(line)

                loading_bar.print()

            loading_bar.update()
            enricher.writerow(row, post.as_csv_row())
Esempio n. 8
0
def check_dragnet():
    try:
        import dragnet
    except:
        die([
            'The `dragnet` library is not installed. The `extract` command won\'t work.',
            'To install it correctly, run the following commands in order:',
            '', '  pip install lxml numpy Cython', '  pip install dragnet'
        ])
Esempio n. 9
0
def crowdtangle_summary_action(namespace, output_file):
    if not namespace.start_date:
        die('Missing --start-date!')

    if is_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'url')

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select.split(',') if namespace.select else None,
        add=CROWDTANGLE_SUMMARY_CSV_HEADERS)

    posts_writer = None

    if namespace.posts is not None:
        posts_writer = csv.writer(namespace.posts)
        posts_writer.writerow(CROWDTANGLE_POST_CSV_HEADERS_WITH_LINK)

    loading_bar = tqdm(desc='Collecting data',
                       dynamic_ncols=True,
                       total=namespace.total,
                       unit=' urls')

    client = CrowdTangleAPIClient(namespace.token,
                                  rate_limit=namespace.rate_limit)

    for row, url in enricher.cells(namespace.column, with_rows=True):
        url = url.strip()

        try:
            stats = client.summary(url,
                                   start_date=namespace.start_date,
                                   with_top_posts=namespace.posts is not None,
                                   sort_by=namespace.sort_by,
                                   format='csv_row',
                                   platforms=namespace.platforms)

        except CrowdTangleInvalidTokenError:
            die([
                'Your API token is invalid.',
                'Check that you indicated a valid one using the `--token` argument.'
            ])

        except Exception as err:
            raise err

        if namespace.posts is not None:
            stats, posts = stats

            if posts is not None:
                for post in posts:
                    posts_writer.writerow([url] + post)

        enricher.writerow(row, stats)

        loading_bar.update()
Esempio n. 10
0
def main():

    # Building parser
    parser, subparser_index = build_parser(MINET_COMMANDS)

    # Parsing arguments and triggering commands
    cli_args = parser.parse_args()

    action = subparser_index.get(cli_args.action)

    if action is not None:

        # Loading config
        config = get_rcfile(cli_args.rcfile)

        # Resolving namespace dependencies
        try:
            to_close = resolve_arg_dependencies(cli_args, config)
        except OSError as e:
            parser.error('Could not open output file (-o/--output): %s' %
                         str(e))
        except NotResumable:
            parser.error(
                'Cannot --resume without knowing where the output will be written (use -o/--output)'
            )

        # Lazy loading module for faster startup
        m = importlib.import_module(action['command']['package'])
        fn = getattr(m, action['command']['action'])

        with ExitStack() as stack:
            for buffer in to_close:
                stack.callback(buffer.close)

            try:
                fn(cli_args)
            except InvalidArgumentsError as e:
                parser.error(e.message)
                sys.exit(1)

    elif cli_args.action == 'help':

        if len(cli_args.subcommand) == 0:
            parser.print_help()
            return

        target = get_subparser(subparser_index, cli_args.subcommand)

        if target is None:
            die('Unknow command "%s"' % ' '.join(cli_args.subcommand))
        else:
            target.print_help()

    else:
        parser.print_help()
Esempio n. 11
0
def videos_action(namespace, output_file):

    enricher = CSVEnricher(
        namespace.file,
        namespace.column,
        output_file,
        report_headers=REPORT_HEADERS,
        select=namespace.select.split(',') if namespace.select else None
    )

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' videos',
    )

    http = create_pool()

    for chunk in gen_chunks(enricher):

        all_ids = [row[0] for row in chunk if row[0]]
        list_id = ",".join(all_ids)

        url = URL_TEMPLATE % {'list_id': list_id, 'key': namespace.key}
        err, response, result = request_json(http, url)

        if err:
            die(err)
        elif response.status == 403:
            time.sleep(seconds_to_midnight_pacific_time())
            continue
        elif response.status >= 400:
            die(response.status)

        data = get_data(result)

        id_available = set(data)
        not_available = set(all_ids).difference(id_available)

        loading_bar.update(len(chunk))

        line_empty = []

        for item in chunk:
            video_id, line = item

            if video_id is None:
                enricher.write_empty(line)

            elif video_id in not_available:
                line_empty = [video_id] + [''] * (len(REPORT_HEADERS) - 1)
                enricher.write(line, line_empty)

            else:
                enricher.write(line, data[video_id])
Esempio n. 12
0
def crowdtangle_summary_action(cli_args):
    if not cli_args.start_date:
        die('Missing --start-date!')

    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=CROWDTANGLE_SUMMARY_CSV_HEADERS
    )

    posts_writer = None

    if cli_args.posts is not None:
        posts_writer = csv.writer(cli_args.posts)
        posts_writer.writerow(CROWDTANGLE_POST_CSV_HEADERS_WITH_LINK)

    loading_bar = LoadingBar(
        desc='Collecting data',
        total=cli_args.total,
        unit='url'
    )

    client = CrowdTangleAPIClient(cli_args.token, rate_limit=cli_args.rate_limit)

    for row, url in enricher.cells(cli_args.column, with_rows=True):
        url = url.strip()

        try:
            stats = client.summary(
                url,
                start_date=cli_args.start_date,
                with_top_posts=cli_args.posts is not None,
                sort_by=cli_args.sort_by,
                platforms=cli_args.platforms
            )

        except CrowdTangleInvalidTokenError:
            die([
                'Your API token is invalid.',
                'Check that you indicated a valid one using the `--token` argument.'
            ])

        if cli_args.posts is not None:
            stats, posts = stats

            if posts is not None:
                for post in posts:
                    posts_writer.writerow(post.as_csv_row())

        enricher.writerow(row, stats.as_csv_row() if stats is not None else None)

        loading_bar.update()
Esempio n. 13
0
def check_credentials(namespace):

    # Credentials are required to be able to access the API
    if not namespace.api_key or \
       not namespace.api_secret_key or \
       not namespace.access_token or \
       not namespace.access_token_secret:
        die([
            'Full credentials are required to access Twitter API.',
            'You can provide them using various CLI arguments:',
            '    --api-key', '    --api-secret-key', '    --access-token',
            '    --access-token-secret'
        ])
Esempio n. 14
0
def crowdtangle_action(namespace):

    # A token is needed to be able to access the API
    if not namespace.token:
        die([
            "A token is needed to be able to access CrowdTangle's API.",
            "You can provide one using the `--token` argument.",
        ])

    output_file = open_output_file(
        namespace.output,
        flag="a+" if getattr(namespace, "resume", False) else "w")

    if namespace.ct_action == "posts":
        from minet.cli.crowdtangle.posts import crowdtangle_posts_action

        crowdtangle_posts_action(namespace, output_file)

    elif namespace.ct_action == "posts-by-id":
        from minet.cli.crowdtangle.posts_by_id import crowdtangle_posts_by_id_action

        crowdtangle_posts_by_id_action(namespace, output_file)

    elif namespace.ct_action == "lists":
        from minet.cli.crowdtangle.lists import crowdtangle_lists_action

        crowdtangle_lists_action(namespace, output_file)

    elif namespace.ct_action == "leaderboard":
        from minet.cli.crowdtangle.leaderboard import crowdtangle_leaderboard_action

        crowdtangle_leaderboard_action(namespace, output_file)

    elif namespace.ct_action == "search":
        from minet.cli.crowdtangle.search import crowdtangle_search_action

        crowdtangle_search_action(namespace, output_file)

    elif namespace.ct_action == "summary":
        from minet.cli.crowdtangle.summary import crowdtangle_summary_action

        crowdtangle_summary_action(namespace, output_file)

    elif namespace.ct_action == "links":
        from crowdtangle.links import crowdtangle_links_action

        crowdtangle_links_action(namespace, output_file)

    # Cleanup
    if namespace.output is not None:
        output_file.close()
Esempio n. 15
0
def search_action(namespace, output_file):

    # Handling output
    output_file = open_output_file(namespace.output)

    edit_namespace_with_csv_io(namespace, 'keyword')

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=CSV_HEADERS
    )

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit='videos',
    )
    http = create_pool()
    error_file = DummyTqdmFile(sys.stderr)
    limit = namespace.limit

    for (row, keyword) in enricher.cells(namespace.column, with_rows=True):
        url = URL_template_accurate % {'subject': keyword, 'key': namespace.key}
        next_page = True
        while next_page:
            if next_page is True:
                err, response, result = request_json(http, url)
            else:
                url_next = url + '&pageToken=' + next_page
                err, response, result = request_json(http, url_next)
            if err:
                die(err)
            elif response.status == 403:
                error_file.write('Running out of API points. You will have to wait until midnight, Pacific time!')
                time.sleep(seconds_to_midnight_pacific_time())
                continue
            elif response.status >= 400:
                die(response.status)
            next_page, data_l = get_data(result)
            for data in data_l:
                if limit is not(None):
                    if limit == 0:
                        return True
                    else:
                        limit -= 1
                        enricher.writerow(row, data)
                else:
                    enricher.writerow(row, data)
Esempio n. 16
0
def main():

    # Building parser
    parser, subparser_index = build_parser(MINET_COMMANDS)

    # Parsing arguments and triggering commands
    args = parser.parse_args()

    action = subparser_index.get(args.action)

    if action is not None:

        # Loading config
        config = get_rcfile(args.rcfile)

        # Bootstrapping config
        for name in vars(args):
            value = getattr(args, name)

            if isinstance(value, WrappedConfigValue):
                setattr(args, name, value.resolve(config))

        # Need to check something?
        if 'before' in action['command']:
            action['command']['before']()

        # Lazy loading module for faster startup
        m = importlib.import_module(action['command']['package'])
        fn = getattr(m, action['command']['action'])

        fn(args)

    elif args.action == 'help':

        if len(args.subcommand) == 0:
            parser.print_help()
            return

        target = get_subparser(subparser_index, args.subcommand)

        if target is None:
            die('Unknow command "%s"' % ' '.join(args.subcommand))
        else:
            target.print_help()

    else:
        parser.print_help()
Esempio n. 17
0
def crowdtangle_lists_action(namespace, output_file):

    client = CrowdTangleAPIClient(namespace.token, rate_limit=namespace.rate_limit)
    writer = csv.writer(output_file)
    writer.writerow(CROWDTANGLE_LIST_CSV_HEADERS)

    try:
        lists = client.lists(format='csv_row')

        for l in lists:
            writer.writerow(l)

    except CrowdTangleInvalidTokenError:
        die([
            'Your API token is invalid.',
            'Check that you indicated a valid one using the `--token` argument.'
        ])
Esempio n. 18
0
def scrape_action(namespace):

    output_file = open_output_file(namespace.output)

    # Parsing scraper definition
    try:
        scraper = load_definition(namespace.scraper)
    except TypeError:
        die(['Unknown scraper format.', 'Expecting a JSON or YAML file.'])
    except:
        die('Invalid scraper file.')

    if namespace.format == 'csv':
        output_headers = headers_from_definition(scraper)
        output_writer = csv.DictWriter(output_file, fieldnames=output_headers)
        output_writer.writeheader()
    else:
        output_writer = ndjson.writer(output_file)

    loading_bar = tqdm(desc='Scraping pages',
                       total=namespace.total,
                       dynamic_ncols=True,
                       unit=' pages')

    loading_bar.set_postfix(p=namespace.processes)

    if namespace.glob is not None:
        files = create_glob_iterator(namespace, scraper)
    else:
        reader = casanova.reader(namespace.report)
        files = create_report_iterator(namespace, reader, scraper, loading_bar)

    with Pool(namespace.processes) as pool:
        for error, items in pool.imap_unordered(worker, files):
            loading_bar.update()

            if not isinstance(items, list):
                items = [items]

            for item in items:
                if not isinstance(item, dict):
                    item = {'value': item}

                output_writer.writerow(item)

    output_file.close()
Esempio n. 19
0
def crowdtangle_action(namespace):

    # A token is needed to be able to access the API
    if not namespace.token:
        die([
            'A token is needed to be able to access CrowdTangle\'s API.',
            'You can provide one using the `--token` argument.'
        ])

    output_file = open_output_file(
        namespace.output,
        flag='a+' if getattr(namespace, 'resume', False) else 'w')

    if namespace.ct_action == 'posts':
        from minet.cli.crowdtangle.posts import crowdtangle_posts_action

        crowdtangle_posts_action(namespace, output_file)

    elif namespace.ct_action == 'posts-by-id':
        from minet.cli.crowdtangle.posts_by_id import crowdtangle_posts_by_id_action

        crowdtangle_posts_by_id_action(namespace, output_file)

    elif namespace.ct_action == 'lists':
        from minet.cli.crowdtangle.lists import crowdtangle_lists_action

        crowdtangle_lists_action(namespace, output_file)

    elif namespace.ct_action == 'leaderboard':
        from minet.cli.crowdtangle.leaderboard import crowdtangle_leaderboard_action

        crowdtangle_leaderboard_action(namespace, output_file)

    elif namespace.ct_action == 'search':
        from minet.cli.crowdtangle.search import crowdtangle_search_action

        crowdtangle_search_action(namespace, output_file)

    elif namespace.ct_action == 'summary':
        from minet.cli.crowdtangle.summary import crowdtangle_summary_action

        crowdtangle_summary_action(namespace, output_file)

    # Cleanup
    if namespace.output is not None:
        output_file.close()
Esempio n. 20
0
def facebook_comments_action(cli_args):
    try:
        scraper = FacebookMobileScraper(cli_args.cookie,
                                        throttle=cli_args.throttle)
    except FacebookInvalidCookieError:
        if cli_args.cookie in COOKIE_BROWSERS:
            die([
                'Could not extract relevant cookie from "%s".' %
                cli_args.cookie
            ])

        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to scrape Facebook comments.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    # Enricher
    enricher = casanova.enricher(cli_args.file,
                                 cli_args.output,
                                 keep=cli_args.select,
                                 add=FACEBOOK_COMMENT_CSV_HEADERS)

    # Loading bar
    loading_bar = LoadingBar(desc='Scraping comments', unit='comment')

    for i, (row,
            url) in enumerate(enricher.cells(cli_args.column, with_rows=True),
                              1):
        try:
            batches = scraper.comments(url, per_call=True, detailed=True)
        except FacebookInvalidTargetError:
            loading_bar.print(
                'Given url (line %i) is probably not a Facebook resource having comments: %s'
                % (i, url))
            continue

        for details, batch in batches:
            for comment in batch:
                enricher.writerow(row, comment.as_csv_row())

            loading_bar.update(len(batch))
            loading_bar.update_stats(calls=details['calls'],
                                     replies=details['replies'],
                                     q=details['queue_size'],
                                     posts=i)
Esempio n. 21
0
def mediacloud_medias_action(namespace, output_file):
    added_headers = MEDIACLOUD_MEDIA_CSV_HEADER[1:]

    feeds_file = None
    feeds_writer = None

    if namespace.feeds:
        added_headers.append('feeds')
        feeds_file = open(namespace.feeds, 'w', encoding='utf-8')
        feeds_writer = csv.writer(feeds_file)
        feeds_writer.writerow(MEDIACLOUD_FEED_CSV_HEADER)

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=added_headers)

    loading_bar = tqdm(desc='Fetching medias',
                       dynamic_ncols=True,
                       unit=' medias',
                       total=namespace.total)

    client = MediacloudAPIClient(namespace.token)

    for row, media_id in enricher.cells(namespace.column, with_rows=True):

        try:
            result = client.media(media_id, format='csv_row')

            if namespace.feeds:
                feeds = client.feeds(media_id, format='csv_row')

                enricher.writerow(row, result[1:] + [len(feeds)])

                for feed in feeds:
                    feeds_writer.writerow(feed)
            else:
                enricher.writerow(row, result[1:])
        except MediacloudServerError as e:
            loading_bar.close()
            die(['Aborted due to a mediacloud server error:', e.server_error])

        loading_bar.update()

    feeds_file.close()
Esempio n. 22
0
def crowdtangle_lists_action(cli_args):

    client = CrowdTangleAPIClient(cli_args.token,
                                  rate_limit=cli_args.rate_limit)
    writer = csv.writer(cli_args.output)
    writer.writerow(CROWDTANGLE_LIST_CSV_HEADERS)

    try:
        lists = client.lists()

        for l in lists:
            writer.writerow(l)

    except CrowdTangleInvalidTokenError:
        die([
            'Your API token is invalid.',
            'Check that you indicated a valid one using the `--token` argument.'
        ])
Esempio n. 23
0
def twitter_action(namespace):

    output_file = open_output_file(
        namespace.output,
        flag='a+' if getattr(namespace, 'resume', False) else 'w')

    if getattr(namespace, 'resume', False) and not namespace.output:
        die('Cannot --resume if -o/--output is not set!')

    if namespace.tw_action == 'scrape':
        from minet.cli.twitter.scrape import twitter_scrape_action

        twitter_scrape_action(namespace, output_file)

    else:
        check_credentials(namespace)

        if namespace.tw_action == 'friends':
            from minet.cli.twitter.friends import twitter_friends_action

            twitter_friends_action(namespace, output_file)

        elif namespace.tw_action == 'followers':
            from minet.cli.twitter.followers import twitter_followers_action

            twitter_followers_action(namespace, output_file)

        elif namespace.tw_action == 'users':
            from minet.cli.twitter.users import twitter_users_action

            twitter_users_action(namespace, output_file)

        elif namespace.tw_action == 'user-tweets':
            from minet.cli.twitter.user_tweets import twitter_user_tweets_action

            twitter_user_tweets_action(namespace, output_file)

        else:
            raise TypeError('unkown tw_action "%s"' % namespace.tw_action)

    # Cleanup
    if namespace.output is not None:
        output_file.close()
Esempio n. 24
0
def mediacloud_action(namespace):

    # A token is needed to be able to access the API
    if not namespace.token:
        die([
            'A token is needed to be able to access Mediacloud\'s API.',
            'You can provide one using the `--token` argument.'
        ])

    output_file = open_output_file(namespace.output)

    if namespace.mc_action == 'topic':
        from minet.cli.mediacloud.topic import mediacloud_topic_action
        mediacloud_topic_action(namespace, output_file)

    elif namespace.mc_action == 'search':
        from minet.cli.mediacloud.search import mediacloud_search_action
        mediacloud_search_action(namespace, output_file)

    output_file.close()
Esempio n. 25
0
def mediacloud_action(cli_args):

    # A token is needed to be able to access the API
    if not cli_args.token:
        die([
            'A token is needed to be able to access Mediacloud\'s API.',
            'You can provide one using the `--token` argument.'
        ])

    if cli_args.mc_action == 'medias':
        from minet.cli.mediacloud.medias import mediacloud_medias_action
        mediacloud_medias_action(cli_args)

    if cli_args.mc_action == 'topic':
        from minet.cli.mediacloud.topic import mediacloud_topic_action
        mediacloud_topic_action(cli_args)

    elif cli_args.mc_action == 'search':
        from minet.cli.mediacloud.search import mediacloud_search_action
        mediacloud_search_action(cli_args)
Esempio n. 26
0
def grab_facebook_cookie(namespace):
    if namespace.cookie == 'firefox' or namespace.cookie == 'chrome':
        get_cookie_for_url = grab_cookies(namespace.cookie)

        if get_cookie_for_url is None:
            die('Could not extract cookies from %s.' % namespace.cookie)

        cookie = get_cookie_for_url(FACEBOOK_URL + '/')

    else:
        cookie = namespace.cookie.strip()

    if not cookie:
        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to access Facebook pages.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    return fix_cookie(cookie)
Esempio n. 27
0
def facebook_url_likes_action(cli_args):
    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=REPORT_HEADERS,
        total=cli_args.total,
        prebuffer_bytes=DEFAULT_PREBUFFER_BYTES
    )

    if cli_args.column not in enricher.pos:
        die([
            'Could not find the "%s" column containing the urls in the given CSV file.' % cli_args.column
        ])

    loading_bar = LoadingBar(
        desc='Retrieving likes',
        unit='url',
        total=enricher.total
    )

    for row, url in enricher.cells(cli_args.column, with_rows=True):
        loading_bar.update()

        url = url.strip()

        if not url or not is_url(url, require_protocol=False):
            enricher.writerow(row)
            continue

        err, html = make_request(url)

        if err is not None:
            loading_bar.die('An error occurred while fetching like button for this url: %s' % url)

        scraped = scrape(html)

        if scraped is None:
            loading_bar.die('Could not extract Facebook likes from this url\'s like button: %s' % url)

        enricher.writerow(row, scraped)
Esempio n. 28
0
def mediacloud_search_action(namespace, output_file):
    writer = csv.writer(output_file)
    writer.writerow(MEDIACLOUD_STORIES_CSV_HEADER)

    client = MediacloudClient(namespace.token)

    kwargs = {
        'collections': namespace.collections
    }

    loading_bar = tqdm(
        desc='Searching stories',
        dynamic_ncols=True,
        unit=' stories'
    )

    try:
        if not namespace.skip_count:
            count = client.count(
                namespace.query,
                **kwargs
            )

            loading_bar.total = count

        iterator = client.search(
            namespace.query,
            format='csv_row',
            **kwargs
        )

        for story in iterator:
            writer.writerow(story)
            loading_bar.update()

    except MediacloudServerError as e:
        loading_bar.close()
        die([
            'Aborted due to a mediacloud server error:',
            e.server_error
        ])
Esempio n. 29
0
def comments_action(namespace, output_file):

    output_file = open_output_file(namespace.output)

    writer = csv.writer(output_file)
    writer.writerow(CSV_HEADERS)

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' comments',
    )

    http = create_pool()

    url = URL_TEMPLATE % {'id': namespace.id, 'key': namespace.key}
    next_page = True
    all_data = []

    while next_page:

        if next_page is True:
            err, response, result = request_json(http, url)
        else:
            url_next = url + '&pageToken=' + next_page
            err, response, result = request_json(http, url_next)

        if err:
            die(err)
        elif response.status == 403:
            time.sleep(seconds_to_midnight_pacific_time())
            continue
        elif response.status >= 400:
            die(response.status)

        next_page, data = get_data(result)

        for comment in data:
            loading_bar.update()
            writer.writerow(comment)
Esempio n. 30
0
def facebook_post_authors_action(cli_args):
    try:
        scraper = FacebookMobileScraper(cli_args.cookie, throttle=cli_args.throttle)
    except FacebookInvalidCookieError:
        if cli_args.cookie in COOKIE_BROWSERS:
            die([
                'Could not extract relevant cookie from "%s".' % cli_args.cookie
            ])

        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to scrape Facebook comments.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    # Enricher
    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=FACEBOOK_USER_CSV_HEADERS
    )

    # Loading bar
    loading_bar = LoadingBar(
        desc='Finding authors',
        unit='post'
    )

    for i, (row, post_url) in enumerate(enricher.cells(cli_args.column, with_rows=True), 1):
        loading_bar.update()

        try:
            author = scraper.post_author(post_url)
        except FacebookInvalidTargetError:
            loading_bar.print('Given url (line %i) is probably not a Facebook group post: %s' % (i, post_url))
            continue

        enricher.writerow(row, author.as_csv_row() if author is not None else None)