Beispiel #1
0
def main():
    parser = get_argparser()
    args = parser.parse_args()

    command = args.command
    query = args.query or ""

    logging.basicConfig(
        filename=args.log,
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(message)s"
    )

    # catch ctrl-c so users don't see a stack trace
    signal.signal(signal.SIGINT, lambda signal, frame: sys.exit(0))

    if command == "version":
        print("twarc v%s" % __version__)
        sys.exit()
    elif command == "help" or not command:
        parser.print_help()
        print("\nPlease use one of the following commands:\n")
        for cmd in commands:
            print(" - %s" % cmd)
        print("\nFor example:\n\n    twarc search blacklivesmatter")
        sys.exit(1)

    # Don't validate the keys if the command is "configure"
    if command == "configure" or args.skip_key_validation:
        validate_keys = False
    else:
        validate_keys = True


    t = Twarc(
        consumer_key=args.consumer_key,
        consumer_secret=args.consumer_secret,
        access_token=args.access_token,
        access_token_secret=args.access_token_secret,
        connection_errors=args.connection_errors,
        http_errors=args.http_errors,
        config=args.config,
        profile=args.profile,
        tweet_mode=args.tweet_mode,
        protected=args.protected,
        validate_keys=validate_keys,
    )

    # calls that return tweets
    if command == "search":
        things = t.search(
            query,
            since_id=args.since_id,
            max_id=args.max_id,
            lang=args.lang,
            result_type=args.result_type,
            geocode=args.geocode
        )

    elif command == "filter":
        things = t.filter(
            track=query,
            follow=args.follow,
            locations=args.locations
        )

    elif command == "dehydrate":
        input_iterator = fileinput.FileInput(
            query,
            mode='r',
            openhook=fileinput.hook_compressed,
        )
        things = t.dehydrate(input_iterator)

    elif command == "hydrate":
        input_iterator = fileinput.FileInput(
            query,
            mode='r',
            openhook=fileinput.hook_compressed,
        )
        things = t.hydrate(input_iterator)

    elif command == "tweet":
        things = [t.tweet(query)]

    elif command == "sample":
        things = t.sample()

    elif command == "timeline":
        kwargs = {"max_id": args.max_id, "since_id": args.since_id}
        if re.match('^[0-9]+$', query):
            kwargs["user_id"] = query
        else:
            kwargs["screen_name"] = query
        things = t.timeline(**kwargs)

    elif command == "retweets":
        things = t.retweets(query)

    elif command == "users":
        if os.path.isfile(query):
            iterator = fileinput.FileInput(
                query,
                mode='r',
                openhook=fileinput.hook_compressed,
            )
            if re.match('^[0-9,]+$', next(open(query))):
                id_type = 'user_id'
            else:
                id_type = 'screen_name'
            things = t.user_lookup(ids=iterator, id_type=id_type)
        elif re.match('^[0-9,]+$', query):
            things = t.user_lookup(ids=query.split(","))
        else:
            things = t.user_lookup(ids=query.split(","), id_type='screen_name')

    elif command == "followers":
        things = t.follower_ids(query)

    elif command == "friends":
        things = t.friend_ids(query)

    elif command == "trends":
        # lookup woeid for geo-coordinate if appropriate
        geo = re.match('^([0-9\-\.]+),([0-9\-\.]+)$', query)
        if geo:
            lat, lon = map(float, geo.groups())
            if lat > 180 or lat < -180 or lon > 180 or lon < -180:
                parser.error('LAT and LONG must be within [-180.0, 180.0]')
            places = list(t.trends_closest(lat, lon))
            if len(places) == 0:
                parser.error("Couldn't find WOE ID for %s" % query)
            query = places[0]["woeid"]

        if not query:
            things = t.trends_available()
        else:
            trends = t.trends_place(query)
            if trends:
                things = trends[0]['trends']

    elif command == "replies":
        tweet = t.tweet(query)
        if not tweet:
            parser.error("tweet with id %s does not exist" % query)
        things = t.replies(tweet, args.recursive)

    elif command == "listmembers":
        list_parts = re.match('^https://twitter.com/(.+)/lists/(.+)$', query)
        if not list_parts:
            parser.error("provide the url for the list, e.g., https://twitter.com/USAFacts/lists/us-armed-forces")
        things = t.list_members(slug=list_parts.group(2),
                                owner_screen_name=list_parts.groups(1))

    elif command == "configure":
        t.configure()
        sys.exit()

    else:
        parser.print_help()
        print("\nPlease use one of the following commands:\n")
        for cmd in commands:
            print(" - %s" % cmd)
        print("\nFor example:\n\n    twarc search blacklivesmatter")
        sys.exit(1)

    # get the output filehandle
    if args.output:
        if pyv == 3:
            fh = codecs.open(args.output, 'wb', 'utf8')
        else:
            fh = open(args.output, 'w')
    else:
        fh = sys.stdout

    # optionally create a csv writer
    csv_writer = None
    if args.format in ("csv", "csv-excel") and command not in ["filter", "hydrate", "replies",
            "retweets", "sample", "search", "timeline", "tweet"]:
        parser.error("csv output not available for %s" % command)
    elif args.format in ("csv", "csv-excel"):
        csv_writer = csv.writer(fh)
        csv_writer.writerow(get_headings())

    line_count = 0
    file_count = 0
    for thing in things:

        # rotate the files if necessary
        if args.output and args.split and line_count % args.split == 0:
            file_count += 1
            fh = codecs.open(numbered_filepath(args.output, file_count), 'wb', 'utf8')
            if csv_writer:
                csv_writer = csv.writer(fh)
                csv_writer.writerow(get_headings())

        line_count += 1

        # ready to output

        kind_of = type(thing)
        if kind_of == str_type:
            # user or tweet IDs
            print(thing, file=fh)
            logging.info("archived %s" % thing)
        elif 'id_str' in thing:
            # tweets and users
            if (args.format == "json"):
                print(json.dumps(thing), file=fh)
            elif (args.format == "csv"):
                csv_writer.writerow(get_row(thing))
            elif (args.format == "csv-excel"):
                csv_writer.writerow(get_row(thing, excel=True))
            logging.info("archived %s", thing['id_str'])
        elif 'woeid' in thing:
            # places
            print(json.dumps(thing), file=fh)
        elif 'tweet_volume' in thing:
            # trends
            print(json.dumps(thing), file=fh)
        elif 'limit' in thing:
            # rate limits
            t = datetime.datetime.utcfromtimestamp(
                float(thing['limit']['timestamp_ms']) / 1000)
            t = t.isoformat("T") + "Z"
            logging.warn("%s tweets undelivered at %s",
                         thing['limit']['track'], t)
            if args.warnings:
                print(json.dumps(thing), file=fh)
        elif 'warning' in thing:
            # other warnings
            logging.warn(thing['warning']['message'])
            if args.warnings:
                print(json.dumps(thing), file=fh)
Beispiel #2
0
def main():
    parser = get_argparser()
    args = parser.parse_args()

    command = args.command
    query = args.query or ""

    logging.basicConfig(filename=args.log,
                        level=logging.INFO,
                        format="%(asctime)s %(levelname)s %(message)s")

    # log and stop when process receives SIGINT
    def stop(signal, frame):
        log.warn('process received SIGNT, stopping')
        sys.exit(0)

    signal.signal(signal.SIGINT, stop)

    if command == "version":
        print("twarc v%s" % __version__)
        sys.exit()
    elif command == "help" or not command:
        parser.print_help()
        print("\nPlease use one of the following commands:\n")
        for cmd in commands:
            print(" - %s" % cmd)
        print("\nFor example:\n\n    twarc search blacklivesmatter")
        sys.exit(1)

    # Don't validate the keys if the command is "configure"
    if command == "configure" or args.skip_key_validation:
        validate_keys = False
    else:
        validate_keys = True

    t = Twarc(consumer_key=args.consumer_key,
              consumer_secret=args.consumer_secret,
              access_token=args.access_token,
              access_token_secret=args.access_token_secret,
              connection_errors=args.connection_errors,
              http_errors=args.http_errors,
              config=args.config,
              profile=args.profile,
              tweet_mode=args.tweet_mode,
              protected=args.protected,
              validate_keys=validate_keys,
              app_auth=args.app_auth,
              gnip_auth=args.gnip_auth)

    # calls that return tweets
    if command == "search":
        if len(args.lang) > 0:
            lang = args.lang[0]
        else:
            lang = None

        # if not using a premium endpoint do a standard search
        if not args.thirtyday and not args.fullarchive and not args.gnip_fullarchive:
            things = t.search(query,
                              since_id=args.since_id,
                              max_id=args.max_id,
                              lang=lang,
                              result_type=args.result_type,
                              geocode=args.geocode)
        else:
            # parse the dates if given
            from_date = parse_dt(args.from_date) if args.from_date else None
            to_date = parse_dt(args.to_date) if args.to_date else None
            if args.gnip_fullarchive:
                env = args.gnip_fullarchive
                product = 'gnip_fullarchive'
            elif args.thirtyday:
                env = args.thirtyday
                product = '30day'
            else:
                env = args.fullarchive
                product = 'fullarchive'
            things = t.premium_search(
                query,
                product,
                env,
                from_date=from_date,
                to_date=to_date,
                sandbox=args.sandbox,
                limit=args.limit,
            )

    elif command == "filter":
        things = t.filter(track=query,
                          follow=args.follow,
                          locations=args.locations,
                          lang=args.lang)

    elif command == "dehydrate":
        input_iterator = fileinput.FileInput(
            query,
            mode='r',
            openhook=fileinput.hook_compressed,
        )
        things = t.dehydrate(input_iterator)

    elif command == "hydrate":
        input_iterator = fileinput.FileInput(
            query,
            mode='r',
            openhook=fileinput.hook_compressed,
        )
        things = t.hydrate(input_iterator)

    elif command == "tweet":
        things = [t.tweet(query)]

    elif command == "sample":
        things = t.sample()

    elif command == "timeline":
        kwargs = {"max_id": args.max_id, "since_id": args.since_id}
        if re.match('^[0-9]+$', query):
            kwargs["user_id"] = query
        elif query:
            kwargs["screen_name"] = query
        things = t.timeline(**kwargs)

    elif command == "retweets":
        if os.path.isfile(query):
            iterator = fileinput.FileInput(
                query,
                mode='r',
                openhook=fileinput.hook_compressed,
            )
            things = t.retweets(tweet_ids=iterator)
        else:
            things = t.retweets(tweet_ids=query.split(','))

    elif command == "users":
        if os.path.isfile(query):
            iterator = fileinput.FileInput(
                query,
                mode='r',
                openhook=fileinput.hook_compressed,
            )
            if re.match('^[0-9,]+$', next(open(query))):
                id_type = 'user_id'
            else:
                id_type = 'screen_name'
            things = t.user_lookup(ids=iterator, id_type=id_type)
        elif re.match('^[0-9,]+$', query):
            things = t.user_lookup(ids=query.split(","))
        else:
            things = t.user_lookup(ids=query.split(","), id_type='screen_name')

    elif command == "followers":
        things = t.follower_ids(query)

    elif command == "friends":
        things = t.friend_ids(query)

    elif command == "trends":
        # lookup woeid for geo-coordinate if appropriate
        geo = re.match('^([0-9-.]+),([0-9-.]+)$', query)
        if geo:
            lat, lon = map(float, geo.groups())
            if lat > 180 or lat < -180 or lon > 180 or lon < -180:
                parser.error('LAT and LONG must be within [-180.0, 180.0]')
            places = list(t.trends_closest(lat, lon))
            if len(places) == 0:
                parser.error("Couldn't find WOE ID for %s" % query)
            query = places[0]["woeid"]

        if not query:
            things = t.trends_available()
        else:
            trends = t.trends_place(query)
            if trends:
                things = trends[0]['trends']

    elif command == "replies":
        tweet = t.tweet(query)
        if not tweet:
            parser.error("tweet with id %s does not exist" % query)
        things = t.replies(tweet, args.recursive)

    elif command == "listmembers":
        list_parts = re.match('^https://twitter.com/(.+)/lists/(.+)$', query)
        if not list_parts:
            parser.error(
                "provide the url for the list, e.g., https://twitter.com/USAFacts/lists/us-armed-forces"
            )
        things = t.list_members(slug=list_parts.group(2),
                                owner_screen_name=list_parts.groups(1))

    elif command == "configure":
        t.configure()
        sys.exit()

    else:
        parser.print_help()
        print("\nPlease use one of the following commands:\n")
        for cmd in commands:
            print(" - %s" % cmd)
        print("\nFor example:\n\n    twarc search blacklivesmatter")
        sys.exit(1)

    # get the output filehandle
    if args.output:
        if pyv == 3:
            fh = codecs.open(args.output, 'wb', 'utf8')
        else:
            fh = open(args.output, 'w')
    else:
        fh = sys.stdout

    # optionally create a csv writer
    csv_writer = None
    if args.format in ("csv", "csv-excel") and command not in [
            "filter", "hydrate", "replies", "retweets", "sample", "search",
            "timeline", "tweet"
    ]:
        parser.error("csv output not available for %s" % command)
    elif args.format in ("csv", "csv-excel"):
        csv_writer = csv.writer(fh)
        csv_writer.writerow(get_headings())

    line_count = 0
    file_count = 0
    for thing in things:

        # rotate the files if necessary
        if args.output and args.split and line_count % args.split == 0:
            file_count += 1
            fh = codecs.open(numbered_filepath(args.output, file_count), 'wb',
                             'utf8')
            if csv_writer:
                csv_writer = csv.writer(fh)
                csv_writer.writerow(get_headings())

        line_count += 1

        # ready to output

        kind_of = type(thing)
        if kind_of == str_type:
            # user or tweet IDs
            print(thing, file=fh)
            log.info("archived %s" % thing)
        elif 'id_str' in thing:
            # tweets and users
            if (args.format == "json"):
                print(json.dumps(thing), file=fh)
            elif (args.format == "csv"):
                csv_writer.writerow(get_row(thing))
            elif (args.format == "csv-excel"):
                csv_writer.writerow(get_row(thing, excel=True))
            log.info("archived %s", thing['id_str'])
        elif 'woeid' in thing:
            # places
            print(json.dumps(thing), file=fh)
        elif 'tweet_volume' in thing:
            # trends
            print(json.dumps(thing), file=fh)
        elif 'limit' in thing:
            # rate limits
            t = datetime.datetime.utcfromtimestamp(
                float(thing['limit']['timestamp_ms']) / 1000)
            t = t.isoformat("T") + "Z"
            log.warning("%s tweets undelivered at %s", thing['limit']['track'],
                        t)
            if args.warnings:
                print(json.dumps(thing), file=fh)
        elif 'warning' in thing:
            # other warnings
            log.warning(thing['warning']['message'])
            if args.warnings:
                print(json.dumps(thing), file=fh)
        elif 'data' in thing:
            # Labs style JSON schema.
            print(json.dumps(thing), file=fh)
Beispiel #3
0
def main():
    parser = get_argparser()
    args = parser.parse_args()

    command = args.command
    query = args.query or ""

    logging.basicConfig(filename=args.log,
                        level=logging.INFO,
                        format="%(asctime)s %(levelname)s %(message)s")

    if command == "version":
        print("twarc v%s" % __version__)
        sys.exit()
    elif command == "help" or not command:
        parser.print_help()
        print("\nPlease use one of the following commands:\n")
        for cmd in commands:
            print(" - %s" % cmd)
        print("\nFor example:\n\n    twarc search blacklivesmatter")
        sys.exit(1)

    t = Twarc(consumer_key=args.consumer_key,
              consumer_secret=args.consumer_secret,
              access_token=args.access_token,
              access_token_secret=args.access_token_secret,
              connection_errors=args.connection_errors,
              http_errors=args.http_errors,
              config=args.config,
              profile=args.profile,
              tweet_mode=args.tweet_mode)

    # calls that return tweets
    if command == "search":
        things = t.search(query,
                          since_id=args.since_id,
                          max_id=args.max_id,
                          lang=args.lang,
                          result_type=args.result_type,
                          geocode=args.geocode)

    elif command == "filter":
        things = t.filter(track=query,
                          follow=args.follow,
                          locations=args.locations)

    elif command == "dehydrate":
        input_iterator = fileinput.FileInput(
            query,
            mode='rU',
            openhook=fileinput.hook_compressed,
        )
        things = t.dehydrate(input_iterator)

    elif command == "hydrate":
        input_iterator = fileinput.FileInput(
            query,
            mode='rU',
            openhook=fileinput.hook_compressed,
        )
        things = t.hydrate(input_iterator)

    elif command == "tweet":
        things = [t.tweet(query)]

    elif command == "sample":
        things = t.sample()

    elif command == "timeline":
        kwargs = {"max_id": args.max_id, "since_id": args.since_id}
        if re.match('^[0-9]+$', query):
            kwargs["user_id"] = query
        else:
            kwargs["screen_name"] = query
        things = t.timeline(**kwargs)

    elif command == "retweets":
        things = t.retweets(query)

    elif command == "users":
        if os.path.isfile(query):
            iterator = fileinput.FileInput(
                query,
                mode='rU',
                openhook=fileinput.hook_compressed,
            )
            things = t.user_lookup(iterator=iterator)
        elif re.match('^[0-9,]+$', query):
            things = t.user_lookup(user_ids=query.split(","))
        else:
            things = t.user_lookup(screen_names=query.split(","))

    elif command == "followers":
        things = t.follower_ids(query)

    elif command == "friends":
        things = t.friend_ids(query)

    elif command == "trends":
        # lookup woeid for geo-coordinate if appropriate
        geo = re.match('^([0-9\-\.]+),([0-9\-\.]+)$', query)
        if geo:
            lat, lon = map(float, geo.groups())
            if lat > 180 or lat < -180 or lon > 180 or lon < -180:
                parser.error('LAT and LONG must be within [-180.0, 180.0]')
            places = list(t.trends_closest(lat, lon))
            if len(places) == 0:
                parser.error("Couldn't find WOE ID for %s" % query)
            query = places[0]["woeid"]

        if not query:
            things = t.trends_available()
        else:
            trends = t.trends_place(query)
            if trends:
                things = trends[0]['trends']

    elif command == "replies":
        tweet = t.tweet(query)
        if not tweet:
            parser.error("tweet with id %s does not exist" % query)
        things = t.replies(tweet, args.recursive)

    elif command == "configure":
        t.input_keys()
        sys.exit()

    else:
        parser.print_help()
        print("\nPlease use one of the following commands:\n")
        for cmd in commands:
            print(" - %s" % cmd)
        print("\nFor example:\n\n    twarc search blacklivesmatter")
        sys.exit(1)

    # get the output filehandle
    if args.output:
        fh = codecs.open(args.output, 'wb', 'utf8')
    else:
        fh = sys.stdout

    # optionally create a csv writer
    csv_writer = None
    if args.format == "csv" and command not in [
            "filter", "hydrate", "replies", "retweets", "sample", "search",
            "timeline", "tweet"
    ]:
        parser.error("csv output not available for %s" % command)
    elif args.format == "csv":
        csv_writer = csv.writer(fh)
        csv_writer.writerow(get_headings())

    line_count = 0
    file_count = 0
    for thing in things:

        # rotate the files if necessary
        if args.output and args.split and line_count % args.split == 0:
            file_count += 1
            fh = codecs.open(numbered_filepath(args.output, file_count), 'wb',
                             'utf8')
            if csv_writer:
                csv_writer = csv.writer(fh)
                csv_writer.writerow(get_headings())

        line_count += 1

        # ready to output

        kind_of = type(thing)
        if kind_of == str_type:
            # user or tweet IDs
            print(thing, file=fh)
            logging.info("archived %s" % thing)
        elif 'id_str' in thing:
            # tweets and users
            if (args.format == "json"):
                print(json.dumps(thing), file=fh)
            elif (args.format == "csv"):
                csv_writer.writerow(get_row(thing))
            logging.info("archived %s", thing['id_str'])
        elif 'woeid' in thing:
            # places
            print(json.dump(thing), file=fh)
        elif 'tweet_volume' in thing:
            # trends
            print(json.dump(thing), file=fh)
        elif 'limit' in thing:
            # rate limits
            t = datetime.datetime.utcfromtimestamp(
                float(thing['limit']['timestamp_ms']) / 1000)
            t = t.isoformat("T") + "Z"
            logging.warn("%s tweets undelivered at %s",
                         thing['limit']['track'], t)
            if args.warnings:
                print(json.dump(thing), file=fh)
        elif 'warning' in thing:
            # other warnings
            logging.warn(thing['warning']['message'])
            if args.warnings:
                print(json.dump(thing), file=fh)