def get_top_users(self, n=50): """Returns the users tweeting the most in the data set retrieved in the data set. Users are returned in descending order of how many times they were tweeted.""" self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter") for x in self.query.get_activity_set(): self.freq.add(x.screen_name) return self.freq.get_tokens(n)
def get_top_links(self, n=20): """Returns the links most shared in the data set retrieved in the order of how many times each was shared.""" self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space") for x in self.query.get_activity_set(): for link_str in x.most_unrolled_urls: self.freq.add(link_str) return self.freq.get_tokens(n)
def get_top_links(self, n=20): """Returns the links most shared in the data set retrieved in the order of how many times each was shared.""" self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space") for x in self.query.get_list_set(): link_str = x[LINKS_INDEX] if link_str != "GNIPEMPTYFIELD" and link_str != "None": self.freq.add(link_str) else: self.freq.add("NoLinks") return self.freq.get_tokens(n)
def setup_analysis(do_conversation=False, do_audience=False, identifier=None, input_results=None): """ Created placeholders for quantities of interest in results structure; return results data structure. If an identifier is specified, place the measurement accumulators at a particular key. """ def weight_and_screennames(): return {"weight": 0, "screennames": set([])} results = { "tweet_count": 0, "non-tweet_lines": 0, "tweets_per_user": defaultdict(int), #"user_id_to_screenname": } if do_conversation: results["do_conversation"] = True results["body_term_count"] = SimpleNGrams(char_lower_cutoff=3, n_grams=3, tokenizer="twitter") results["hashtags"] = defaultdict(int) results["urls"] = defaultdict(int) results["number_of_links"] = 0 results["utc_timeline"] = defaultdict(int) results["local_timeline"] = defaultdict(int) results["at_mentions"] = defaultdict(weight_and_screennames) results["in_reply_to"] = defaultdict(int) results["RT_of_user"] = defaultdict(weight_and_screennames) results["quote_of_user"] = defaultdict(weight_and_screennames) results["url_content"] = SimpleNGrams(char_lower_cutoff=3, n_grams=3, tokenizer="twitter") else: results["do_conversation"] = False if do_audience: results["do_audience"] = True results["bio_term_count"] = SimpleNGrams(char_lower_cutoff=3, n_grams=1, tokenizer="twitter") results["profile_locations_regions"] = defaultdict(int) results["audience_api"] = "" else: results["do_audience"] = False # in the future we could add custom fields by adding kwarg = func where func is agg/extractor and kwarg is field name return results
def set_index(self, use_case, count_bucket): self.use_case = use_case space_tokenizer = False char_upper_cutoff = 20 # longer than for normal words because of user names if use_case.startswith("links"): char_upper_cutoff = 100 space_tokenizer = True # self.freq = SimpleNGrams(charUpperCutoff=char_upper_cutoff, space_tokenizer=space_tokenizer) self.freq = SimpleNGrams(char_upper_cutoff=char_upper_cutoff, tokenizer="space") if use_case.startswith("user"): self.index = USER_NAME_INDEX elif use_case.startswith("wordc"): self.index = TEXT_INDEX elif use_case.startswith("rate"): self.index = DATE_INDEX elif use_case.startswith("link"): self.index = LINKS_INDEX elif use_case.startswith("time"): if not self.stream_url.endswith("counts.json"): self.stream_url = self.stream_url[:-5] + "/counts.json" if count_bucket not in ['day', 'minute', 'hour']: print >> sys.stderr, "Error. Invalid count bucket: %s \n" % str( count_bucket) sys.exit()
def get_top_users(self, n=50): """Returns the users tweeting the most in the data set retrieved in the data set. Users are returned in descending order of how many times they were tweeted.""" self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter") for x in self.query.get_list_set(): self.freq.add(x[USER_NAME_INDEX]) return self.freq.get_tokens(n)
def get_top_links(self, n=20): """Returns the links most shared in the data set retrieved in the order of how many times each was shared.""" self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space") for x in self.query.get_list_set(): link_str = x[LINKS_INDEX] if link_str != "GNIPEMPTYFIELD" and link_str != "None": try: exec("link_list=%s"%link_str) except SyntaxError as e: print >> sys.stderr, "WARNING: Something isn't right with this list: %s skipping it..."%link_str continue for l in link_list: self.freq.add(l) else: self.freq.add("NoLinks") return self.freq.get_tokens(n)
class Results(): """Class for aggregating and accessing search result sets and subsets. Returns derived values for the query specified.""" def __init__(self, user, password, stream_url, paged=False, output_file_path=None, pt_filter=None, max_results=100, start=None, end=None, count_bucket=None, show_query=False, hard_max=None): """Create a result set by passing all of the require parameters for a query. The Results class runs an API query once when initialized. This allows one to make multiple calls to analytics methods on a single query. """ # run the query self.query = Query(user, password, stream_url, paged, output_file_path, hard_max) self.query.execute(pt_filter=pt_filter, max_results=max_results, start=start, end=end, count_bucket=count_bucket, show_query=show_query) self.freq = None def get_raw_results(self): """Generator of query results""" for x in self.query.get_raw_results(): yield x def get_activities(self): """Generator of query tweet results.""" for x in self.query.get_activity_set(): yield x def get_time_series(self): """Generator of time series for query results.""" for x in self.query.get_time_series(): yield x def get_top_links(self, n=20): """Returns the links most shared in the data set retrieved in the order of how many times each was shared.""" self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space") for x in self.query.get_activity_set(): for link_str in x.most_unrolled_urls: self.freq.add(link_str) return self.freq.get_tokens(n) def get_top_users(self, n=50): """Returns the users tweeting the most in the data set retrieved in the data set. Users are returned in descending order of how many times they were tweeted.""" self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter") for x in self.query.get_activity_set(): self.freq.add(x.screen_name) return self.freq.get_tokens(n) def get_users(self, n=None): """Returns the user ids for the tweets collected""" uniq_users = set() for x in self.query.get_activity_set(): uniq_users.add(x.user_id) return uniq_users def get_top_grams(self, n=20): self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter") self.freq.sl.add_session_stop_list(["http", "https", "amp", "htt"]) for x in self.query.get_activity_set(): self.freq.add(x.all_text) return self.freq.get_tokens(n) def get_geo(self): for x in self.query.get_activity_set(): if x.geo_coordinates is not None: lat_lon = x.geo_coordinates activity = { "id": x.id, "postedTime": x.created_at_string.strip(".000Z"), "latitude": lat_lon["latitude"], "longitude": lat_lon["longitude"] } yield activity def get_frequency_items(self, size=20): """Retrieve the token list structure from the last query""" if self.freq is None: raise VallueError("No frequency available for use case") return self.freq.get_tokens(size) def __len__(self): return len(self.query) def __repr__(self): if self.last_query_params["count_bucket"] is None: res = [u"-" * OUTPUT_PAGE_WIDTH] rate = self.query.get_rate() unit = "Tweets/Minute" if rate < 0.01: rate *= 60. unit = "Tweets/Hour" res.append(" PowerTrack Rule: \"%s\"" % self.last_query_params["pt_filter"]) res.append(" Oldest Tweet (UTC): %s" % str(self.query.oldest_t)) res.append(" Newest Tweet (UTC): %s" % str(self.query.newest_t)) res.append( " Now (UTC): %s" % str(datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"))) res.append(" %5d Tweets: %6.3f %s" % (self.query.res_cnt, rate, unit)) res.append("-" * OUTPUT_PAGE_WIDTH) # self.query.get_top_users() fmt_str = u"%{}s -- %10s %8s (%d)".format(BIG_COLUMN_WIDTH) res.append(fmt_str % ("users", "tweets", "activities", self.res_cnt)) res.append("-" * OUTPUT_PAGE_WIDTH) fmt_str = u"%{}s -- %4d %5.2f%% %4d %5.2f%%".format( BIG_COLUMN_WIDTH) for x in self.freq.get_tokens(20): res.append(fmt_str % (x[4], x[0], x[1] * 100., x[2], x[3] * 100.)) res.append("-" * OUTPUT_PAGE_WIDTH) # self.query.get_top_links() fmt_str = u"%{}s -- %10s %8s (%d)".format( int(2.5 * BIG_COLUMN_WIDTH)) res.append(fmt_str % ("links", "mentions", "activities", self.res_cnt)) res.append("-" * OUTPUT_PAGE_WIDTH) fmt_str = u"%{}s -- %4d %5.2f%% %4d %5.2f%%".format( int(2.5 * BIG_COLUMN_WIDTH)) for x in self.freq.get_tokens(20): res.append(fmt_str % (x[4], x[0], x[1] * 100., x[2], x[3] * 100.)) res.append("-" * OUTPUT_PAGE_WIDTH) # self.query.get_top_grams() fmt_str = u"%{}s -- %10s %8s (%d)".format(BIG_COLUMN_WIDTH) res.append(fmt_str % ("terms", "mentions", "activities", self.res_cnt)) res.append("-" * OUTPUT_PAGE_WIDTH) fmt_str = u"%{}s -- %4d %5.2f%% %4d %6.2f%%".format( BIG_COLUMN_WIDTH) for x in self.freq.get_tokens(20): res.append(fmt_str % (x[4], x[0], x[1] * 100., x[2], x[3] * 100.)) res.append("-" * OUTPUT_PAGE_WIDTH) else: res = [ "{:%Y-%m-%dT%H:%M:%S},{}".format(x[2], x[1]) for x in self.get_time_series() ] return u"\n".join(res)
def get_top_grams(self, n=20): self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter") self.freq.sl.add_session_stop_list(["http", "https", "amp", "htt"]) for x in self.query.get_activity_set(): self.freq.add(x.all_text) return self.freq.get_tokens(n)
description="See list of 1 and 2 grams (bag-of-words) for input corpus--1 docudment per line.") grams_parser.add_argument("file_name", metavar= "file_name", nargs="?", default=[], help="Input file name (optional).") grams_parser.add_argument("-n", "--number-of-grams", dest="number_of_grams", default=None, help="Limit list to top n 1-grams and top n 2-grams.") grams_parser.add_argument("-c", "--char-limit", dest="char_limit", default=2, help="The shortest grams to include in the count.") grams_parser.add_argument("-p", "--pretty-print", dest="pretty_print", action="store_true", default=False, help="Prettier output format") grams_parser.add_argument("-k", "--n-grams", dest="n_grams", default=2, help="N-gram depth (default 2)") grams_parser.add_argument("-f", "--filter", dest="filter", default=None, help="List of terms to filter \"the,and,happy\"") opts = grams_parser.parse_args() f = SimpleNGrams(charCutoff=int(opts.char_limit), n_grams=opts.n_grams) if opts.filter is not None: tmp = [x.lower().strip() for x in opts.filter.split(",")] f.sl.add_session_stop_list(tmp) for row in fileinput.FileInput(opts.file_name,openhook=fileinput.hook_encoded("utf-8")): f.add(row) if opts.number_of_grams is None: res = f.get_repr(opts.number_of_grams) else: res = f.get_repr(int(opts.number_of_grams)) if opts.pretty_print: fmt = ["%5s", "%9s", "%5s", "%9s", "%24s", "%7s"] for x in res.split('\n'): tmp_str = x.strip().split(",") sys.stdout.write(" ".join([j%i for i,j in zip(tmp_str,fmt)]) + "\n") else:
class Results(): """Class for aggregating and accessing search result sets and subsets. Returns derived values for the query specified.""" def __init__(self , user , password , stream_url , paged = False , output_file_path = None , pt_filter = None , max_results = 100 , start = None , end = None , count_bucket = None , show_query = False , hard_max = None ): """Create a result set by passing all of the require parameters for a query. The Results class runs an API query once when initialized. This allows one to make multiple calls to analytics methods on a single query. """ # run the query self.query = Query(user, password, stream_url, paged, output_file_path, hard_max) self.query.execute( pt_filter=pt_filter , max_results = max_results , start = start , end = end , count_bucket = count_bucket , show_query = show_query ) self.freq = None def get_activities(self): """Generator of query results.""" for x in self.query.get_activity_set(): yield x def get_time_series(self): """Generator of time series for query results.""" for x in self.query.get_time_series(): yield x def get_top_links(self, n=20): """Returns the links most shared in the data set retrieved in the order of how many times each was shared.""" self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space") for x in self.query.get_list_set(): link_str = x[LINKS_INDEX] if link_str != "GNIPEMPTYFIELD" and link_str != "None": self.freq.add(link_str) else: self.freq.add("NoLinks") return self.freq.get_tokens(n) def get_top_users(self, n=50): """Returns the users tweeting the most in the data set retrieved in the data set. Users are returned in descending order of how many times they were tweeted.""" self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter") for x in self.query.get_list_set(): self.freq.add(x[USER_NAME_INDEX]) return self.freq.get_tokens(n) def get_users(self, n=None): """Returns the user ids for the tweets collected""" uniq_users = set() for x in self.query.get_list_set(): uniq_users.add(x[USER_ID_INDEX]) return uniq_users def get_top_grams(self, n=20): self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter") self.freq.sl.add_session_stop_list(["http", "https", "amp", "htt"]) for x in self.query.get_list_set(): self.freq.add(x[TEXT_INDEX]) return self.freq.get_tokens(n) def get_geo(self): for rec in self.query.get_activity_set(): lat, lng = None, None if "geo" in rec: if "coordinates" in rec["geo"]: [lat,lng] = rec["geo"]["coordinates"] activity = { "id": rec["id"].split(":")[2] , "postedTime": rec["postedTime"].strip(".000Z") , "latitude": lat , "longitude": lng } yield activity def get_frequency_items(self, size = 20): """Retrieve the token list structure from the last query""" if self.freq is None: raise VallueError("No frequency available for use case") return self.freq.get_tokens(size) def __len__(self): return len(self.query) def __repr__(self): if self.last_query_params["count_bucket"] is None: res = [u"-"*OUTPUT_PAGE_WIDTH] rate = self.query.get_rate() unit = "Tweets/Minute" if rate < 0.01: rate *= 60. unit = "Tweets/Hour" res.append(" PowerTrack Rule: \"%s\""%self.last_query_params["pt_filter"]) res.append(" Oldest Tweet (UTC): %s"%str(self.query.oldest_t)) res.append(" Newest Tweet (UTC): %s"%str(self.query.newest_t)) res.append(" Now (UTC): %s"%str(datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"))) res.append(" %5d Tweets: %6.3f %s"%(self.query.res_cnt, rate, unit)) res.append("-"*OUTPUT_PAGE_WIDTH) # self.query.get_top_users() fmt_str = u"%{}s -- %10s %8s (%d)".format(BIG_COLUMN_WIDTH) res.append(fmt_str%( "users", "tweets", "activities", self.res_cnt)) res.append("-"*OUTPUT_PAGE_WIDTH) fmt_str = u"%{}s -- %4d %5.2f%% %4d %5.2f%%".format(BIG_COLUMN_WIDTH) for x in self.freq.get_tokens(20): res.append(fmt_str%(x[4], x[0], x[1]*100., x[2], x[3]*100.)) res.append("-"*OUTPUT_PAGE_WIDTH) # self.query.get_top_links() fmt_str = u"%{}s -- %10s %8s (%d)".format(int(2.5*BIG_COLUMN_WIDTH)) res.append(fmt_str%( "links", "mentions", "activities", self.res_cnt)) res.append("-"*OUTPUT_PAGE_WIDTH) fmt_str = u"%{}s -- %4d %5.2f%% %4d %5.2f%%".format(int(2.5*BIG_COLUMN_WIDTH)) for x in self.freq.get_tokens(20): res.append(fmt_str%(x[4], x[0], x[1]*100., x[2], x[3]*100.)) res.append("-"*OUTPUT_PAGE_WIDTH) # self.query.get_top_grams() fmt_str = u"%{}s -- %10s %8s (%d)".format(BIG_COLUMN_WIDTH) res.append(fmt_str%( "terms", "mentions", "activities", self.res_cnt)) res.append("-"*OUTPUT_PAGE_WIDTH) fmt_str =u"%{}s -- %4d %5.2f%% %4d %6.2f%%".format(BIG_COLUMN_WIDTH) for x in self.freq.get_tokens(20): res.append(fmt_str%(x[4], x[0], x[1]*100., x[2], x[3]*100.)) res.append("-"*OUTPUT_PAGE_WIDTH) else: res = ["{:%Y-%m-%dT%H:%M:%S},{}".format(x[2], x[1]) for x in self.get_time_series()] return u"\n".join(res)
def get_top_grams(self, n=20): self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter") self.freq.sl.add_session_stop_list(["http", "https", "amp", "htt"]) for x in self.query.get_list_set(): self.freq.add(x[TEXT_INDEX]) return self.freq.get_tokens(n)
def get_top_grams(self, n=20): self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter") for x in self.query.get_list_set(): self.freq.add(x[TEXT_INDEX]) return self.freq.get_tokens(n)
class Results(): """Class for aggregating and accessing search result sets and subsets. Returns derived values for the query specified.""" def __init__(self, user, password, stream_url, paged=False, output_file_path=None, pt_filter=None, max_results=100, start=None, end=None, count_bucket=None, show_query=False, search_v2=False): """Create a result set by passing all of the require parameters for a query. The Results class runs an API query once when initialized. This allows one to make multiple calls to analytics methods on a single query. """ # run the query self.query = Query(user, password, stream_url, paged, output_file_path, search_v2) self.query.execute(pt_filter=pt_filter, max_results=max_results, start=start, end=end, count_bucket=count_bucket, show_query=show_query) self.freq = None def get_activities(self): """Generator of query results.""" for x in self.query.get_activity_set(): yield x def get_time_series(self): """Generator of time series for query results. If count_bucket is set to a valid string, then the returned values are from the counts endpoint. In the case of the data endpoint, the generator returns the createdDate for the activities retrieved.""" for x in self.query.time_series: yield x def get_top_links(self, n=20): """Returns the links most shared in the data set retrieved in the order of how many times each was shared.""" self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space") for x in self.query.get_list_set(): link_str = x[LINKS_INDEX] if link_str != "GNIPEMPTYFIELD" and link_str != "None": exec("link_list=%s" % link_str) for l in link_list: self.freq.add(l) else: self.freq.add("NoLinks") return self.freq.get_tokens(n) def get_top_users(self, n=50): """Returns the users tweeting the most in the data set retrieved in the data set. Users are returned in descending order of how many times they were tweeted.""" self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter") for x in self.query.get_list_set(): self.freq.add(x[USER_NAME_INDEX]) return self.freq.get_tokens(n) def get_top_grams(self, n=20): self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter") for x in self.query.get_list_set(): self.freq.add(x[TEXT_INDEX]) return self.freq.get_tokens(n) def get_geo(self): for rec in self.query.get_activity_set(): lat, lng = None, None if "geo" in rec: if "coordinates" in rec["geo"]: [lat, lng] = rec["geo"]["coordinates"] activity = { "id": rec["id"].split(":")[2], "postedTime": rec["postedTime"].strip(".000Z"), "latitude": lat, "longitude": lng } yield activity def get_frequency_items(self, size=20): """Retrieve the token list structure from the last query""" if self.freq is None: raise VallueError("No frequency available for use case") return self.freq.get_tokens(size) def __len__(self): return len(self.query) def __repr__(self): if self.last_query_params["count_bucket"] is None: res = [u"-" * OUTPUT_PAGE_WIDTH] rate = self.query.get_rate() unit = "Tweets/Minute" if rate < 0.01: rate *= 60. unit = "Tweets/Hour" res.append(" PowerTrack Rule: \"%s\"" % self.last_query_params["pt_filter"]) res.append(" Oldest Tweet (UTC): %s" % str(self.query.oldest_t)) res.append(" Newest Tweet (UTC): %s" % str(self.query.newest_t)) res.append( " Now (UTC): %s" % str(datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"))) res.append(" %5d Tweets: %6.3f %s" % (self.query.res_cnt, rate, unit)) res.append("-" * OUTPUT_PAGE_WIDTH) # self.query.get_top_users() fmt_str = u"%{}s -- %10s %8s (%d)".format(BIG_COLUMN_WIDTH) res.append(fmt_str % ("users", "tweets", "activities", self.res_cnt)) res.append("-" * OUTPUT_PAGE_WIDTH) fmt_str = u"%{}s -- %4d %5.2f%% %4d %5.2f%%".format( BIG_COLUMN_WIDTH) for x in self.freq.get_tokens(20): res.append(fmt_str % (x[4], x[0], x[1] * 100., x[2], x[3] * 100.)) res.append("-" * OUTPUT_PAGE_WIDTH) # self.query.get_top_links() fmt_str = u"%{}s -- %10s %8s (%d)".format( int(2.5 * BIG_COLUMN_WIDTH)) res.append(fmt_str % ("links", "mentions", "activities", self.res_cnt)) res.append("-" * OUTPUT_PAGE_WIDTH) fmt_str = u"%{}s -- %4d %5.2f%% %4d %5.2f%%".format( int(2.5 * BIG_COLUMN_WIDTH)) for x in self.freq.get_tokens(20): res.append(fmt_str % (x[4], x[0], x[1] * 100., x[2], x[3] * 100.)) res.append("-" * OUTPUT_PAGE_WIDTH) # self.query.get_top_grams() fmt_str = u"%{}s -- %10s %8s (%d)".format(BIG_COLUMN_WIDTH) res.append(fmt_str % ("terms", "mentions", "activities", self.res_cnt)) res.append("-" * OUTPUT_PAGE_WIDTH) fmt_str = u"%{}s -- %4d %5.2f%% %4d %6.2f%%".format( BIG_COLUMN_WIDTH) for x in self.freq.get_tokens(20): res.append(fmt_str % (x[4], x[0], x[1] * 100., x[2], x[3] * 100.)) res.append("-" * OUTPUT_PAGE_WIDTH) else: res = [ "{:%Y-%m-%dT%H:%M:%S},{}".format(x[2], x[1]) for x in self.time_series ] return u"\n".join(res)
def __init__(self, token_list_size=20): self.token_list_size = int(token_list_size) twitter_parser = argparse.ArgumentParser( description="GnipSearch supports the following use cases: %s" % str(self.USE_CASES)) twitter_parser.add_argument("use_case", metavar="USE_CASE", choices=self.USE_CASES, help="Use case for this search.") twitter_parser.add_argument( "-f", "--filter", dest="filter", default="from:jrmontag OR from:gnip", help= "PowerTrack filter rule (See: http://support.gnip.com/customer/portal/articles/901152-powertrack-operators)" ) twitter_parser.add_argument( "-l", "--stream-url", dest="stream_url", default= "https://search.gnip.com/accounts/shendrickson/search/wayback.json", help="Url of search endpoint. (See your Gnip console.)") twitter_parser.add_argument( "-c", "--count", dest="csv_count", action="store_true", default=False, help= "Return comma-separated 'date,counts' when using a counts.json endpoint." ) twitter_parser.add_argument( "-b", "--bucket", dest="count_bucket", default="day", help= "Bucket size for counts query. Options are day, hour, minute (default is 'day')." ) twitter_parser.add_argument( "-s", "--start-date", dest="start", default=None, help= "Start of datetime window, format 'YYYY-mm-DDTHH:MM' (default: 30 days ago)" ) twitter_parser.add_argument( "-e", "--end-date", dest="end", default=None, help= "End of datetime window, format 'YYYY-mm-DDTHH:MM' [Omit for most recent activities] (default: none)" ) twitter_parser.add_argument("-q", "--query", dest="query", action="store_true", default=False, help="View API query (no data)") twitter_parser.add_argument("-u", "--user-name", dest="user", default="*****@*****.**", help="User name") twitter_parser.add_argument("-p", "--password", dest="pwd", help="Password") twitter_parser.add_argument( "-n", "--results-max", dest="max", default=100, help="Maximum results to return (default 100)") self.options = twitter_parser.parse_args() self.twitter_parser = TwacsCSV(",", False, False, True, False, True, False, False, False) DATE_INDEX = 1 TEXT_INDEX = 2 LINKS_INDEX = 3 USER_NAME_INDEX = 7 space_tokenizer = False char_upper_cutoff = 11 # if self.options.use_case.startswith("links"): char_upper_cutoff = 100 space_tokenizer = True self.freq = SimpleNGrams(charUpperCutoff=char_upper_cutoff, space_tokenizer=space_tokenizer) if self.options.use_case.startswith("user"): self.index = USER_NAME_INDEX elif self.options.use_case.startswith("wordc"): self.index = TEXT_INDEX elif self.options.use_case.startswith("rate"): self.index = DATE_INDEX elif self.options.use_case.startswith("link"): self.index = LINKS_INDEX elif self.options.use_case.startswith("time"): if not self.options.stream_url.endswith("counts.json"): self.options.stream_url = self.options.stream_url[:-5] + "/counts.json" if self.options.count_bucket not in ['day', 'minute', 'hour']: print >> sys.stderr, "Error. Invalid count bucket: %s \n" % str( self.options.count_bucket) sys.exit() timeRE = re.compile( "([0-9]{4}).([0-9]{2}).([0-9]{2}).([0-9]{2}):([0-9]{2})") if self.options.start: dt = re.search(timeRE, self.options.start) if not dt: print >> sys.stderr, "Error. Invalid start-date format: %s \n" % str( self.options.start) sys.exit() else: f = '' for i in range(re.compile(timeRE).groups): f += dt.group(i + 1) self.fromDate = f if self.options.end: dt = re.search(timeRE, self.options.end) if not dt: print >> sys.stderr, "Error. Invalid end-date format: %s \n" % str( self.options.end) sys.exit() else: e = '' for i in range(re.compile(timeRE).groups): e += dt.group(i + 1) self.toDate = e