def setup_analysis(do_conversation=False, do_audience=False, identifier=None, input_results=None): """ Created placeholders for quantities of interest in results structure; return results data structure. If an identifier is specified, place the measurement accumulators at a particular key. """ def weight_and_screennames(): return {"weight": 0, "screennames": set([])} results = { "tweet_count": 0, "non-tweet_lines": 0, "tweets_per_user": defaultdict(int), #"user_id_to_screenname": } if do_conversation: results["do_conversation"] = True results["body_term_count"] = SimpleNGrams(char_lower_cutoff=3, n_grams=3, tokenizer="twitter") results["hashtags"] = defaultdict(int) results["urls"] = defaultdict(int) results["number_of_links"] = 0 results["utc_timeline"] = defaultdict(int) results["local_timeline"] = defaultdict(int) results["at_mentions"] = defaultdict(weight_and_screennames) results["in_reply_to"] = defaultdict(int) results["RT_of_user"] = defaultdict(weight_and_screennames) results["quote_of_user"] = defaultdict(weight_and_screennames) results["url_content"] = SimpleNGrams(char_lower_cutoff=3, n_grams=3, tokenizer="twitter") else: results["do_conversation"] = False if do_audience: results["do_audience"] = True results["bio_term_count"] = SimpleNGrams(char_lower_cutoff=3, n_grams=1, tokenizer="twitter") results["profile_locations_regions"] = defaultdict(int) results["audience_api"] = "" else: results["do_audience"] = False # in the future we could add custom fields by adding kwarg = func where func is agg/extractor and kwarg is field name return results
def set_index(self, use_case, count_bucket): self.use_case = use_case space_tokenizer = False char_upper_cutoff = 20 # longer than for normal words because of user names if use_case.startswith("links"): char_upper_cutoff = 100 space_tokenizer = True # self.freq = SimpleNGrams(charUpperCutoff=char_upper_cutoff, space_tokenizer=space_tokenizer) self.freq = SimpleNGrams(char_upper_cutoff=char_upper_cutoff, tokenizer="space") if use_case.startswith("user"): self.index = USER_NAME_INDEX elif use_case.startswith("wordc"): self.index = TEXT_INDEX elif use_case.startswith("rate"): self.index = DATE_INDEX elif use_case.startswith("link"): self.index = LINKS_INDEX elif use_case.startswith("time"): if not self.stream_url.endswith("counts.json"): self.stream_url = self.stream_url[:-5] + "/counts.json" if count_bucket not in ['day', 'minute', 'hour']: print >> sys.stderr, "Error. Invalid count bucket: %s \n" % str( count_bucket) sys.exit()
def get_top_users(self, n=50): """Returns the users tweeting the most in the data set retrieved in the data set. Users are returned in descending order of how many times they were tweeted.""" self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter") for x in self.query.get_activity_set(): self.freq.add(x.screen_name) return self.freq.get_tokens(n)
def get_top_links(self, n=20): """Returns the links most shared in the data set retrieved in the order of how many times each was shared.""" self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space") for x in self.query.get_activity_set(): for link_str in x.most_unrolled_urls: self.freq.add(link_str) return self.freq.get_tokens(n)
def get_top_links(self, n=20): """Returns the links most shared in the data set retrieved in the order of how many times each was shared.""" self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space") for x in self.query.get_list_set(): link_str = x[LINKS_INDEX] if link_str != "GNIPEMPTYFIELD" and link_str != "None": self.freq.add(link_str) else: self.freq.add("NoLinks") return self.freq.get_tokens(n)
def get_top_grams(self, n=20): self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter") self.freq.sl.add_session_stop_list(["http", "https", "amp", "htt"]) for x in self.query.get_activity_set(): self.freq.add(x.all_text) return self.freq.get_tokens(n)
description="See list of 1 and 2 grams (bag-of-words) for input corpus--1 docudment per line.") grams_parser.add_argument("file_name", metavar= "file_name", nargs="?", default=[], help="Input file name (optional).") grams_parser.add_argument("-n", "--number-of-grams", dest="number_of_grams", default=None, help="Limit list to top n 1-grams and top n 2-grams.") grams_parser.add_argument("-c", "--char-limit", dest="char_limit", default=2, help="The shortest grams to include in the count.") grams_parser.add_argument("-p", "--pretty-print", dest="pretty_print", action="store_true", default=False, help="Prettier output format") grams_parser.add_argument("-k", "--n-grams", dest="n_grams", default=2, help="N-gram depth (default 2)") grams_parser.add_argument("-f", "--filter", dest="filter", default=None, help="List of terms to filter \"the,and,happy\"") opts = grams_parser.parse_args() f = SimpleNGrams(charCutoff=int(opts.char_limit), n_grams=opts.n_grams) if opts.filter is not None: tmp = [x.lower().strip() for x in opts.filter.split(",")] f.sl.add_session_stop_list(tmp) for row in fileinput.FileInput(opts.file_name,openhook=fileinput.hook_encoded("utf-8")): f.add(row) if opts.number_of_grams is None: res = f.get_repr(opts.number_of_grams) else: res = f.get_repr(int(opts.number_of_grams)) if opts.pretty_print: fmt = ["%5s", "%9s", "%5s", "%9s", "%24s", "%7s"] for x in res.split('\n'): tmp_str = x.strip().split(",") sys.stdout.write(" ".join([j%i for i,j in zip(tmp_str,fmt)]) + "\n") else:
help="Input file name (optional).") grams_parser.add_argument("-n", "--number-of-grams", dest="number_of_grams", default=None, help="Limit list to top n 1-grams and top n 2-grams.") grams_parser.add_argument("-c", "--char-limit", dest="char_limit", default=2, help="The shortest grams to include in the count.") grams_parser.add_argument("-p", "--pretty-print", dest="pretty_print", action="store_true", default=False, help="Prettier output format") grams_parser.add_argument("-k", "--n-grams", dest="n_grams", default=2, help="N-gram depth (default 2)") grams_parser.add_argument("-t", "--space-tokenizer", dest="space_tokenizer", default=False, action="store_true", help="Use alternate tokization on white-space only.") grams_parser.add_argument("-f", "--filter", dest="filter", default=None, help="List of terms to filter \"the,and,happy\"") opts = grams_parser.parse_args() f = SimpleNGrams(charCutoff=int(opts.char_limit), n_grams=opts.n_grams, space_tokenizer=opts.space_tokenizer) if opts.filter is not None: tmp = [x.lower().strip() for x in opts.filter.split(",")] f.sl.add_session_stop_list(tmp) for row in fileinput.FileInput(opts.file_name,openhook=fileinput.hook_encoded("utf-8")): f.add(row) if opts.number_of_grams is None: res = f.get_repr(opts.number_of_grams) else: res = f.get_repr(int(opts.number_of_grams)) if opts.pretty_print: fmt = ["%5s", "%9s", "%5s", "%9s", "%24s", "%7s"] for x in res.split('\n'): tmp_str = x.strip().split(",") sys.stdout.write(" ".join([j%i for i,j in zip(tmp_str,fmt)]) + "\n") else:
def get_top_grams(self, n=20): self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter") for x in self.query.get_list_set(): self.freq.add(x[TEXT_INDEX]) return self.freq.get_tokens(n)
def __init__(self, token_list_size=20): self.token_list_size = int(token_list_size) twitter_parser = argparse.ArgumentParser( description="GnipSearch supports the following use cases: %s" % str(self.USE_CASES)) twitter_parser.add_argument("use_case", metavar="USE_CASE", choices=self.USE_CASES, help="Use case for this search.") twitter_parser.add_argument( "-f", "--filter", dest="filter", default="from:jrmontag OR from:gnip", help= "PowerTrack filter rule (See: http://support.gnip.com/customer/portal/articles/901152-powertrack-operators)" ) twitter_parser.add_argument( "-l", "--stream-url", dest="stream_url", default= "https://search.gnip.com/accounts/shendrickson/search/wayback.json", help="Url of search endpoint. (See your Gnip console.)") twitter_parser.add_argument( "-c", "--count", dest="csv_count", action="store_true", default=False, help= "Return comma-separated 'date,counts' when using a counts.json endpoint." ) twitter_parser.add_argument( "-b", "--bucket", dest="count_bucket", default="day", help= "Bucket size for counts query. Options are day, hour, minute (default is 'day')." ) twitter_parser.add_argument( "-s", "--start-date", dest="start", default=None, help= "Start of datetime window, format 'YYYY-mm-DDTHH:MM' (default: 30 days ago)" ) twitter_parser.add_argument( "-e", "--end-date", dest="end", default=None, help= "End of datetime window, format 'YYYY-mm-DDTHH:MM' [Omit for most recent activities] (default: none)" ) twitter_parser.add_argument("-q", "--query", dest="query", action="store_true", default=False, help="View API query (no data)") twitter_parser.add_argument("-u", "--user-name", dest="user", default="*****@*****.**", help="User name") twitter_parser.add_argument("-p", "--password", dest="pwd", help="Password") twitter_parser.add_argument( "-n", "--results-max", dest="max", default=100, help="Maximum results to return (default 100)") self.options = twitter_parser.parse_args() self.twitter_parser = TwacsCSV(",", False, False, True, False, True, False, False, False) DATE_INDEX = 1 TEXT_INDEX = 2 LINKS_INDEX = 3 USER_NAME_INDEX = 7 space_tokenizer = False char_upper_cutoff = 11 # if self.options.use_case.startswith("links"): char_upper_cutoff = 100 space_tokenizer = True self.freq = SimpleNGrams(charUpperCutoff=char_upper_cutoff, space_tokenizer=space_tokenizer) if self.options.use_case.startswith("user"): self.index = USER_NAME_INDEX elif self.options.use_case.startswith("wordc"): self.index = TEXT_INDEX elif self.options.use_case.startswith("rate"): self.index = DATE_INDEX elif self.options.use_case.startswith("link"): self.index = LINKS_INDEX elif self.options.use_case.startswith("time"): if not self.options.stream_url.endswith("counts.json"): self.options.stream_url = self.options.stream_url[:-5] + "/counts.json" if self.options.count_bucket not in ['day', 'minute', 'hour']: print >> sys.stderr, "Error. Invalid count bucket: %s \n" % str( self.options.count_bucket) sys.exit() timeRE = re.compile( "([0-9]{4}).([0-9]{2}).([0-9]{2}).([0-9]{2}):([0-9]{2})") if self.options.start: dt = re.search(timeRE, self.options.start) if not dt: print >> sys.stderr, "Error. Invalid start-date format: %s \n" % str( self.options.start) sys.exit() else: f = '' for i in range(re.compile(timeRE).groups): f += dt.group(i + 1) self.fromDate = f if self.options.end: dt = re.search(timeRE, self.options.end) if not dt: print >> sys.stderr, "Error. Invalid end-date format: %s \n" % str( self.options.end) sys.exit() else: e = '' for i in range(re.compile(timeRE).groups): e += dt.group(i + 1) self.toDate = e
grams_parser.add_argument("-t", "--space-tokenizer", dest="space_tokenizer", default=False, action="store_true", help="Use alternate tokization on white-space only.") grams_parser.add_argument("-w", "--twitter-tokenizer", dest="twitter_tokenizer", default=False, action="store_true", help="Use alternate Twitter tokization with hashtags and mentions intact.") grams_parser.add_argument("-f", "--filter", dest="filter", default=None, help="List of terms to filter \"the,and,happy\"") opts = grams_parser.parse_args() if opts.space_tokenizer: tokenizer = "space" elif opts.twitter_tokenizer: tokenizer = "twitter" else: tokenizer = "word" f = SimpleNGrams(char_lower_cutoff=int(opts.char_limit), n_grams=opts.n_grams, tokenizer=tokenizer) if opts.filter is not None: tmp = [x.lower().strip() for x in opts.filter.split(",")] f.sl.add_session_stop_list(tmp) for row in fileinput.FileInput(opts.file_name,openhook=fileinput.hook_encoded("utf-8")): f.add(row) if opts.number_of_grams is None: res = f.get_repr(opts.number_of_grams) else: res = f.get_repr(int(opts.number_of_grams)) if opts.pretty_print: fmt = ["%5s", "%9s", "%5s", "%9s", "%34s", "%7s"] for x in res.split('\n'): tmp_str = x.strip().split(",") sys.stdout.write(" ".join([j%i for i,j in zip(tmp_str,fmt)]) + "\n") else: