Python SimpleNGramsの例、simple_n_grams.simple_n_grams.SimpleNGrams Pythonの例

コード例 #1

0

ファイルを表示

 def get_top_users(self, n=50):
     """Returns the users  tweeting the most in the data set retrieved
        in the data set. Users are returned in descending order of how
        many times they were tweeted."""
     self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
     for x in self.query.get_activity_set():
         self.freq.add(x.screen_name)
     return self.freq.get_tokens(n)

コード例 #2

0

ファイルを表示

 def get_top_links(self, n=20):
     """Returns the links most shared in the data set retrieved in
        the order of how many times each was shared."""
     self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space")
     for x in self.query.get_activity_set():
         for link_str in x.most_unrolled_urls:
             self.freq.add(link_str)
     return self.freq.get_tokens(n)

コード例 #3

0

ファイルを表示

 def get_top_links(self, n=20):
     """Returns the links most shared in the data set retrieved in
        the order of how many times each was shared."""
     self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space")
     for x in self.query.get_list_set():
         link_str = x[LINKS_INDEX]
         if link_str != "GNIPEMPTYFIELD" and link_str != "None":
             self.freq.add(link_str)
         else:
             self.freq.add("NoLinks")
     return self.freq.get_tokens(n)

コード例 #4

0

ファイルを表示

ファイル: analysis.py プロジェクト: fionapigott/Gnip-Tweet-Evaluation

def setup_analysis(do_conversation=False,
                   do_audience=False,
                   identifier=None,
                   input_results=None):
    """
    Created placeholders for quantities of interest in results structure;
    return results data structure.

    If an identifier is specified, place the measurement accumulators at a
    particular key.

    """
    def weight_and_screennames():
        return {"weight": 0, "screennames": set([])}

    results = {
        "tweet_count": 0,
        "non-tweet_lines": 0,
        "tweets_per_user": defaultdict(int),
        #"user_id_to_screenname":
    }
    if do_conversation:
        results["do_conversation"] = True
        results["body_term_count"] = SimpleNGrams(char_lower_cutoff=3,
                                                  n_grams=3,
                                                  tokenizer="twitter")
        results["hashtags"] = defaultdict(int)
        results["urls"] = defaultdict(int)
        results["number_of_links"] = 0
        results["utc_timeline"] = defaultdict(int)
        results["local_timeline"] = defaultdict(int)
        results["at_mentions"] = defaultdict(weight_and_screennames)
        results["in_reply_to"] = defaultdict(int)
        results["RT_of_user"] = defaultdict(weight_and_screennames)
        results["quote_of_user"] = defaultdict(weight_and_screennames)
        results["url_content"] = SimpleNGrams(char_lower_cutoff=3,
                                              n_grams=3,
                                              tokenizer="twitter")
    else:
        results["do_conversation"] = False
    if do_audience:
        results["do_audience"] = True
        results["bio_term_count"] = SimpleNGrams(char_lower_cutoff=3,
                                                 n_grams=1,
                                                 tokenizer="twitter")
        results["profile_locations_regions"] = defaultdict(int)
        results["audience_api"] = ""
    else:
        results["do_audience"] = False

    # in the future we could add custom fields by adding kwarg = func where func is agg/extractor and kwarg is field name

    return results

コード例 #5

0

ファイルを表示

ファイル: gnip_search_api.py プロジェクト: muskanmahajan37/struct

    def set_index(self, use_case, count_bucket):
        self.use_case = use_case
        space_tokenizer = False
        char_upper_cutoff = 20  # longer than for normal words because of user names
        if use_case.startswith("links"):
            char_upper_cutoff = 100
            space_tokenizer = True
#         self.freq = SimpleNGrams(charUpperCutoff=char_upper_cutoff, space_tokenizer=space_tokenizer)
        self.freq = SimpleNGrams(char_upper_cutoff=char_upper_cutoff,
                                 tokenizer="space")
        if use_case.startswith("user"):
            self.index = USER_NAME_INDEX
        elif use_case.startswith("wordc"):
            self.index = TEXT_INDEX
        elif use_case.startswith("rate"):
            self.index = DATE_INDEX
        elif use_case.startswith("link"):
            self.index = LINKS_INDEX
        elif use_case.startswith("time"):
            if not self.stream_url.endswith("counts.json"):
                self.stream_url = self.stream_url[:-5] + "/counts.json"
            if count_bucket not in ['day', 'minute', 'hour']:
                print >> sys.stderr, "Error. Invalid count bucket: %s \n" % str(
                    count_bucket)
                sys.exit()

コード例 #6

0

ファイルを表示

ファイル: results.py プロジェクト: DrSkippy/Gnip-Python-Search-API-Utilities

 def get_top_users(self, n=50):
     """Returns the users  tweeting the most in the data set retrieved
        in the data set. Users are returned in descending order of how
        many times they were tweeted."""
     self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
     for x in self.query.get_list_set():
         self.freq.add(x[USER_NAME_INDEX])
     return self.freq.get_tokens(n)

コード例 #7

0

ファイルを表示

ファイル: results.py プロジェクト: DrSkippy/Gnip-Python-Search-API-Utilities

 def get_top_links(self, n=20):
     """Returns the links most shared in the data set retrieved in
        the order of how many times each was shared."""
     self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space")
     for x in self.query.get_list_set():
         link_str = x[LINKS_INDEX]
         if link_str != "GNIPEMPTYFIELD" and link_str != "None":
             self.freq.add(link_str)
         else:
             self.freq.add("NoLinks")
     return self.freq.get_tokens(n)

コード例 #8

0

ファイルを表示

 def get_top_links(self, n=20):
     """Returns the links most shared in the data set retrieved in
        the order of how many times each was shared."""
     self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space")
     for x in self.query.get_list_set():
         link_str = x[LINKS_INDEX]
         if link_str != "GNIPEMPTYFIELD" and link_str != "None":
             try:
                 exec("link_list=%s"%link_str)
             except SyntaxError as e:
                 print >> sys.stderr, "WARNING: Something isn't right with this list: %s skipping it..."%link_str
                 continue
             for l in link_list:
                 self.freq.add(l)
         else:
             self.freq.add("NoLinks")
     return self.freq.get_tokens(n)

コード例 #9

0

ファイルを表示

class Results():
    """Class for aggregating and accessing search result sets and
       subsets.  Returns derived values for the query specified."""
    def __init__(self,
                 user,
                 password,
                 stream_url,
                 paged=False,
                 output_file_path=None,
                 pt_filter=None,
                 max_results=100,
                 start=None,
                 end=None,
                 count_bucket=None,
                 show_query=False,
                 hard_max=None):
        """Create a result set by passing all of the require parameters 
           for a query. The Results class runs an API query once when 
           initialized. This allows one to make multiple calls 
           to analytics methods on a single query.
        """
        # run the query
        self.query = Query(user, password, stream_url, paged, output_file_path,
                           hard_max)
        self.query.execute(pt_filter=pt_filter,
                           max_results=max_results,
                           start=start,
                           end=end,
                           count_bucket=count_bucket,
                           show_query=show_query)
        self.freq = None

    def get_raw_results(self):
        """Generator of query results"""
        for x in self.query.get_raw_results():
            yield x

    def get_activities(self):
        """Generator of query tweet results."""
        for x in self.query.get_activity_set():
            yield x

    def get_time_series(self):
        """Generator of time series for query results."""
        for x in self.query.get_time_series():
            yield x

    def get_top_links(self, n=20):
        """Returns the links most shared in the data set retrieved in
           the order of how many times each was shared."""
        self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space")
        for x in self.query.get_activity_set():
            for link_str in x.most_unrolled_urls:
                self.freq.add(link_str)
        return self.freq.get_tokens(n)

    def get_top_users(self, n=50):
        """Returns the users  tweeting the most in the data set retrieved
           in the data set. Users are returned in descending order of how
           many times they were tweeted."""
        self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
        for x in self.query.get_activity_set():
            self.freq.add(x.screen_name)
        return self.freq.get_tokens(n)

    def get_users(self, n=None):
        """Returns the user ids for the tweets collected"""
        uniq_users = set()
        for x in self.query.get_activity_set():
            uniq_users.add(x.user_id)
        return uniq_users

    def get_top_grams(self, n=20):
        self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
        self.freq.sl.add_session_stop_list(["http", "https", "amp", "htt"])
        for x in self.query.get_activity_set():
            self.freq.add(x.all_text)
        return self.freq.get_tokens(n)

    def get_geo(self):
        for x in self.query.get_activity_set():
            if x.geo_coordinates is not None:
                lat_lon = x.geo_coordinates
                activity = {
                    "id": x.id,
                    "postedTime": x.created_at_string.strip(".000Z"),
                    "latitude": lat_lon["latitude"],
                    "longitude": lat_lon["longitude"]
                }
                yield activity

    def get_frequency_items(self, size=20):
        """Retrieve the token list structure from the last query"""
        if self.freq is None:
            raise VallueError("No frequency available for use case")
        return self.freq.get_tokens(size)

    def __len__(self):
        return len(self.query)

    def __repr__(self):
        if self.last_query_params["count_bucket"] is None:
            res = [u"-" * OUTPUT_PAGE_WIDTH]
            rate = self.query.get_rate()
            unit = "Tweets/Minute"
            if rate < 0.01:
                rate *= 60.
                unit = "Tweets/Hour"
            res.append("     PowerTrack Rule: \"%s\"" %
                       self.last_query_params["pt_filter"])
            res.append("  Oldest Tweet (UTC): %s" % str(self.query.oldest_t))
            res.append("  Newest Tweet (UTC): %s" % str(self.query.newest_t))
            res.append(
                "           Now (UTC): %s" %
                str(datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")))
            res.append("        %5d Tweets: %6.3f %s" %
                       (self.query.res_cnt, rate, unit))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_users()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(BIG_COLUMN_WIDTH)
            res.append(fmt_str %
                       ("users", "tweets", "activities", self.res_cnt))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            fmt_str = u"%{}s -- %4d  %5.2f%% %4d  %5.2f%%".format(
                BIG_COLUMN_WIDTH)
            for x in self.freq.get_tokens(20):
                res.append(fmt_str %
                           (x[4], x[0], x[1] * 100., x[2], x[3] * 100.))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_links()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(
                int(2.5 * BIG_COLUMN_WIDTH))
            res.append(fmt_str %
                       ("links", "mentions", "activities", self.res_cnt))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            fmt_str = u"%{}s -- %4d  %5.2f%% %4d  %5.2f%%".format(
                int(2.5 * BIG_COLUMN_WIDTH))
            for x in self.freq.get_tokens(20):
                res.append(fmt_str %
                           (x[4], x[0], x[1] * 100., x[2], x[3] * 100.))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_grams()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(BIG_COLUMN_WIDTH)
            res.append(fmt_str %
                       ("terms", "mentions", "activities", self.res_cnt))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            fmt_str = u"%{}s -- %4d  %5.2f%% %4d  %6.2f%%".format(
                BIG_COLUMN_WIDTH)
            for x in self.freq.get_tokens(20):
                res.append(fmt_str %
                           (x[4], x[0], x[1] * 100., x[2], x[3] * 100.))
            res.append("-" * OUTPUT_PAGE_WIDTH)
        else:
            res = [
                "{:%Y-%m-%dT%H:%M:%S},{}".format(x[2], x[1])
                for x in self.get_time_series()
            ]
        return u"\n".join(res)

コード例 #10

0

ファイルを表示

 def get_top_grams(self, n=20):
     self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
     self.freq.sl.add_session_stop_list(["http", "https", "amp", "htt"])
     for x in self.query.get_activity_set():
         self.freq.add(x.all_text)
     return self.freq.get_tokens(n)

コード例 #11

0

ファイルを表示

            description="See list of 1 and 2 grams (bag-of-words) for input corpus--1 docudment per line.")
    grams_parser.add_argument("file_name", metavar= "file_name", nargs="?", default=[], 
            help="Input file name (optional).")
    grams_parser.add_argument("-n", "--number-of-grams", dest="number_of_grams", default=None,
            help="Limit list to top n 1-grams and top n 2-grams.")
    grams_parser.add_argument("-c", "--char-limit", dest="char_limit", default=2, 
            help="The shortest grams to include in the count.")
    grams_parser.add_argument("-p", "--pretty-print", dest="pretty_print", action="store_true", default=False,
            help="Prettier output format")
    grams_parser.add_argument("-k", "--n-grams", dest="n_grams", default=2,
            help="N-gram depth (default 2)")
    grams_parser.add_argument("-f", "--filter", dest="filter", default=None,
            help="List of terms to filter \"the,and,happy\"")
    opts = grams_parser.parse_args()

    f = SimpleNGrams(charCutoff=int(opts.char_limit), n_grams=opts.n_grams)
    if opts.filter is not None:
        tmp = [x.lower().strip() for x in opts.filter.split(",")]
        f.sl.add_session_stop_list(tmp)
    for row in fileinput.FileInput(opts.file_name,openhook=fileinput.hook_encoded("utf-8")):
        f.add(row)
    if opts.number_of_grams is None:
        res = f.get_repr(opts.number_of_grams)
    else:
        res = f.get_repr(int(opts.number_of_grams))
    if opts.pretty_print:
        fmt = ["%5s", "%9s", "%5s", "%9s", "%24s", "%7s"] 
        for x in res.split('\n'):
            tmp_str = x.strip().split(",")
            sys.stdout.write(" ".join([j%i for i,j in zip(tmp_str,fmt)]) + "\n")
    else:

コード例 #12

0

ファイルを表示

ファイル: results.py プロジェクト: DrSkippy/Gnip-Python-Search-API-Utilities

class Results():
    """Class for aggregating and accessing search result sets and
       subsets.  Returns derived values for the query specified."""

    def __init__(self
            , user
            , password
            , stream_url
            , paged = False
            , output_file_path = None
            , pt_filter = None
            , max_results = 100
            , start = None
            , end = None
            , count_bucket = None
            , show_query = False
            , hard_max = None
            ):
        """Create a result set by passing all of the require parameters 
           for a query. The Results class runs an API query once when 
           initialized. This allows one to make multiple calls 
           to analytics methods on a single query.
        """
        # run the query
        self.query = Query(user, password, stream_url, paged, output_file_path, hard_max)
        self.query.execute(
            pt_filter=pt_filter
            , max_results = max_results
            , start = start
            , end = end
            , count_bucket = count_bucket
            , show_query = show_query
            )
        self.freq = None

    def get_activities(self):
        """Generator of query results."""
        for x in self.query.get_activity_set():
            yield x

    def get_time_series(self):
        """Generator of time series for query results."""
        for x in self.query.get_time_series():
            yield x

    def get_top_links(self, n=20):
        """Returns the links most shared in the data set retrieved in
           the order of how many times each was shared."""
        self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space")
        for x in self.query.get_list_set():
            link_str = x[LINKS_INDEX]
            if link_str != "GNIPEMPTYFIELD" and link_str != "None":
                self.freq.add(link_str)
            else:
                self.freq.add("NoLinks")
        return self.freq.get_tokens(n)

    def get_top_users(self, n=50):
        """Returns the users  tweeting the most in the data set retrieved
           in the data set. Users are returned in descending order of how
           many times they were tweeted."""
        self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
        for x in self.query.get_list_set():
            self.freq.add(x[USER_NAME_INDEX])
        return self.freq.get_tokens(n) 

    def get_users(self, n=None):
        """Returns the user ids for the tweets collected"""
        uniq_users = set()
        for x in self.query.get_list_set():
            uniq_users.add(x[USER_ID_INDEX])
        return uniq_users

    def get_top_grams(self, n=20):
        self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
        self.freq.sl.add_session_stop_list(["http", "https", "amp", "htt"])
        for x in self.query.get_list_set():
            self.freq.add(x[TEXT_INDEX])
        return self.freq.get_tokens(n) 
            
    def get_geo(self):
        for rec in self.query.get_activity_set():
            lat, lng = None, None
            if "geo" in rec:
                if "coordinates" in rec["geo"]:
                    [lat,lng] = rec["geo"]["coordinates"]
                    activity = { "id": rec["id"].split(":")[2]
                        , "postedTime": rec["postedTime"].strip(".000Z")
                        , "latitude": lat
                        , "longitude": lng }
                    yield activity
 
    def get_frequency_items(self, size = 20):
        """Retrieve the token list structure from the last query"""
        if self.freq is None:
            raise VallueError("No frequency available for use case")
        return self.freq.get_tokens(size)

    def __len__(self):
        return len(self.query)

    def __repr__(self):
        if self.last_query_params["count_bucket"] is None:
            res = [u"-"*OUTPUT_PAGE_WIDTH]
            rate = self.query.get_rate()
            unit = "Tweets/Minute"
            if rate < 0.01:
                rate *= 60.
                unit = "Tweets/Hour"
            res.append("     PowerTrack Rule: \"%s\""%self.last_query_params["pt_filter"])
            res.append("  Oldest Tweet (UTC): %s"%str(self.query.oldest_t))
            res.append("  Newest Tweet (UTC): %s"%str(self.query.newest_t))
            res.append("           Now (UTC): %s"%str(datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")))
            res.append("        %5d Tweets: %6.3f %s"%(self.query.res_cnt, rate, unit))
            res.append("-"*OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_users()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(BIG_COLUMN_WIDTH)
            res.append(fmt_str%( "users", "tweets", "activities", self.res_cnt))
            res.append("-"*OUTPUT_PAGE_WIDTH)
            fmt_str =  u"%{}s -- %4d  %5.2f%% %4d  %5.2f%%".format(BIG_COLUMN_WIDTH)
            for x in self.freq.get_tokens(20):
                res.append(fmt_str%(x[4], x[0], x[1]*100., x[2], x[3]*100.))
            res.append("-"*OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_links()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(int(2.5*BIG_COLUMN_WIDTH))
            res.append(fmt_str%( "links", "mentions", "activities", self.res_cnt))
            res.append("-"*OUTPUT_PAGE_WIDTH)
            fmt_str =  u"%{}s -- %4d  %5.2f%% %4d  %5.2f%%".format(int(2.5*BIG_COLUMN_WIDTH))
            for x in self.freq.get_tokens(20):
                res.append(fmt_str%(x[4], x[0], x[1]*100., x[2], x[3]*100.))
            res.append("-"*OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_grams()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(BIG_COLUMN_WIDTH)
            res.append(fmt_str%( "terms", "mentions", "activities", self.res_cnt))
            res.append("-"*OUTPUT_PAGE_WIDTH)
            fmt_str =u"%{}s -- %4d  %5.2f%% %4d  %6.2f%%".format(BIG_COLUMN_WIDTH)
            for x in self.freq.get_tokens(20):
                res.append(fmt_str%(x[4], x[0], x[1]*100., x[2], x[3]*100.))
            res.append("-"*OUTPUT_PAGE_WIDTH)
        else:
            res = ["{:%Y-%m-%dT%H:%M:%S},{}".format(x[2], x[1])
                        for x in self.get_time_series()]
        return u"\n".join(res)

コード例 #13

0

ファイルを表示

ファイル: results.py プロジェクト: DrSkippy/Gnip-Python-Search-API-Utilities

 def get_top_grams(self, n=20):
     self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
     self.freq.sl.add_session_stop_list(["http", "https", "amp", "htt"])
     for x in self.query.get_list_set():
         self.freq.add(x[TEXT_INDEX])
     return self.freq.get_tokens(n)

コード例 #14

0

ファイルを表示

 def get_top_grams(self, n=20):
     self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
     for x in self.query.get_list_set():
         self.freq.add(x[TEXT_INDEX])
     return self.freq.get_tokens(n)

コード例 #15

0

ファイルを表示

class Results():
    """Class for aggregating and accessing search result sets and
       subsets.  Returns derived values for the query specified."""
    def __init__(self,
                 user,
                 password,
                 stream_url,
                 paged=False,
                 output_file_path=None,
                 pt_filter=None,
                 max_results=100,
                 start=None,
                 end=None,
                 count_bucket=None,
                 show_query=False,
                 search_v2=False):
        """Create a result set by passing all of the require parameters 
           for a query. The Results class runs an API query once when 
           initialized. This allows one to make multiple calls 
           to analytics methods on a single query.
        """
        # run the query
        self.query = Query(user, password, stream_url, paged, output_file_path,
                           search_v2)
        self.query.execute(pt_filter=pt_filter,
                           max_results=max_results,
                           start=start,
                           end=end,
                           count_bucket=count_bucket,
                           show_query=show_query)
        self.freq = None

    def get_activities(self):
        """Generator of query results."""
        for x in self.query.get_activity_set():
            yield x

    def get_time_series(self):
        """Generator of time series for query results. If count_bucket
           is set to a valid string, then the returned values are from
           the counts endpoint. In the case of the data endpoint, the
           generator returns the createdDate for the activities retrieved."""
        for x in self.query.time_series:
            yield x

    def get_top_links(self, n=20):
        """Returns the links most shared in the data set retrieved in
           the order of how many times each was shared."""
        self.freq = SimpleNGrams(char_upper_cutoff=100, tokenizer="space")
        for x in self.query.get_list_set():
            link_str = x[LINKS_INDEX]
            if link_str != "GNIPEMPTYFIELD" and link_str != "None":
                exec("link_list=%s" % link_str)
                for l in link_list:
                    self.freq.add(l)
            else:
                self.freq.add("NoLinks")
        return self.freq.get_tokens(n)

    def get_top_users(self, n=50):
        """Returns the users  tweeting the most in the data set retrieved
           in the data set. Users are returned in descending order of how
           many times they were tweeted."""
        self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
        for x in self.query.get_list_set():
            self.freq.add(x[USER_NAME_INDEX])
        return self.freq.get_tokens(n)

    def get_top_grams(self, n=20):
        self.freq = SimpleNGrams(char_upper_cutoff=20, tokenizer="twitter")
        for x in self.query.get_list_set():
            self.freq.add(x[TEXT_INDEX])
        return self.freq.get_tokens(n)

    def get_geo(self):
        for rec in self.query.get_activity_set():
            lat, lng = None, None
            if "geo" in rec:
                if "coordinates" in rec["geo"]:
                    [lat, lng] = rec["geo"]["coordinates"]
                    activity = {
                        "id": rec["id"].split(":")[2],
                        "postedTime": rec["postedTime"].strip(".000Z"),
                        "latitude": lat,
                        "longitude": lng
                    }
                    yield activity

    def get_frequency_items(self, size=20):
        """Retrieve the token list structure from the last query"""
        if self.freq is None:
            raise VallueError("No frequency available for use case")
        return self.freq.get_tokens(size)

    def __len__(self):
        return len(self.query)

    def __repr__(self):
        if self.last_query_params["count_bucket"] is None:
            res = [u"-" * OUTPUT_PAGE_WIDTH]
            rate = self.query.get_rate()
            unit = "Tweets/Minute"
            if rate < 0.01:
                rate *= 60.
                unit = "Tweets/Hour"
            res.append("     PowerTrack Rule: \"%s\"" %
                       self.last_query_params["pt_filter"])
            res.append("  Oldest Tweet (UTC): %s" % str(self.query.oldest_t))
            res.append("  Newest Tweet (UTC): %s" % str(self.query.newest_t))
            res.append(
                "           Now (UTC): %s" %
                str(datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")))
            res.append("        %5d Tweets: %6.3f %s" %
                       (self.query.res_cnt, rate, unit))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_users()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(BIG_COLUMN_WIDTH)
            res.append(fmt_str %
                       ("users", "tweets", "activities", self.res_cnt))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            fmt_str = u"%{}s -- %4d  %5.2f%% %4d  %5.2f%%".format(
                BIG_COLUMN_WIDTH)
            for x in self.freq.get_tokens(20):
                res.append(fmt_str %
                           (x[4], x[0], x[1] * 100., x[2], x[3] * 100.))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_links()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(
                int(2.5 * BIG_COLUMN_WIDTH))
            res.append(fmt_str %
                       ("links", "mentions", "activities", self.res_cnt))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            fmt_str = u"%{}s -- %4d  %5.2f%% %4d  %5.2f%%".format(
                int(2.5 * BIG_COLUMN_WIDTH))
            for x in self.freq.get_tokens(20):
                res.append(fmt_str %
                           (x[4], x[0], x[1] * 100., x[2], x[3] * 100.))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            #
            self.query.get_top_grams()
            fmt_str = u"%{}s -- %10s     %8s (%d)".format(BIG_COLUMN_WIDTH)
            res.append(fmt_str %
                       ("terms", "mentions", "activities", self.res_cnt))
            res.append("-" * OUTPUT_PAGE_WIDTH)
            fmt_str = u"%{}s -- %4d  %5.2f%% %4d  %6.2f%%".format(
                BIG_COLUMN_WIDTH)
            for x in self.freq.get_tokens(20):
                res.append(fmt_str %
                           (x[4], x[0], x[1] * 100., x[2], x[3] * 100.))
            res.append("-" * OUTPUT_PAGE_WIDTH)
        else:
            res = [
                "{:%Y-%m-%dT%H:%M:%S},{}".format(x[2], x[1])
                for x in self.time_series
            ]
        return u"\n".join(res)

コード例 #16

0

ファイルを表示

 def __init__(self, token_list_size=20):
     self.token_list_size = int(token_list_size)
     twitter_parser = argparse.ArgumentParser(
         description="GnipSearch supports the following use cases: %s" %
         str(self.USE_CASES))
     twitter_parser.add_argument("use_case",
                                 metavar="USE_CASE",
                                 choices=self.USE_CASES,
                                 help="Use case for this search.")
     twitter_parser.add_argument(
         "-f",
         "--filter",
         dest="filter",
         default="from:jrmontag OR from:gnip",
         help=
         "PowerTrack filter rule (See: http://support.gnip.com/customer/portal/articles/901152-powertrack-operators)"
     )
     twitter_parser.add_argument(
         "-l",
         "--stream-url",
         dest="stream_url",
         default=
         "https://search.gnip.com/accounts/shendrickson/search/wayback.json",
         help="Url of search endpoint. (See your Gnip console.)")
     twitter_parser.add_argument(
         "-c",
         "--count",
         dest="csv_count",
         action="store_true",
         default=False,
         help=
         "Return comma-separated 'date,counts' when using a counts.json endpoint."
     )
     twitter_parser.add_argument(
         "-b",
         "--bucket",
         dest="count_bucket",
         default="day",
         help=
         "Bucket size for counts query. Options are day, hour, minute (default is 'day')."
     )
     twitter_parser.add_argument(
         "-s",
         "--start-date",
         dest="start",
         default=None,
         help=
         "Start of datetime window, format 'YYYY-mm-DDTHH:MM' (default: 30 days ago)"
     )
     twitter_parser.add_argument(
         "-e",
         "--end-date",
         dest="end",
         default=None,
         help=
         "End of datetime window, format 'YYYY-mm-DDTHH:MM' [Omit for most recent activities] (default: none)"
     )
     twitter_parser.add_argument("-q",
                                 "--query",
                                 dest="query",
                                 action="store_true",
                                 default=False,
                                 help="View API query (no data)")
     twitter_parser.add_argument("-u",
                                 "--user-name",
                                 dest="user",
                                 default="*****@*****.**",
                                 help="User name")
     twitter_parser.add_argument("-p",
                                 "--password",
                                 dest="pwd",
                                 help="Password")
     twitter_parser.add_argument(
         "-n",
         "--results-max",
         dest="max",
         default=100,
         help="Maximum results to return (default 100)")
     self.options = twitter_parser.parse_args()
     self.twitter_parser = TwacsCSV(",", False, False, True, False, True,
                                    False, False, False)
     DATE_INDEX = 1
     TEXT_INDEX = 2
     LINKS_INDEX = 3
     USER_NAME_INDEX = 7
     space_tokenizer = False
     char_upper_cutoff = 11
     #
     if self.options.use_case.startswith("links"):
         char_upper_cutoff = 100
         space_tokenizer = True
     self.freq = SimpleNGrams(charUpperCutoff=char_upper_cutoff,
                              space_tokenizer=space_tokenizer)
     if self.options.use_case.startswith("user"):
         self.index = USER_NAME_INDEX
     elif self.options.use_case.startswith("wordc"):
         self.index = TEXT_INDEX
     elif self.options.use_case.startswith("rate"):
         self.index = DATE_INDEX
     elif self.options.use_case.startswith("link"):
         self.index = LINKS_INDEX
     elif self.options.use_case.startswith("time"):
         if not self.options.stream_url.endswith("counts.json"):
             self.options.stream_url = self.options.stream_url[:-5] + "/counts.json"
         if self.options.count_bucket not in ['day', 'minute', 'hour']:
             print >> sys.stderr, "Error. Invalid count bucket: %s \n" % str(
                 self.options.count_bucket)
             sys.exit()
     timeRE = re.compile(
         "([0-9]{4}).([0-9]{2}).([0-9]{2}).([0-9]{2}):([0-9]{2})")
     if self.options.start:
         dt = re.search(timeRE, self.options.start)
         if not dt:
             print >> sys.stderr, "Error. Invalid start-date format: %s \n" % str(
                 self.options.start)
             sys.exit()
         else:
             f = ''
             for i in range(re.compile(timeRE).groups):
                 f += dt.group(i + 1)
             self.fromDate = f
     if self.options.end:
         dt = re.search(timeRE, self.options.end)
         if not dt:
             print >> sys.stderr, "Error. Invalid end-date format: %s \n" % str(
                 self.options.end)
             sys.exit()
         else:
             e = ''
             for i in range(re.compile(timeRE).groups):
                 e += dt.group(i + 1)
             self.toDate = e