Example #1
0
def main():
    parser = OptionParser()
    parser.add_option("-i", "--input", help="input json file to parse (.gz or not)")
    parser.add_option("-a", "--all", help="tokenize all json files from the base dir",action="store_true", default=False)
    parser.add_option("-d", "--basedir", help="base dir to use, defaults to $git_root/" + DEFAULT_PATH)
    parser.add_option("-M","--max", help="set maximum number of rows to parse - for testing", type="int")

    (options, args) = parser.parse_args()

    if len(sys.argv) < 2:
        parser.print_help()
        sys.exit()

    tweets_dir = options.basedir if (options.basedir) else TWEETS_DIR
    tokenizer = TweetTokenizer(tweets_dir)

    #if (options.max):
    #    print options.max>1
    #exit()
    
    #counter = Counter()

    if (options.input):
        overall_counter = defaultdict(int)
        for (username, token) in tokenizer.get_tokens_from_file(options.input):
            overall_counter[token] += 1
            
    if (options.all):
        overall_token_counts = defaultdict(int)
        token_user_counts = defaultdict(int)
        user_totals = defaultdict(int)
        debug('opening ' + ALL_USER_TOKEN_COUNTS_FILE)
        user_token_file = codecs.open(ALL_USER_TOKEN_COUNTS_FILE, encoding="utf-8",mode= "w")
        last_username = None
        row=0
        for (username, token) in tokenizer.get_tokens_from_all_files():
            if ((last_username) and last_username!=username):
                write_token_counts(user_token_file, last_username, token_user_counts)
                token_user_counts = defaultdict(int) #reset counter
            last_username = username
            token_user_counts[token] += 1
            overall_token_counts[token] += 1
            user_totals[username] += 1
            row += 1
            if (options.max and row>options.max):
                break
        #don't forget to write out the counts for that last user
        write_token_counts(user_token_file, last_username, token_user_counts)
        debug("closing " + ALL_USER_TOKEN_COUNTS_FILE)
        user_token_file.close()

        write_token_totals(overall_token_counts)
        write_user_totals(user_totals)
Example #2
0
def write_token_totals(token_totals):
    debug('writing ' + ALL_TOKEN_COUNTS_FILE)
    token_counts_file = codecs.open(ALL_TOKEN_COUNTS_FILE, encoding="utf-8",mode= "w")
    for token, count in sorted(token_totals.iteritems(), key=itemgetter(1), reverse=True):
        token_counts_file.write("'%s',%i\n"%(unicode(token), count))
    token_counts_file.close()
Example #3
0
def write_user_totals(user_totals):
    debug('writing ' + ALL_USER_TOTALS_FILE)
    user_totals_file = codecs.open(ALL_USER_TOTALS_FILE, encoding="utf-8",mode= "w")
    for username, count in sorted(user_totals.iteritems(), key=itemgetter(1), reverse=True):
        user_totals_file.write("'%s',%i\n"%(unicode(username), count))
    user_totals_file.close()
Example #4
0
    def load_from_csv(
        self, file_prefix=None, data_dir=None, min_token_count=None, skip_common_tokens_cutoff=None, min_user_total=None
    ):

        # fixme need to support filter on load also
        self.init_data()
        self.set_data_dir(data_dir)
        self.set_prefix(file_prefix)

        debug("loading user totals from %s.." % (self.user_totals_file))
        reader = UnicodeReader(open(self.user_totals_file), quotechar="'")
        users = []
        user_ids = {}
        user_totals = {}
        idx = 0
        for [user, countstr] in reader:
            count = int(countstr)
            if min_user_total and count < min_user_total:
                # debug("skipping %s"%(user))
                continue
            user_totals[user] = count
            if not user_ids.has_key(user):
                users.append(user)
                user_ids[user] = len(users) - 1
            idx += 1
            if idx % 10000 == 0:
                debug(".. loaded %i users" % idx)

        debug(".. %i users" % idx)
        debug("..done")
        self.users = users
        self.user_ids = user_ids
        self.user_totals = user_totals
        self.num_users = len(users)

        # to work out max count, based on what % of most common data to skip we have to read the totals file twice
        max_token_count = None
        if skip_common_tokens_cutoff:
            debug("pre-loading token totals from %s.." % (self.token_totals_file))
            reader = UnicodeReader(open(self.token_totals_file), quotechar="'")
            token_totals = {}
            idx = 0
            for row in reader:
                if len(row) == 2:
                    [token, countstr] = row
                    count = int(countstr)
                    token_totals[token] = count
                    idx += 1
                    if idx % 10000 == 0:
                        debug(".. loaded %i tokens" % idx)
                else:
                    debug("skipping '%s' incorrect row format" % (token))

            cut_off = int(len(token_totals) * skip_common_tokens_cutoff)
            max_token_count = token_totals[sorted(token_totals, key=token_totals.get, reverse=True)[cut_off]]

            debug("max token count %s " % (max_token_count))

        debug("loading token totals from %s.." % (self.token_totals_file))
        reader = UnicodeReader(open(self.token_totals_file), quotechar="'")
        tokens = []
        token_ids = {}
        token_totals = {}
        idx = 0
        for row in reader:
            if len(row) == 2:
                [token, countstr] = row
                count = int(countstr)
                if min_token_count and count < min_token_count:
                    # debug("skipping '%s': count below min - %i"%(token, min_token_count))
                    continue

                if max_token_count and count > max_token_count:
                    debug("skipping '%s': count (%i) above max count (%i)" % (token, count, max_token_count))
                    continue

                token_totals[token] = count
                if not token_ids.has_key(token):
                    tokens.append(token)
                    token_ids[token] = len(tokens) - 1
                idx += 1
                if idx % 10000 == 0:
                    debug(".. loaded %i tokens" % idx)

            else:
                debug("skipping '%s' incorrect row format" % (token))
                # raise Exception('unexpected input:',row)
        debug(".. loaded %i tokens" % idx)
        debug("..done")
        self.tokens = tokens
        self.token_ids = token_ids
        self.token_totals = token_totals
        self.num_tokens = len(tokens)

        # load user_token_counts.csv
        debug("loading user token counts from %s.." % (self.user_token_counts_file))
        reader = UnicodeReader(open(self.user_token_counts_file), quotechar="'")
        idx = 0
        distinct_users_per_token = defaultdict(int)
        user_token_counts = []
        for row in reader:
            if len(row) == 3:
                [user, token, countstr] = row
                count = int(countstr)
                if token_ids.has_key(token) and user_ids.has_key(user):
                    token_user_prob = float(count) / self.user_totals[user]
                    user_token_counts.append((user_ids[user], token_ids[token], count, token_user_prob))
                    distinct_users_per_token[token] += 1
                    idx += 1
                    if idx % 100000 == 0:
                        debug(".. loaded %i user token counts" % idx)
            # else:
            # just skip this row, for now

        self.user_token_counts = np.array(user_token_counts, dtype=np.float32)
        self.distinct_users_per_token = distinct_users_per_token
        debug(".. loaded %i user token counts" % idx)
        debug("..done")
Example #5
0
    def write_to_training_csv(self):
        "write to csv using only the user_id and token_id form"
        self.set_prefix("train_")

        debug("writing  %s .." % (self.user_token_counts_file))
        idx = 0
        outfile = codecs.open(self.user_token_counts_file, encoding="utf-8", mode="w")
        outfile.write("user_id,token_id,count\n")
        for [user_id, token_id, count, prob] in self.user_token_counts:
            outfile.write("%i,%s,%i\n" % (int(user_id), int(token_id), int(count)))
            idx += 1
            if idx % 100000 == 0:
                debug("..  %i token counts" % idx)

        debug(".. wrote %i token counts" % idx)
        outfile.close()
        debug("done")

        debug("writing %s .." % (self.users_file))
        users_file = codecs.open(self.users_file, encoding="utf-8", mode="w")
        users_file.write("user_id,user\n")
        for user, user_id in sorted(self.user_ids.iteritems(), key=itemgetter(1)):
            users_file.write("%i, '%s'\n" % (int(user_id), unicode(user)))
        users_file.close()
        debug("done")

        debug("writing %s .." % (self.tokens_file))
        tokens_file = codecs.open(self.tokens_file, encoding="utf-8", mode="w")
        tokens_file.write("token_id,token\n")
        for token, token_id in sorted(self.token_ids.iteritems(), key=itemgetter(1)):
            tokens_file.write("%i, '%s'\n" % (int(token_id), unicode(token)))
        tokens_file.close()
        debug("done")
Example #6
0
    def write_to_csv(self, file_prefix=None, data_dir=None, user_filter_fun=None, token_filter_fun=None):
        self.set_data_dir(data_dir)
        self.set_prefix(file_prefix)
        user_filter_fun = user_filter_fun if (user_filter_fun) else self.pass_anything_fun
        token_filter_fun = token_filter_fun if (token_filter_fun) else self.pass_anything_fun

        debug("writing  %s .." % (self.user_token_counts_file))
        included_user_totals = defaultdict(int)
        included_token_totals = defaultdict(int)
        idx = 0
        outfile = codecs.open(self.user_token_counts_file, encoding="utf-8", mode="w")
        for [user_id, token_id, count, prob] in self.user_token_counts:
            user = self.users[int(user_id)]
            token = self.tokens[int(token_id)]
            if user_filter_fun(user) and token_filter_fun(token):
                outfile.write("'%s','%s',%i\n" % (unicode(user), unicode(token), count))
                included_user_totals[user] += count
                included_token_totals[token] += count
                idx += 1
                if idx % 100000 == 0:
                    debug(".. %i rows" % idx)

        outfile.close()
        debug("done")

        debug("writing %s .." % (self.user_totals_file))
        user_totals_file = codecs.open(self.user_totals_file, encoding="utf-8", mode="w")
        for username, count in sorted(included_user_totals.iteritems(), key=itemgetter(1), reverse=True):
            user_totals_file.write("'%s',%i\n" % (unicode(username), count))
        user_totals_file.close()
        debug("done")

        debug("writing %s .." % (self.token_totals_file))
        token_totals_file = codecs.open(self.token_totals_file, encoding="utf-8", mode="w")
        for token, count in sorted(included_token_totals.iteritems(), key=itemgetter(1), reverse=True):
            token_totals_file.write("'%s',%i\n" % (unicode(token), count))
        token_totals_file.close()
        debug("done")