def TokenizeTweets(data_path): learner_tweet_map = {} tok = Tokenizer(preserve_case=False) learners = os.listdir(data_path) for learner in learners: tweet_path = data_path + learner + "/tweet" if os.path.isfile(tweet_path): tweet_file = open(tweet_path, "r") lines = tweet_file.readlines() individual_word_count_map = {} individual_word_count_set = set() num_english_tweet = 0 for line in lines: try: jsonObject = json.loads(line) if jsonObject["lang"] == "en": tweet = jsonObject["text"] tokenized_tweet = tok.tokenize(tweet) num_english_tweet += 1 for word in tokenized_tweet: if word not in individual_word_count_set: individual_word_count_set.add(word) individual_word_count_map[word] = 0 individual_word_count_map[word] += 1 except Exception as e: print line learner_tweet_map[learner] = {} learner_tweet_map[learner]["tweet"] = individual_word_count_map learner_tweet_map[learner]["num_english_tweet"] = num_english_tweet #if num > 30: # break if len(learner_tweet_map) % 100 == 0: print len(learner_tweet_map) output_path = os.path.dirname(os.path.dirname(data_path)) + "all_tokenized_tweets" output_file = open(output_path, "w") output_file.write(json.dumps(learner_tweet_map)) output_file.close()
class FeatureWorker(TextWorker): def __init__(self): super(FeatureWorker, self).__init__() self.tok = Tokenizer() def extractNgramPerTweet(self, tweet, n=1): """ Extract n-grams from tweet after standardizing """ tweet = self.shrinkSpace(tweet) tweet = self.remove_handles(tweet) tweet = self.remove_urls(tweet) tokens = self.tok.tokenize(tweet) #ngrams = Counter([" ".join(x) for x in zip(*[tokens[i:] for i in range(n)])]) ngrams = Counter([" ".join(x) for x in zip(*[tokens[n:]])]) return ngrams def fullNGramExtract(self, tweet_list, n=1): """ """ all_ngrams = Counter() for i in range(n): this_ngrams = Counter() for tweet in tweet_list: this_ngrams.update(self.extractNgramPerTweet(tweet, n)) total_ngrams = float(sum(this_ngrams.values())) all_ngrams.update({ gram: value / total_ngrams for gram, value in this_ngrams.items() }) return all_ngrams def extractLexicon(self, ngrams, lex, intercepts=None): """ """ pLex = {} # prob of lex given user for term, cats in lex.iteritems(): try: gn = ngrams[term] for cat, weight in cats.iteritems(): try: pLex[cat] += float(gn) * weight except KeyError: pLex[cat] = float(gn) * weight except KeyError: pass #not in lex if intercepts: for cat in intercepts: pLex[cat] += intercepts[cat] return pLex
def tokenize_tweet(tweet, tok=None): """ Tokenize the tweet and discard any with fewer than 3 tokens """ if not tok: tok = Tokenizer() tweet = tweet.strip() tokens = tok.tokenize(tweet) if len(tokens) > 3: tweet = " ".join(tokens) else: tweet = "" return tweet
user_gender = 0 if user_gender_re.findall(file)[0] == 'female' else 1 user_industry = user_industry_re.findall(file)[0] if user_industry in industries: industries[user_industry] += 1 else: industries[user_industry] = 1 user_word_count[user_id] = {} ages[user_id] = user_age genders[user_id] = user_gender user_industry_map[user_id] = user_industry sample_size += 1 f = open(file, 'r') posts = post_re.findall(f.read()) for p in posts: posts_count += 1 tokenized = tok.tokenize(p) for token in tokenized: words_count += 1 if token in user_word_count[user_id]: user_word_count[user_id][token] += 1 else: user_word_count[user_id][token] = 1 # # i = 0; # q = Queue(maxsize = 0) # for i in range(0, len(files_list), 4): # process1 = Process(target=run_proc, args=(files_list[i],q)) # process1.start() # # if i+1 < len(files_list):
tkzer = Tokenizer() postsRe = re.compile(r'<post>(.*?)</post>', re.DOTALL + re.I) #.*? = non-geedy match userData = dict() #dictionary of user_id => {age, gender, industry, topics} filesRead = 0 (numPosts, numWords, industries) = (0, 0, dict()) #for answering question one. for file in dirFiles: if fnmatch.fnmatch(file, '*.xml'): user_id, gender, age, industry, zodiac = file.split('.')[:5] industry = industry.lower() wordCounts = dict() totalWords = 0 currentFile = open(blogsDir + '/' + file).read() posts = postsRe.findall(currentFile) for post in posts: words = tkzer.tokenize(post) for word in words: try: wordCounts[word] += 1 except KeyError: wordCounts[word] = 1 totalWords += len(words) numPosts += 1 numWords += totalWords ############################################# #2. CALCULATE USERS PROBABILITY OF MENTIONING A TOPIC fTotalWords = float(totalWords) #for floating point division pTopicGivenUser = [0] * num_topics #initial probabilities for word, count in wordCounts.iteritems(): pWordGivenUser = count / fTotalWords
print "[LOADING CORPUS (%d files) AND CALCULATING TOPIC USAGE]" % len(dirFiles) tkzer = Tokenizer() postsRe = re.compile(r'<post>(.*?)</post>', re.DOTALL + re.I) #.*? = non-geedy match userData = dict() #dictionary of user_id => {age, gender, industry, topics} filesRead = 0 (numPosts, numWords, industries) = (0, 0, dict()) #for answering question one. for file in dirFiles: if fnmatch.fnmatch(file, '*.xml'): user_id, gender, age, industry, zodiac = file.split('.')[:5] industry = industry.lower() wordCounts = dict() totalWords = 0 currentFile = open(blogsDir+'/'+file).read() posts = postsRe.findall(currentFile) for post in posts: words = tkzer.tokenize(post) for word in words: try: wordCounts[word] += 1 except KeyError: wordCounts[word] = 1 totalWords += len(words) numPosts+=1 numWords+=totalWords ############################################# #2. CALCULATE USERS PROBABILITY OF MENTIONING A TOPIC fTotalWords = float(totalWords)#for floating point division pTopicGivenUser = [0] * num_topics #initial probabilities for word, count in wordCounts.iteritems(): pWordGivenUser = count / fTotalWords
def parse_blogs(path): tokenizer = Tokenizer() users = {} global_words_dict = {} industry_map = {} total_users = 0 total_blog_posts = 0 iterations = 0 topics = pd.read_csv('wwbpFBtopics_condProb.csv') regex = r'<post>(.*?)</post>' for filename in os.listdir(path): iterations += 1 print "user %d" %iterations if iterations > 50: break if filename.startswith("."): continue parts = filename.split(".") user_attributes_map = {} word_count_map = {} topic_prob_map = {} user_total_words_count = 0 user_id = (int)(parts[0]) gender = parts[1] if gender == "male": gender = 0 else: gender = 1 age = (int)(parts[2]) industry = parts[3] star_sign = parts[4] if user_id in users: user_attributes_map = users[user_id] if industry in industry_map: industry_map[industry] = industry_map[industry] + 1 else: industry_map[industry] = 1 with open(path + filename, 'r') as user_blog: user_blogs = user_blog.read().replace('\n', '').replace('\r', '').replace('\t', '') all_blog_posts = re.findall(regex, user_blogs, re.DOTALL) total_blog_posts = total_blog_posts + len(all_blog_posts) for blog in all_blog_posts: words = tokenizer.tokenize(blog.strip()) user_total_words_count = user_total_words_count + len(words) if 'wc_map' in user_attributes_map: word_count_map = user_attributes_map['wc_map'] for word in words: if word in word_count_map: count = word_count_map[word] count = count + 1 word_count_map[word] = count else: word_count_map[word] = 1 if word in global_words_dict: count = global_words_dict[word] count = count + 1 global_words_dict[word] = count else: global_words_dict[word] = 1 for topic in range(2000): prob_topic_given_user = 0.0 topic_dict = topics[topics['category'] == topic] for row in topic_dict.itertuples(): word = row[1] prob_topic_given_word = row[3] if word in word_count_map: count_user_word = word_count_map[word] prob_word_given_user = count_user_word/user_total_words_count cur = prob_topic_given_word * prob_word_given_user prob_topic_given_user = prob_topic_given_user + cur topic_prob_map[topic] = prob_topic_given_user user_attributes_map['wc_map'] = word_count_map user_attributes_map['age'] = age user_attributes_map['industry'] = industry user_attributes_map['star_sign'] = star_sign user_attributes_map['user_id'] = user_id user_attributes_map['topic_prob_map'] = topic_prob_map user_attributes_map['total_count'] = user_total_words_count user_attributes_map['gender'] = gender users[user_id] = user_attributes_map return (users, global_words_dict, industry_map, total_blog_posts)
class Preprocessor: def __init__(self, delimiter=" "): # delimiter to split tweets into tokens self.DELIM = delimiter self.tokenizer = Tokenizer() def tokenize(self, tweet): return " ".join(self.tokenizer.tokenize(tweet)) def replace_user_tags(self, tweet, remove=False): """ Replace mentions to usernames with "@USER" if remove=True removes the user mentions >>> p=Preprocessor() >>> p.replace_user_tags("@maya yes this is cool1@ did b@ @augyyz") '@USER yes this is cool1@ did b@ @USER' >>> p.replace_user_tags("@maya yes this is cool1@ did b@ @augyyz",remove=True) 'yes this is cool1@ did b@' """ if remove: return self.DELIM.join( [w for w in tweet.split(self.DELIM) if not w.startswith("@")]) else: return self.DELIM.join([ "@USER" if w.startswith("@") else w for w in tweet.split(self.DELIM) ]) def replace_urls(self, tweet, remove=False): """ Replace urls with @URL if remove=True removes them >>> p=Preprocessor() >>> p.replace_urls("@maya yes this is cool1@ did b@ @augyyz http://www.bitly") '@maya yes this is cool1@ did b@ @augyyz @URL' >>> p.replace_urls("@maya yes this is cool1@ did b@ @augyyz http://www.bitly",remove=True) '@maya yes this is cool1@ did b@ @augyyz' """ if remove: return self.DELIM.join([ w for w in tweet.split(self.DELIM) if not w.startswith("http") ]) else: return self.DELIM.join([ "@URL" if w.startswith("http") else w for w in tweet.split(self.DELIM) ]) def replace_hashtags(self, tweet, remove=False): """ Replace hashtags with @HASHTAG if remove=True removes them (any number of # at token start) >>> p=Preprocessor() >>> p.replace_hashtags("yes #cool we are in #miami ###yes") 'yes @HASHTAG we are in @HASHTAG @HASHTAG' >>> p.replace_hashtags("yes #cool we# are in #miami ###yes",remove=True) 'yes we# are in' >>> p.replace_hashtags("yes #cool we# are in #miami ###yes bar . #wishiwere in italy .") 'yes @HASHTAG we# are in @HASHTAG @HASHTAG bar . @HASHTAG in italy .' """ if remove: return self.DELIM.join( [w for w in tweet.split(self.DELIM) if not w.startswith("#")]) else: return self.DELIM.join([ "@HASHTAG" if w.startswith("#") else w for w in tweet.split(self.DELIM) ])