def TokenizeTweets(data_path):
    
    learner_tweet_map = {}
        
    tok = Tokenizer(preserve_case=False)
    
    learners = os.listdir(data_path)
    for learner in learners:
        
        tweet_path = data_path + learner + "/tweet"
        
        if os.path.isfile(tweet_path):
            
            tweet_file = open(tweet_path, "r")
            lines = tweet_file.readlines()          
                            
            individual_word_count_map = {}
            individual_word_count_set = set()
            
            num_english_tweet = 0
                
            for line in lines:
                try:
                    jsonObject = json.loads(line)
                    if jsonObject["lang"] == "en":
                        tweet = jsonObject["text"]
                        tokenized_tweet = tok.tokenize(tweet)
                        
                        num_english_tweet += 1
                                                       
                        for word in tokenized_tweet:
                                
                            if word not in individual_word_count_set:
                                individual_word_count_set.add(word)
                                individual_word_count_map[word] = 0
                                    
                            individual_word_count_map[word] += 1
                except Exception as e:
                    print line
            
            learner_tweet_map[learner] = {}
            learner_tweet_map[learner]["tweet"] = individual_word_count_map
            learner_tweet_map[learner]["num_english_tweet"] = num_english_tweet
            
            #if num > 30:
            #    break
            
            if len(learner_tweet_map) % 100 == 0:
                print len(learner_tweet_map)
            
    output_path = os.path.dirname(os.path.dirname(data_path)) + "all_tokenized_tweets"
    output_file = open(output_path, "w")
    output_file.write(json.dumps(learner_tweet_map))
    output_file.close()
Exemple #2
0
def tokenize_tweet(tweet, tok=None):
    """
    Tokenize the tweet and discard any with fewer than 3 tokens
    """
    if not tok:
        tok = Tokenizer()

    tweet = tweet.strip()
    tokens = tok.tokenize(tweet)
    if len(tokens) > 3:
        tweet = " ".join(tokens)
    else:
        tweet = ""
    return tweet
Exemple #3
0
def process_tweets(tweets, target):
    tok = Tokenizer()

    processed_tweets = []
    for tweet in tweets:
        tweet = clean_tweet(tweet, target)
        tweet = remove_self_refs(tweet, target)
        tweet = tokenize_tweet(tweet, tok)
        if tweet:
            processed_tweets.append(tweet)
    print "Remaining Tweets:", len(processed_tweets)
    return processed_tweets
Exemple #4
0
class FeatureWorker(TextWorker):
    def __init__(self):
        super(FeatureWorker, self).__init__()
        self.tok = Tokenizer()

    def extractNgramPerTweet(self, tweet, n=1):
        """
        Extract n-grams from tweet after standardizing
        """
        tweet = self.shrinkSpace(tweet)
        tweet = self.remove_handles(tweet)
        tweet = self.remove_urls(tweet)
        tokens = self.tok.tokenize(tweet)

        #ngrams = Counter([" ".join(x) for x in zip(*[tokens[i:] for i in range(n)])])
        ngrams = Counter([" ".join(x) for x in zip(*[tokens[n:]])])
        return ngrams

    def fullNGramExtract(self, tweet_list, n=1):
        """
        """
        all_ngrams = Counter()
        for i in range(n):
            this_ngrams = Counter()
            for tweet in tweet_list:
                this_ngrams.update(self.extractNgramPerTweet(tweet, n))
            total_ngrams = float(sum(this_ngrams.values()))
            all_ngrams.update({
                gram: value / total_ngrams
                for gram, value in this_ngrams.items()
            })
        return all_ngrams

    def extractLexicon(self, ngrams, lex, intercepts=None):
        """
        """
        pLex = {}  # prob of lex given user
        for term, cats in lex.iteritems():
            try:
                gn = ngrams[term]
                for cat, weight in cats.iteritems():
                    try:
                        pLex[cat] += float(gn) * weight
                    except KeyError:
                        pLex[cat] = float(gn) * weight
            except KeyError:
                pass  #not in lex

        if intercepts:
            for cat in intercepts:
                pLex[cat] += intercepts[cat]
        return pLex
Exemple #5
0
def extract_features(tweet, uid):
    if not config.tokenizer:
        config.tokenizer = Tokenizer()

    tokens = config.tokenizer.tokenize(tweet)

    feats = []

    # Character ngram features
    for n in range(2, 6):
        feats += get_char_ngram_feats(tweet, n)

    # Word ngram features
    for n in range(1, 4):
        feats += get_word_ngram_feats(tokens, n)

    feats += apply_user_factor_adaptation(feats, uid)

    return feats
Exemple #6
0
 def __init__(self):
     super(FeatureWorker, self).__init__()
     self.tok = Tokenizer()
Exemple #7
0
#     # print file
#     #print file, len(dict[user_id])
#
# def run_proc2(file, q):
#     soup = BeautifulSoup(open(file),'xml')
#     posts = soup.find_all('post')
#     for post in posts:
#         tokenized = tok.tokenize(post)
#         for token in tokenized:
#             if token in user_word_count[user_id]:
#                 dict[user_id][token] += 1
#             else:
#                 dict[user_id][token] = 1
#     #print file

tok = Tokenizer(preserve_case=False)
files_list = glob.glob(sys.argv[1] + '/*.xml')

user_word_count = {}
posts_count = 0
words_count = 0
industries = {}
ages = {}
genders = {}
user_industry_map = {}
sample_size = 0

user_id_re = re.compile(r'(?<=/)(\d{3,8})(?=.)')

user_age_re = re.compile(r'(?<=\.)(\d{2})(?=\.)')
Exemple #8
0
wordsForTopic = dict()
with open(topic_count_file, 'rb') as csvfile:
    topicreader = csv.reader(csvfile)
    topicreader.next()  #throw out header
    for row in topicreader:
        topic = int(row[0])
        words = row[1:][::2][:4]
        wordsForTopic[topic] = words
print "[DONE]"

################################
#1. READ AND TOKENIZE THE CORPUS
#READ CORPUS
dirFiles = os.listdir(blogsDir)
print "[LOADING CORPUS (%d files) AND CALCULATING TOPIC USAGE]" % len(dirFiles)
tkzer = Tokenizer()
postsRe = re.compile(r'<post>(.*?)</post>',
                     re.DOTALL + re.I)  #.*? = non-geedy match
userData = dict()  #dictionary of user_id => {age, gender, industry, topics}
filesRead = 0
(numPosts, numWords, industries) = (0, 0, dict())  #for answering question one.
for file in dirFiles:
    if fnmatch.fnmatch(file, '*.xml'):
        user_id, gender, age, industry, zodiac = file.split('.')[:5]
        industry = industry.lower()
        wordCounts = dict()
        totalWords = 0
        currentFile = open(blogsDir + '/' + file).read()
        posts = postsRe.findall(currentFile)
        for post in posts:
            words = tkzer.tokenize(post)
Exemple #9
0
with open(topic_count_file, 'rb') as csvfile:
    topicreader = csv.reader(csvfile)
    topicreader.next()#throw out header
    for row in topicreader:
        topic = int(row[0])
        words = row[1:][::2][:4]
        wordsForTopic[topic] = words
print "[DONE]"


################################
#1. READ AND TOKENIZE THE CORPUS
#READ CORPUS
dirFiles = os.listdir(blogsDir)
print "[LOADING CORPUS (%d files) AND CALCULATING TOPIC USAGE]" % len(dirFiles)
tkzer = Tokenizer()
postsRe = re.compile(r'<post>(.*?)</post>', re.DOTALL + re.I) #.*? = non-geedy match
userData = dict() #dictionary of user_id => {age, gender, industry, topics}
filesRead = 0
(numPosts, numWords, industries) = (0, 0, dict()) #for answering question one.
for file in dirFiles:
    if fnmatch.fnmatch(file, '*.xml'):
        user_id, gender, age, industry, zodiac = file.split('.')[:5]
        industry = industry.lower()
        wordCounts = dict()
        totalWords = 0
        currentFile = open(blogsDir+'/'+file).read()
        posts = postsRe.findall(currentFile)
        for post in posts:
            words = tkzer.tokenize(post)
            for word in words:
Exemple #10
0
#     #print file, len(dict[user_id])
#
# def run_proc2(file, q):
#     soup = BeautifulSoup(open(file),'xml')
#     posts = soup.find_all('post')
#     for post in posts:
#         tokenized = tok.tokenize(post)
#         for token in tokenized:
#             if token in user_word_count[user_id]:
#                 dict[user_id][token] += 1
#             else:
#                 dict[user_id][token] = 1
#     #print file


tok = Tokenizer(preserve_case=False)
files_list = glob.glob(sys.argv[1]+'/*.xml')


user_word_count = {}
posts_count = 0
words_count = 0
industries = {}
ages = {}
genders = {}
user_industry_map = {}
sample_size = 0

user_id_re = re.compile(r'(?<=/)(\d{3,8})(?=.)')

user_age_re = re.compile(r'(?<=\.)(\d{2})(?=\.)')
def parse_blogs(path):
    
    tokenizer = Tokenizer()
    users = {}
    global_words_dict = {}
    industry_map = {}
    total_users = 0
    total_blog_posts = 0
    iterations = 0
    topics = pd.read_csv('wwbpFBtopics_condProb.csv')
    
    regex = r'<post>(.*?)</post>'

    for filename in os.listdir(path):
        iterations += 1
        print "user %d" %iterations
        if iterations > 50:
            break
            
        if filename.startswith("."):
            continue
            
        parts = filename.split(".")

        user_attributes_map = {}
        word_count_map = {}
        topic_prob_map = {}

        user_total_words_count = 0
        
        user_id = (int)(parts[0])
        gender = parts[1]
        
        if gender == "male":
            gender = 0
        else:
            gender = 1
            
        age = (int)(parts[2])
        industry = parts[3]
        star_sign = parts[4]
        
        if user_id in users:
            user_attributes_map = users[user_id]
        
        if industry in industry_map:
            industry_map[industry] = industry_map[industry] + 1
        else:
            industry_map[industry] = 1
                
        with open(path + filename, 'r') as user_blog:
            user_blogs = user_blog.read().replace('\n', '').replace('\r', '').replace('\t', '')
    
        all_blog_posts = re.findall(regex, user_blogs, re.DOTALL)

        total_blog_posts = total_blog_posts + len(all_blog_posts)

        for blog in all_blog_posts:  
            words = tokenizer.tokenize(blog.strip())
            user_total_words_count = user_total_words_count + len(words)
            
            if 'wc_map' in user_attributes_map:
                word_count_map = user_attributes_map['wc_map']

            for word in words:
                if word in word_count_map:
                    count = word_count_map[word]
                    count = count + 1
                    word_count_map[word] = count
                else:
                    word_count_map[word] = 1
                    
                if word in global_words_dict:
                    count = global_words_dict[word]
                    count = count + 1
                    global_words_dict[word] = count
                else:
                    global_words_dict[word] = 1
                    

        for topic in range(2000):
            prob_topic_given_user = 0.0

            topic_dict = topics[topics['category'] == topic]

            for row in topic_dict.itertuples():
                word = row[1]
                prob_topic_given_word = row[3]
                if word in word_count_map:
                    count_user_word = word_count_map[word]
                    prob_word_given_user = count_user_word/user_total_words_count

                    cur = prob_topic_given_word * prob_word_given_user

                    prob_topic_given_user = prob_topic_given_user + cur
            
            topic_prob_map[topic] = prob_topic_given_user
        
        user_attributes_map['wc_map'] = word_count_map
        user_attributes_map['age'] = age
        user_attributes_map['industry'] = industry
        user_attributes_map['star_sign'] = star_sign
        user_attributes_map['user_id'] = user_id
        user_attributes_map['topic_prob_map'] = topic_prob_map
        user_attributes_map['total_count'] = user_total_words_count
        user_attributes_map['gender'] = gender
        
        users[user_id] = user_attributes_map
    return (users, global_words_dict, industry_map, total_blog_posts)
Exemple #12
0
 def __init__(self, delimiter=" "):
     # delimiter to split tweets into tokens
     self.DELIM = delimiter
     self.tokenizer = Tokenizer()
Exemple #13
0
class Preprocessor:
    def __init__(self, delimiter=" "):
        # delimiter to split tweets into tokens
        self.DELIM = delimiter
        self.tokenizer = Tokenizer()

    def tokenize(self, tweet):
        return " ".join(self.tokenizer.tokenize(tweet))

    def replace_user_tags(self, tweet, remove=False):
        """
        Replace mentions to usernames with "@USER"
        if remove=True removes the user mentions
        
        >>> p=Preprocessor()
        >>> p.replace_user_tags("@maya yes this is cool1@ did b@ @augyyz")
        '@USER yes this is cool1@ did b@ @USER'
        >>> p.replace_user_tags("@maya yes this is cool1@ did b@ @augyyz",remove=True)
        'yes this is cool1@ did b@'
        """
        if remove:
            return self.DELIM.join(
                [w for w in tweet.split(self.DELIM) if not w.startswith("@")])
        else:
            return self.DELIM.join([
                "@USER" if w.startswith("@") else w
                for w in tweet.split(self.DELIM)
            ])

    def replace_urls(self, tweet, remove=False):
        """
        Replace urls with @URL
        if remove=True removes them
    
        >>> p=Preprocessor()
        >>> p.replace_urls("@maya yes this is cool1@ did b@ @augyyz http://www.bitly")
        '@maya yes this is cool1@ did b@ @augyyz @URL'
        >>> p.replace_urls("@maya yes this is cool1@ did b@ @augyyz http://www.bitly",remove=True)
        '@maya yes this is cool1@ did b@ @augyyz'
        
        """
        if remove:
            return self.DELIM.join([
                w for w in tweet.split(self.DELIM) if not w.startswith("http")
            ])
        else:
            return self.DELIM.join([
                "@URL" if w.startswith("http") else w
                for w in tweet.split(self.DELIM)
            ])

    def replace_hashtags(self, tweet, remove=False):
        """
        Replace hashtags with @HASHTAG
        if remove=True removes them (any number of # at token start)

        >>> p=Preprocessor()
        >>> p.replace_hashtags("yes #cool we are in #miami ###yes")
        'yes @HASHTAG we are in @HASHTAG @HASHTAG'
        >>> p.replace_hashtags("yes #cool we# are in #miami ###yes",remove=True)
        'yes we# are in'
        >>> p.replace_hashtags("yes #cool we# are in #miami ###yes bar . #wishiwere in italy .")
        'yes @HASHTAG we# are in @HASHTAG @HASHTAG bar . @HASHTAG in italy .'
        """
        if remove:
            return self.DELIM.join(
                [w for w in tweet.split(self.DELIM) if not w.startswith("#")])
        else:
            return self.DELIM.join([
                "@HASHTAG" if w.startswith("#") else w
                for w in tweet.split(self.DELIM)
            ])