def find_author_id(author_name): for n in range(len(AllT.collect_tweets())): #print(AllT.collect_tweets()[n][0]) if(author_name == AllT.collect_tweets()[n][0]): return n else: next
def get_unigrams_bigrams_count(self): all_unigrams = BOW.Get_unigrams_bigrams(AllT.collect_text())[0] all_bigrams = BOW.Get_unigrams_bigrams(AllT.collect_text())[1] vect1 = CountVectorizer(vocabulary=all_unigrams) unigrams = vect1.fit_transform(self.tweet).toarray() vect2 = CountVectorizer(vocabulary=all_bigrams) bigrams = vect2.fit_transform(self.tweet).toarray() return (unigrams, bigrams)
def __init__(self, author_name): self.tweets = AllT.collect_topic_context() self.id = find_author_id(author_name) # build the data matrix and vocabulary by # tokenizing and cleaning stop-words self.vocab = BOW.Get_BOW(self.tweets)[0] self.data = BOW.Get_BOW(self.tweets)[1]
def original_message_unigram_fature(tweet_id): author_tweet = dbtweets.find_one({'tweet_id': tweet_id}) original_text = author_tweet['in_reply_to_status_id_text'] if(original_text == None): #print 'this tweet was not replying to other tweet' return else: original_text_unigram = BOW.Get_unigrams(AllT.collect_original_messages(), original_text) return original_text_unigram
def original_message_unigram_fature(tweet_id): author_tweet = dbtweets.find_one({'tweet_id': tweet_id}) original_text = author_tweet['in_reply_to_status_id_text'] if (original_text == None): #print 'this tweet was not replying to other tweet' return else: original_text_unigram = BOW.Get_unigrams( AllT.collect_original_messages(), original_text) return original_text_unigram
def update_profile_unigram(): all_profiles = AT.collect_profiles() # Import countvectorzier to generate unigrams unicount_vect = CountVectorizer(ngram_range=(1,1), lowercase = False, stop_words='english', token_pattern=r'\b\w+\b', min_df=1) unicount = unicount_vect.fit_transform(all_profiles).toarray() unigrams = unicount_vect.get_feature_names() print unicount_vect x = nltk.cluster.api.ClusterI() y = x.cluster(unicount, assign_clusters=False) # Load profile_unigram into MongoDB """ for n in range(test.find().count()): tweetAuthor = test.find()[n]["author"] profile_unigram = scipy.sparse.coo_matrix(unicount_vect.transform([test.find()[n]["profile"]]).toarray()) print type(profile_unigram) print "-"*20 print profile_unigram print "-"*20 pickle_profile_unigram = Binary(pickle.dumps(profile_unigram, protocol=2), subtype=128 ) result = test.update_one({"author": tweetAuthor}, {"$set": {"profile_unigram": pickle_profile_unigram}}) """ """
for line in inten_file.readlines(): regex = r"\b" + re.escape(line) + r"\b" if re.findall(regex, text, re.IGNORECASE): intensifier = 1 break else: continue return intensifier #-------- # unigrams # all_unigrams is the bag of word in unigrams all_unigrams = BOW.Get_unigrams_bigrams(AllT.collect_text())[0] vect1 = CountVectorizer(vocabulary=all_unigrams) # bigrams # all_bigrams is the bag of word in bigrams all_bigrams = BOW.Get_unigrams_bigrams(AllT.collect_text())[1] vect2 = CountVectorizer(vocabulary=all_bigrams) start_time = datetime.datetime.now() for i in range(dbtweets.find().count()): cur_time = datetime.datetime.now() delta = cur_time - start_time #print 'this is the ', i+1, 'tweet', 'total time is: ', delta tweet_id = dbtweets.find()[i]['tweet_id'] tweet_text = dbtweets.find()[i]['tweet_text'] number_no_vowels = Pron.count_number_no_vowels(tweet_text)
# import countvectorzier to do process and # a built-in stop word list for English is used count_vect = CountVectorizer(stop_words='english') train_counts = count_vect.fit_transform(tweets) vocab = count_vect.vocabulary_.keys() train_counts = (train_counts).toarray() return (vocab, train_counts) def Get_unigrams_bigrams(tweets): # import countvectorzier to generate unigrams and bigrams unicount_vect = CountVectorizer(ngram_range=(1,1), lowercase = False, stop_words='english', token_pattern=r'\b\w+\b', min_df=1) unicount = unicount_vect.fit_transform(tweets).toarray() unigrams = unicount_vect.get_feature_names() bicount_vect = CountVectorizer(ngram_range=(2,2), lowercase = False, stop_words='english', token_pattern=r'\b\w+\b', min_df=1) bicount = bicount_vect.fit_transform(tweets).toarray() bigrams = bicount_vect.get_feature_names() return (unigrams, bigrams) all_tweets_grams = Get_unigrams_bigrams(AllT.collect_text()) if __name__ == '__main__': print "Running as a file, not as imported" print all_tweets_grams[0][0:15] print all_tweets_grams[1][0:15]
regex = r"\b" + re.escape(line) + r"\b" if re.findall(regex, text, re.IGNORECASE): intensifier = 1 break else: continue return intensifier #-------- # unigrams # all_unigrams is the bag of word in unigrams all_unigrams = BOW.Get_unigrams_bigrams(AllT.collect_text())[0] vect1 = CountVectorizer(vocabulary=all_unigrams) # bigrams # all_bigrams is the bag of word in bigrams all_bigrams = BOW.Get_unigrams_bigrams(AllT.collect_text())[1] vect2 = CountVectorizer(vocabulary=all_bigrams) start_time = datetime.datetime.now() for i in range(dbtweets.find().count()): cur_time = datetime.datetime.now() delta = cur_time - start_time #print 'this is the ', i+1, 'tweet', 'total time is: ', delta tweet_id = dbtweets.find()[i]['tweet_id'] tweet_text = dbtweets.find()[i]['tweet_text']