def preprocess(key, tweet): id_tweet_map, tweet_id_map = get_tweet_map() id_tokenised_tweet_map = get_id_tokenised_tweet_map() tokenized_tweet = id_tokenised_tweet_map[key] # Get emoticons, hashtags, mentions and URLs for a given tweet. emoticons = getemoticons(tweet) hashtags = gethashtags(tokenized_tweet) mentions = getmentions(tokenized_tweet) urls = geturls(tweet) # Get character n-grams (n=1-3) for a given tweet. char_n_grams = getcharngrams(tweet) stop_words = get_stop_words() # Replace emoticons, hashtags, mentions and URLs in a tweet. processed_tokenized_tweet = processtweet(tokenized_tweet, stop_words) processed_tweet = " ".join(processed_tokenized_tweet[0:]) # # Get count of each punctuation and then remove them from the tweet. # pucntuations_count = getpunctuations(processed_tweet) # processed_tweet = removepunctuations(processed_tweet) # processed_tokenized_tweet = tokenize_tweet(processed_tweet) # Remove stop words from the tweet. # processed_tokenized_tweet = removestopwords(processed_tokenized_tweet) # processed_tweet = " ".join(processed_tokenized_tweet[0:]) # Get word n-grams (n=1-3) for the tweet. word_n_grams = getwordngrams(processed_tokenized_tweet) return emoticons, hashtags, mentions, urls, char_n_grams, word_n_grams
def findfeatureproperties(): id_tweet_map, tweet_id_map = get_tweet_map() id_tokenised_tweet_map = get_id_tokenised_tweet_map() id_truth_map = get_id_truth_map() # Get all char n-grams (n=1-5) from training set and create an index for each of them. char_n_grams = getallcharngrams(id_tweet_map) getcharngramsindex(char_n_grams) processed_id_tweet_map, processed_id_tokenised_tweet_map = processtweetforwordngrams( id_tweet_map, id_tokenised_tweet_map) # Get all word n-grams (n=1-3) from training set and create an index for each of them. word_n_grams = getallwordngrams(processed_id_tweet_map, processed_id_tokenised_tweet_map) getwordngramsindex(word_n_grams) token_count, hashtag_count = gettargetwords( id_tweet_map, processed_id_tweet_map, id_tokenised_tweet_map, processed_id_tokenised_tweet_map, id_truth_map) truth_top_hashtags = gettophashtags(hashtag_count) truth_top_hi_tokens, truth_top_en_tokens, truth_top_rest_tokens = gettoptokens( token_count) fp = open('data.txt', 'w') pickle.dump(6, fp) pickle.dump(char_n_grams_index, fp) pickle.dump(word_n_grams_index, fp) pickle.dump(truth_top_hashtags, fp) pickle.dump(truth_top_hi_tokens, fp) pickle.dump(truth_top_en_tokens, fp) pickle.dump(truth_top_rest_tokens, fp) fp.close()
def preprocess(key, tweet): id_tweet_map, tweet_id_map = get_tweet_map() id_tokenised_tweet_map = get_id_tokenised_tweet_map() tokenized_tweet = id_tokenised_tweet_map[key] # Get emoticons, hashtags, mentions and URLs for a given tweet. emoticons = getemoticons(tweet) hashtags = gethashtags(tokenized_tweet) mentions = getmentions(tokenized_tweet) urls = geturls(tweet) capitalcount = getCapitalcount(tweet, tokenized_tweet) # Get character n-grams (n=1-3) for a given tweet. idiom = getidiom(tweet) char_n_grams = getcharngrams(tweet) puncts = getpuncts(tweet) #length=len(tweet) stop_words = get_stop_words() # Replace emoticons, hashtags, mentions and URLs in a tweet. processed_tokenized_tweet = processtweet(tokenized_tweet, stop_words) processed_tweet = " ".join(processed_tokenized_tweet[0:]) # Get word n-grams (n=1-5) for the tweet. word_n_grams = getwordngrams(processed_tokenized_tweet) #print("Done execution") return emoticons, hashtags, mentions, urls, char_n_grams, word_n_grams, idiom, capitalcount, puncts # tweet = "En el día @shyamli de hoy #27óS sólo me @sahil sale del alma gritar ¡¡VIVA ESPAÑA! ! http://t.co/w9Bmsf4TUK :) (: #NLP"
def train_and_test(): findfeatureproperties() id_truth_map = get_id_truth_map() train_truth_feature_map = getfeaturevectorforalltweets() truth_index = {'YES': 0, 'NO': 1, 0: 'YES', 1: 'NO'} id_tweet_map = get_tweet_map() tenfoldcrossvalidation(train_truth_feature_map, id_truth_map, truth_index, id_tweet_map)
def getfeaturevectorforalltweets(): id_tweet_map, tweet_id_map = get_tweet_map() # print len(id_tweet_map) id_tweet_map = dict(sorted(id_tweet_map.items(), key=operator.itemgetter(0))) train_truth_feature_map = {} count = 1 for key, tweet in id_tweet_map.iteritems(): truth_feature_vector = getfeaturevector(key, tweet) train_truth_feature_map[key] = truth_feature_vector # print count count += 1 return train_truth_feature_map
def train_and_test(): findfeatureproperties() id_stance_map = get_id_stance_map() train_stance_feature_map = getfeaturevectorforalltweets() stance_index = { 'FAVOR': 0, 'AGAINST': 1, 'NONE': 2, 0: 'FAVOR', 1: 'AGAINST', 2: 'NONE' } id_tweet_map = get_tweet_map() tenfoldcrossvalidation(train_stance_feature_map, id_stance_map, stance_index, id_tweet_map)
def preprocess(key, tweet): id_tweet_map, tweet_id_map = get_tweet_map() id_tokenised_tweet_map = get_id_tokenised_tweet_map() tokenized_tweet = id_tokenised_tweet_map[key] # Get emoticons, hashtags, mentions and URLs for a given tweet. emoticons = getemoticons(tweet) hashtags = gethashtags(tokenized_tweet) mentions = getmentions(tokenized_tweet) urls = geturls(tweet) # Get character n-grams (n=1-3) for a given tweet. char_n_grams = getcharngrams(tweet) stop_words = get_stop_words() # Replace emoticons, hashtags, mentions and URLs in a tweet. processed_tokenized_tweet = processtweet(tokenized_tweet, stop_words) processed_tweet = " ".join(processed_tokenized_tweet[0:]) # Get word n-grams (n=1-5) for the tweet. word_n_grams = getwordngrams(processed_tokenized_tweet) return emoticons, hashtags, mentions, urls, char_n_grams, word_n_grams