def preprocess(key, tweet):
    id_tweet_map, tweet_id_map = get_tweet_map()
    id_tokenised_tweet_map = get_id_tokenised_tweet_map()

    tokenized_tweet = id_tokenised_tweet_map[key]

    # Get emoticons, hashtags, mentions and URLs for a given tweet.
    emoticons = getemoticons(tweet)
    hashtags = gethashtags(tokenized_tweet)
    mentions = getmentions(tokenized_tweet)
    urls = geturls(tweet)

    # Get character n-grams (n=1-3) for a given tweet.
    char_n_grams = getcharngrams(tweet)

    stop_words = get_stop_words()
    # Replace emoticons, hashtags, mentions and URLs in a tweet.
    processed_tokenized_tweet = processtweet(tokenized_tweet, stop_words)
    processed_tweet = " ".join(processed_tokenized_tweet[0:])

    # # Get count of each punctuation and then remove them from the tweet.
    # pucntuations_count = getpunctuations(processed_tweet)
    # processed_tweet = removepunctuations(processed_tweet)
    # processed_tokenized_tweet = tokenize_tweet(processed_tweet)

    # Remove stop words from the tweet.
    # processed_tokenized_tweet = removestopwords(processed_tokenized_tweet)
    # processed_tweet = " ".join(processed_tokenized_tweet[0:])

    # Get word n-grams (n=1-3) for the tweet.
    word_n_grams = getwordngrams(processed_tokenized_tweet)

    return emoticons, hashtags, mentions, urls, char_n_grams, word_n_grams
def findfeatureproperties():
    id_tweet_map, tweet_id_map = get_tweet_map()
    id_tokenised_tweet_map = get_id_tokenised_tweet_map()
    id_truth_map = get_id_truth_map()

    # Get all char n-grams (n=1-5) from training set and create an index for each of them.
    char_n_grams = getallcharngrams(id_tweet_map)
    getcharngramsindex(char_n_grams)

    processed_id_tweet_map, processed_id_tokenised_tweet_map = processtweetforwordngrams(
        id_tweet_map, id_tokenised_tweet_map)

    # Get all word n-grams (n=1-3) from training set and create an index for each of them.
    word_n_grams = getallwordngrams(processed_id_tweet_map,
                                    processed_id_tokenised_tweet_map)
    getwordngramsindex(word_n_grams)

    token_count, hashtag_count = gettargetwords(
        id_tweet_map, processed_id_tweet_map, id_tokenised_tweet_map,
        processed_id_tokenised_tweet_map, id_truth_map)

    truth_top_hashtags = gettophashtags(hashtag_count)
    truth_top_hi_tokens, truth_top_en_tokens, truth_top_rest_tokens = gettoptokens(
        token_count)

    fp = open('data.txt', 'w')
    pickle.dump(6, fp)
    pickle.dump(char_n_grams_index, fp)
    pickle.dump(word_n_grams_index, fp)
    pickle.dump(truth_top_hashtags, fp)
    pickle.dump(truth_top_hi_tokens, fp)
    pickle.dump(truth_top_en_tokens, fp)
    pickle.dump(truth_top_rest_tokens, fp)
    fp.close()
Example #3
0
def preprocess(key, tweet):
    id_tweet_map, tweet_id_map = get_tweet_map()
    id_tokenised_tweet_map = get_id_tokenised_tweet_map()

    tokenized_tweet = id_tokenised_tweet_map[key]

    # Get emoticons, hashtags, mentions and URLs for a given tweet.
    emoticons = getemoticons(tweet)
    hashtags = gethashtags(tokenized_tweet)
    mentions = getmentions(tokenized_tweet)
    urls = geturls(tweet)
    capitalcount = getCapitalcount(tweet, tokenized_tweet)
    # Get character n-grams (n=1-3) for a given tweet.
    idiom = getidiom(tweet)
    char_n_grams = getcharngrams(tweet)
    puncts = getpuncts(tweet)
    #length=len(tweet)
    stop_words = get_stop_words()
    # Replace emoticons, hashtags, mentions and URLs in a tweet.
    processed_tokenized_tweet = processtweet(tokenized_tweet, stop_words)
    processed_tweet = " ".join(processed_tokenized_tweet[0:])

    # Get word n-grams (n=1-5) for the tweet.
    word_n_grams = getwordngrams(processed_tokenized_tweet)
    #print("Done execution")
    return emoticons, hashtags, mentions, urls, char_n_grams, word_n_grams, idiom, capitalcount, puncts


# tweet = "En el día @shyamli de hoy #27óS sólo me @sahil sale del alma gritar ¡¡VIVA ESPAÑA! ! http://t.co/w9Bmsf4TUK :) (: #NLP"
Example #4
0
def train_and_test():
	findfeatureproperties()
	id_truth_map = get_id_truth_map()

	train_truth_feature_map = getfeaturevectorforalltweets()
	
	truth_index = {'YES': 0, 'NO': 1, 0: 'YES', 1: 'NO'}

	id_tweet_map = get_tweet_map()

	tenfoldcrossvalidation(train_truth_feature_map, id_truth_map, truth_index, id_tweet_map)
Example #5
0
def getfeaturevectorforalltweets():
	id_tweet_map, tweet_id_map = get_tweet_map()
	# print len(id_tweet_map)
	id_tweet_map = dict(sorted(id_tweet_map.items(), key=operator.itemgetter(0)))
	
	train_truth_feature_map = {}

	count = 1
	for key, tweet in id_tweet_map.iteritems():
		truth_feature_vector = getfeaturevector(key, tweet)
		
		train_truth_feature_map[key] = truth_feature_vector
		# print count
		count += 1

	return train_truth_feature_map
def train_and_test():
    findfeatureproperties()
    id_stance_map = get_id_stance_map()

    train_stance_feature_map = getfeaturevectorforalltweets()

    stance_index = {
        'FAVOR': 0,
        'AGAINST': 1,
        'NONE': 2,
        0: 'FAVOR',
        1: 'AGAINST',
        2: 'NONE'
    }

    id_tweet_map = get_tweet_map()

    tenfoldcrossvalidation(train_stance_feature_map, id_stance_map,
                           stance_index, id_tweet_map)
def preprocess(key, tweet):
    id_tweet_map, tweet_id_map = get_tweet_map()
    id_tokenised_tweet_map = get_id_tokenised_tweet_map()

    tokenized_tweet = id_tokenised_tweet_map[key]

    # Get emoticons, hashtags, mentions and URLs for a given tweet.
    emoticons = getemoticons(tweet)
    hashtags = gethashtags(tokenized_tweet)
    mentions = getmentions(tokenized_tweet)
    urls = geturls(tweet)

    # Get character n-grams (n=1-3) for a given tweet.
    char_n_grams = getcharngrams(tweet)

    stop_words = get_stop_words()
    # Replace emoticons, hashtags, mentions and URLs in a tweet.
    processed_tokenized_tweet = processtweet(tokenized_tweet, stop_words)
    processed_tweet = " ".join(processed_tokenized_tweet[0:])

    # Get word n-grams (n=1-5) for the tweet.
    word_n_grams = getwordngrams(processed_tokenized_tweet)

    return emoticons, hashtags, mentions, urls, char_n_grams, word_n_grams