def embeddings(tweetText, path2voca):
    f = codecs.open(path2voca).read().splitlines()
    dico = {
        i.split()[0]: np.array([float(x) for x in i.split()[1:]])
        for i in f
    }
    tokenizer = Tokenizer(preserve_case=False)
    feat = []
    for key, tweet in enumerate(tweetText):
        words = tokenizer.tokenize(tweet)
        my_vec, cnt, min_max = np.zeros(50), 0, []
        for i in words:
            j = i.strip("_neg")
            try:
                my_vec += dico[j]
                cnt += 1
                min_max.append(dico[j])
            except:
                pass
        if len(min_max) > 1:
            min_max = np.array(min_max)
            my_min = np.amin(min_max, axis=0)
            my_max = np.amax(min_max, axis=0)
        else:
            my_min, my_max = np.zeros(50), np.zeros(50)
        if cnt > 1:
            my_vec /= cnt
        feat.append(np.hstack((my_vec, my_max, my_min)))
    return np.array(feat)
def bing_lius(tweetText, pos, different_pos_tags, pos_text ):
    with codecs.open('../lexicons/positive-words_bing_liu.txt', 'r') as inFile:
        positive = set(inFile.read().splitlines())
    with codecs.open('../lexicons/negative-words_bing_liu.txt', 'r') as inFile:
        negative = set(inFile.read().splitlines())
    feat = []
    tokenizer = Tokenizer(preserve_case=True)
    for key, tweet in enumerate(tweetText):
        words = tokenizer.tokenize(tweet)
        counters, counters_cap = np.zeros(8), np.zeros(8)
        for j in words:
            if j.isupper():
                counters_cap += np.array(getBingLiusCounters(positive, negative, j.lower()))
            else:
                counters += np.array(getBingLiusCounters(positive, negative, j.lower()))
        pos_sen = OrderedDict({x:[0,0,0,0] for x in different_pos_tags})
        for k_key, k in enumerate(pos_text[key]):
            if k in positive:
                pos_sen[pos[key][k_key]][0]+=1
            if k in negative:
                pos_sen[pos[key][k_key]][2]+=1
            if k.endswith("_NEG"):
                if k.strip("_NEG") in positive:
                    pos_sen[pos[key][k_key]][1]+=1
                if k.strip("_NEG") in negative:
                    pos_sen[pos[key][k_key]][3]+=1
#        my_feat = list(counters)+list(counters_cap)+[g for gg in pos_sen.values() for g in gg]
        my_feat = list(counters+counters_cap)+[g for gg in pos_sen.values() for g in gg]
        feat.append(np.array(my_feat))
    return np.array(feat)
def embeddings(tweetText, path2voca):
    f = codecs.open(path2voca).read().splitlines()
    dico = {i.split()[0]:np.array([float(x) for x in i.split()[1:]]) for i in f}
    tokenizer = Tokenizer(preserve_case=False)
    feat = []
    for key, tweet in enumerate(tweetText):
        words = tokenizer.tokenize(tweet)
        my_vec,  cnt,  min_max  = np.zeros(50),  0, [] 
        for i in words:
            j = i.strip("_neg")
            try:
                my_vec += dico[j]
                cnt += 1
                min_max.append(dico[j])
            except:
                pass
        if len(min_max)>1:
            min_max= np.array(min_max)
            my_min = np.amin(min_max, axis=0)
            my_max = np.amax(min_max, axis=0)
        else:
            my_min,my_max = np.zeros(50), np.zeros(50)
        if cnt > 1:
            my_vec /= cnt
        feat.append(np.hstack((my_vec, my_max, my_min)))
    return np.array(feat)
def createDataMatrix(ngram_features, character_gram_features,tweetText, pos, pos_features, different_pos_tags, pos_text, voca_clusters, categories):
    tokenizer_case_preserve = Tokenizer(preserve_case=True)
    tokenizer = Tokenizer(preserve_case=False)
    handmade_features, cll, cll2 = [], [], []
    for tweet in tweetText:
        feat = []
        feat.append(exclamations(tweet))
        feat.append(questions(tweet))
        feat.append(questions_and_exclamation(tweet))
        feat.append(emoticon_negative(tweet))
        feat.append(emoticon_positive(tweet))
        words = tokenizer_case_preserve.tokenize(tweet) #preserving casing
        feat.append(allCaps(words))
        feat.append(elongated(words))
        feat.append(questions_and_exclamation(words[-1]))
        handmade_features.append(np.array(feat))
        words = tokenizer.tokenize(tweet)
        words = [word.strip("_NEG") for word in words]
        cll.append(getClusters(voca_clusters, words))
        #cll2.append(getClusters(voca_handmade, words))


    bl = csr_matrix(bing_lius(tweetText, pos, different_pos_tags, pos_text))
    nrc_emo = csr_matrix(nrc_emotion(tweetText, pos, different_pos_tags, pos_text ))
    mpqa_feat = csr_matrix(mpqa(tweetText,pos, different_pos_tags, pos_text))
    handmade_features = np.array(handmade_features)
    mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_clusters.values())))
    cluster_memberships_binarized = csr_matrix(mlb.fit_transform(cll))
    #mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_handmade.values())))
    #cluster_memberships_binarized_2 = csr_matrix(mlb.fit_transform(cll2))
    
    hasht = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-unigrams.txt'))
#    sent140aff_data = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-unigrams.txt'))
    hasht_bigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-bigrams.txt'))
#    sent140affBigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-bigrams.txt'))
    sentQ = csr_matrix(get_sentiwordnet(pos_text, pos))
    pos_features = csr_matrix(pos_features)
    handmade_features = csr_matrix(handmade_features)
    # ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, 
#                             sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float)
#    ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float)
    ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, sentQ, handmade_features, pos_features, cluster_memberships_binarized, bl, mpqa_feat, nrc_emo, hasht, hasht_bigrams ), dtype=float)

#     print ngram_features.shape, character_gram_features.shape, cluster_memberships_binarized.shape, handmade_features.shape, pos_features.shape, 
#     sent140affBigrams.shape, hasht_bigrams, hasht.shape, sent140aff_data.shape, bl.shape, mpqa_feat.shape, nrc_emo.shape
    y=[]
    for i in categories:
        if i=='positive':
            y.append(1)
        elif i == 'negative':
            y.append(-1)
        elif i == 'UNKNOWN':
            y.append(0)
        else:
            print i
    ffeatures = normalize(ffeatures)
#     ffeatures, y = shuffle(ffeatures,y)
    return ffeatures, y
def sent140aff(tweetText, pos, different_pos_tags, pos_text, path2lexicon):
    with codecs.open(path2lexicon, 'r') as inFile:
        wds = inFile.read().splitlines()
    pos_cont, nega_cont, nega_cont_first = {}, {}, {}
    for i in wds:
        i = i.split("\t")
        if i[0].endswith("_NEG"):
            name = "".join(i[0].split('_')[:-1])
            nega_cont[name] = float(i[1])
        elif i[0].endswith('_NEGFIRST'):
            name = "".join(i[0].split('_')[:-1])
            nega_cont_first[name] = float(i[1])
        else:
            pos_cont[i[0]] = float(i[1])
    feat = []
    tokenizer = Tokenizer(preserve_case=False)
    for key, tweet in enumerate(tweetText):
        cnt, scor = 0, []
        words = tokenizer.tokenize(tweet)
        for my_key, i in enumerate(words):
            if i in pos_cont:
                scor.append(pos_cont[i])
            if i.endswith('_neg'):
                j = i.strip("_neg")
                flag = 0
                if not words[my_key - 1].endswith('_neg'):
                    if j in nega_cont_first:
                        scor.append(nega_cont_first[j])
                        flag = 1
                    elif j in nega_cont:
                        scor.append(nega_cont[j])
                        flag = 1
                    else:
                        pass
                if j in nega_cont and flag == 0:
                    scor.append(nega_cont[j])
        if len(scor) > 0:
            pos_scores, neg_scores = [x for x in scor
                                      if x > 0], [x for x in scor if x < 0]
            if len(pos_scores) == 0:
                pos_scores = [0]
            if len(neg_scores) == 0:
                neg_scores = [0]
            feat.append([
                len(scor),
                len(pos_scores),
                len(neg_scores),
                sum(scor),
                sum(pos_scores),
                sum(neg_scores),
                max(scor),
                max(pos_scores),
                max(neg_scores), scor[-1], pos_scores[-1], neg_scores[-1]
            ])
        else:
            feat.append(list(np.zeros(12)))
    return np.array(feat)
def sent140aff(tweetText, pos, different_pos_tags, pos_text, path2lexicon):
    with codecs.open(path2lexicon, 'r') as inFile:
        wds = inFile.read().splitlines()
    pos_cont, nega_cont, nega_cont_first = {},{},{}
    for i in wds:
        i = i.split("\t")
        if i[0].endswith("_NEG"):
            name = "".join(i[0].split('_')[:-1])
            nega_cont[name]=float(i[1])
        elif i[0].endswith('_NEGFIRST'):
            name = "".join(i[0].split('_')[:-1])
            nega_cont_first[name]=float(i[1])
        else:
            pos_cont[i[0]]=float(i[1])
    feat = []
    tokenizer = Tokenizer(preserve_case=False)
    for key, tweet in enumerate(tweetText):
        cnt, scor  = 0, []
        words = tokenizer.tokenize(tweet)
        for my_key, i in enumerate(words):
            if i in pos_cont:
                scor.append(pos_cont[i])
            if i.endswith('_neg'):
                j = i.strip("_neg")
                flag = 0
                if not words[my_key-1].endswith('_neg'):
                    if j in nega_cont_first:
                        scor.append(nega_cont_first[j])
                        flag = 1
                    elif j in nega_cont:
                        scor.append(nega_cont[j])
                        flag = 1 
                    else:
                        pass
                if j in nega_cont and flag == 0:
                    scor.append(nega_cont[j])
        if len(scor)> 0:
            pos_scores, neg_scores = [x for x in scor if x>0],[x for x in scor if x<0]
            if len(pos_scores) == 0:
                pos_scores= [0]
            if len(neg_scores) == 0:
                neg_scores=[0]
            feat.append([len(scor), len(pos_scores), len(neg_scores), sum(scor), sum(pos_scores), sum(neg_scores), max(scor), 
                        max(pos_scores), max(neg_scores), scor[-1], pos_scores[-1], neg_scores[-1]])
        else:
            feat.append(list(np.zeros(12)))
    return np.array(feat)
def nrc_emotion(tweetText, pos, different_pos_tags, pos_text):
    with codecs.open(
            '../lexicons/NRC-Emotion-Lexicon-v0.92/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt',
            'r') as inFile:
        wds = inFile.read().splitlines()
    positive, negative = [], []
    for i in wds:
        my_i = i.split('\t')
        if my_i[1] == 'positive' and my_i[2] == '1':
            positive.append(my_i[0])
        if my_i[1] == 'negative' and my_i[2] == '1':
            negative.append(my_i[0])
    feat = []
    positive, negative = set(positive), set(negative)
    #    for key, tweet in enumerate(pos_text):
    tokenizer = Tokenizer(preserve_case=True)
    for key, tweet in enumerate(tweetText):
        words = tokenizer.tokenize(tweet)
        counters, counters_caps = np.zeros(8), np.zeros(8)
        for i in words:
            if i.isupper():
                counters_caps += np.array(
                    getBingLiusCounters(positive, negative, i.lower()))
            else:
                counters += np.array(
                    getBingLiusCounters(positive, negative, i.lower()))
        pos_sen = OrderedDict({x: [0, 0, 0, 0] for x in different_pos_tags})
        for k_key, k in enumerate(pos_text[key]):
            if k in positive:
                pos_sen[pos[key][k_key]][0] += 1
            if k in negative:
                pos_sen[pos[key][k_key]][2] += 1
            if k.endswith("_NEG"):
                if k.strip("_NEG") in positive:
                    pos_sen[pos[key][k_key]][1] += 1
                if k.strip("_NEG") in negative:
                    pos_sen[pos[key][k_key]][3] += 1


#        my_feat = list(counters)+list(counters_caps)+[g for gg in pos_sen.values() for g in gg]
        my_feat = list(counters + counters_caps) + [
            g for gg in pos_sen.values() for g in gg
        ]
        feat.append(np.array(my_feat))
    return np.array(feat)
def nrc_emotion(tweetText, pos, different_pos_tags, pos_text ):
    with codecs.open('../lexicons/NRC-Emotion-Lexicon-v0.92/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt', 'r') as inFile:
        wds = inFile.read().splitlines()
    positive, negative = [], []
    for i in wds:
        my_i = i.split('\t')
        if my_i[1] == 'positive' and my_i[2]=='1':
            positive.append(my_i[0])
        if my_i[1] == 'negative' and my_i[2]=='1':
            negative.append(my_i[0])
    feat = []
    positive, negative = set(positive), set(negative)
#    for key, tweet in enumerate(pos_text):
    tokenizer = Tokenizer(preserve_case=True)
    for key, tweet in enumerate(tweetText):
        words = tokenizer.tokenize(tweet)
        counters, counters_caps = np.zeros(8), np.zeros(8)
        for i in words:
            if i.isupper():
                counters_caps += np.array(getBingLiusCounters(positive, negative, i.lower()))
            else:
                counters += np.array(getBingLiusCounters(positive, negative, i.lower()))
        pos_sen = OrderedDict({x:[0,0,0,0] for x in different_pos_tags})
        for k_key, k in enumerate(pos_text[key]):
            if k in positive:
                pos_sen[pos[key][k_key]][0]+=1
            if k in negative:
                pos_sen[pos[key][k_key]][2]+=1
            if k.endswith("_NEG"):
                if k.strip("_NEG") in positive:
                    pos_sen[pos[key][k_key]][1]+=1
                if k.strip("_NEG") in negative:
                    pos_sen[pos[key][k_key]][3]+=1
#        my_feat = list(counters)+list(counters_caps)+[g for gg in pos_sen.values() for g in gg]
        my_feat = list(counters+counters_caps)+[g for gg in pos_sen.values() for g in gg]
        feat.append(np.array(my_feat))
    return np.array(feat)
def bing_lius(tweetText, pos, different_pos_tags, pos_text):
    with codecs.open('../lexicons/positive-words_bing_liu.txt', 'r') as inFile:
        positive = set(inFile.read().splitlines())
    with codecs.open('../lexicons/negative-words_bing_liu.txt', 'r') as inFile:
        negative = set(inFile.read().splitlines())
    feat = []
    tokenizer = Tokenizer(preserve_case=True)
    for key, tweet in enumerate(tweetText):
        words = tokenizer.tokenize(tweet)
        counters, counters_cap = np.zeros(8), np.zeros(8)
        for j in words:
            if j.isupper():
                counters_cap += np.array(
                    getBingLiusCounters(positive, negative, j.lower()))
            else:
                counters += np.array(
                    getBingLiusCounters(positive, negative, j.lower()))
        pos_sen = OrderedDict({x: [0, 0, 0, 0] for x in different_pos_tags})
        for k_key, k in enumerate(pos_text[key]):
            if k in positive:
                pos_sen[pos[key][k_key]][0] += 1
            if k in negative:
                pos_sen[pos[key][k_key]][2] += 1
            if k.endswith("_NEG"):
                if k.strip("_NEG") in positive:
                    pos_sen[pos[key][k_key]][1] += 1
                if k.strip("_NEG") in negative:
                    pos_sen[pos[key][k_key]][3] += 1


#        my_feat = list(counters)+list(counters_cap)+[g for gg in pos_sen.values() for g in gg]
        my_feat = list(counters + counters_cap) + [
            g for gg in pos_sen.values() for g in gg
        ]
        feat.append(np.array(my_feat))
    return np.array(feat)
    def construct_features(self, tokenized_tweet, nrc_lexicons, bing_liu, mpqa,
                           clusters, negations):
        #print "Tweet : ",tokenized_tweet
        f = []
        #NRC Lexicon
        #tokenized_tweet=['hello','world','great','worst']

        #[min, max, avg of lexicon]
        #print nrc_lexicons.get_features(tokenized_tweet)
        f += nrc_lexicons.get_features(tokenized_tweet)

        #Bing_Liu Lexicon

        #[no_of_positive_words, no_of_negative_words]
        #print bing_liu.get_features(tokenized_tweet)
        #f += bing_liu.get_features(tokenized_tweet)

        #MPQA_SUb_Lexicon

        #print mpqa.get_features(tokenized_tweet)
        f += mpqa.get_features(tokenized_tweet)
        #print f

        #Find 1000 clusters
        #f += clusters.get_features(tokenized_tweet)

        #Negation words
        f += negations.get_features(tokenized_tweet)

        from twitterTokenizer import Tokenizer
        tokenizer = Tokenizer()

        #Char Grams
        char_gram = HashingVectorizer(strip_accents='unicode',
                                      binary=True,
                                      ngram_range=(4, 5),
                                      stop_words=None,
                                      lowercase=True,
                                      analyzer='char',
                                      tokenizer=tokenizer.tokenize,
                                      n_features=22000)
        char_gram_features = char_gram.fit_transform(
            [' '.join(tokenized_tweet)])
        char_grams = char_gram_features.toarray()
        print len(char_grams[0])
        #print len(f)
        return f
Ejemplo n.º 11
0
#g = codecs.open('../SemEval2016-task4-test.subtask-BD.txt', encoding='utf8').read().splitlines() #Test data to generate final predictions
g = [i.split("\t") for i in g if i.split("\t")[-1] != 'Not Available']
tweetTest, categories_test = [i[-1] for i in g], [i[2] for i in g]


l = [i[1] for i in g] #This is to group tweets by topic. Can by improved!!
cnt = Counter(l)
yo = [0]
test_cats = []
for i in range(len(set(l))):
    num = cnt[l[yo[i]]]
    test_cats.append(l[num+yo[i]-1])
    yo.append(num+yo[i])


tokenizer = Tokenizer()
ngram = HashingVectorizer(strip_accents='unicode', binary=True, ngram_range=(1,4), stop_words=None, lowercase=True,  tokenizer=tokenizer.tokenize, n_features=10000) #N-gram feature vectorizer
character_gram = HashingVectorizer(strip_accents='unicode', binary=True, ngram_range=(4,5), stop_words=None, lowercase=True, analyzer='char', tokenizer=tokenizer.tokenize, n_features=22000) #Char-gram feature vectorizer

n_power = float(sys.argv[1]) #parameter of the n_power transformation, I used 0.9 for submission

#Linguistic, POS, sentiment disctionaries etc.
pos1, pos_features1, different_pos_tags1, pos_text1 = get_pos_tags_and_hashtags(tweetText+tweetTest) #Get POS of everything
pos, pos_features, different_pos_tags, pos_text =  pos1[:len(categories)], pos_features1[:len(categories)], different_pos_tags1, pos_text1[:len(categories)] #Split train-test again
pos_test, pos_features_test, different_pos_tags_test, pos_text_test = pos1[len(categories):], pos_features1[len(categories):], different_pos_tags1, pos_text1[len(categories):] #Split train-test again

ngram_features = ngram.fit_transform(tweetText) #Get n-gram features
character_gram_features = character_gram.fit_transform(tweetText) #Get char-gram features
ngram_features.data **= n_power #a-power transformation
character_gram_features.data **= n_power #a-power transformation
def createDataMatrix(ngram_features, character_gram_features, tweetText, pos,
                     pos_features, different_pos_tags, pos_text, voca_clusters,
                     categories):
    tokenizer_case_preserve = Tokenizer(preserve_case=True)
    tokenizer = Tokenizer(preserve_case=False)
    handmade_features, cll, cll2 = [], [], []
    for tweet in tweetText:
        feat = []
        feat.append(exclamations(tweet))
        feat.append(questions(tweet))
        feat.append(questions_and_exclamation(tweet))
        feat.append(emoticon_negative(tweet))
        feat.append(emoticon_positive(tweet))
        words = tokenizer_case_preserve.tokenize(tweet)  #preserving casing
        feat.append(allCaps(words))
        feat.append(elongated(words))
        feat.append(questions_and_exclamation(words[-1]))
        handmade_features.append(np.array(feat))
        words = tokenizer.tokenize(tweet)
        words = [word.strip("_NEG") for word in words]
        cll.append(getClusters(voca_clusters, words))
        #cll2.append(getClusters(voca_handmade, words))

    bl = csr_matrix(bing_lius(tweetText, pos, different_pos_tags, pos_text))
    nrc_emo = csr_matrix(
        nrc_emotion(tweetText, pos, different_pos_tags, pos_text))
    mpqa_feat = csr_matrix(mpqa(tweetText, pos, different_pos_tags, pos_text))
    handmade_features = np.array(handmade_features)
    mlb = MultiLabelBinarizer(sparse_output=True,
                              classes=list(set(voca_clusters.values())))
    cluster_memberships_binarized = csr_matrix(mlb.fit_transform(cll))
    #mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_handmade.values())))
    #cluster_memberships_binarized_2 = csr_matrix(mlb.fit_transform(cll2))

    hasht = csr_matrix(
        sent140aff(
            tweetText, pos, different_pos_tags, pos_text,
            '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-unigrams.txt'
        ))
    #    sent140aff_data = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-unigrams.txt'))
    hasht_bigrams = csr_matrix(
        sent140aff_bigrams(
            tweetText, pos, different_pos_tags, pos_text,
            '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-bigrams.txt'
        ))
    #    sent140affBigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-bigrams.txt'))
    sentQ = csr_matrix(get_sentiwordnet(pos_text, pos))
    pos_features = csr_matrix(pos_features)
    handmade_features = csr_matrix(handmade_features)
    # ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features,
    #                             sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float)
    #    ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float)
    ffeatures = scipy.sparse.hstack(
        (ngram_features, character_gram_features, sentQ, handmade_features,
         pos_features, cluster_memberships_binarized, bl, mpqa_feat, nrc_emo,
         hasht, hasht_bigrams),
        dtype=float)

    #     print ngram_features.shape, character_gram_features.shape, cluster_memberships_binarized.shape, handmade_features.shape, pos_features.shape,
    #     sent140affBigrams.shape, hasht_bigrams, hasht.shape, sent140aff_data.shape, bl.shape, mpqa_feat.shape, nrc_emo.shape
    y = []
    for i in categories:
        if i == 'positive':
            y.append(1)
        elif i == 'negative':
            y.append(-1)
        elif i == 'UNKNOWN':
            y.append(0)
        else:
            print i
    ffeatures = normalize(ffeatures)
    #     ffeatures, y = shuffle(ffeatures,y)
    return ffeatures, y
def mpqa(tweetText, pos, different_pos_tags, pos_text):
    voca = codecs.open(
        '../lexicons/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff',
        'r').read().splitlines()
    wds1, wds = {}, {}
    for i in voca:
        i = i.split()
        try:
            if wds1[i[2].split('=')[1]] != i[5].split('=')[1]:
                pass
        except:
            if i[5].split('=')[1] in ['positive', 'negative']:
                wds1[i[2].split('=')[1]] = i[5].split('=')[1]
                wds[i[2].split('=')[1]] = (i[0].split('=')[1],
                                           i[5].split('=')[1])
    feat = []
    tokenizer = Tokenizer(preserve_case=False)
    for key, tweet in enumerate(tweetText):
        direction = {
            'negative': -1,
            'positive': 1,
            'neutral': 0,
            'both': 0,
            'weaksubj': 1,
            'strongsubj': 2
        }
        pp, pn, npp, nn, pp_hash, pn_hash, npp_hash, nn_hash = 0, 0, 0, 0, 0, 0, 0, 0
        words = tokenizer.tokenize(tweet)
        for i in words:
            if i in wds:
                if direction[wds[i][1]] > 0:
                    pp += direction[wds[i][0]] * direction[wds[i][1]]
                if direction[wds[i][1]] < 0:
                    pn += direction[wds[i][0]] * direction[wds[i][1]]
            if i.endswith("_neg"):
                my_i = i.strip("_neg")
                if my_i in wds:
                    if direction[wds[my_i][1]] > 0:
                        npp += direction[wds[my_i][0]] * direction[wds[my_i]
                                                                   [1]]
                    if direction[wds[my_i][1]] < 0:
                        nn += direction[wds[my_i][0]] * direction[wds[my_i][1]]
            if i[0] == "#":
                if i[1:] in wds:
                    if direction[wds[i[1:]][1]] > 0:
                        pp_hash += direction[wds[i[1:]][0]] * direction[wds[
                            i[1:]][1]]
                    if direction[wds[i[1:]][1]] < 0:
                        pn_hash += direction[wds[i[1:]][0]] * direction[wds[
                            i[1:]][1]]
                if i.endswith("_neg"):
                    my_i = i[1:].strip("_neg")
                    if my_i in wds:
                        if direction[wds[my_i][1]] > 0:
                            npp_hash += direction[wds[my_i][0]] * direction[
                                wds[my_i][1]]
                        if direction[wds[my_i][1]] < 0:
                            nn_hash += direction[wds[my_i][0]] * direction[
                                wds[my_i][1]]
        pos_sen = OrderedDict({x: [0, 0, 0, 0] for x in different_pos_tags})
        for k_key, i in enumerate(pos_text[key]):
            if i in wds:
                if direction[wds[i][1]] > 0:
                    pos_sen[pos[key][k_key]][0] += 1
                if direction[wds[i][1]] < 0:
                    pos_sen[pos[key][k_key]][1] += 1
            if i.endswith("_NEG"):
                if i.strip('_NEG') in wds:
                    ii = i.strip('_NEG')
                    if direction[wds[ii][1]] > 0:
                        pos_sen[pos[key][k_key]][2] += 1
                    if direction[wds[ii][1]] < 0:
                        pos_sen[pos[key][k_key]][3] += 1
        my_feat = [pp, pn, npp, nn, pp_hash, pn_hash, npp_hash, nn_hash
                   ] + [g for gg in pos_sen.values() for g in gg]
        feat.append(np.array(my_feat))
    return np.array(feat)
def mpqa(tweetText, pos, different_pos_tags, pos_text):
    voca = codecs.open('../lexicons/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff', 'r').read().splitlines()
    wds1, wds = {}, {}
    for i in voca:
        i = i.split()
        try:
            if wds1[i[2].split('=')[1]] != i[5].split('=')[1]:
               pass 
        except:
            if i[5].split('=')[1] in ['positive', 'negative']:
                wds1[i[2].split('=')[1]] = i[5].split('=')[1]
                wds[i[2].split('=')[1]]=(i[0].split('=')[1], i[5].split('=')[1])
    feat = []
    tokenizer = Tokenizer(preserve_case=False)
    for key, tweet in enumerate(tweetText):
        direction = {'negative':-1, 'positive':1, 'neutral':0, 'both':0, 'weaksubj':1, 'strongsubj':2}
        pp, pn, npp, nn, pp_hash, pn_hash, npp_hash, nn_hash  = 0,0,0,0,0,0,0,0
        words = tokenizer.tokenize(tweet)
        for i in words:
            if i in wds:
                if direction[wds[i][1]] > 0:
                    pp += direction[wds[i][0]]*direction[wds[i][1]]
                if direction[wds[i][1]] < 0:
                    pn += direction[wds[i][0]]*direction[wds[i][1]]
            if i.endswith("_neg"):
                my_i = i.strip("_neg")
                if my_i in wds:
                    if direction[wds[my_i][1]] > 0:
                        npp += direction[wds[my_i][0]]*direction[wds[my_i][1]]
                    if direction[wds[my_i][1]] < 0:
                        nn += direction[wds[my_i][0]]*direction[wds[my_i][1]]
            if i[0] == "#":
                if i[1:] in wds:
                    if direction[wds[i[1:]][1]] > 0:
                        pp_hash += direction[wds[i[1:]][0]]*direction[wds[i[1:]][1]]
                    if direction[wds[i[1:]][1]] < 0:
                        pn_hash += direction[wds[i[1:]][0]]*direction[wds[i[1:]][1]]
                if i.endswith("_neg"):
                    my_i = i[1:].strip("_neg")
                    if my_i in wds:
                        if direction[wds[my_i][1]] > 0:
                            npp_hash += direction[wds[my_i][0]]*direction[wds[my_i][1]]
                        if direction[wds[my_i][1]] < 0:
                            nn_hash += direction[wds[my_i][0]]*direction[wds[my_i][1]]
        pos_sen = OrderedDict({x:[0,0,0,0] for x in different_pos_tags})
        for k_key, i in enumerate(pos_text[key]):
            if i in wds:
                if direction[wds[i][1]] > 0:
                    pos_sen[pos[key][k_key]][0]+=1
                if direction[wds[i][1]] < 0:
                    pos_sen[pos[key][k_key]][1]+=1
            if i.endswith("_NEG"):
                if i.strip('_NEG') in wds:
                    ii = i.strip('_NEG')
                    if direction[wds[ii][1]] > 0:
                        pos_sen[pos[key][k_key]][2]+=1
                    if direction[wds[ii][1]] < 0:
                        pos_sen[pos[key][k_key]][3]+=1
        my_feat = [pp, pn, npp, nn, pp_hash, pn_hash, npp_hash, nn_hash]+[g for gg in pos_sen.values() for g in gg]
        feat.append(np.array(my_feat))
    return np.array(feat)