def generateCentroidSimilarityScore(sent): # print sent terms = twokenize.tokenizeRawTweetText(sent) averagevec = numpy.zeros((300, ), dtype="float32") for t in terms: try: averagevec = numpy.add( averagevec, FeatureExtractionUtilities.neg_model[t.lower()]) except KeyError: pass try: averagevec = np.divide(averagevec, len(terms)) except: pass sims = [] for nfv in FeatureExtractionUtilities.neg_feature_vecs: sims.append(cosine(nfv, averagevec)) averagepvec = np.zeros((300, ), dtype="float32") for t in terms: try: averagepvec = np.add( averagepvec, FeatureExtractionUtilities.neg_model[t.lower()]) except KeyError: pass if len(terms) > 0: averagevec = np.divide(averagepvec, len(terms)) for nfv in FeatureExtractionUtilities.neg_feature_vecs: sims.append(cosine(nfv, averagepvec)) # print sims return np.nan_to_num(sims)
def generate_ade_feature(text): terms = twokenize.tokenizeRawTweetText(text) tags = nltk.pos_tag(terms, 'universal') sentence_arr = numpy.zeros(32) ade_count = 0 min_len = min(32, len(tags)) for i in range(0, min_len): found_adr = False sentence_list = find_words(tags, i) for sentence in sentence_list: if find_adr(sentence): # print "sentence {}".format(sentence) ln = len(nltk.word_tokenize(sentence)) ade_count = ade_count + 1 sentence_arr[i:i + ln] = 10 found_adr = True if tags[i][0] not in stop_words_en and not found_adr and ( tags[i][1] == 'ADJ' or tags[i][1] == 'ADV' or tags[i][1] == 'NOUN' or tags[i][1] == 'VERB'): if re.search(r'\b{0}\b'.format(tags[i][0]), ades): ade_count = ade_count + 1 sentence_arr[i] = 1 # print ade_count sentence_arr = sentence_arr * ade_count sentence_ade.append(sentence_arr) # print sentence_arr return sentence_arr
def getsentiwordscores(processed_data): negscore = 0.0 posscore = 0.0 negscores = [] posscores = [] for d in processed_data: negscore = 0.0 posscore = 0.0 terms = twokenize.tokenizeRawTweetText(d) pos_tags = nltk.pos_tag(terms, 'universal') for i in range(0,len(pos_tags)): try: if string.lower(str(pos_tags[i][1]))=='adj': if FeatureExtractionUtilities.sentiposscores.has_key((string.lower(str(pos_tags[i][0])),'a')): posscore+= float(FeatureExtractionUtilities.sentiposscores[(string.lower(str(pos_tags[i][0])),'a')]) if FeatureExtractionUtilities.sentinegscores.has_key((string.lower(str(pos_tags[i][0])),'a')): negscore+= float(FeatureExtractionUtilities.sentinegscores[(string.lower(str(pos_tags[i][0])),'a')]) if string.lower(str(pos_tags[i][1]))=='verb': if FeatureExtractionUtilities.sentiposscores.has_key((string.lower(str(pos_tags[i][0])),'v')): posscore+= float(FeatureExtractionUtilities.sentiposscores[(string.lower(str(pos_tags[i][0])),'v')]) if FeatureExtractionUtilities.sentinegscores.has_key((string.lower(str(pos_tags[i][0])),'v')): negscore+= float(FeatureExtractionUtilities.sentinegscores[(string.lower(str(pos_tags[i][0])),'v')]) if string.lower(str(pos_tags[i][1]))=='noun': if FeatureExtractionUtilities.sentiposscores.has_key((string.lower(str(pos_tags[i][0])),'n')): posscore+= float(FeatureExtractionUtilities.sentiposscores[(string.lower(str(pos_tags[i][0])),'n')]) if FeatureExtractionUtilities.sentinegscores.has_key((string.lower(str(pos_tags[i][0])),'n')): negscore+= float(FeatureExtractionUtilities.sentinegscores[(string.lower(str(pos_tags[i][0])),'n')]) except Exception: pass negscores.append([negscore]) posscores.append([posscore]) #print negscores #print posscores return negscores,posscores
def getsubjectivityscores(processed_data, sequence_length): subjectivity_scores = [] for d in processed_data: subjectivity_score = 0.0 subjectivity_score_lst = [] # score = np.zeros((sequence_length), dtype=np.float64).tolist() zeros = np.random.uniform(low=0.0, high=1, size=6) ones = np.ones(6, dtype=np.float32) terms = twokenize.tokenizeRawTweetText(d) pos_tags = nltk.pos_tag(terms, 'universal') for i in range(0, len(pos_tags)): try: if FeatureExtractionUtilities.polarity_dict.has_key(pos_tags[i]): i = terms.index(pos_tags[i]) subjectivity_score = FeatureExtractionUtilities.polarity_dict[pos_tags[i]] score = 100 * subjectivity_score * ones subjectivity_score_lst.append(score) else: subjectivity_score_lst.append(zeros) except Exception: subjectivity_score_lst.append(zeros) pass # subjectivity_score = subjectivity_score/len(terms) # diff = sequence_length - len(subjectivity_score_lst) diff = sequence_length - len(subjectivity_score_lst) for i in range(0, diff): subjectivity_score_lst.append(zeros) # score = np.concatenate((np.array(subjectivity_score_lst), a), axis=0) subjectivity_scores.append(subjectivity_score_lst) return subjectivity_scores
def generate_Document_word2vec(d,embedding_model): sentence_vec = [] sequence_length=32 terms = twokenize.tokenizeRawTweetText(d.lower()) min_len = min(sequence_length, len(terms)) dict={} # pos_tags = nltk.pos_tag(terms, 'universal') # word_tokens = nltk.word_tokenize(d.lower()) for i in range(0, min_len): if terms[i] in embedding_model: sentence_vec.append(np.array(embedding_model[terms[i]])) else: if(terms[i] in dict): sentence_vec.append(dict[terms[i]]) else: dict[terms[i]]=np.array(np.random.uniform(-0.25, 0.25, embedding_model.vector_size)) sentence_vec.append(dict[terms[i]]) # embedding_weights = [np.array([embedding_model[w] if w in embedding_model \ # else np.random.uniform(-0.25, 0.25, embedding_model.vector_size) \ # for w in vocabulary_inv])] # print embedding_model['medicine'] diff = sequence_length - len(sentence_vec) # print diff if diff < 0: print diff for i in range(0, diff): sentence_vec.append(np.random.uniform(-0.25, 0.25, embedding_model.vector_size)) return sentence_vec
def getclusterfeatures(sent): terms = twokenize.tokenizeRawTweetText(sent) # pos_tags = nltk.pos_tag(terms, 'universal') # terms = parsed_sent.split('\t') cluster_string = '' for t in terms: for k in FeatureExtractionUtilities.word_clusters.keys(): if t in FeatureExtractionUtilities.word_clusters[k]: cluster_string += ' clust_' + k + '_clust ' return cluster_string
def getsubjectivityscores(processed_data): subjectivity_scores = [] for d in processed_data: subjectivity_score = 0.0 terms = twokenize.tokenizeRawTweetText(d) pos_tags = nltk.pos_tag(terms, 'universal') for i in range(0,len(pos_tags)): try: if FeatureExtractionUtilities.polarity_dict.has_key(pos_tags[i]): print 'yes' subjectivity_score+=FeatureExtractionUtilities.polarity_dict[pos_tags[i]] except Exception: pass subjectivity_score = subjectivity_score/len(terms) subjectivity_scores.append([subjectivity_score]) return subjectivity_scores
def generate_emotion_features(text): terms = twokenize.tokenizeRawTweetText(text) tags = nltk.pos_tag(terms, 'universal') # print tags valence_list = numpy.zeros(32) arousal_list = numpy.zeros(32) dominance_list = numpy.zeros(32) min_len = min(32, len(tags)) for i in range(0, min_len): if valenceList[tags[i][0]]: valence_list[i] = valenceList[tags[i][0]][0] arousal_list[i] = arousalList[tags[i][0]][0] dominance_list[i] = dominanceList[tags[i][0]][0] return (valence_list, arousal_list, dominance_list)
def getSynsetString(sent, negations): terms = twokenize.tokenizeRawTweetText(sent) pos_tags = nltk.pos_tag(terms, 'universal') #terms = parsed_sent.split('\t') sent_terms = [] #now terms[0] will contain the text and terms[1] will contain the POS (space separated) #sentence_tokens = terms[0].split() #pos_tags = terms[1].split() for i in range(0,len(pos_tags)): if string.lower(str(pos_tags[i][1]))=='adj': synsets = wn.synsets(string.lower(pos_tags[i][0]),pos=wn.ADJ) for syn in synsets: lemmas=[string.lower(lemma) for lemma in syn.lemma_names()] sent_terms += lemmas if string.lower(pos_tags[i][1])=='verb': synsets = wn.synsets(string.lower(pos_tags[i][0]),pos=wn.VERB) for syn in synsets: lemmas=[string.lower(lemma) for lemma in syn.lemma_names()] sent_terms += lemmas if string.lower(pos_tags[i][1])=='noun': synsets = wn.synsets(string.lower(pos_tags[i][0]),pos=wn.NOUN) for syn in synsets: lemmas=[string.lower(lemma) for lemma in syn.lemma_names()] sent_terms += lemmas sent_terms = list(set(sent_terms)) #print sent_terms senttermsstring = '' for term in sent_terms: senttermsstring += ' ' +'syn_'+ stemmer.stem(term)+'_syn' #print senttermsstring return senttermsstring
def generate_emotion_features(text): terms = twokenize.tokenizeRawTweetText(text) tags = nltk.pos_tag(terms, 'universal') # print tags anger_list = numpy.zeros(32) anticipation_list = numpy.zeros(32) disgust_list = numpy.zeros(32) fear_list = numpy.zeros(32) joy_list = numpy.zeros(32) negative_list = numpy.zeros(32) positive_list = numpy.zeros(32) sadness_list = numpy.zeros(32) surprise_list = numpy.zeros(32) trust_list = numpy.zeros(32) anger_cnt = 0 anticipation_cnt = 0 disgust_cnt = 0 fear_cnt = 0 joy_cnt = 0 negative_cnt = 0 positive_cnt = 0 sadness_cnt = 0 surprise_cnt = 0 trust_cnt = 0 min_len = min(32, len(tags)) # found_emotions = 0 for i in range(0, min_len): print tags[i][0] print wordList[tags[i][0]] if 'anger' in wordList[tags[i][0]]: anger_list[i] = 13 anger_cnt = anger_cnt + 1 if 'anticipation' in wordList[tags[i][0]]: anticipation_list[i] = 14 anticipation_cnt = anticipation_cnt + 1 if 'disgust' in wordList[tags[i][0]]: disgust_list[i] = 15 disgust_cnt = disgust_cnt + 1 if 'fear' in wordList[tags[i][0]]: fear_list[i] = 16 fear_cnt = fear_cnt + 1 if 'joy' in wordList[tags[i][0]]: joy_list[i] = 17 joy_cnt = joy_cnt + 1 if 'negative' in wordList[tags[i][0]]: negative_list[i] = 18 negative_cnt = negative_cnt + 1 if 'positive' in wordList[tags[i][0]]: positive_list[i] = 19 positive_cnt = positive_cnt + 1 if 'sadness' in wordList[tags[i][0]]: sadness_list[i] = 20 sadness_cnt = sadness_cnt + 1 if 'surprise' in wordList[tags[i][0]]: surprise_list[i] = 21 surprise_cnt = surprise_cnt + 1 if 'trust' in wordList[tags[i][0]]: trust_list[i] = 22 trust_cnt = trust_cnt + 1 if anger_cnt > 1: anger_list = anger_list * anger_cnt if anticipation_cnt > 1: anticipation_list = anticipation_list * anticipation_cnt if disgust_cnt > 1: disgust_list = disgust_list * disgust_cnt if fear_cnt > 1: fear_list = fear_list * fear_cnt if joy_cnt > 1: joy_list = joy_list * joy_cnt if negative_cnt > 1: negative_list = negative_list * negative_cnt if positive_cnt > 1: positive_list = positive_list * positive_cnt if sadness_cnt > 1: sadness_list = sadness_list * sadness_cnt if surprise_cnt > 1: surprise_list = surprise_list * surprise_cnt if trust_cnt > 1: trust_list = trust_list * trust_cnt return (anger_list, anticipation_list, disgust_list, fear_list, joy_list, negative_list, positive_list, sadness_list, surprise_list, trust_list)
def generate_senti_features(text): terms = twokenize.tokenizeRawTweetText(text) pos_tags = nltk.pos_tag(terms, 'universal') # print tags sentence_pos = numpy.zeros(32) sentence_neg = numpy.zeros(32) min_len = min(32, len(pos_tags)) for i in range(0, min_len): st = stemmer.stem(pos_tags[i][0]) # print pos_tags[i][0] if string.lower(str(pos_tags[i][1])) == 'adj': if FeatureExtractionUtilities.sentiposscores.has_key( (string.lower(str(pos_tags[i][0])), 'a')): posscore = float( FeatureExtractionUtilities.sentiposscores[(string.lower( str(pos_tags[i][0])), 'a')]) score = 200 * posscore sentence_pos[i] = score if FeatureExtractionUtilities.sentinegscores.has_key( (string.lower(str(pos_tags[i][0])), 'a')): negscore = float( FeatureExtractionUtilities.sentinegscores[(string.lower( str(pos_tags[i][0])), 'a')]) score = 100 * negscore sentence_neg[i] = score elif string.lower(str(pos_tags[i][1])) == 'verb': if FeatureExtractionUtilities.sentiposscores.has_key( (string.lower(str(pos_tags[i][0])), 'v')): posscore = float( FeatureExtractionUtilities.sentiposscores[(string.lower( str(pos_tags[i][0])), 'v')]) score = 200 * posscore sentence_pos[i] = score if FeatureExtractionUtilities.sentinegscores.has_key( (string.lower(str(pos_tags[i][0])), 'v')): negscore = float( FeatureExtractionUtilities.sentinegscores[(string.lower( str(pos_tags[i][0])), 'v')]) score = 100 * negscore sentence_neg[i] = score elif string.lower(str(pos_tags[i][1])) == 'noun': if FeatureExtractionUtilities.sentiposscores.has_key( (string.lower(str(pos_tags[i][0])), 'n')): posscore = float( FeatureExtractionUtilities.sentiposscores[(string.lower( str(pos_tags[i][0])), 'n')]) score = 200 * posscore sentence_pos[i] = score if FeatureExtractionUtilities.sentinegscores.has_key( (string.lower(str(pos_tags[i][0])), 'n')): negscore = float( FeatureExtractionUtilities.sentinegscores[(string.lower( str(pos_tags[i][0])), 'n')]) score = 100 * negscore sentence_neg[i] = score return (sentence_pos, sentence_neg)
def getsentiwordscores(processed_data, sequence_length): negscore = 0.0 posscore = 0.0 negscores = [] posscores = [] zeros = np.random.uniform(low=0.0, high=1, size=6) for d in processed_data: negscore = 0.0 posscore = 0.0 sentencenegscore = [] sentenceposscore = [] terms = twokenize.tokenizeRawTweetText(d) pos_tags = nltk.pos_tag(terms, 'universal') for i in range(0, len(pos_tags)): try: if string.lower(str(pos_tags[i][1])) == 'adj': if FeatureExtractionUtilities.sentiposscores.has_key((string.lower(str(pos_tags[i][0])), 'a')): posscore = float( FeatureExtractionUtilities.sentiposscores[(string.lower(str(pos_tags[i][0])), 'a')]) score = 100 * posscore * np.ones(6, dtype=np.float64) sentenceposscore.append(score) else: sentenceposscore.append(zeros) if FeatureExtractionUtilities.sentinegscores.has_key((string.lower(str(pos_tags[i][0])), 'a')): negscore = float( FeatureExtractionUtilities.sentinegscores[(string.lower(str(pos_tags[i][0])), 'a')]) score = 100 * negscore * np.ones(6, dtype=np.float64) sentencenegscore.append(score) else: sentencenegscore.append(zeros) elif string.lower(str(pos_tags[i][1])) == 'verb': if FeatureExtractionUtilities.sentiposscores.has_key((string.lower(str(pos_tags[i][0])), 'v')): posscore = float( FeatureExtractionUtilities.sentiposscores[(string.lower(str(pos_tags[i][0])), 'v')]) score = 100 * posscore * np.ones(6, dtype=np.float64) sentenceposscore.append(score) else: sentenceposscore.append(zeros) if FeatureExtractionUtilities.sentinegscores.has_key((string.lower(str(pos_tags[i][0])), 'v')): negscore = float( FeatureExtractionUtilities.sentinegscores[(string.lower(str(pos_tags[i][0])), 'v')]) score = 100 * negscore * np.ones(6, dtype=np.float64) sentencenegscore.append(score) else: sentencenegscore.append(zeros) elif string.lower(str(pos_tags[i][1])) == 'noun': if FeatureExtractionUtilities.sentiposscores.has_key((string.lower(str(pos_tags[i][0])), 'n')): posscore = float( FeatureExtractionUtilities.sentiposscores[(string.lower(str(pos_tags[i][0])), 'n')]) score = 100 * posscore * np.ones(6, dtype=np.float64) sentenceposscore.append(score) else: sentenceposscore.append(zeros) if FeatureExtractionUtilities.sentinegscores.has_key((string.lower(str(pos_tags[i][0])), 'n')): negscore = float( FeatureExtractionUtilities.sentinegscores[(string.lower(str(pos_tags[i][0])), 'n')]) score = 100 * negscore * np.ones(6, dtype=np.float64) sentencenegscore.append(score) else: sentencenegscore.append(zeros) else: sentencenegscore.append(zeros) sentenceposscore.append(zeros) except Exception: # sentencenegscore.append(zeros) # sentenceposscore.append(zeros) pass # diff = sequence_length - len(sentencenegscore) # a = np.zeros((diff), dtype=np.float64).tolist() # pos = np.concatenate((np.array(sentenceposscore), a), axis=0) # neg = np.concatenate((np.array(sentencenegscore), a), axis=0) diff = sequence_length - len(terms) for i in range(0, diff): sentencenegscore.append(zeros) sentenceposscore.append(zeros) negscores.append(sentencenegscore) posscores.append(sentenceposscore) # print negscores # print posscores return negscores, posscores
def generate_sentence_matrix(d): sentence_vec = [] sentence_pos = [] sentence_neg = [] sentence_ade = [] sentence_subj = [] sentence_ppos = [] sentence_nneg = [] sentence_mgood = [] sentence_lgood = [] sentence_lbad = [] sentence_mbad = [] sentence_cluster = [] sentence_cluster2 = [] letters_count=[] word_order=[] added = False sequence_length = 32 embed_size = 1 zeros = -1 * np.ones(embed_size, dtype=np.float64) # np.random.uniform(low=0.0, high=1, size=embed_size) #np.zeros(embed_size)# random.uniform(low=0.0, high=1, size=embed_size) ones = np.ones(embed_size, dtype=np.float64) # np.random.uniform(low=0.0, high=1, size=1)#np.ones(embed_size, dtype=np.float32) two = np.ones(embed_size, dtype=np.float64) one = np.ones(1, dtype=np.float64) ades = '|'.join(FeatureExtractionUtilities.ade_list) terms = twokenize.tokenizeRawTweetText(d.lower()) pos_tags = nltk.pos_tag(terms, 'universal') word_tokens = nltk.word_tokenize(d.lower()) min_len = min(sequence_length, len(pos_tags)) for i in range(0,min_len ): # print "pos_tags[i]" # print pos_tags[i] letters_count.append(len(pos_tags[i][0])*one) word_order.append(i*one) st = stemmer.stem(pos_tags[i][0]) word_arr = np.array([]) # print pos_tags[i][0] if st in FeatureExtractionUtilities.bingnegs: neg = 1 * np.ones(embed_size, dtype=np.float64) word_arr = np.concatenate((word_arr, neg), axis=0) sentence_pos.append(one) added = True else: word_arr = np.concatenate((word_arr, zeros), axis=0) sentence_pos.append(0 * one) if st in FeatureExtractionUtilities.bingposs: neg = 20 * np.ones(embed_size, dtype=np.float64) word_arr = np.concatenate((word_arr, neg), axis=0) sentence_neg.append(2 * one) added = True else: word_arr = np.concatenate((word_arr, zeros), axis=0) sentence_neg.append(0 * one) # rand = random.randint(1, 100)*ones # word_arr = np.concatenate((word_arr, rand), axis=0) # word_arr = np.concatenate((word_arr, rand), axis=0) # if re.search(ades, str(pos_tags[i][0])): # word_arr = 3 * np.concatenate((word_arr, two), axis=0) # sentence_ade.append(3 * one) # else: word_arr = np.concatenate((word_arr, zeros), axis=0) sentence_ade.append(0 * one) # word_arr = np.concatenate((word_arr, rand), axis=0) if FeatureExtractionUtilities.polarity_dict.has_key(st): i = terms.index(pos_tags[i]) subjectivity_score = FeatureExtractionUtilities.polarity_dict[pos_tags[i][0]] score = 40 * subjectivity_score * ones word_arr = np.concatenate((word_arr, score), axis=0) sentence_subj.append(score) added = True else: sentence_subj.append(0 * one) word_arr = np.concatenate((word_arr, zeros), axis=0) if string.lower(str(pos_tags[i][1])) == 'adj': if FeatureExtractionUtilities.sentiposscores.has_key((st, 'a')): posscore = float( FeatureExtractionUtilities.sentiposscores[(st, 'a')]) score = 50 * posscore * np.ones(embed_size, dtype=np.float64) word_arr = np.concatenate((word_arr, score), axis=0) sentence_ppos.append(score) added = True else: word_arr = np.concatenate((word_arr, zeros), axis=0) sentence_ppos.append(0 * one) if FeatureExtractionUtilities.sentinegscores.has_key((st, 'a')): negscore = float( FeatureExtractionUtilities.sentinegscores[(st, 'a')]) score = 6 * negscore * np.ones(embed_size, dtype=np.float64) sentence_nneg.append(score) word_arr = np.concatenate((word_arr, score), axis=0) added = True else: sentence_nneg.append(0 * one) word_arr = np.concatenate((word_arr, zeros), axis=0) elif string.lower(str(pos_tags[i][1])) == 'verb': if FeatureExtractionUtilities.sentiposscores.has_key((st, 'v')): posscore = float( FeatureExtractionUtilities.sentiposscores[(st, 'v')]) score = 50 * posscore * np.ones(embed_size, dtype=np.float64) word_arr = np.concatenate((word_arr, score), axis=0) sentence_ppos.append(score) added = True else: word_arr = np.concatenate((word_arr, zeros), axis=0) sentence_ppos.append(0 * one) if FeatureExtractionUtilities.sentinegscores.has_key((st, 'v')): negscore = float( FeatureExtractionUtilities.sentinegscores[(st, 'v')]) score = 6 * negscore * np.ones(embed_size, dtype=np.float64) sentence_nneg.append(score) word_arr = np.concatenate((word_arr, score), axis=0) added = True else: sentence_nneg.append(0 * one) word_arr = np.concatenate((word_arr, zeros), axis=0) elif string.lower(str(pos_tags[i][1])) == 'noun': if FeatureExtractionUtilities.sentiposscores.has_key((st, 'n')): posscore = float( FeatureExtractionUtilities.sentiposscores[(st, 'n')]) score = 50 * posscore * np.ones(embed_size, dtype=np.float64) sentence_ppos.append(score) word_arr = np.concatenate((word_arr, score), axis=0) added = True else: word_arr = np.concatenate((word_arr, zeros), axis=0) sentence_ppos.append(0 * one) if FeatureExtractionUtilities.sentinegscores.has_key((st, 'n')): negscore = float( FeatureExtractionUtilities.sentinegscores[(st, 'n')]) score = 6 * negscore * np.ones(embed_size, dtype=np.float64) sentence_nneg.append(score) word_arr = np.concatenate((word_arr, score), axis=0) added = True else: sentence_nneg.append(0 * one) word_arr = np.concatenate((word_arr, zeros), axis=0) else: sentence_ppos.append(0 * one) sentence_nneg.append(0 * one) word_arr = np.concatenate((word_arr, zeros), axis=0) word_arr = np.concatenate((word_arr, zeros), axis=0) moreGood = 0 moreBad = 0 lessGood = 0 lessBad = 0 for k in range(i, len(word_tokens)): if st in FeatureExtractionUtilities.morewords: minboundary = max(i - 4, 0) maxboundary = min(i + 4, len(pos_tags) - 1) j = minboundary while j <= maxboundary: t = stemmer.stem(pos_tags[j][0]) if t in FeatureExtractionUtilities.goodwords: moreGood = 70 elif t in FeatureExtractionUtilities.badwords: moreBad = 8 j += 1 if st in FeatureExtractionUtilities.lesswords: minboundary = max(i - 4, 0) maxboundary = min(i + 4, len(pos_tags) - 1) j = minboundary while j <= maxboundary: t = stemmer.stem(pos_tags[j][0]) if t in FeatureExtractionUtilities.goodwords: lessGood = 90 elif t in FeatureExtractionUtilities.badwords: lessBad = 10 j += 1 sentence_mgood.append(moreGood * one) sentence_mbad.append(moreBad * one) sentence_lgood.append(lessGood * one) sentence_lbad.append(lessBad * one) clusterAdded = False # for k in FeatureExtractionUtilities.word_clusters.keys(): # if st in FeatureExtractionUtilities.word_clusters[k]: # clusterAdded = True # kdiff = 20 - len(k) # clust = np.concatenate((np.zeros(kdiff, dtype=np.float64), np.array(list(k), dtype=np.float64)), # axis=0) # wclass = np.array(clust[:10], # dtype=np.float64) # np.concatenate((np.zeros(kdiff, dtype=np.float64), np.array(list(k), dtype=np.float64)), axis=0) # wclass2 = np.array(clust[10:], dtype=np.float64) # # if(kdiff>0): # # wclass=np.concatenate((np.zeros(kdiff, dtype=np.float64), wclass), axis=0) # # sentence_cluster.append(wclass) # sentence_cluster2.append(wclass2) if clusterAdded == False: sentence_cluster.append(np.zeros(10, dtype=np.float64)) sentence_cluster2.append(np.zeros(10, dtype=np.float64)) word_arr = np.concatenate((word_arr, [moreGood, moreBad, lessGood, lessBad]), axis=0) sentence_vec.append(word_arr) diff = sequence_length - len(sentence_vec) # print diff if diff < 0: print diff for i in range(0, diff): z = np.concatenate((zeros, zeros, zeros, zeros, zeros, zeros, zeros, zeros, zeros, zeros), axis=0) # ,zeros,zeros,zeros,zeros letters_count.append(0* one) sentence_vec.append(z) sentence_neg.append(0 * one) sentence_pos.append(0 * one) sentence_ade.append(0 * one) sentence_subj.append(0 * one) sentence_ppos.append(0 * one) sentence_nneg.append(0 * one) sentence_lgood.append(0 * one) sentence_mbad.append(0 * one) sentence_lbad.append(0 * one) sentence_mgood.append(0 * one) sentence_cluster.append(np.zeros(10, dtype=np.float64)) sentence_cluster2.append(np.zeros(10, dtype=np.float64)) word_order.append(0 * one) return sentence_ade, sentence_cluster, sentence_cluster2, sentence_lbad, sentence_lgood, sentence_mbad, sentence_mgood, sentence_neg, sentence_nneg, sentence_pos, sentence_ppos, sentence_subj, sentence_vec,letters_count,word_order