words = [] # combine wordlists of one doc in order to get a list of words for wordlist in doc: words += wordlist words = Preprocessing.Wordlist_cleaner(words, remove_puncts=True) # words = ['i','am','good'] or words = [] Use_this_words = False for word in words: if word in index2word_set: Use_this_words = True break if Use_this_words == False: empty_docs.add(j) X = BuildFeature.getAvgFeatureVecs([words], model) y = np.array([emotion]) FeatureVec = {'X': X, 'y': y} new_docs_dict[j] = FeatureVec if (j + 1) % 1000. == 0.: print ">> %s - %d/%d doc finished! " % (emotion, j + 1, len(docs)) empty_docs_index[emotion] = empty_docs pickle.dump( new_docs_dict, open(save_path + '/' + feature_name + '.' + emotion + '.Xy.pkl', 'wb')) print ">> %s. %s emotion finishing!" % (i + 1, emotion) del new_docs_dict del empty_docs pickle.dump(empty_docs_index,
avg_number_of_sentences = n_sentences / number_of_parts number_of_sentences_remain = n_sentences % number_of_parts start_row = 0 for s in xrange(number_of_parts): if number_of_sentences_remain <= 0: Part_of_clean_doc = clean_doc[start_row:start_row + avg_number_of_sentences] else: Part_of_clean_doc = clean_doc[start_row:start_row + avg_number_of_sentences + 1] start_row = start_row + len(Part_of_clean_doc) Part_of_clean_doc = sum(Part_of_clean_doc, []) new_doc.append(Part_of_clean_doc) number_of_sentences_remain = number_of_sentences_remain - 1 DocVec = BuildFeature.getAvgFeatureVecs(new_doc, model) DocVec = np.reshape(DocVec, np.product(DocVec.shape)) X[idx] = DocVec idx = idx + 1 if (j + 1) % 100. == 0.: print ">> %s - %d/%d doc finished! " % (emotion, j + 1, len(docs)) del new_doc, clean_doc, DocVec label = np.array([emotion] * len(docs)) y = np.concatenate((y, label), axis=1) del docs print ">> %s. %s emotion finishing!" % (i + 1, emotion) np.savez_compressed(save_path, X=X, y=y)
print 'Start to load ' + emotion + '_wordlists.pkl' docs = pickle.load( open(load_path + '/' + emotion + '_wordlists.pkl', "rb")) # docs = docs[0:5] for j, doc in enumerate(docs): new_doc_keyword = [] new_doc = sum(doc, []) for word in new_doc: if word in basic_keyword_list and word in index2word_set: new_doc_keyword.append(word) # print new_doc_keyword if len(new_doc_keyword) == 0: empty_docs.add((emotion, j)) new_docs.append(new_doc) else: new_docs.append(new_doc_keyword) if (j + 1) % 100. == 0.: print ">> %s - %d/%d doc finished! " % (emotion, j + 1, len(docs)) label = np.array([emotion] * len(docs)) y = np.concatenate((y, label), axis=1) print ">> %s. %s emotion finishing!" % (i + 1, emotion) print empty_docs X = BuildFeature.getAvgFeatureVecs(new_docs, model) np.savez_compressed(save_path, X=X, y=y)
Use_this_wordlist = False wordlist = Preprocessing.Wordlist_cleaner(wordlist,remove_puncts=True) if len(wordlist) > 0 : for word in wordlist: if word in index2word_set: Use_this_wordlist = True break if Use_this_wordlist == True: new_doc.append(wordlist) if len(new_doc) == 0: empty_docs.add(j) new_doc = [new_doc] X = BuildFeature.getAvgFeatureVecs(new_doc, model) y = np.array([emotion]*len(new_doc)) FeatureVec = {'X': X, 'y': y} new_docs_dict[j] = FeatureVec if (j+1)%100. == 0.: print ">> %s - %d/%d doc finished! " % (emotion,j+1,len(docs)) del new_doc empty_docs_index[emotion] = empty_docs pickle.dump(new_docs_dict, open(save_path+'/'+feature_name+'.'+emotion+'.Xy.pkl', 'wb')) print ">> %s. %s emotion finishing!" % (i+1,emotion) del new_docs_dict # print emotion," ",empty_docs del empty_docs pickle.dump(empty_docs_index, open(save_path+'/empty_docs_index.pkl', 'wb'))
for word in wordlist: # if word in basic_keyword_list and word in index2word_set: if word in extend_keyword_list and word in index2word_set: keyword_inside = True break if Use_this_wordlist == True: new_doc.append(wordlist) if keyword_inside == True: new_doc_with_keywords.append(wordlist) if len(new_doc) == 0: empty_docs.add(j) new_doc = [new_doc] if len(new_doc_with_keywords) == 0: SentenceVecs = BuildFeature.getAvgFeatureVecs(new_doc, model) else: SentenceVecs = BuildFeature.getAvgFeatureVecs(new_doc_with_keywords, model) n_sentences = SentenceVecs.shape[0] DocVec = np.zeros((n_features,),dtype="float32") # print 'SentenceVecs : ', SentenceVecs.shape for row in xrange(n_sentences): DocVec = np.add(DocVec,SentenceVecs[row]) DocVec = np.divide(DocVec,n_sentences) del SentenceVecs X[idx] = DocVec idx = idx + 1
for word in wordlist: # if word in basic_keyword_list and word in index2word_set: if word in extend_keyword_list and word in index2word_set: keyword_inside = True break if Use_this_wordlist == True: new_doc.append(wordlist) if keyword_inside == True: new_doc_with_keywords.append(wordlist) if len(new_doc) == 0: empty_docs.add(j) new_doc = [new_doc] if len(new_doc_with_keywords) == 0: SentenceVecs = BuildFeature.getAvgFeatureVecs(new_doc, model) else: SentenceVecs = BuildFeature.getAvgFeatureVecs( new_doc_with_keywords, model) n_sentences = SentenceVecs.shape[0] DocVec = np.zeros((n_features, ), dtype="float32") # print 'SentenceVecs : ', SentenceVecs.shape for row in xrange(n_sentences): DocVec = np.add(DocVec, SentenceVecs[row]) DocVec = np.divide(DocVec, n_sentences) del SentenceVecs X[idx] = DocVec idx = idx + 1