words = []
            # combine wordlists of one doc in order to get a list of words
            for wordlist in doc:
                words += wordlist
            words = Preprocessing.Wordlist_cleaner(words, remove_puncts=True)

            # words = ['i','am','good'] or words = []
            Use_this_words = False
            for word in words:
                if word in index2word_set:
                    Use_this_words = True
                    break
            if Use_this_words == False:
                empty_docs.add(j)

            X = BuildFeature.getAvgFeatureVecs([words], model)
            y = np.array([emotion])
            FeatureVec = {'X': X, 'y': y}
            new_docs_dict[j] = FeatureVec
            if (j + 1) % 1000. == 0.:
                print ">>  %s - %d/%d doc finished! " % (emotion, j + 1,
                                                         len(docs))
        empty_docs_index[emotion] = empty_docs
        pickle.dump(
            new_docs_dict,
            open(save_path + '/' + feature_name + '.' + emotion + '.Xy.pkl',
                 'wb'))
        print ">> %s. %s emotion finishing!" % (i + 1, emotion)
        del new_docs_dict
        del empty_docs
    pickle.dump(empty_docs_index,
                avg_number_of_sentences = n_sentences / number_of_parts
                number_of_sentences_remain = n_sentences % number_of_parts
                start_row = 0
                for s in xrange(number_of_parts):
                    if number_of_sentences_remain <= 0:
                        Part_of_clean_doc = clean_doc[start_row:start_row +
                                                      avg_number_of_sentences]
                    else:
                        Part_of_clean_doc = clean_doc[start_row:start_row +
                                                      avg_number_of_sentences +
                                                      1]
                    start_row = start_row + len(Part_of_clean_doc)
                    Part_of_clean_doc = sum(Part_of_clean_doc, [])
                    new_doc.append(Part_of_clean_doc)
                    number_of_sentences_remain = number_of_sentences_remain - 1

            DocVec = BuildFeature.getAvgFeatureVecs(new_doc, model)
            DocVec = np.reshape(DocVec, np.product(DocVec.shape))
            X[idx] = DocVec
            idx = idx + 1

            if (j + 1) % 100. == 0.:
                print ">>  %s - %d/%d doc finished! " % (emotion, j + 1,
                                                         len(docs))
            del new_doc, clean_doc, DocVec

        label = np.array([emotion] * len(docs))
        y = np.concatenate((y, label), axis=1)
        del docs
        print ">> %s. %s emotion finishing!" % (i + 1, emotion)
    np.savez_compressed(save_path, X=X, y=y)
        print 'Start to load ' + emotion + '_wordlists.pkl'
        docs = pickle.load(
            open(load_path + '/' + emotion + '_wordlists.pkl', "rb"))
        # docs = docs[0:5]
        for j, doc in enumerate(docs):
            new_doc_keyword = []
            new_doc = sum(doc, [])
            for word in new_doc:
                if word in basic_keyword_list and word in index2word_set:
                    new_doc_keyword.append(word)

            # print new_doc_keyword
            if len(new_doc_keyword) == 0:
                empty_docs.add((emotion, j))
                new_docs.append(new_doc)
            else:
                new_docs.append(new_doc_keyword)

            if (j + 1) % 100. == 0.:
                print ">>  %s - %d/%d doc finished! " % (emotion, j + 1,
                                                         len(docs))

        label = np.array([emotion] * len(docs))
        y = np.concatenate((y, label), axis=1)
        print ">> %s. %s emotion finishing!" % (i + 1, emotion)

    print empty_docs

    X = BuildFeature.getAvgFeatureVecs(new_docs, model)
    np.savez_compressed(save_path, X=X, y=y)
                Use_this_wordlist = False
                wordlist = Preprocessing.Wordlist_cleaner(wordlist,remove_puncts=True)

                if len(wordlist) > 0 :
                    for word in wordlist:
                        if word in index2word_set:
                            Use_this_wordlist = True
                            break
                    if Use_this_wordlist == True:
                        new_doc.append(wordlist)

            if len(new_doc) == 0:
                empty_docs.add(j)
                new_doc = [new_doc]

            X = BuildFeature.getAvgFeatureVecs(new_doc, model)
            y = np.array([emotion]*len(new_doc))
            FeatureVec = {'X': X, 'y': y}
            new_docs_dict[j] = FeatureVec
            if (j+1)%100. == 0.:
                print ">>  %s - %d/%d doc finished! " % (emotion,j+1,len(docs))
            del new_doc
            
        empty_docs_index[emotion] = empty_docs
        pickle.dump(new_docs_dict, open(save_path+'/'+feature_name+'.'+emotion+'.Xy.pkl', 'wb'))
        print ">> %s. %s emotion finishing!" % (i+1,emotion)
        del new_docs_dict
        # print emotion," ",empty_docs
        del empty_docs
    pickle.dump(empty_docs_index, open(save_path+'/empty_docs_index.pkl', 'wb'))
                    for word in wordlist:
                        # if word in basic_keyword_list and word in index2word_set:
                        if word in extend_keyword_list and word in index2word_set:
                            keyword_inside = True
                            break
                    if Use_this_wordlist == True:
                        new_doc.append(wordlist)
                    if keyword_inside == True:
                        new_doc_with_keywords.append(wordlist)

            if len(new_doc) == 0:
                empty_docs.add(j)
                new_doc = [new_doc]

            if len(new_doc_with_keywords) == 0:
                SentenceVecs = BuildFeature.getAvgFeatureVecs(new_doc, model)
            else:
                SentenceVecs = BuildFeature.getAvgFeatureVecs(new_doc_with_keywords, model)
            
            n_sentences = SentenceVecs.shape[0]
            DocVec = np.zeros((n_features,),dtype="float32")

            # print 'SentenceVecs : ', SentenceVecs.shape
            for row in xrange(n_sentences):
                DocVec = np.add(DocVec,SentenceVecs[row])
            DocVec = np.divide(DocVec,n_sentences)
            del SentenceVecs

            X[idx] = DocVec
            idx = idx + 1
Ejemplo n.º 6
0
                    for word in wordlist:
                        # if word in basic_keyword_list and word in index2word_set:
                        if word in extend_keyword_list and word in index2word_set:
                            keyword_inside = True
                            break
                    if Use_this_wordlist == True:
                        new_doc.append(wordlist)
                    if keyword_inside == True:
                        new_doc_with_keywords.append(wordlist)

            if len(new_doc) == 0:
                empty_docs.add(j)
                new_doc = [new_doc]

            if len(new_doc_with_keywords) == 0:
                SentenceVecs = BuildFeature.getAvgFeatureVecs(new_doc, model)
            else:
                SentenceVecs = BuildFeature.getAvgFeatureVecs(
                    new_doc_with_keywords, model)

            n_sentences = SentenceVecs.shape[0]
            DocVec = np.zeros((n_features, ), dtype="float32")

            # print 'SentenceVecs : ', SentenceVecs.shape
            for row in xrange(n_sentences):
                DocVec = np.add(DocVec, SentenceVecs[row])
            DocVec = np.divide(DocVec, n_sentences)
            del SentenceVecs

            X[idx] = DocVec
            idx = idx + 1