예제 #1
0
def makedocvec(Folda = "businesstexts",clusternumber = DimentionN,word2vecdic = word2vecdic):
    businesspreprocessed_docs = maketextdoc(Folda)
    docvec = {}
    for k in businesspreprocessed_docs.keys():
        sentence = businesspreprocessed_docs[k]
        if len(sentence) < 2:
            continue
        docvec[k] = np.zeros(clusternumber)
        for m in sentence:
            try:
                labelnum = word2vecdic[m]
                docvec[k][labelnum] = (docvec[k][labelnum] + 1)
            except:
                #print k
                continue
    for key in docvec.keys():
        if all(docvec[key] == np.zeros(clusternumber)):
            print Folda,key
            #print docvec[key]
        else:
            docvec[key] = (docvec[key]/np.linalg.norm(docvec[key]))
    return docvec
    for mod_value in range(mod_number):
        if mod_value == 0:
            all_target_pred_list_dic, all_target_test_list_dic = evaluate_with_SVM_3(topdocvec,toptarget_dic, newl_dic,func2,func4,func1, date = '07302015',mod_number = mod_number, mod_value = mod_value)
        else:
            target_pred_list_dic, target_test_list_dic = evaluate_with_SVM_3(topdocvec,toptarget_dic, newl_dic,func2,func4,func1, date = '07302015',mod_number = mod_number, mod_value = mod_value) 
            for dic_key in all_target_test_list_dic:
                all_target_test_list_dic[dic_key] += target_test_list_dic[dic_key]
                all_target_pred_list_dic[dic_key] += target_pred_list_dic[dic_key]
    for dic_key in all_target_test_list_dic:
        print dic_key
        print classification_report(all_target_test_list_dic[dic_key],all_target_pred_list_dic[dic_key],digits=4)
    return all_target_test_list_dic, all_target_pred_list_dic

#all_target_test_list_dic, all_target_pred_list_dic = evaluate_with_SVM_3_k_fold(topdocvec,toptarget_dic, newl_dic)
#BOW
topdoc2014 = maketextdoc(Folda = "toptexts_kaigyou_kihon2")
topdoc2015 = maketextdoc(Folda = "toptexts_kaigyou_kihon2_2015")
topdoc2013 = maketextdoc(Folda = "toptexts_kaigyou_kihon2_2013")
topdoc = copy.copy(topdoc2013)
topdoc.update(topdoc2014)
topdoc.update(topdoc2015)

#preprocessed_docs = doc
preprocessed_docs = topdoc
dct = gensim.corpora.Dictionary(preprocessed_docs.values())
unfiltered = dct.token2id.keys()
dct.filter_extremes(no_below=5)
filtered = dct.token2id.keys()
#filtered_out = set(unfiltered) - set(filtered)
bow_docs = {}
bow_docs_all_zeros = {}