for key in wordIDdic.keys():
    IDwordIDdic[wordIDdic[key]] = key

voclist = []
for ID in voclist_ID:
    if ID != u"</s>":
        voclist.append(IDwordIDdic[int(ID)])
    else:
        voclist.append("</s>")

word2vecdic = dict(zip(voclist, labels))
pickle.dump(word2vecdic,open(("AntSynword2vec_kai_dic_" + str(DimentionN) + ".pkl"),"w"))
pickle.dump(word2vecdic,open(("AntSynword2vec_dic_" + str(DimentionN) + ".pkl"),"w"))


topdocvec2014 = makedocvec(word2vecdic, Folda = "toptexts_kaigyou_kihon2", clusternumber = DimentionN)
topdocvec2015 = makedocvec(word2vecdic, Folda = "toptexts_kaigyou_kihon2_2015", clusternumber = DimentionN)
topdocvec2013 = makedocvec(word2vecdic, Folda = "toptexts_kaigyou_kihon2_2013", clusternumber = DimentionN)
topdocvec = copy.copy(topdocvec2013)
topdocvec.update(topdocvec2014)
topdocvec.update(topdocvec2015)


from sklearn.metrics import confusion_matrix

def devide_train_test_with_date(newl,toptarget,date = '07302015', mod_number = 5, mod_value = 0):
    for text_index, text_id in enumerate(reversed(newl)):
        if text_id[0:8] == date:
            train_test_index = text_index
            break
        else:
Beispiel #2
0
                docvec[k][labelnum] = (docvec[k][labelnum] + 1)
            except:
                #print k
                continue
    for key in docvec.keys():
        if all(docvec[key] == np.zeros(clusternumber)):
            print Folda,key
            #print docvec[key]
        else:
            docvec[key] = (docvec[key]/np.linalg.norm(docvec[key]))
    return docvec

#topdocvec = makedocvec(Folda = "toptexts",clusternumber = DimentionN)
#topdocvec = makedocvec(Folda = "toptexts_kaigyou_kihon",clusternumber = DimentionN)
#topdocvec = makedocvec(Folda = "toptexts_kaigyou_kihon2",clusternumber = DimentionN)
topdocvec2013 = makedocvec("toptexts_kaigyou_kihon2_2013", DimentionN, word2vecdic)
topdocvec2014 = makedocvec("toptexts_kaigyou_kihon2", DimentionN, word2vecdic)
topdocvec2015 = makedocvec("toptexts_kaigyou_kihon2_2015", DimentionN, word2vecdic)
topdocvec = copy.copy(topdocvec2013)
topdocvec.update(topdocvec2014)
topdocvec.update(topdocvec2015)


topMat = topdocvec.values()


#files = os.listdir('topnewstextswithtag/')
thread = {}
#thread[ID] = {}
kigou = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{|}~"
_re_remove = re.compile(r"(http(s)?://[\w.\-/:#?=&;\\%~\+]+|<[^>]*?>)")