for key in wordIDdic.keys(): IDwordIDdic[wordIDdic[key]] = key voclist = [] for ID in voclist_ID: if ID != u"</s>": voclist.append(IDwordIDdic[int(ID)]) else: voclist.append("</s>") word2vecdic = dict(zip(voclist, labels)) pickle.dump(word2vecdic,open(("AntSynword2vec_kai_dic_" + str(DimentionN) + ".pkl"),"w")) pickle.dump(word2vecdic,open(("AntSynword2vec_dic_" + str(DimentionN) + ".pkl"),"w")) topdocvec2014 = makedocvec(word2vecdic, Folda = "toptexts_kaigyou_kihon2", clusternumber = DimentionN) topdocvec2015 = makedocvec(word2vecdic, Folda = "toptexts_kaigyou_kihon2_2015", clusternumber = DimentionN) topdocvec2013 = makedocvec(word2vecdic, Folda = "toptexts_kaigyou_kihon2_2013", clusternumber = DimentionN) topdocvec = copy.copy(topdocvec2013) topdocvec.update(topdocvec2014) topdocvec.update(topdocvec2015) from sklearn.metrics import confusion_matrix def devide_train_test_with_date(newl,toptarget,date = '07302015', mod_number = 5, mod_value = 0): for text_index, text_id in enumerate(reversed(newl)): if text_id[0:8] == date: train_test_index = text_index break else:
docvec[k][labelnum] = (docvec[k][labelnum] + 1) except: #print k continue for key in docvec.keys(): if all(docvec[key] == np.zeros(clusternumber)): print Folda,key #print docvec[key] else: docvec[key] = (docvec[key]/np.linalg.norm(docvec[key])) return docvec #topdocvec = makedocvec(Folda = "toptexts",clusternumber = DimentionN) #topdocvec = makedocvec(Folda = "toptexts_kaigyou_kihon",clusternumber = DimentionN) #topdocvec = makedocvec(Folda = "toptexts_kaigyou_kihon2",clusternumber = DimentionN) topdocvec2013 = makedocvec("toptexts_kaigyou_kihon2_2013", DimentionN, word2vecdic) topdocvec2014 = makedocvec("toptexts_kaigyou_kihon2", DimentionN, word2vecdic) topdocvec2015 = makedocvec("toptexts_kaigyou_kihon2_2015", DimentionN, word2vecdic) topdocvec = copy.copy(topdocvec2013) topdocvec.update(topdocvec2014) topdocvec.update(topdocvec2015) topMat = topdocvec.values() #files = os.listdir('topnewstextswithtag/') thread = {} #thread[ID] = {} kigou = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{|}~" _re_remove = re.compile(r"(http(s)?://[\w.\-/:#?=&;\\%~\+]+|<[^>]*?>)")