def add_label_to_dict(src_path, dl_pair_path, dictionary, r_dictionary): label_path = '/home/ubuntu/workspace/text_summary_data/task_w2v/data_label_pairs/toytmp_label_dict.pickle' if os.path.isfile(label_path): label_dict_ori = statics.loadfrompickle(label_path) else: label_dict_ori = create_label_dict(src_path) statics.savetopickle(label_path, label_dict_ori) label_dict = label_dict_ori[3] for l in label_dict: labelidx = len(dictionary) dictionary[l] = labelidx r_dictionary[labelidx] = l for l in label_dict: word_onehot = np.zeros(len(dictionary)) word_onehot[dictionary[l]] = 1 label = np.zeros(len(label_dict)) label[label_dict[l]] = 1 index = len(os.listdir(dl_pair_path)) dl_pair = [word_onehot, label] statics.savetopickle( os.path.join(dl_pair_path, str(index + 1) + '.pickle'), dl_pair) return dictionary, r_dictionary
def collect_dict(data, dict_path, wdicts={ 'model': {}, 'OS': {}, 'category': {} }): if type(data) is dict: lmodel = data['model'][0] los = data['OS'][0] lcat = data['category'][0] if lmodel not in wdicts['model']: wdicts['model'][lmodel] = len(wdicts['model']) if los not in wdicts['OS']: wdicts['OS'][los] = len(wdicts['OS']) if lcat not in wdicts['category']: wdicts['category'][lcat] = len(wdicts['category']) else: for w in data: if w in wdicts: wdicts[w] = wdicts[w] + 1 else: wdicts[w] = 1 statics.savetopickle(dict_path, wdicts) return wdicts
print("Process: {}/{}".format(i, len(src_list))) dir_path = os.path.join(process_data_root, src_list[i]) if os.path.isdir(dir_path) : file_path = os.listdir(dir_path) else: continue for f in file_path: if f != 'label.pickle': path = os.path.join(dir_path, f) words = words + statics.loadfrompickle(path) if i%1000 == 0: statics.savetopickle(word_pool_path, words) #-----------------Encode words by dict------------------------------- #code_words = [] #for i in words: # code_words.append(wdict[i]) #data = code_words #data_index = 0 # #def generate_batch(batch_size, num_skips, skip_window): # global data_index # assert batch_size % num_skips == 0 # assert num_skips <= 2 * skip_window # batch = np.ndarray(shape=(batch_size), dtype=np.int32)
def create_train_pair_by_dict(src_path, all_words, dl_pair_path, dictionary): label_path = '/home/ubuntu/workspace/text_summary_data/task_w2v/data_label_pairs/tmp_label_dict.pickle' if os.path.isfile(label_path): label_dict_ori = statics.loadfrompickle(label_path) else: label_dict_ori = create_label_dict(src_path) statics.savetopickle(label_path, label_dict_ori) label_dict = label_dict_ori[3] files = os.listdir(src_path) total_file = len(files) unk_onehot = np.zeros(len(dictionary)) unk_onehot[0] = 1 count = 0 for w in dictionary: count = count + 1 print("create_train_pair_by_dict:{}/{}".format(count, len(dictionary))) word_onehot = np.zeros(len(dictionary)) word_onehot[dictionary[w]] = 1 for idx in range(total_file): f = files[idx] subfold = os.path.join(src_path, f) subfiles = os.listdir(subfold) l = statics.loadfrompickle(os.path.join(subfold, 'label.pickle')) label = np.zeros(len(label_dict)) if l['model'][0].replace(" ", "") in label_dict: label[label_dict[l['model'][0].replace(" ", "")]] = 1 if l['OS'][0].replace(" ", "") in label_dict: label[label_dict[l['OS'][0].replace(" ", "")]] = 1 if l['category'][0].replace(" ", "") in label_dict: label[label_dict[l['category'][0].replace(" ", "")]] = 1 for sf in subfiles: if sf != 'label.pickle': file = os.path.join(subfold, sf) words = statics.loadfrompickle(file) if w in words: index = len(os.listdir(dl_pair_path)) dl_pair = [word_onehot, label] statics.savetopickle( os.path.join(dl_pair_path, str(index + 1) + '.pickle'), dl_pair) index = len(os.listdir(dl_pair_path)) dl_pair = [unk_onehot, label] statics.savetopickle( os.path.join(dl_pair_path, str(index + 1) + '.pickle'), dl_pair) break
def create_data_label_path(dataset_path_list, dl_pair_path): if os.path.isfile(dl_pair_path): return statics.loadfrompickle(dl_pair_path) data_dict = {} count = 0 for d in dataset_path_list: lsdir = os.listdir(d) for folder in lsdir: count = count + 1 sys.stdout.write("Create path dict:{}/{}\n".format( count, len(lsdir))) sys.stdout.flush() folderpath = os.path.join(d, folder) filelist = os.listdir(folderpath) data_dict[folder] = {} if len(filelist) > 1: filelist.remove('labels.txt') filepath = [{x: os.path.join(folderpath, x)} for x in filelist] if type(filepath) is not list: filepath = [filepath] data_dict[folder]['context'] = filepath else: data_dict[folder]['context'] = {} label_path = os.path.join(folderpath, 'labels.txt') text = open(label_path, "r").read() if data_dict[folder]['context'] == {}: if len(text) == 0: shutil.rmtree(folderpath) else: sl = slipt_label(label_path) if len(sl['description']) != 0: context_path = os.path.join(folderpath, 'context.txt') with open(context_path, 'w') as f: for line in sl['description']: f.write(line + " ") f.close data_dict[folder]['label'] = label_path data_dict[folder]['context'] = context_path else: del data_dict[folder] else: data_dict[folder]['label'] = label_path data_dict = statics.savetopickle(dl_pair_path, data_dict) return data_dict
def create_wdict_ldict_general(Nword, wdict_path, ldict_path, final_wdict_path, final_ldict_path): qwords = ["what", "is", " model", "OS", "category"] wdict = statics.loadfrompickle(wdict_path) ldict = statics.loadfrompickle(ldict_path) sorted_wdict = sorted(wdict.items(), key=operator.itemgetter(1)) print(len(sorted_wdict)) r_wdict = {} #=============================================================== pure_dict = {} label_type = ['OS', 'category', 'model'] r_ldict = {} pure_ldict = {} count = 1 for lt in label_type: pure_ldict['UNK_' + lt] = len(pure_ldict) r_ldict[len(r_ldict)] = 'UNK_' + lt for l in ldict[lt]: if len(l) > 1 and l not in pure_ldict: r_ldict[count] = l pure_ldict[l] = len(pure_ldict) count = count + 1 elif len(l) > 1: l = lt + l r_ldict[count] = l pure_ldict[l] = len(pure_ldict) count = count + 1 print(l) #============================================================ r_wdict[0] = 'UNK' pure_dict['UNK'] = 0 idx = 1 count = 1 while idx < Nword + 1: # print("Create_dict:{}/{}".format(count, len(sorted_wdict))) if len(getChinese(sorted_wdict[-count][0])) == 0: r_wdict[idx] = sorted_wdict[-count][0] pure_dict[r_wdict[idx]] = idx idx = idx + 1 count = count + 1 for i in range(1, len(r_ldict)): if r_ldict[i] not in pure_dict: pure_dict[r_ldict[i]] = len(r_wdict) r_wdict[len(r_wdict)] = r_ldict[i] for i in range(len(qwords)): if qwords[i] not in pure_dict: pure_dict[qwords[i]] = len(r_wdict) r_wdict[len(r_wdict)] = qwords[i] # statics.savetopickle(final_wdict_path, pure_dict) statics.savetopickle(final_ldict_path, pure_ldict) # statics.savetopickle(final_rwdict_path, r_wdict) return pure_dict, pure_ldict, r_wdict
import statics w2vfile = '/home/ubuntu/workspace/text_summary_data/w2v.pickle' tfidf_path = '/home/ubuntu/workspace/text_summary_data/tfidf_score/tfidfscore.pickle' save_path = '/home/ubuntu/workspace/text_summary_data/tfidf_score/w2v_tfidf.pickle' w2v = statics.loadfrompickle(w2vfile) tiidf = statics.loadfrompickle(tfidf_path) tfidf_w2v = {} for w in w2v: tfidf_w2v[w] = w2v[w]*tiidf[1][w][2] statics.savetopickle(save_path, tfidf_w2v)