def merge_loop(double_set, root_name, file=None): """ 进行团合并操作,循环直到不能合并 :param double_set:强相关的两两组合 :param root_name:词根列表 :param file:对聚类结果进行dump的目标路径 :return:团成员最大数,最终的团 """ best_set = set() old_set = double_set num_list = [] count_list = [] group_list = [] while len(old_set) > 0: # oldSet为需要继续进行合并操作的团 print('成员数:', len(list(old_set)[0])) # oldSet中团的成员数量 print('个数:', len(old_set)) # oldSet中团的数量 print(old_set) num_list.append(len(list(old_set)[0])) count_list.append(len(old_set)) group_list.append(old_set) best_set = old_set old_set = merge_group(old_set, double_set) # 返回新组合成的团,对这些团继续进行合并操作 # 若oldSet不存在,则说明聚类收敛、合并到最大的团了,无法继续合并了 if file is not None: group_list = index_2_word(root_name, group_list) write_csv(['成员数', '个数', '团'], file, [num_list, count_list, group_list]) save_pickle(file + '.pkl', group_list) print(best_set) return len(list(best_set)[0]), best_set
def prepare_sif(): corpus, vocab = load_corpus() word_freq = calcu_word_freq(corpus, min_freq=1) word_weight = calcu_word_weight(word_freq) word2id, id2emb = load_emb_matrix(vocab) id2weight = calcu_id_weight(word2id, word_weight) dump_json(word2id, config.word_id_path) save_pickle(id2emb, config.id_emb_path) save_pickle(id2weight, config.id_weight_path)
#%% Add dummy variables import data_utils y_val_new = np.append(y_val[:,0:4], np.zeros((len(y_val),1)), axis = 1) for i in range(len(y_val_new)): if sum(y_val_new[i]) == 0: y_val_new[i,4] = 1 y_train_new = np.append(y_train[:,0:4], np.zeros((len(y_train),1)), axis = 1) for i in range(len(y_train_new)): if sum(y_train_new[i]) == 0: y_train_new[i,4] = 1 data_utils.save_pickle("/Users/ignacio/Documents/Universidad/Master/Segundo/SegundoSemestre/Computer vision/Project/x_train_dummy.pkl",x_train) data_utils.save_pickle("/Users/ignacio/Documents/Universidad/Master/Segundo/SegundoSemestre/Computer vision/Project/y_train_dummy.pkl",y_train_new) data_utils.save_pickle("/Users/ignacio/Documents/Universidad/Master/Segundo/SegundoSemestre/Computer vision/Project/x_val_dummy.pkl",x_val) data_utils.save_pickle("/Users/ignacio/Documents/Universidad/Master/Segundo/SegundoSemestre/Computer vision/Project/y_val_dummy.pkl",y_val_new) #%% Generate new data datagen = ImageDataGenerator(rotation_range=40,width_shift_range=0.2,height_shift_range=0.2,rescale=1./255,shear_range=0.2,zoom_range=0.2,horizontal_flip=True,fill_mode='nearest') x_train_big = x_train.reshape((1,) + x_train.shape) x = x_train_big[0,1] x = x.reshape((1,) + x.shape) i = 0 for batch in datagen.flow(x, batch_size=1, save_to_dir='preview', save_prefix='cat', save_format='jpeg'):
X_test_imgs, X_test_embs = get_metadata('data/test/') y_test = np.array([2, 3, 2, 2, 2, 3, 3, 3, 2, 2, 0]) y_test_names = np.array([ 'Nithin', 'Kunal', 'Nithin', 'Nithin', 'Nithin', 'Kunal', 'Kunal', 'Kunal', 'Nithin', 'Nithin', 'Shreyas' ]) data = { 'X_train_embs': X_train_embs, 'X_train_imgs': X_train_imgs, 'y_train': y_train, 'X_test_embs': X_test_embs, 'X_test_imgs': X_test_imgs, 'y_test': y_test } d.save_pickle(data, 'dataset.pickle') else: data = d.load_serial('dataset.pickle') X_train_embs = data['X_train_embs'] X_train_imgs = data['X_train_imgs'] y_train = data['y_train'] y_train_names = np.array([ 'Mazin', 'Mazin', 'Mazin', 'Nithin', 'Nithin', 'Nithin', 'Kunal', 'Kunal', 'Kunal', 'Shreyas', 'Shreyas', 'Shreyas', 'Shreyas' ]) X_test_embs = data['X_test_embs'] X_test_imgs = data['X_test_imgs'] y_test = data['y_test']
def split_files(args): assert os.path.isfile( args.label_file), 'there is no label files, --label_file [{}]'.format( args.label_file) dirname, filename = os.path.split(args.label_file) data = load_pickle(args.label_file) ## SPLIT data train_idx, leftover_idx, _, leftover_label = train_test_split( list(range(len(data['label']))), data['label'], train_size=args.labeled_data_size, stratify=data['label']) if len(leftover_idx) > args.valid_data_size: valid_idx, unlabel_idx, _, _ = train_test_split( leftover_idx, leftover_label, train_size=args.valid_data_size, stratify=leftover_label) else: valid_idx = leftover_idx unlabel_idx = [] train_data = dict((key, np.array(item)[train_idx].tolist()) for key, item in zip(data.keys(), data.values())) valid_data = dict((key, np.array(item)[valid_idx].tolist()) for key, item in zip(data.keys(), data.values())) unlabel_data = dict((key, np.array(item)[unlabel_idx].tolist()) for key, item in zip(data.keys(), data.values())) if args.unlabel_file is not None and os.path.isfile(args.unlabel_file): additional_data = load_pickle(args.unlabel_file) for key in unlabel_data.keys(): unlabel_data[key] += additional_data[key] if args.train_file is None: args.train_file = TRAIN_NAME.format(args.labeled_data_size, args.valid_data_size) train_path = os.path.join(args.output_dir, args.train_file) save_pickle(train_path, train_data) try: os.remove(os.path.join(args.output_dir, "cache_" + args.train_file)) except: pass if args.valid_file is None: args.valid_file = VALID_NAME.format(args.labeled_data_size, args.valid_data_size) valid_path = os.path.join(args.output_dir, args.valid_file) save_pickle(valid_path, valid_data) try: os.remove(os.path.join(args.output_dir, "cache_" + args.valid_file)) except: pass if args.augment_file is None: args.augment_file = AUGMENT_NAME.format(args.labeled_data_size, args.valid_data_size) augment_path = os.path.join(args.output_dir, args.augment_file) save_pickle(augment_path, unlabel_data) try: os.remove(os.path.join(args.output_dir, "cache_" + args.augment_file)) except: pass args.train_file = train_path args.valid_file = valid_path args.augment_file = augment_path