コード例 #1
0
def merge_loop(double_set, root_name, file=None):
    """
    进行团合并操作,循环直到不能合并
    :param double_set:强相关的两两组合
    :param root_name:词根列表
    :param file:对聚类结果进行dump的目标路径
    :return:团成员最大数,最终的团
    """
    best_set = set()
    old_set = double_set
    num_list = []
    count_list = []
    group_list = []
    while len(old_set) > 0:
        # oldSet为需要继续进行合并操作的团
        print('成员数:', len(list(old_set)[0]))  # oldSet中团的成员数量
        print('个数:', len(old_set))  # oldSet中团的数量
        print(old_set)
        num_list.append(len(list(old_set)[0]))
        count_list.append(len(old_set))
        group_list.append(old_set)
        best_set = old_set
        old_set = merge_group(old_set, double_set)  # 返回新组合成的团,对这些团继续进行合并操作
    # 若oldSet不存在,则说明聚类收敛、合并到最大的团了,无法继续合并了
    if file is not None:
        group_list = index_2_word(root_name, group_list)
        write_csv(['成员数', '个数', '团'], file, [num_list, count_list, group_list])
        save_pickle(file + '.pkl', group_list)
    print(best_set)
    return len(list(best_set)[0]), best_set
コード例 #2
0
def prepare_sif():

    corpus, vocab = load_corpus()

    word_freq = calcu_word_freq(corpus, min_freq=1)
    word_weight = calcu_word_weight(word_freq)

    word2id, id2emb = load_emb_matrix(vocab)
    id2weight = calcu_id_weight(word2id, word_weight)

    dump_json(word2id, config.word_id_path)
    save_pickle(id2emb, config.id_emb_path)
    save_pickle(id2weight, config.id_weight_path)
コード例 #3
0
#%% Add dummy variables
import data_utils

y_val_new = np.append(y_val[:,0:4], np.zeros((len(y_val),1)), axis = 1)

for i in range(len(y_val_new)):
    if sum(y_val_new[i]) == 0:
        y_val_new[i,4] = 1

y_train_new = np.append(y_train[:,0:4], np.zeros((len(y_train),1)), axis = 1)

for i in range(len(y_train_new)):
    if sum(y_train_new[i]) == 0:
        y_train_new[i,4] = 1
        
data_utils.save_pickle("/Users/ignacio/Documents/Universidad/Master/Segundo/SegundoSemestre/Computer vision/Project/x_train_dummy.pkl",x_train)
data_utils.save_pickle("/Users/ignacio/Documents/Universidad/Master/Segundo/SegundoSemestre/Computer vision/Project/y_train_dummy.pkl",y_train_new)
data_utils.save_pickle("/Users/ignacio/Documents/Universidad/Master/Segundo/SegundoSemestre/Computer vision/Project/x_val_dummy.pkl",x_val)
data_utils.save_pickle("/Users/ignacio/Documents/Universidad/Master/Segundo/SegundoSemestre/Computer vision/Project/y_val_dummy.pkl",y_val_new)        


#%% Generate new data

datagen = ImageDataGenerator(rotation_range=40,width_shift_range=0.2,height_shift_range=0.2,rescale=1./255,shear_range=0.2,zoom_range=0.2,horizontal_flip=True,fill_mode='nearest')
x_train_big = x_train.reshape((1,) + x_train.shape)       
    
x = x_train_big[0,1]
x = x.reshape((1,) + x.shape)  
i = 0
for batch in datagen.flow(x, batch_size=1,
                          save_to_dir='preview', save_prefix='cat', save_format='jpeg'):
コード例 #4
0
    X_test_imgs, X_test_embs = get_metadata('data/test/')
    y_test = np.array([2, 3, 2, 2, 2, 3, 3, 3, 2, 2, 0])
    y_test_names = np.array([
        'Nithin', 'Kunal', 'Nithin', 'Nithin', 'Nithin', 'Kunal', 'Kunal',
        'Kunal', 'Nithin', 'Nithin', 'Shreyas'
    ])
    data = {
        'X_train_embs': X_train_embs,
        'X_train_imgs': X_train_imgs,
        'y_train': y_train,
        'X_test_embs': X_test_embs,
        'X_test_imgs': X_test_imgs,
        'y_test': y_test
    }

    d.save_pickle(data, 'dataset.pickle')

else:
    data = d.load_serial('dataset.pickle')

    X_train_embs = data['X_train_embs']
    X_train_imgs = data['X_train_imgs']
    y_train = data['y_train']
    y_train_names = np.array([
        'Mazin', 'Mazin', 'Mazin', 'Nithin', 'Nithin', 'Nithin', 'Kunal',
        'Kunal', 'Kunal', 'Shreyas', 'Shreyas', 'Shreyas', 'Shreyas'
    ])

    X_test_embs = data['X_test_embs']
    X_test_imgs = data['X_test_imgs']
    y_test = data['y_test']
コード例 #5
0
ファイル: divide.py プロジェクト: JoungheeKim/uda_pytorch
def split_files(args):
    assert os.path.isfile(
        args.label_file), 'there is no label files, --label_file [{}]'.format(
            args.label_file)
    dirname, filename = os.path.split(args.label_file)
    data = load_pickle(args.label_file)

    ## SPLIT data
    train_idx, leftover_idx, _, leftover_label = train_test_split(
        list(range(len(data['label']))),
        data['label'],
        train_size=args.labeled_data_size,
        stratify=data['label'])
    if len(leftover_idx) > args.valid_data_size:
        valid_idx, unlabel_idx, _, _ = train_test_split(
            leftover_idx,
            leftover_label,
            train_size=args.valid_data_size,
            stratify=leftover_label)
    else:
        valid_idx = leftover_idx
        unlabel_idx = []

    train_data = dict((key, np.array(item)[train_idx].tolist())
                      for key, item in zip(data.keys(), data.values()))
    valid_data = dict((key, np.array(item)[valid_idx].tolist())
                      for key, item in zip(data.keys(), data.values()))
    unlabel_data = dict((key, np.array(item)[unlabel_idx].tolist())
                        for key, item in zip(data.keys(), data.values()))

    if args.unlabel_file is not None and os.path.isfile(args.unlabel_file):
        additional_data = load_pickle(args.unlabel_file)
        for key in unlabel_data.keys():
            unlabel_data[key] += additional_data[key]

    if args.train_file is None:
        args.train_file = TRAIN_NAME.format(args.labeled_data_size,
                                            args.valid_data_size)
    train_path = os.path.join(args.output_dir, args.train_file)
    save_pickle(train_path, train_data)
    try:
        os.remove(os.path.join(args.output_dir, "cache_" + args.train_file))
    except:
        pass

    if args.valid_file is None:
        args.valid_file = VALID_NAME.format(args.labeled_data_size,
                                            args.valid_data_size)
    valid_path = os.path.join(args.output_dir, args.valid_file)
    save_pickle(valid_path, valid_data)
    try:
        os.remove(os.path.join(args.output_dir, "cache_" + args.valid_file))
    except:
        pass

    if args.augment_file is None:
        args.augment_file = AUGMENT_NAME.format(args.labeled_data_size,
                                                args.valid_data_size)
    augment_path = os.path.join(args.output_dir, args.augment_file)
    save_pickle(augment_path, unlabel_data)
    try:
        os.remove(os.path.join(args.output_dir, "cache_" + args.augment_file))
    except:
        pass

    args.train_file = train_path
    args.valid_file = valid_path
    args.augment_file = augment_path