def load_data(loc='./data/'): """ Load quora data """ df = du.load_csv(os.path.join(loc, 'train.csv')) trainA = df['question1'] trainB = df['question2'] trainS = df['is_duplicate'] df = du.load_csv(os.path.join(loc, 'dev.csv')) devA = df['question1'] devB = df['question2'] devS = df['is_duplicate'] df = du.load_csv(os.path.join(loc, 'valid.csv')) testA = df['question1'] testB = df['question2'] testS = df['is_duplicate'] trainS = [int(s) for s in trainS] devS = [int(s) for s in devS] testS = [int(s) for s in testS] return [trainA, trainB], [devA, devB], [testA, testB], [trainS, devS, testS]
def load_quora_data(data_dir): train_file = os.path.join(data_dir, 'train.csv') test_file = os.path.join(data_dir, 'test.csv') train_table = du.load_csv(train_file) test_table = du.load_csv(test_file) return [train_table, test_table]
def __init__(self, data_queue, args, split='train', repeat=True): """Shapenet dataloader. Args: data_queue: multiprocessing queue where data is stored at. split: str in ('train', 'val', 'test'). Loads corresponding dataset. repeat: repeats epoch if true. Terminates after one epoch otherwise. """ self.args = args self.split = split args.DATA_PATH = 'data/%s' % (args.dataset) classmap = load_csv(args.DATA_PATH + '/synsetoffset2category.txt') args.classmap = {} for i in range(classmap.shape[0]): args.classmap[str(classmap[i][1]).zfill(8)] = classmap[i][0] self.data_paths = sorted([ os.path.join(args.DATA_PATH, split, 'partial', k.rstrip() + '.h5') for k in open(args.DATA_PATH + '/%s.list' % (split)).readlines() ]) N = int(len(self.data_paths) / args.batch_size) * args.batch_size self.data_paths = self.data_paths[0:N] super().__init__(data_queue, self.data_paths, None, args.batch_size, repeat=repeat)
def run_eval(vector_dir, data_dir, output_dir, show_fig=False): print "Reading encodings..." trainA_vec = np.load(os.path.join(vector_dir, 'trainA_encodings.npy')) trainB_vec = np.load(os.path.join(vector_dir, 'trainB_encodings.npy')) print "Reading train.csv..." train = du.load_csv(os.path.join(data_dir, 'train.csv')) labels = train['is_duplicate'] print "Calculating distances..." dists = np.asarray( [cosine(trainA_vec[i], trainB_vec[i]) for i in range(len(trainA_vec))]) print "Finding best threshold distance..." threshold, log_loss = find_best_threshold(dists, labels) # success, loss = try_validation(data_dir, threshold) print "Best threshold={0:.4f}, which induces a log loss of {1:.4f}".format( threshold, log_loss) perf_file = os.path.join(output_dir, 'performance.txt') print "Saving performance info to " + perf_file write_performance(perf_file, threshold, log_loss) print "Generating plot and saving it to {0}".format(output_dir) plot_histogram(dists, labels, output_dir, threshold=threshold, log_loss=log_loss, show_fig=show_fig)
def encode_skipthoughts(data_file, encoder, out_dir, prefix, verbose): print 'Loading quora data...' data = du.load_csv(data_file) print 'Computing skipthoughts...' q1_vecs = encoder.encode(data['question1'], verbose=verbose, use_eos=True) q2_vecs = encoder.encode(data['question2'], verbose=verbose, use_eos=True) # Save the encodings print 'Saving the encodings...' np.save(os.path.join(out_dir, prefix + 'q1_encodings'), q1_vecs) np.save(os.path.join(out_dir, prefix + 'q2_encodings'), q2_vecs)
def main(): parser = argparse.ArgumentParser( description='Evaluate a model on the quora question-pair dataset.') parser.add_argument('--quora-data-dir', required=True, help='path to the directory containing the quora data') parser.add_argument( '--st-model-dir', required=True, help='path to the directory containing the skipthoughts model') parser.add_argument('--output-dir', default='.', help='path to the directory to write to') parser.add_argument('-v', '--verbose', action='store_true') args = parser.parse_args() output_file = os.path.join(args.output_dir, 'oov_stats.txt') with log.FileWriterStdoutPrinter(output_file) as writer: print "Loading skipthoughts model..." model = st.load_model(args.st_model_dir) print "Initializing skipthoughts word dict..." word_dict = st.init_word_dict(model) print "Analyzing word dict..." analyze_dict(word_dict, writer) print "Loading training set..." train = du.load_csv(os.path.join(args.quora_data_dir, 'train.csv')) writer.emit_line("Analyzing word counts in train.csv...") analyze_oov(word_dict, train, writer, args.output_dir, 'train') # Be sure to write data to disk for train before moving on to test, which is much bigger print "Loading test set..." test = du.load_csv(os.path.join(args.quora_data_dir, 'test.csv')) writer.emit_line("Analyzing word counts in test.csv...") analyze_oov(word_dict, test, writer, args.output_dir, 'test')
def __init__(self, data_queue, args, split='train', repeat=True): """Shapenet dataloader. Args: data_queue: multiprocessing queue where data is stored at. split: str in ('train', 'val', 'test'). Loads corresponding dataset. repeat: repeats epoch if true. Terminates after one epoch otherwise. """ self.args = args self.split = split args.DATA_PATH = 'data/%s' % (args.dataset) classmap = load_csv(args.DATA_PATH + '/synsetoffset2category.txt') args.classmap = {} if args.dataset == 'facade': for i in range(classmap.shape[0]): args.classmap[str(classmap[i][1]).zfill(2)] = classmap[i][0] else: for i in range(classmap.shape[0]): args.classmap[str(classmap[i][1]).zfill(8)] = classmap[i][0] self.data_paths = [] self.data_paths = sorted([]) for k in open(args.DATA_PATH + '/%s.list' % (split)).readlines(): if split == 'train' and not k.startswith(args.scene): self.data_paths.append( os.path.join(args.DATA_PATH, split, 'partial', k.rstrip() + '.h5')) if split == 'val' and k.startswith(args.scene): # workaround to make sure we have at least `batch size` number of data_paths: s = os.path.join(args.DATA_PATH, split, 'partial', k.rstrip() + '.h5') for i in range(args.batch_size): self.data_paths.append(s) self.data_paths = sorted(self.data_paths) print('data_paths', len(self.data_paths)) N = int(len(self.data_paths) / args.batch_size) * args.batch_size self.data_paths = self.data_paths[0:N] print('DataProcess', split, 'scene', args.scene, len(self.data_paths)) super().__init__(data_queue, self.data_paths, None, args.batch_size, repeat=repeat)
def generate_from_file(self, dataset, smooth=True, discretize_params=[]): _df = load_csv(dataset.path) _smooth = smooth _continuous_features = dataset.continuous_features _discrete_features = dataset.discrete_features if len(discretize_params) > 0: discrete_columns = [dp.feature_name for dp in discretize_params] _discrete_features = list( set(_discrete_features + discrete_columns)) _continuous_features = [ c for c in _continuous_features if not c in discrete_columns ] _df = discretize_continuous_features(dataset.name, _df, discretize_params) _discrete_features_values = {} for discrete_feature in _discrete_features: _discrete_features_values[discrete_feature] = list( pd.unique(_df[discrete_feature].values.ravel('K'))) return self(_df, _discrete_features, _discrete_features_values, _continuous_features, smooth)
def main(): parser = argparse.ArgumentParser( description= 'Split train.csv into train, dev, and test splits. Specify dev and validation set sizes with args, the remainder is used for training.' ) parser.add_argument( '--dataset-file', required=True, help='path to the train.csv file containing the quora training data') parser.add_argument('--ndev', type=int, default=1e4, help='size of dev set to create') parser.add_argument('--nvalid', type=int, default=5e4, help='size of validation set to create') parser.add_argument( '--output-dir', required=True, help='directory to which to write train.csv, dev.csv, and valid.csv') parser.add_argument( '--seed', type=int, help= 'optional random seed to have reproducibility between multiple uses of this tool' ) args = parser.parse_args() data = du.load_csv(args.dataset_file) shuffled = du.shuffle(data, args.seed) ntrain = len(data) - args.ndev - args.nvalid train, dev, valid = du.split(shuffled, ntrain, args.ndev, args.nvalid) du.write_csv(train, os.path.join(args.output_dir, 'train.csv')) du.write_csv(dev, os.path.join(args.output_dir, 'dev.csv')) du.write_csv(valid, os.path.join(args.output_dir, 'valid.csv'))
def load_data(self): train_data, val_data, test_data = load_csv(self.args.data_path, self.args.data) self.train_y = to_categorical(train_data.valid) self.val_y = to_categorical(val_data.valid) self.test_y = to_categorical(test_data.valid) print('Load word embedding') self.word2idx, idx2word, self.word_vectors = load_word_embedding( self.args.embedding_path, self.args.word_embedding, self.args.data) print('Load graph embedding') cpc_embed_dict, ipc_embed_dict, uspc_embed_dict = load_code_embeddings( self.args.embedding_path, self.args.code_embedding, self.args.data) self.cpc2idx, idx2cpc, self.cpc_vectors = create_code_vocab( cpc_embed_dict) self.ipc2idx, idx2ipc, self.ipc_vectors = create_code_vocab( ipc_embed_dict) self.uspc2idx, idx2uspc, self.uspc_vectors = create_code_vocab( uspc_embed_dict) self.max_cpc_len, self.max_ipc_len, self.max_uspc_len = get_code_length( train_data) print('Preparing train data') train_cpcs, train_ipcs, train_uspcs = convert_code_to_idx( train_data, self.max_cpc_len, self.max_ipc_len, self.max_uspc_len, self.cpc2idx, self.ipc2idx, self.uspc2idx) train_abs_sequence = get_text_sequence(train_data.abstract_text, self.word2idx, self.args.max_length) self.train.append(train_cpcs) self.train.append(train_ipcs) self.train.append(train_uspcs) self.train.append(train_abs_sequence) print('Preparing validation data') val_cpcs, val_ipcs, val_uspcs = convert_code_to_idx( val_data, self.max_cpc_len, self.max_ipc_len, self.max_uspc_len, self.cpc2idx, self.ipc2idx, self.uspc2idx) val_abs_sequence = get_text_sequence(val_data.abstract_text, self.word2idx, self.args.max_length) self.val.append(val_cpcs) self.val.append(val_ipcs) self.val.append(val_uspcs) self.val.append(val_abs_sequence) print('preparing test data') test_cpcs, test_ipcs, test_uspcs = convert_code_to_idx( test_data, self.max_cpc_len, self.max_ipc_len, self.max_uspc_len, self.cpc2idx, self.ipc2idx, self.uspc2idx) test_abs_sequence = get_text_sequence(test_data.abstract_text, self.word2idx, self.args.max_length) self.test.append(test_cpcs) self.test.append(test_ipcs) self.test.append(test_uspcs) self.test.append(test_abs_sequence)
from __future__ import print_function import numpy as np import tflearn # Load CSV file, indicate that the first column represents labels from data_utils import load_csv data, labels = load_csv('titanic_dataset.csv', target_column=0, categorical_labels=True, n_classes=2) # Preprocessing function def preprocess(data, columns_to_ignore): # Sort by descending id and delete columns for id in sorted(columns_to_ignore, reverse=True): [r.pop(id) for r in data] for i in range(len(data)): # Converting 'sex' field to float (id is 1 after removing labels column) data[i][1] = 1. if data[i][1] == 'female' else 0. return np.array(data, dtype=np.float32) # Ignore 'name' and 'ticket' columns (id 1 & 6 of data array) to_ignore=[1, 6] # Preprocess data data = preprocess(data, to_ignore) # Build neural network net = tflearn.input_data(shape=[None, 6]) net = tflearn.fully_connected(net, 32) net = tflearn.fully_connected(net, 32)
counter, q1_row_idx = idx_vec(counter, q1_words) counter, q2_row_idx = idx_vec(counter, q2_words) q1_idx = np.vstack((q1_idx, q1_row_idx)) q2_idx = np.vstack((q2_idx, q2_row_idx)) if counter % 10000 == 0: print(counter) #input_matrix.tofile('sub_matrix2_{}.bin'.format(first_id)) #q1_idx.tofile('sub_q1_{}.bin'.format(first_id)) #q2_idx.tofile('sub_q2_{}.bin'.format(first_id)) return (input_matrix, q1_idx[1:, :], q2_idx[1:, :]) #print("input_matrix shape {}".format(input_matrix.shape)) #print("input mat shape is {}".format(input_matrix.shape)) dataframe = load_csv(filename) #print("df shape {}".format(dataframe.shape)) num_processors = mp.cpu_count() subframe = np.array_split(dataframe, 6)[5] chunks = np.array_split(subframe, num_processors) # create our pool with `num_processes` processes pool = mp.Pool(processes=num_processors) # apply our function to each chunk in the list listResults = pool.map(process_chunk, chunks) counter = 0 mat_list = [] q1_list = [] q2_list = [] for result in listResults: sub_mat, sub_q1_idx, sub_q2_idx = result