Exemple #1
0
def load_data(loc='./data/'):
    """
    Load quora data
    """

    df = du.load_csv(os.path.join(loc, 'train.csv'))
    trainA = df['question1']
    trainB = df['question2']
    trainS = df['is_duplicate']

    df = du.load_csv(os.path.join(loc, 'dev.csv'))
    devA = df['question1']
    devB = df['question2']
    devS = df['is_duplicate']

    df = du.load_csv(os.path.join(loc, 'valid.csv'))
    testA = df['question1']
    testB = df['question2']
    testS = df['is_duplicate']

    trainS = [int(s) for s in trainS]
    devS = [int(s) for s in devS]
    testS = [int(s) for s in testS]

    return [trainA, trainB], [devA, devB], [testA,
                                            testB], [trainS, devS, testS]
Exemple #2
0
def load_quora_data(data_dir):
    train_file = os.path.join(data_dir, 'train.csv')
    test_file = os.path.join(data_dir, 'test.csv')

    train_table = du.load_csv(train_file)
    test_table = du.load_csv(test_file)

    return [train_table, test_table]
Exemple #3
0
    def __init__(self, data_queue, args, split='train', repeat=True):
        """Shapenet dataloader.

        Args:
            data_queue: multiprocessing queue where data is stored at.
            split: str in ('train', 'val', 'test'). Loads corresponding dataset.
            repeat: repeats epoch if true. Terminates after one epoch otherwise.
        """
        self.args = args
        self.split = split
        args.DATA_PATH = 'data/%s' % (args.dataset)
        classmap = load_csv(args.DATA_PATH + '/synsetoffset2category.txt')
        args.classmap = {}
        for i in range(classmap.shape[0]):
            args.classmap[str(classmap[i][1]).zfill(8)] = classmap[i][0]

        self.data_paths = sorted([
            os.path.join(args.DATA_PATH, split, 'partial',
                         k.rstrip() + '.h5')
            for k in open(args.DATA_PATH + '/%s.list' % (split)).readlines()
        ])
        N = int(len(self.data_paths) / args.batch_size) * args.batch_size
        self.data_paths = self.data_paths[0:N]
        super().__init__(data_queue,
                         self.data_paths,
                         None,
                         args.batch_size,
                         repeat=repeat)
Exemple #4
0
def run_eval(vector_dir, data_dir, output_dir, show_fig=False):

    print "Reading encodings..."
    trainA_vec = np.load(os.path.join(vector_dir, 'trainA_encodings.npy'))
    trainB_vec = np.load(os.path.join(vector_dir, 'trainB_encodings.npy'))

    print "Reading train.csv..."
    train = du.load_csv(os.path.join(data_dir, 'train.csv'))
    labels = train['is_duplicate']

    print "Calculating distances..."
    dists = np.asarray(
        [cosine(trainA_vec[i], trainB_vec[i]) for i in range(len(trainA_vec))])

    print "Finding best threshold distance..."
    threshold, log_loss = find_best_threshold(dists, labels)
    # success, loss = try_validation(data_dir, threshold)
    print "Best threshold={0:.4f}, which induces a log loss of {1:.4f}".format(
        threshold, log_loss)

    perf_file = os.path.join(output_dir, 'performance.txt')
    print "Saving performance info to " + perf_file
    write_performance(perf_file, threshold, log_loss)

    print "Generating plot and saving it to {0}".format(output_dir)
    plot_histogram(dists,
                   labels,
                   output_dir,
                   threshold=threshold,
                   log_loss=log_loss,
                   show_fig=show_fig)
Exemple #5
0
def encode_skipthoughts(data_file, encoder, out_dir, prefix, verbose):
    print 'Loading quora data...'
    data = du.load_csv(data_file)

    print 'Computing skipthoughts...'
    q1_vecs = encoder.encode(data['question1'], verbose=verbose, use_eos=True)
    q2_vecs = encoder.encode(data['question2'], verbose=verbose, use_eos=True)

    # Save the encodings
    print 'Saving the encodings...'
    np.save(os.path.join(out_dir, prefix + 'q1_encodings'), q1_vecs)
    np.save(os.path.join(out_dir, prefix + 'q2_encodings'), q2_vecs)
Exemple #6
0
def main():
    parser = argparse.ArgumentParser(
        description='Evaluate a model on the quora question-pair dataset.')
    parser.add_argument('--quora-data-dir',
                        required=True,
                        help='path to the directory containing the quora data')
    parser.add_argument(
        '--st-model-dir',
        required=True,
        help='path to the directory containing the skipthoughts model')
    parser.add_argument('--output-dir',
                        default='.',
                        help='path to the directory to write to')
    parser.add_argument('-v', '--verbose', action='store_true')
    args = parser.parse_args()

    output_file = os.path.join(args.output_dir, 'oov_stats.txt')

    with log.FileWriterStdoutPrinter(output_file) as writer:
        print "Loading skipthoughts model..."
        model = st.load_model(args.st_model_dir)
        print "Initializing skipthoughts word dict..."
        word_dict = st.init_word_dict(model)
        print "Analyzing word dict..."
        analyze_dict(word_dict, writer)

        print "Loading training set..."
        train = du.load_csv(os.path.join(args.quora_data_dir, 'train.csv'))
        writer.emit_line("Analyzing word counts in train.csv...")
        analyze_oov(word_dict, train, writer, args.output_dir, 'train')

        # Be sure to write data to disk for train before moving on to test, which is much bigger

        print "Loading test set..."
        test = du.load_csv(os.path.join(args.quora_data_dir, 'test.csv'))
        writer.emit_line("Analyzing word counts in test.csv...")
        analyze_oov(word_dict, test, writer, args.output_dir, 'test')
Exemple #7
0
    def __init__(self, data_queue, args, split='train', repeat=True):
        """Shapenet dataloader.

        Args:
            data_queue: multiprocessing queue where data is stored at.
            split: str in ('train', 'val', 'test'). Loads corresponding dataset.
            repeat: repeats epoch if true. Terminates after one epoch otherwise.
        """
        self.args = args
        self.split = split
        args.DATA_PATH = 'data/%s' % (args.dataset)
        classmap = load_csv(args.DATA_PATH + '/synsetoffset2category.txt')
        args.classmap = {}
        if args.dataset == 'facade':
            for i in range(classmap.shape[0]):
                args.classmap[str(classmap[i][1]).zfill(2)] = classmap[i][0]
        else:
            for i in range(classmap.shape[0]):
                args.classmap[str(classmap[i][1]).zfill(8)] = classmap[i][0]

        self.data_paths = []
        self.data_paths = sorted([])
        for k in open(args.DATA_PATH + '/%s.list' % (split)).readlines():
            if split == 'train' and not k.startswith(args.scene):
                self.data_paths.append(
                    os.path.join(args.DATA_PATH, split, 'partial',
                                 k.rstrip() + '.h5'))
            if split == 'val' and k.startswith(args.scene):
                # workaround to make sure we have at least `batch size` number of data_paths:
                s = os.path.join(args.DATA_PATH, split, 'partial',
                                 k.rstrip() + '.h5')
                for i in range(args.batch_size):
                    self.data_paths.append(s)

        self.data_paths = sorted(self.data_paths)
        print('data_paths', len(self.data_paths))
        N = int(len(self.data_paths) / args.batch_size) * args.batch_size
        self.data_paths = self.data_paths[0:N]
        print('DataProcess', split, 'scene', args.scene, len(self.data_paths))
        super().__init__(data_queue,
                         self.data_paths,
                         None,
                         args.batch_size,
                         repeat=repeat)
Exemple #8
0
    def generate_from_file(self, dataset, smooth=True, discretize_params=[]):
        _df = load_csv(dataset.path)
        _smooth = smooth
        _continuous_features = dataset.continuous_features
        _discrete_features = dataset.discrete_features
        if len(discretize_params) > 0:
            discrete_columns = [dp.feature_name for dp in discretize_params]
            _discrete_features = list(
                set(_discrete_features + discrete_columns))
            _continuous_features = [
                c for c in _continuous_features if not c in discrete_columns
            ]
            _df = discretize_continuous_features(dataset.name, _df,
                                                 discretize_params)

        _discrete_features_values = {}
        for discrete_feature in _discrete_features:
            _discrete_features_values[discrete_feature] = list(
                pd.unique(_df[discrete_feature].values.ravel('K')))
        return self(_df, _discrete_features, _discrete_features_values,
                    _continuous_features, smooth)
Exemple #9
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Split train.csv into train, dev, and test splits. Specify dev and validation set sizes with args, the remainder is used for training.'
    )
    parser.add_argument(
        '--dataset-file',
        required=True,
        help='path to the train.csv file containing the quora training data')
    parser.add_argument('--ndev',
                        type=int,
                        default=1e4,
                        help='size of dev set to create')
    parser.add_argument('--nvalid',
                        type=int,
                        default=5e4,
                        help='size of validation set to create')
    parser.add_argument(
        '--output-dir',
        required=True,
        help='directory to which to write train.csv, dev.csv, and valid.csv')
    parser.add_argument(
        '--seed',
        type=int,
        help=
        'optional random seed to have reproducibility between multiple uses of this tool'
    )
    args = parser.parse_args()

    data = du.load_csv(args.dataset_file)
    shuffled = du.shuffle(data, args.seed)

    ntrain = len(data) - args.ndev - args.nvalid
    train, dev, valid = du.split(shuffled, ntrain, args.ndev, args.nvalid)

    du.write_csv(train, os.path.join(args.output_dir, 'train.csv'))
    du.write_csv(dev, os.path.join(args.output_dir, 'dev.csv'))
    du.write_csv(valid, os.path.join(args.output_dir, 'valid.csv'))
Exemple #10
0
    def load_data(self):
        train_data, val_data, test_data = load_csv(self.args.data_path,
                                                   self.args.data)
        self.train_y = to_categorical(train_data.valid)
        self.val_y = to_categorical(val_data.valid)
        self.test_y = to_categorical(test_data.valid)

        print('Load word embedding')
        self.word2idx, idx2word, self.word_vectors = load_word_embedding(
            self.args.embedding_path, self.args.word_embedding, self.args.data)

        print('Load graph embedding')
        cpc_embed_dict, ipc_embed_dict, uspc_embed_dict = load_code_embeddings(
            self.args.embedding_path, self.args.code_embedding, self.args.data)

        self.cpc2idx, idx2cpc, self.cpc_vectors = create_code_vocab(
            cpc_embed_dict)
        self.ipc2idx, idx2ipc, self.ipc_vectors = create_code_vocab(
            ipc_embed_dict)
        self.uspc2idx, idx2uspc, self.uspc_vectors = create_code_vocab(
            uspc_embed_dict)

        self.max_cpc_len, self.max_ipc_len, self.max_uspc_len = get_code_length(
            train_data)

        print('Preparing train data')
        train_cpcs, train_ipcs, train_uspcs = convert_code_to_idx(
            train_data, self.max_cpc_len, self.max_ipc_len, self.max_uspc_len,
            self.cpc2idx, self.ipc2idx, self.uspc2idx)
        train_abs_sequence = get_text_sequence(train_data.abstract_text,
                                               self.word2idx,
                                               self.args.max_length)

        self.train.append(train_cpcs)
        self.train.append(train_ipcs)
        self.train.append(train_uspcs)
        self.train.append(train_abs_sequence)

        print('Preparing validation data')
        val_cpcs, val_ipcs, val_uspcs = convert_code_to_idx(
            val_data, self.max_cpc_len, self.max_ipc_len, self.max_uspc_len,
            self.cpc2idx, self.ipc2idx, self.uspc2idx)
        val_abs_sequence = get_text_sequence(val_data.abstract_text,
                                             self.word2idx,
                                             self.args.max_length)

        self.val.append(val_cpcs)
        self.val.append(val_ipcs)
        self.val.append(val_uspcs)
        self.val.append(val_abs_sequence)

        print('preparing test data')
        test_cpcs, test_ipcs, test_uspcs = convert_code_to_idx(
            test_data, self.max_cpc_len, self.max_ipc_len, self.max_uspc_len,
            self.cpc2idx, self.ipc2idx, self.uspc2idx)
        test_abs_sequence = get_text_sequence(test_data.abstract_text,
                                              self.word2idx,
                                              self.args.max_length)

        self.test.append(test_cpcs)
        self.test.append(test_ipcs)
        self.test.append(test_uspcs)
        self.test.append(test_abs_sequence)
from __future__ import print_function

import numpy as np
import tflearn

# Load CSV file, indicate that the first column represents labels
from data_utils import load_csv
data, labels = load_csv('titanic_dataset.csv', target_column=0,
                        categorical_labels=True, n_classes=2)


# Preprocessing function
def preprocess(data, columns_to_ignore):
    # Sort by descending id and delete columns
    for id in sorted(columns_to_ignore, reverse=True):
        [r.pop(id) for r in data]
    for i in range(len(data)):
      # Converting 'sex' field to float (id is 1 after removing labels column)
      data[i][1] = 1. if data[i][1] == 'female' else 0.
    return np.array(data, dtype=np.float32)

# Ignore 'name' and 'ticket' columns (id 1 & 6 of data array)
to_ignore=[1, 6]

# Preprocess data
data = preprocess(data, to_ignore)

# Build neural network
net = tflearn.input_data(shape=[None, 6])
net = tflearn.fully_connected(net, 32)
net = tflearn.fully_connected(net, 32)
        counter, q1_row_idx = idx_vec(counter, q1_words)
        counter, q2_row_idx = idx_vec(counter, q2_words)
        q1_idx = np.vstack((q1_idx, q1_row_idx))
        q2_idx = np.vstack((q2_idx, q2_row_idx))

        if counter % 10000 == 0:
            print(counter)
    #input_matrix.tofile('sub_matrix2_{}.bin'.format(first_id))
    #q1_idx.tofile('sub_q1_{}.bin'.format(first_id))
    #q2_idx.tofile('sub_q2_{}.bin'.format(first_id))
    return (input_matrix, q1_idx[1:, :], q2_idx[1:, :])
    #print("input_matrix shape {}".format(input_matrix.shape))
    #print("input mat shape is {}".format(input_matrix.shape))


dataframe = load_csv(filename)
#print("df shape {}".format(dataframe.shape))
num_processors = mp.cpu_count()
subframe = np.array_split(dataframe, 6)[5]
chunks = np.array_split(subframe, num_processors)
# create our pool with `num_processes` processes
pool = mp.Pool(processes=num_processors)
# apply our function to each chunk in the list
listResults = pool.map(process_chunk, chunks)

counter = 0
mat_list = []
q1_list = []
q2_list = []
for result in listResults:
    sub_mat, sub_q1_idx, sub_q2_idx = result