Exemple #1
0
def load_dataset():
    print("Reading CSV file...")

    bug_fields = config.bug_fields
    nlp_fields = config.nlp_fields
    nonnlp_fields = config.nonnlp_fields
    table = pq.read_table(config.bug_data, columns=bug_fields)
    df = table.to_pandas()
    df.drop_duplicates(inplace=True)
    df_bug_nlp, df_bug_nonnlp = clean_dataset(df)
    utils.create_vocabulary(df_bug_nlp['text'].tolist())
    print("nlp feature clean_shape", df_bug_nlp.shape)
    df_bug_nlp.to_csv(config.nlp_data_clean, index=True)
    df_bug_nonnlp.to_csv(config.nonnlp_data_clean, index=True)
    return df_bug_nlp, df_bug_nonnlp
Exemple #2
0
    def __init__(self, sent_size):
        super(Attention_Net, self).__init__()
        #embeds are of the dimension n * (1 x (embedding_size * no_of_kmers))
        self.embeds = nn.Embedding(len(utils.create_vocabulary(Config.window_size)), Config.embedding_size)
        #Embeds[125, 5]
        self.embeds_size = Config.embedding_size * sent_size
        """
        Experimenting Start
        """
        #self.embeds_size = sent_size # set this if we want the embeddings to be fed as vectors
        self.attn_weights = nn.Parameter(autograd.Variable(torch.randn(self.embeds_size, self.embeds_size)))

        self.attention = MultiHeadAttention(Config.n_head, Config.d_model, Config.d_k, Config.d_v, dropout=Config.attn_dropout);
        """
        Experimenting END
        """
        #self.attn_weights = nn.Parameter(autograd.Variable(torch.randn(self.embeds_size, self.embeds_size)))
        #self.attn_weights = autograd.Variable(torch.randn(self.embeds_size, self.embeds_size))
        #attn_weights = autograd.Variable(torch.randn(self.embeds_size, self.embeds_size))
        self.tanh = torch.tanh
        self.fc1 = nn.Linear(self.embeds_size, Config.hidden_layer_size)
        """
        Experimenting Start
        """
        #self.fc1 = nn.Linear(Config.embedding_size, Config.hidden_layer_size)
        """
        Experimenting END
        """
        self.relu = F.relu
        #self.context = None
        self.sigmoid = nn.Sigmoid()
        self.fc2 = nn.Linear(Config.hidden_layer_size, 1)
        self.threshold = F.threshold
        self.dropout = nn.Dropout(Config.dropout)
Exemple #3
0
 def pre_process_data(raw_data, tokenizer, config, logger):
     '''
     raw_data: dir or a specific file
     '''
     vocab_file = os.path.join(config.tokenized_data_dir, 'vocab.txt')
     sample_file = os.path.join(config.tokenized_data_dir, 'samples.txt')
     if os.path.isfile(vocab_file) and os.path.isfile(sample_file):
         logger.info("vocab file and sample file already existed!")
         return Data(vocab_file, sample_file, config, logger)
     else:
         logger.info("Genarate vocabulary and tokenized samples.")
         if os.path.isfile(raw_data):
             raw_data = [raw_data]
         else:
             raw_data = glob.glob(os.path.join(raw_data, '*'))
         samples = set()
         for file in raw_data:
             for qa in parse_raw_file(file):
                 q = qa[0]
                 a = qa[1]
                 tokenized_q = tokenize_one_line(
                     sentence=q,
                     cut_fun=tokenizer.tokenize,
                     specical_symbol=config.special_symbol,
                     mode=config.source_language_type,
                     lower=config.source_language_lower)
                 tokenized_a = tokenize_one_line(
                     sentence=a,
                     cut_fun=tokenizer.tokenize,
                     specical_symbol=config.special_symbol,
                     mode=config.target_language_type,
                     lower=config.target_language_lower)
                 samples.add(tokenized_q + "\t" + tokenized_a)
         logger.info('sample size:{}'.format(len(samples)))
         logger.info("save samples in '{}'".format(sample_file))
         write_lines(sample_file, samples)
         source_vocab, target_vocab, special_vocab = create_vocabulary(
             samples, config.special_symbol)
         source_vocab = set(list(source_vocab.keys()))
         for s_symbol in config.vocab_remains:
             if s_symbol in source_vocab:
                 source_vocab.discard(s_symbol)
             if s_symbol in target_vocab:
                 target_vocab.discard(s_symbol)
             if s_symbol in special_vocab:
                 special_vocab.discard(s_symbol)
         logger.info('vocab size:{}'.format(
             len(source_vocab) + len(target_vocab) + len(special_vocab) +
             len(config.vocab_remains)))
         logger.info('save vocabulary in "{}"'.format(vocab_file))
         with open(vocab_file, 'w', encoding='utf8') as f:
             for line in config.vocab_remains:
                 f.write(line + '\n')
             for line in special_vocab:
                 f.write(line + '\n')
             for line in source_vocab | target_vocab:
                 f.write(line + '\n')
         return Data(vocab_file, sample_file, config, logger)
 def __init__(self, sent_size):
     super(Lstm_Net, self).__init__()
     self.sent_size = sent_size
     self.embeds = nn.Embedding(
         len(utils.create_vocabulary(Config.window_size)),
         Config.embedding_size)
     self.embeds_size = Config.embedding_size * sent_size
     self.lstm = nn.LSTM(self.embeds_size, Config.hidden_layer_size,
                         Config.num_layers)
     self.tanh = torch.tanh
     # self.fc1 = nn.Linear(self.embeds_size, Config.hidden_layer_size)
     self.fc1 = nn.Linear(Config.hidden_layer_size,
                          Config.hidden_layer_size)
     self.dropout = nn.Dropout(Config.dropout_rate)
     self.relu = F.relu
     self.sigmoid = nn.Sigmoid()
     self.fc2 = nn.Linear(Config.hidden_layer_size, 1)
Exemple #5
0
def train_epoch(model, inputs, labels, optimizer, criterion):
    model.train()
    losses = []
    vocabulary = utils.create_vocabulary(Config.window_size)
    #labels_hat = []
    j = 0
    correct, wrong = 0, 0
    for data in inputs.itertuples():
        gene = data.Gene
        input_ = torch.tensor([
            vocabulary[gene[i:i + Config.window_size]]
            for i in range(0,
                           len(gene) - Config.window_size + 1)
        ],
                              dtype=torch.long)
        #data_batch = inputs[i:i + batch_size, :]
        #labels_batch = labels[i:i + batch_size, :]
        inputs = autograd.Variable(input_)
        label = autograd.Variable(labels[j])
        j += 1
        optimizer.zero_grad()
        # (1) Forward
        label_hat = model(input_)
        # (2) Compute diff
        loss = criterion(label_hat, label)
        # (3) Compute gradients
        losses.append(loss.data.numpy())
        loss.backward(retain_graph=False)
        # (4) update weights
        optimizer.step()
        #labels_hat.append(label_hat)
        correct, wrong = utils.get_train_accuracy(label_hat, j - 1,
                                                  len(labels), correct, wrong)

    #print('labels_hat size>', len(labels_hat))
    loss = sum(losses) / len(losses)

    return loss, tuple((correct, wrong))
def main(_):
    assert FLAGS.source_train_path, ("--source_train_path is required.")
    assert FLAGS.target_train_path, ("--target_train_path is required.")

    # Create vocabularies.
    source_vocab_path = os.path.join(os.path.dirname(FLAGS.source_train_path),
                                     "vocabulary.source")
    target_vocab_path = os.path.join(os.path.dirname(FLAGS.source_train_path),
                                     "vocabulary.target")
    utils.create_vocabulary(source_vocab_path, FLAGS.source_train_path, FLAGS.source_vocab_size)
    utils.create_vocabulary(target_vocab_path, FLAGS.target_train_path, FLAGS.target_vocab_size)

    # Read vocabularies.
    source_vocab, rev_source_vocab = utils.initialize_vocabulary(source_vocab_path)
    target_vocab, rev_target_vocab = utils.initialize_vocabulary(target_vocab_path)

    # Read parallel sentences.
    parallel_data = utils.read_data(FLAGS.source_train_path, FLAGS.target_train_path,
                                    source_vocab, target_vocab)

    # Read validation data set.
    if FLAGS.source_valid_path and FLAGS.target_valid_path:
        valid_data = utils.read_data(FLAGS.source_valid_path, FLAGS.target_valid_path,
                                    source_vocab, target_vocab)

    # Initialize BiRNN.
    config = Config(len(source_vocab),
                    len(target_vocab),
                    FLAGS.embedding_size,
                    FLAGS.state_size,
                    FLAGS.hidden_size,
                    FLAGS.num_layers,
                    FLAGS.learning_rate,
                    FLAGS.max_gradient_norm,
                    FLAGS.use_lstm,
                    FLAGS.use_mean_pooling,
                    FLAGS.use_max_pooling,
                    FLAGS.source_embeddings_path,
                    FLAGS.target_embeddings_path,
                    FLAGS.fix_pretrained)

    model = BiRNN(config)

    # Build graph.
    model.build_graph()

    # Train  model.
    with tf.Session() as sess:

        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        train_iterator = utils.TrainingIteratorRandom(parallel_data, FLAGS.num_negative)
        train_summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.checkpoint_dir, "train"), sess.graph)

        if FLAGS.source_valid_path and FLAGS.target_valid_path:
            valid_iterator = utils.EvalIterator(valid_data)
            valid_summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.checkpoint_dir, "valid"), sess.graph)

        epoch_loss = 0
        epoch_completed = 0
        batch_completed = 0

        num_iter = int(np.ceil(train_iterator.size / FLAGS.batch_size * FLAGS.num_epochs))
        start_time = time.time()
        print("Training model on {} sentence pairs per epoch:".
            format(train_iterator.size, valid_iterator.size))

        for step in xrange(num_iter):
            source, target, label = train_iterator.next_batch(FLAGS.batch_size)
            source_len = utils.sequence_length(source)
            target_len = utils.sequence_length(target)
            feed_dict = {model.x_source: source,
                         model.x_target: target,
                         model.labels: label,
                         model.source_seq_length: source_len,
                         model.target_seq_length: target_len,
                         model.input_dropout: FLAGS.keep_prob_input,
                         model.output_dropout: FLAGS.keep_prob_output,
                         model.decision_threshold: FLAGS.decision_threshold}

            _, loss_value, epoch_accuracy,\
            epoch_precision, epoch_recall = sess.run([model.train_op,
                                                      model.mean_loss,
                                                      model.accuracy[1],
                                                      model.precision[1],
                                                      model.recall[1]],
                                                      feed_dict=feed_dict)
            epoch_loss += loss_value
            batch_completed += 1
            # Write the model's training summaries.
            if step % FLAGS.steps_per_checkpoint == 0:
                summary = sess.run(model.summaries, feed_dict=feed_dict)
                train_summary_writer.add_summary(summary, global_step=step)
            # End of current epoch.
            if train_iterator.epoch_completed > epoch_completed:
                epoch_time = time.time() - start_time
                epoch_loss /= batch_completed
                epoch_f1 = utils.f1_score(epoch_precision, epoch_recall)
                epoch_completed += 1
                print("Epoch {} in {:.0f} sec\n"
                      "  Training: Loss = {:.6f}, Accuracy = {:.4f}, "
                      "Precision = {:.4f}, Recall = {:.4f}, F1 = {:.4f}"
                      .format(epoch_completed, epoch_time,
                              epoch_loss, epoch_accuracy,
                              epoch_precision, epoch_recall, epoch_f1))
                # Save a model checkpoint.
                checkpoint_path = os.path.join(FLAGS.checkpoint_dir, "model.ckpt")
                model.saver.save(sess, checkpoint_path, global_step=step)
                # Evaluate model on the validation set.
                if FLAGS.source_valid_path and FLAGS.target_valid_path:
                    eval_epoch(sess, model, valid_iterator, valid_summary_writer)
                # Initialize local variables for new epoch.
                batch_completed = 0
                epoch_loss = 0
                sess.run(tf.local_variables_initializer())
                start_time = time.time()

        print("Training done with {} steps.".format(num_iter))
        train_summary_writer.close()
        valid_summary_writer.close()
Exemple #7
0
if sys.version_info[0] < 3:
    raise Exception("Must be using Python 3")

print('Preparing data')

train_x, train_y, trial_x, trial_y, test_x, test_y = utils.load_dataset(
    'data/train.csv', 'data/trial.csv', 'data/trial.labels',
    'data/test-text-labels.csv')

print('Preprocessing data')
train_x, trial_x, test_x, max_string_length = preprocessing.preprocessing_pipeline(
    train_x, trial_x, test_x)

print('Words to index')

vocab_length, words_to_index, index_to_words = utils.create_vocabulary(
    train_x, trial_x, test_x)

train_y_oh = utils.labels_to_indices(train_y, config.labels_to_index,
                                     config.classes)
trial_y_oh = utils.labels_to_indices(trial_y, config.labels_to_index,
                                     config.classes)

train_x_indices = utils.sentences_to_indices(train_x,
                                             words_to_index,
                                             max_len=max_string_length)
trial_x_indices = utils.sentences_to_indices(trial_x,
                                             words_to_index,
                                             max_len=max_string_length)
test_x_indices = utils.sentences_to_indices(test_x,
                                            words_to_index,
                                            max_len=max_string_length)
Exemple #8
0
                          split=False)

if 'dev.txt' not in os.listdir(train_flag['data_dir']):
    dev_data = read_dataset(join(train_flag['data_dir'], 'valid.txt'),
                            parameters['maximum_L'],
                            split=False)
else:
    dev_data = read_dataset(join(train_flag['data_dir'], 'dev.txt'),
                            parameters['maximum_L'],
                            split=False)

test_data = read_dataset(join(train_flag['data_dir'], 'test.txt'),
                         parameters['maximum_L'],
                         split=False)

tag_vocabulary, i2t = create_vocabulary(train_data)
parameters['labels_num'] = len(tag_vocabulary.keys())  # number of labels
parameters['tag_emb_dim'] = len(tag_vocabulary.keys())


def train(generator, param, flags):

    with tf.Session() as sess:
        # create model
        model = NEF(param, tag_vocabulary, i2t)

        # print config
        print(model.path)
        sess.run(tf.global_variables_initializer())

        # start learning
Exemple #9
0
def main(train_path, val_path, labels_path, embedding_vectors_path,
         embedding_word2idx_path, categories_def_path, uncertainty_output_path,
         batch_size, model_snapshot_prefix, pretrained_model_path,
         model_snapshot_interval):
    embedding_vectors = bcolz.open(embedding_vectors_path)[:]
    embedding_dim = len(embedding_vectors[0])
    embedding_word2idx = pickle.load(open(embedding_word2idx_path, 'rb'))
    # Maps words to embedding vectors. These are all embeddings available to us
    embeddings = {
        w: embedding_vectors[embedding_word2idx[w]]
        for w in embedding_word2idx
    }

    # Build vocabulary using training set. Maps words to indices
    vocab = create_vocabulary(train_path)
    vocab_size = len(vocab)
    print(f'Vocabulary size: {vocab_size}\nBatch size: {batch_size}')
    # TODO: take advantage of the multiple annotations
    labels = load_existing_annotations(labels_path,
                                       load_first_annotation_only=True)

    if model_snapshot_interval:
        print(f'Taking model snapshot every {model_snapshot_interval} epochs')
    else:
        print(f'Taking model snapshot ONLY at the end of training')

    humor_types = load_sentences_or_categories(categories_def_path)
    # Map label IDs to indices so that when computing cross entropy we don't operate on raw label IDs
    label_id_to_idx = {
        label_id: idx
        for idx, label_id in enumerate(humor_types)
    }
    word_weight_matrix = create_weight_matrix(vocab, embeddings, device)

    # Stores indexes of sentences provided in the original dataset
    train_labeled_idx, train_labeled_data_unpadded, train_labels, train_unlabeled_idx, train_unlabeled_data_unpadded,\
        longest_sentence_length = load_unpadded_train_val_data(train_path, vocab, labels, label_id_to_idx)
    val_labeled_idx, val_labeled_data_unpadded, val_labels, val_unlabeled_idx, val_unlabeled_data_unpadded,\
        _ = load_unpadded_train_val_data(val_path, vocab, labels, label_id_to_idx)

    # Create padded train and val dataset
    # TODO: Do not use longest length to pad input. Find mean and std
    train_labeled_data = create_padded_data(train_labeled_data_unpadded,
                                            longest_sentence_length)
    val_labeled_data = create_padded_data(val_labeled_data_unpadded,
                                          longest_sentence_length)

    print(
        f'Num of labeled training data: {train_labeled_data.shape[0]}, labeled val: {val_labeled_data.shape[0]}'
    )

    num_iterations = train_labeled_data.shape[0] // batch_size

    textCNN = DataParallel(
        TextCNN(word_weight_matrix, NUM_FILTERS, WINDOW_SIZES,
                len(humor_types))).to(device)
    if pretrained_model_path:
        textCNN.module.initialize_from_pretrained(pretrained_model_path)
    optimizer = torch.optim.Adam(textCNN.parameters(), lr=LR, eps=OPTIM_EPS)

    for i in range(NUM_EPOCHS):
        print(f'Epoch {i}')
        train_one_epoch(
            textCNN,
            create_batch_iterable(train_labeled_data, train_labels, batch_size,
                                  device), optimizer, val_labeled_data,
            val_labels, num_iterations)
        if model_snapshot_prefix:
            if (not model_snapshot_interval and i + 1 == NUM_EPOCHS) or \
                    (model_snapshot_interval and (i + 1) % model_snapshot_interval == 0):
                print('\nSaving model snapshot...')
                torch.save(textCNN.state_dict(),
                           f'{model_snapshot_prefix}_epoch{i}.mdl')
                print('Saved\n')

    if uncertainty_output_path:
        train_unlabeled_data = create_padded_data(
            train_unlabeled_data_unpadded, longest_sentence_length)
        rank_unlabeled_train(
            textCNN,
            torch.tensor(train_unlabeled_data, dtype=torch.long,
                         device=device), train_unlabeled_idx,
            uncertainty_output_path)
#full path ./data/ + dataset + train/test/valid
if arg.dataset == None:
    print("name of dataset can not be None")
elif arg.dataset == "snip":
    print("use snip dataset")
elif arg.dataset == "atis":
    print("use atis dataset")
else:
    print("use own dataset: ", arg.dataset)

full_train_path = os.path.join("./data", arg.dataset, arg.train_data_path)
full_test_path = os.path.join('./data', arg.dataset, arg.test_data_path)
full_valid_path = os.path.join('./data', arg.dataset, arg.valid_data_path)

create_vocabulary(os.path.join(full_train_path, arg.input_file),
                  os.path.join(arg.vocab_path, "in_vocab"))
create_vocabulary(os.path.join(full_train_path, arg.slot_file),
                  os.path.join(arg.vocab_path, "slot_vocab"))
create_vocabulary(os.path.join(full_train_path, arg.intent_file),
                  os.path.join(arg.vocab_path, "intent_vocab"))

# {word 2 id, words list}
in_vocab = load_vocabulary(os.path.join(arg.vocab_path, "in_vocab"))
slot_vocab = load_vocabulary(os.path.join(arg.vocab_path, "slot_vocab"))
intent_vocab = load_vocabulary(os.path.join(arg.vocab_path, "intent_vocab"))


def create_model(input_data,
                 input_size,
                 sequence_length,
                 slot_size,
Exemple #11
0
# RNN PARAMETERS
tf.app.flags.DEFINE_integer("BATCH_SIZE", 64, "batch size")
tf.app.flags.DEFINE_integer("NUM_EPOCHS", 1, "number of epochs for training")
tf.app.flags.DEFINE_float("LEARNING_RATE", 0.001, "learning rate for rnn")
tf.app.flags.DEFINE_float("MAX_GRAD_NORM", 5.0, "max. norm for gradient clipping")
tf.app.flags.DEFINE_string('f', '', 'tensorflow bug')

FLAGS = tf.app.flags.FLAGS
if FLAGS.EXPERIMENT == "C":
    FLAGS.STATE_DIM = 1024
tf_utils.print_flags(FLAGS, logger)

# -------------------------------------------------------------------------------------------------------------------- #
# PREPROCESSING
logger.append("PREPROCESSING STARTING.")
vocabulary, word_to_idx, idx_to_word = utils.create_vocabulary(FLAGS.DATA_DIR + FLAGS.SENTENCES_TRAIN_FILE,
                                                               FLAGS.VOCABULARY_SIZE)
X_train = utils.create_dataset(FLAGS.DATA_DIR + FLAGS.SENTENCES_TRAIN_FILE, word_to_idx)
logger.append("X_train CREATED.")
X_test = utils.create_dataset(FLAGS.DATA_DIR + FLAGS.SENTENCES_TEST_FILE, word_to_idx)
logger.append("X_test CREATED.")
X_eval = utils.create_dataset(FLAGS.DATA_DIR + FLAGS.SENTENCES_EVAL_FILE, word_to_idx)
logger.append("X_eval CREATED.")
X_cont = utils.load_continuation(FLAGS.DATA_DIR + FLAGS.SENTENCES_CONTINUATION_FILE, word_to_idx)
logger.append("X_cont CREATED.")

with open(FLAGS.RESULTS_DIR + "vocabulary.pkl", "wb") as f:
    pickle.dump((vocabulary, word_to_idx, idx_to_word), f)

with open(FLAGS.RESULTS_DIR + "X_train.ids", "w") as f:
    for i in range(X_train.shape[0]):
        f.write(" ".join([str(x) for x in X_train[i, :]]) + "\n")