Ejemplo n.º 1
0
def main():
    assert os.path.exists(model_dir)
    assert os.path.exists(conf_path)
    assert os.path.exists(summary_dir)
    assert os.path.exists(FLAGS.data_prefix + '.train.txt') and \
            os.path.exists(FLAGS.data_prefix + '.valid.txt') and \
            os.path.exists(FLAGS.data_prefix + '.test.txt')
    assert FLAGS.mode in ['train', 'test']

    logger = logging.getLogger("lm_zh")
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    if log_path:
        file_handler = logging.FileHandler(log_path)
        file_handler.setLevel(logging.INFO)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    else:
        console_handler = logging.StreamHandler()
        console_handler.setLevel(logging.INFO)
        console_handler.setFormatter(formatter)
        logger.addHandler(console_handler)

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = '0'

    logger.info('Parse config file ...')
    config = default_config.parse(conf_path)
    logger.info('Running with config: {}'.format(config.items))

    if FLAGS.mode == 'test':
        config.batch_size *= 2

    logger.info('Build vocab and dataset ...')
    dataset = Dataset(FLAGS.data_prefix,
                      config.num_steps,
                      config.batch_size,
                      train=(FLAGS.mode == 'train'))

    print('Use algo:', config.algo)

    if FLAGS.mode == 'train':
        train(config, dataset, model_dir, summary_dir)
    elif FLAGS.mode == 'test':
        test(config, dataset, model_dir, summary_dir)
Ejemplo n.º 2
0
def main():
  config = Config()
  args = add_arguments(config)
  config.parse_arg(args)
  dset = Dataset(config)
  dset.build()
  # print('debug:')
  # print(dset.id2word[1])
  config.vocab_size = len(dset.word2id)

  # read the transfered sentences
  transfer_analysis = PivotTransferAnalysis(config)

  if(config.model == 'cmu'):
    transfer_analysis.pipeline_w_cmu(dset)
  else:
    transfer_analysis.pipeline(dset)
  return 
Ejemplo n.º 3
0
    def test_dataset(self):
        vocab = Vocabulary.from_file("testdata/test_vocab.txt")
        dataset = Dataset(vocab, "testdata/*")

        def generator():
            for i in range(1, 10):
                yield [0] + list(range(1, i + 1)) + [0]
        counts = [0] * 10
        for seq in generator():
            for v in seq:
                counts[v] += 1

        counts2 = [0] * 10
        for x, y in dataset._iterate(generator(), 2, 4):
            for v in x.ravel():
                counts2[v] += 1
        for i in range(1, 10):
            self.assertEqual(counts[i], counts2[i], "Mismatch at i=%d. counts[i]=%s, counts2[i]=%s" % (i,counts[i], counts2[i]))
Ejemplo n.º 4
0
def inference(path, model, vocab, config, **kwargs):
    model.eval()
    test_dataset = Dataset(vocab)
    test_path = os.path.join(path, 'test_data')
    test_dataset.create_instances(test_path,
                                  config.max_seq_length,
                                  type='test')
    test_loader = DataLoader(test_dataset, batch_size=1)

    pred_results = []
    for step, batch in enumerate(test_loader):
        batch = tuple(t.to(device) for t in batch)
        batch = sort_batch(batch)
        input_ids, input_lengths, labels = batch

        outputs = model(input_ids)
        top_1_result = outputs['predicted_intents'][0].item()
        pred_results.append([step, top_1_result])

    return pred_results
Ejemplo n.º 5
0
def main(_):
    hvd.init()
    hps = LM.get_default_hparams().parse(FLAGS.hpconfig)
    hps.num_gpus = FLAGS.num_gpus

    vocab = Vocabulary.from_file(FLAGS.vocab)
    hps.vocab_size = vocab.num_tokens

    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank())

    if FLAGS.logdir is None:
        FLAGS.logdir = os.path.join('/tmp',
                                    'lm-run-{}'.format(int(time.time())))
        print('logdir: {}'.format(FLAGS.logdir))
    hps.batch_size = 256
    dataset = Dataset(vocab, FLAGS.datadir)
    run_train(dataset,
              hps,
              FLAGS.logdir + '/train',
              ps_device='/gpu:' + str(hvd.local_rank()))
Ejemplo n.º 6
0
def build_data(Config):
    """
    Procedure to build data
    Args:
        Config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
    """
    # Generators
    train = Dataset(words_filename=Config.source_path,
    tags_filename=Config.source_tgt_path)
    # test = Dataset(words_filename=Config.test_path,
    # tags_filename=Config.test_tgt_path)

    # Build Word and Tag vocab
    # vocab_words, vocab_tags = get_vocabs([train, test])

    # vocab_words.add(UNK)

    # Save vocab
    # write_vocab(vocab_words, Config.words_vocab)
    # write_vocab(vocab_tags, Config.tags_vocab)
    vocab_build(train, Config.min_count, Config.words_vocab)
Ejemplo n.º 7
0
num_cates = max(ent2idx.values()) + 1
sent_len = 64
vocab_size = 2320
emb_size = 256
sent_pad = 10
seq_len = sent_len + 2 * sent_pad

test_data_dir = '../data/chusai_xuanshou'
test_docs = Documents(data_dir=test_data_dir)
sent_extrator = SentenceExtractor(window_size=sent_len, pad_size=sent_pad)
test_sents = sent_extrator(test_docs)

with open('word2idx.json', 'r') as f:
    word2idx = eval(f.read())

test_data = Dataset(test_sents, word2idx=word2idx, cate2idx=ent2idx)
test_X, _ = test_data[:]

print(len(test_docs))

w2v_embeddings = np.load('w2v_embeddings.npy')

model = build_lstm_crf_model(num_cates,
                             seq_len=seq_len,
                             vocab_size=vocab_size,
                             model_opts={
                                 'emb_matrix': w2v_embeddings,
                                 'emb_size': emb_size,
                                 'emb_trainable': False
                             })
model.load_weights(
Ejemplo n.º 8
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    ## General experimental parameters
    parser.add_argument('-exp', type=str,
                        default='')  # which experiment to run
    parser.add_argument('-reddit_path', type=str,
                        default='data/posts.npy')  # path to reddit post data

    parser.add_argument(
        '-val_interval', type=int,
        default=1000)  # how often to evaluate models during training
    parser.add_argument('-size', type=str,
                        default='med')  # maximum post history length
    parser.add_argument('-n_user', type=int,
                        default=8000)  # number of users for experiment
    parser.add_argument(
        '-no_try',
        action='store_true')  # whether or not to run code in a try-except

    ## arguments for training HBERT models
    parser.add_argument(
        '-max_tokens_batch', type=int,
        default=10000)  # how big batches we allow BERT to run (depends on GPU)
    parser.add_argument(
        '-lr', type=float,
        default=0.00001)  # learning rate for HBERT classification layers
    parser.add_argument('-bs', type=int, default=10)  # batch size for training
    parser.add_argument('-n_it', type=int,
                        default=8000)  # number of iterations to train for
    parser.add_argument('-seq', action='store_true')
    parser.add_argument(
        '-temp_file_path', type=str,
        default='')  # path to directory for temp files. if '' this is not used
    parser.add_argument('-preembed_size', type=int,
                        default=10)  # internal hidden size

    opt = parser.parse_args()

    #
    #
    #
    ###################################
    ###################################
    #
    #
    #
    """
    The first section loads the reddit user data,
    
    does some preprocessing, and carries out train/val/test split
    
    
    """
    exp_name = 'experiment_' + str(opt.exp) + '_' + opt.size
    if opt.n_user != 8000:
        exp_name += '_nuser{}'.format(opt.n_user)

    exp_classes = experiment_dict[opt.exp]

    print(exp_classes)

    # creat data if not done already
    if (not os.path.isdir(exp_name)):

        # '/projects/bdata/datasets_peter/dataset_3/posts.npy'
        Reddit_posts = np.load(opt.reddit_path, allow_pickle=True)[0]

        Reddit_posts = order_users(Reddit_posts)[:opt.n_user]

        try:
            opt.size = int(opt.size)
            Reddit_posts = [user[:opt.size] for user in Reddit_posts]
            opt.size = 'size' + str(opt.size)
        except:
            if opt.size == 'xsmall':
                Reddit_posts = [user[:50] for user in Reddit_posts]
            elif opt.size == 'test':
                Reddit_posts = [user[:2] for user in Reddit_posts]
            elif opt.size == 'min':
                Reddit_posts = [user[:10] for user in Reddit_posts]
            elif opt.size == 'small':
                Reddit_posts = [user[:100] for user in Reddit_posts]
            elif opt.size == 'med':
                Reddit_posts = [user[:200] for user in Reddit_posts]
            elif opt.size == 'big':
                pass
            else:
                assert (False)

        print(exp_classes)
        Users, Users_full_posts, T, Y, classes = process_users_synth(
            Reddit_posts,  #user_list, #order_users(MH2SW_posts)+ order_users(MH_posts), 
            exp_classes,
            keep_class=True)

        os.mkdir(exp_name)

        np.save('{}/data.npy'.format(exp_name),
                [Users, Users_full_posts, T, Y, classes])

    Users, Users_full_posts, T, Y, classes = tuple(
        np.load('{}/data.npy'.format(exp_name), allow_pickle=True))

    #
    #
    #
    ###################################
    ###################################
    #
    #
    #
    """
    This section produces feature sets for the different models
    
    Feature sets represent some featurization of user histories
    
    e.g.
    X_chi and X_chi_counts use bag of words, with and without counts
    
    X_HBERT largely leaves user histories as text
    
    X_LDA processes X_chi_counts using LDA
    
    refer to paper for further details
    
    """
    print('starting data loading...')

    s_time = time.time()
    X_chi = get_features_chi(Users_full_posts)
    X_chi_counts = get_features_chi(Users_full_posts, counts=True)
    print('time = {}'.format(time.time() - s_time))
    X_chi_uni = get_features_chi(Users_full_posts, include_bigrams=False)
    X_HBERT = get_features_HBERT(Users, tokenizer, pretokenize=True)

    print('fitting LDA...')
    n_topics = 20
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)

    X_LDA = lda.fit_transform(X_chi_counts)
    print('fit LDA')

    X_inds = list(range(len(Users)))

    dataset = Dataset(X_inds, T, Y, train_frac=0.4, val_frac=0.1)

    inds_train = [data[0] for data in dataset.train_epoch(true_set=True)]
    inds_val = [data[0] for data in dataset.valid_epoch()]
    inds_test = [data[0] for data in dataset.test_epoch()]

    np.save('{}/{}.npy'.format(exp_name, 'inds_dict'), {
        'inds_train': inds_train,
        'inds_val': inds_val,
        'inds_test': inds_test
    })

    print('{} train examples, {} val examples, {} test examples'.format(
        len(inds_train), len(inds_val), len(inds_test)))
    time.sleep(3)

    print('done data loading')

    #
    #
    #
    ###################################
    ###################################
    #
    #
    #
    """
    The next section defines model_dict data structures, which are used to 
    organize training and evaluation of the models
    

    First, model dicts are defined, than added to a list of models to run, model_dicts
    """

    ## instantiate model dicts

    model_dict_0 = {
        'X':
        X_chi,
        'model':
        LogReg_PT_propensity_model(input_dim=len(X_chi[0]),
                                   lr=0.0001,
                                   experiment_name=exp_name + '/LR_12_' +
                                   exp_name),
        'model_name':
        'Logistic_Regression'
    }

    model_dict_1 = {
        'X':
        X_chi,
        'model':
        NN_PT_propensity_model(input_dim=len(X_chi[0]),
                               lr=0.0001,
                               experiment_name=exp_name + '/NN_12_' +
                               exp_name),
        'model_name':
        'Simple_NN'
    }

    model_dict_2 = {
        'X':
        X_chi_uni,
        'model':
        LogReg_PT_propensity_model(input_dim=len(X_chi_uni[0]),
                                   lr=0.0001,
                                   experiment_name=exp_name + '/LR_1_' +
                                   exp_name),
        'model_name':
        'Logistic_Regression_(1gram)'
    }

    model_dict_3 = {
        'X':
        X_chi_uni,
        'model':
        NN_PT_propensity_model(input_dim=len(X_chi_uni[0]),
                               lr=0.0001,
                               experiment_name=exp_name + '/NN_1_' + exp_name),
        'model_name':
        'Simple_NN_(1gram)'
    }

    # A temporary file can be added to do some precalculation, making HBERT more efficient
    # '/projects/bdata/datasets_peter/precalc/'
    d_input = None
    if len(opt.temp_file_path) > 0:
        d = tempfile.TemporaryDirectory(prefix=opt.temp_file_path)
        d_input = d.name + '/' + exp_name

    model_dict_4 = {
        'X':
        X_HBERT,
        'model':
        Hierarchical_BERT_propensity_model(
            n_it=opt.n_it,
            val_interval=opt.val_interval,
            lr=opt.lr,
            batch_size=opt.bs,
            h_size_sent=1000,
            h_size_user=1000,
            tokenize=False,
            precalc_path=d_input,
            experiment_name=exp_name + '/hbert' + exp_name,
            seq=opt.seq,
            max_tokens_batch=opt.max_tokens_batch,
            preembed_size=opt.preembed_size),
        'model_name':
        'HBERT'
    }

    model_dict_5 = {
        'X':
        X_chi_counts,
        'model':
        LogReg_PT_propensity_model(input_dim=len(X_chi[0]),
                                   lr=0.0001,
                                   experiment_name='LR_12_' + exp_name),
        'model_name':
        'Logistic_Regression_counts'
    }

    model_dict_6 = {
        'X':
        X_chi_counts,
        'model':
        NN_PT_propensity_model(input_dim=len(X_chi[0]),
                               lr=0.0001,
                               experiment_name='NN_12_' + exp_name),
        'model_name':
        'Simple_NN_counts'
    }

    model_dict_8 = {
        'X':
        X_LDA,
        'model':
        LogReg_PT_propensity_model(input_dim=n_topics,
                                   lr=0.0001,
                                   experiment_name='LR_12_' + exp_name),
        'model_name':
        'Logistic_Regression_LDA'
    }

    model_dict_9 = {
        'X':
        X_LDA,
        'model':
        NN_PT_propensity_model(input_dim=n_topics,
                               lr=0.0001,
                               experiment_name='NN_12_' + exp_name),
        'model_name':
        'Simple_NN_LDA'
    }

    # A temporary file can be added to do some precalculation, making HBERT more efficient
    d_input = None
    if len(opt.temp_file_path) > 0:
        d = tempfile.TemporaryDirectory(prefix=opt.temp_file_path)
        d_input = d.name + '/' + exp_name

    model_dict_7 = {
        'X':
        X_HBERT,
        'model':
        Average_BERT_propensity_model(n_it=opt.n_it,
                                      val_interval=opt.val_interval,
                                      lr=opt.lr,
                                      batch_size=opt.bs,
                                      h_size_sent=1000,
                                      h_size_user=768,
                                      tokenize=False,
                                      precalc_path=d_input,
                                      experiment_name='avgbert' + exp_name,
                                      seq=opt.seq,
                                      max_tokens_batch=opt.max_tokens_batch),
        #preembed_size = opt.preembed_size),
        'model_name':
        'avgBERT'
    }

    # a list of dictionaries to keep track of all models to run
    model_dicts = [
        model_dict_8, model_dict_9, model_dict_5, model_dict_6, model_dict_0,
        model_dict_1, model_dict_2, model_dict_3, model_dict_4
    ]

    #
    #
    #
    ###################################
    ###################################
    #
    #
    #
    """
    This last section runs each model for the given experiment
    
    
    
    """

    ## loop over the models
    stat_dicts = []

    if opt.no_try:
        for i, model_dict in enumerate(model_dicts):
            # only run the model if you haven't yet
            if not os.path.isfile('{}/{}.npy'.format(
                    exp_name, model_dict['model_name'])):
                dataset.update_X(model_dict['X'])
                # fit model
                model = model_dict['model']
                _, stat_dict = train_propensity_model(model,
                                                      dataset,
                                                      data_test=True)

                stat_dicts += [stat_dict]

                np.save('{}/{}.npy'.format(exp_name, model_dict['model_name']),
                        stat_dict)

            stat_dict = np.load('{}/{}.npy'.format(exp_name,
                                                   model_dict['model_name']),
                                allow_pickle=True).item()
            print(stat_dict)
            print(type(stat_dict))

            stat_dict_print = {
                key: stat_dict[key]
                for key in [
                    k for k in stat_dict.keys()
                    if 'P_' not in k and 'Z_' not in k and 'Y_' not in k
                ]
            }

            print('model {}, statdict {}'.format(model_dict['model_name'],
                                                 stat_dict_print))

        return

    for i, model_dict in enumerate(model_dicts):
        try:
            # only run the model if you haven't yet
            if not os.path.isfile('{}/{}.npy'.format(
                    exp_name, model_dict['model_name'])):
                dataset.update_X(model_dict['X'])
                # fit model
                model = model_dict['model']
                _, stat_dict = train_propensity_model(model,
                                                      dataset,
                                                      data_test=True)

                stat_dicts += [stat_dict]

                np.save('{}/{}.npy'.format(exp_name, model_dict['model_name']),
                        stat_dicts)

            stat_dict = np.load('{}/{}.npy'.format(exp_name,
                                                   model_dict['model_name']),
                                allow_pickle=True).item()

            stat_dict_print = {
                key: stat_dict[key]
                for key in [k for k in stat_dict.keys() if 'P_' not in k]
            }

            print('model {}, statdict {}'.format(model_dict['model_name'],
                                                 stat_dict_print))

        except:
            print('model {} FAILED'.format(model_dict['model_name']))
Ejemplo n.º 9
0
def main(_):

    vocab = Vocabulary.from_file(
        os.path.join(FLAGS.datadir, "1b_word_vocab.txt"))
    dataset = Dataset(
        vocab,
        os.path.join(FLAGS.datadir,
                     "training-monolingual.tokenized.shuffled/*"))

    single_gpu_graph = tf.Graph()
    with single_gpu_graph.as_default():
        with tf.variable_scope("model"):
            model = language_model_graph.build_model()

    def run(sess, num_workers, worker_id, num_replicas_per_worker):

        state_c = []
        state_h = []

        if len(state_c) == 0:
            state_c.extend([
                np.zeros([FLAGS.batch_size, model.state_size],
                         dtype=np.float32)
                for _ in range(num_replicas_per_worker)
            ])
            state_h.extend([
                np.zeros([FLAGS.batch_size, model.projected_size],
                         dtype=np.float32)
                for _ in range(num_replicas_per_worker)
            ])

        prev_global_step = sess.run(model.global_step)[0]
        prev_time = time.time()
        data_iterator = dataset.iterate_forever(
            FLAGS.batch_size * num_replicas_per_worker, FLAGS.num_steps,
            num_workers, worker_id)
        fetches = {
            'global_step': model.global_step,
            'loss': model.loss,
            'train_op': model.train_op,
            'final_state_c': model.final_state_c,
            'final_state_h': model.final_state_h
        }

        for local_step in range(FLAGS.max_steps):
            if FLAGS.use_synthetic:
                x = np.random.randint(
                    low=0,
                    high=model.vocab_size,
                    size=(FLAGS.batch_size * num_replicas_per_worker,
                          FLAGS.num_steps))
                y = np.random.randint(
                    low=0,
                    high=model.vocab_size,
                    size=(FLAGS.batch_size * num_replicas_per_worker,
                          FLAGS.num_steps))
                w = np.ones((FLAGS.batch_size * num_replicas_per_worker,
                             FLAGS.num_steps))
            else:
                x, y, w = next(data_iterator)
            feeds = {}
            feeds[model.x] = np.split(x, num_replicas_per_worker)
            feeds[model.y] = np.split(y, num_replicas_per_worker)
            feeds[model.w] = np.split(w, num_replicas_per_worker)
            feeds[model.initial_state_c] = state_c
            feeds[model.initial_state_h] = state_h
            fetched = sess.run(fetches, feeds)

            state_c = fetched['final_state_c']
            state_h = fetched['final_state_h']

            if local_step % FLAGS.log_frequency == 0:
                cur_time = time.time()
                elapsed_time = cur_time - prev_time
                num_words = FLAGS.batch_size * FLAGS.num_steps
                wps = (fetched['global_step'][0] -
                       prev_global_step) * num_words / elapsed_time
                prev_global_step = fetched['global_step'][0]
                parallax.log.info(
                    "Iteration %d, time = %.2fs, wps = %.0f, train loss = %.4f"
                    % (fetched['global_step'][0], cur_time - prev_time, wps,
                       fetched['loss'][0]))
                prev_time = cur_time

    sess, num_workers, worker_id, num_replicas_per_worker = \
        parallax.parallel_run(single_gpu_graph,
                              FLAGS.resource_info_file,
                              sync=FLAGS.sync,
                              parallax_config=parallax_config.build_config())
    run(sess, num_workers, worker_id, num_replicas_per_worker)
Ejemplo n.º 10
0
from data_utils import read_dictionary, Dataset, vocab_tags
from general_utils import get_logger
from model import Model
from config import Config
import os
import sys

if not os.path.exists(Config.output_path):
    os.makedirs(Config.output_path)

# vocab_words = load_vocab(Config.words_vocab)
# vocab_tags = load_vocab(Config.tags_vocab)
vocab_words = read_dictionary(Config.words_vocab)

# print(vocab_words)
# print(vocab_tags)
# sys.exit(0)

test = Dataset(Config.test_path, Config.test_tgt_path, Config.max_iter)
train = Dataset(Config.source_path, Config.source_tgt_path, Config.max_iter)

logger = get_logger(Config.log_path)

model = Model(Config,
              ntags=len(vocab_tags),
              n_words=len(vocab_words),
              logger=logger)

model.build()

model.train(train, test, vocab_tags, vocab_words)
Ejemplo n.º 11
0
    cost, ev = model.test(ds_test)
    accuracies = [ev['accuracy']]
    for eps in epss[1:]:
        print("Creating adversarial examples...")
        clip_max = (255 - np.max(Cifar10Loader.mean)) / np.max(
            Cifar10Loader.std)
        n_fgsm = fgsm(model.nodes.input,
                      model.nodes.probs,
                      eps=eps,
                      clip_min=-clip_max,
                      clip_max=clip_max)
        images_adv, = batch_eval(model._sess, [model.nodes.input], [n_fgsm],
                                 [ds_test.images[:model.batch_size * 64]],
                                 args={'batch_size': model.batch_size},
                                 feed={model._is_training: False})
        adv_ds_test = Dataset(images_adv, ds_test.labels, ds_test.class_count)
        cost, ev = model.test(adv_ds_test)
        accuracies.append(ev['accuracy'])
    accuracieses.append(accuracies)
    print(accuracies)


def plot(epss, curves, names):
    plt.figure()
    plt.rcParams["mathtext.fontset"] = "cm"
    #plt.yticks(np.arange(0, 1, 0.05))
    axes = plt.gca()
    axes.grid(color='0.9', linestyle='-', linewidth=1)
    axes.set_ylim([0, 1])
    axes.set_xlim([0, top / 100])
    for c, n in zip(curves, names):
Ejemplo n.º 12
0
def train_pos(args):
    src_embedding = None
    target_embedding = None
    logger = get_logger(args.log)
    logger.info('Model Type: {}'.format(args.type))
    if os.path.exists(args.config) and (not args.config == 'debug.json'):
        logger.info('Loading config from {}'.format(args.config))
        config = json.load(open(args.config, 'r'))
        try:
            vocab_word = pickle.load(open(config['word'], 'rb'))
            vocab_tag = pickle.load(open(config['tag'], 'rb'))
            target_vocab_word = pickle.load(open(config['target_word'], 'rb'))

            assert len(vocab_word) == config['nword']
            assert len(vocab_tag) == config['ntag']
            assert len(target_vocab_word) == config['ntarword']

            if args.use_pretrain_src:
                _, src_embedding = load_pre_train(args.src_embedding)

            if args.use_pretrain_target:
                _, target_embedding = load_pre_train(args.target_embedding)

        except Exception as e:
            logger.error(e)
            exit(0)
    else:
        if args.use_pretrain_src:
            pre_dictionary, src_embedding = load_pre_train(args.src_embedding)
            vocab_word, vocab_tag = load_vocab(args.train_file, pre_dictionary)
        else:
            vocab_word, vocab_tag = load_vocab(args.train_file)

        if args.use_pretrain_target:
            pre_dictionary, target_embedding = load_pre_train(
                args.target_embedding)
            target_vocab_word, _ = load_vocab(args.train_file, pre_dictionary)
        else:
            target_vocab_word, _ = load_vocab(args.target_train_file)

        i = 0
        while os.path.exists('./.cache/vocab_{}.pickle'.format(
                str(i))) or os.path.exists('./.cache/tag_{}.pickle'.format(
                    str(i))):
            i += 1
        if not os.path.exists('./.cache'):
            os.makedirs('./.cache')
        with open('./.cache/vocab_{}.pickle'.format(str(i)),
                  'wb') as vocab, open('./.cache/tag_{}.pickle'.format(
                      str(i)), 'wb') as tag, open(
                          './.cache/target_vocab_{}.pickle'.format(str(i)),
                          'wb') as tar_vocab:
            pickle.dump(vocab_word, vocab)
            pickle.dump(vocab_tag, tag)
            pickle.dump(target_vocab_word, tar_vocab)

        with open(args.config, 'w+') as config:
            json.dump(
                {
                    'word': './.cache/vocab_{}.pickle'.format(str(i)),
                    'tag': './.cache/tag_{}.pickle'.format(str(i)),
                    'target_word': './.cache/target_vocab_{}.pickle'.format(
                        str(i)),
                    'nword': len(vocab_word),
                    'ntag': len(vocab_tag),
                    'ntarword': len(target_vocab_word)
                },
                config,
                indent='\t')

    nword = len(vocab_word)
    ntag = len(vocab_tag)
    ntarword = len(target_vocab_word)

    logger.info("Src:    {}  {}".format(nword, ntag))
    logger.info("Target: {}".format(ntarword))
    logger.info("Flag:   {}".format(args.flag))
    logger.info(
        "Src embed trainable: {}".format(not args.disable_src_embed_training))
    logger.info("\ntrain:{}\ndev  :{}\ntest :{}\n\n".format(
        args.train_file, args.dev_file, args.test_file))
    logger.info("\nTarget: \ntrain:{}\ndev  :{}\ntest :{}\n".format(
        args.target_train_file, args.target_dev_file, args.target_test_file))
    logger.info("MSG:   {}\n".format(args.msg))
    logger.info("lr_ratio: {}\n".format(str(args.lr_ratio)))
    logger.info("penalty_ratio: {}\n".format(str(args.penalty_ratio)))
    logger.info("penalty: {}\n".format(str(args.penalty)))

    processing_word = get_processing(vocab_word)
    processing_tag = get_processing(vocab_tag)
    processing_target_word = get_processing(target_vocab_word)

    src_train = Dataset(args.train_file, processing_word, processing_tag, None)
    src_dev = Dataset(args.dev_file, processing_word, processing_tag, None)
    src_test = Dataset(args.test_file, processing_word, processing_tag, None)

    target_train = Dataset(args.target_train_file, processing_target_word,
                           processing_tag)
    target_dev = Dataset(args.target_dev_file, processing_target_word,
                         processing_tag)
    target_test = Dataset(args.target_test_file, processing_target_word,
                          processing_tag)

    src_len = len(src_train)
    target_len = len(target_train)
    ratio = target_len / (src_len + target_len)
    logger.info("\nsrc:    {}\ntarget: {}\n".format(src_len, target_len))

    # ratio = 0.1 if ratio < 0.1 else ratio
    target_batch_size = int(ratio * args.batch_size)
    target_batch_size = 1 if target_batch_size < 1 else target_batch_size
    src_batch_size = args.batch_size - target_batch_size
    logger.info("\nsrc_batch_size: {}\ntarget_batch_size: {}".format(
        src_batch_size, target_batch_size))
    assert target_batch_size >= 0

    model = Model(args,
                  ntag,
                  nword,
                  ntarwords=ntarword,
                  src_embedding=src_embedding,
                  target_embedding=target_embedding,
                  logger=logger,
                  src_batch_size=src_batch_size)

    model.build()
    try:
        print("========If !!! it's debugging!==========")
        print(args.debug)
        if args.debug:
            print("========it's debugging!==========")
            model.train(src_dev, src_dev, vocab_tag, target_dev, target_dev,
                        target_test, src_batch_size, target_batch_size)
        else:
            # model.train(src_train, src_dev, vocab_tag, target_train, target_dev, src_batch_size, target_batch_size)
            model.train(src_train, src_dev, vocab_tag, target_train,
                        target_dev, target_test, src_batch_size,
                        target_batch_size)
    except KeyboardInterrupt:
        model.evaluate(target_dev, vocab_tag, target='target')
Ejemplo n.º 13
0
		}, 'checkpoints/saved_model.pth')



train_data = load_data('preprocessed_data/train_data.json', thres, max_plen)[0:100000]

print('Done loading Training data.')


train_params = {'batch_size': 32,
          'shuffle': True,
          'num_workers': 32,
	  	  'pin_memory': True}


training_set = Dataset(train_data, max_plen, max_qlen, data_dir, glove_vec_size)
training_generator = DataLoader(training_set, **train_params)

cuda = torch.cuda.is_available()

device = torch.device('cpu')
if(cuda):
	device = torch.device('cuda')

config = Config(glove_vec_size, elmo_options, elmo_weights, elmo_emb_size, hidden_size, max_plen, max_qlen, num_para, device)
model = Model(config)

if(cuda):
	model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
Ejemplo n.º 14
0
emb_matrix, word2idx, idx2word = load_emb_matrix('/vocab.txt', glove_vec_size, glove_word2vec)
print('Done loading emb_matrix.')

eval_data = load_data(data_dir + '/msmarco/dev_v2.1.json')[0:25000]

print('Done loading Eval data.')


eval_params = {'batch_size': 32,
          'shuffle': False,
          'num_workers': 32,
	  	  'pin_memory': True}


eval_set = Dataset(eval_data, max_plen, max_qlen, max_ans_len, glove_vec_size, glove_word2vec, word2idx, qp_vocab_size, mode='predict')
eval_generator = DataLoader(eval_set, **eval_params)



config = Config(glove_vec_size,
				elmo_options,
				elmo_weights,
				elmo_emb_size,
				common_vocab_size,
				qp_vocab_size,
				hidden_size, 
				max_plen, 
				max_qlen, 
				max_ans_len,
				num_para, 
Ejemplo n.º 15
0
train_docs, val_docs = docs[train_doc_ids], docs[val_doc_ids]
print(len(docs))

epochs = 10
num_cates = max(ent2idx.values()) + 1
sent_len = 64
vocab_size = 3000
emb_size = 256
sent_pad = 10
sent_extrator = SentenceExtractor(window_size=sent_len, pad_size=sent_pad)

train_sents = sent_extrator(train_docs)
val_sents = sent_extrator(val_docs)
test_sents = sent_extrator(test_docs)

train_data = Dataset(train_sents, cate2idx=ent2idx)
train_data.build_vocab_dict(vocab_size=vocab_size)

with open('word2idx.json', 'w') as f:
    f.write(str(train_data.word2idx))
val_data = Dataset(val_sents, word2idx=train_data.word2idx, cate2idx=ent2idx)
val_X, _ = val_data[:]
test_data = Dataset(test_sents, word2idx=train_data.word2idx, cate2idx=ent2idx)
test_X, _ = test_data[:]

vocab_size = len(train_data.word2idx)

w2v_train_sents = []
for doc in docs:
    w2v_train_sents.append(list(doc.text))
w2v_model = Word2Vec(w2v_train_sents, size=emb_size)
Ejemplo n.º 16
0
def train():

    submission = FLAGS.submission == "True"
    mean_subtraction = FLAGS.mean_subtraction == "True"
    augmentation = FLAGS.augmentation == "True"

    # For reproducible research :)
    tf.set_random_seed(42)
    np.random.seed(42)
    if FLAGS.architecture == "vgg":
        IMG_SIZE = 224
    else:
        IMG_SIZE = 227

    dataset = Dataset(rescale_imgs=True,
                      img_shape=(IMG_SIZE, IMG_SIZE),
                      submission=submission,
                      mean_subtraction=mean_subtraction)
    if not submission:
        val_images, val_labels = dataset.val, dataset.val_labels

    with tf.variable_scope("Input"):
        input = tf.placeholder(tf.float32, [None, IMG_SIZE, IMG_SIZE, 3],
                               name='input')
        labels = tf.placeholder(tf.float32, shape=(None, 10), name='labels')
        e = tf.placeholder(tf.float32, name='epoch')

    if FLAGS.architecture == "alexnet":
        net = AnimalNet_Alex(num_classes=10,
                             dropout_rate=FLAGS.dropout,
                             regularization_scale=FLAGS.l2_reg,
                             refine_after=FLAGS.refine_after)
    elif FLAGS.architecture == "vgg":
        net = AnimalNet_VGG(num_classes=10,
                            dropout_rate=FLAGS.dropout,
                            regularization_scale=FLAGS.l2_reg,
                            refine_after=FLAGS.refine_after)
    elif FLAGS.architecture == 'ensemb':
        net = AnimalNet_Ensemb(num_classes=10,
                               dropout_rate=FLAGS.dropout,
                               regularization_scale=FLAGS.l2_reg,
                               refine_after=FLAGS.refine_after)

    stop_training_op = net._set_training_op(False)
    start_training_op = net._set_training_op(True)
    if FLAGS.architecture == 'ensemb':
        logits = net.inference(input, e)
    else:
        logits = net.inference(input, e, FLAGS.feature_layer)
    loss_op = net.loss(logits, labels)
    acc_op = net.accuracy(logits, labels)

    # Training operation
    train_op = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(loss_op)

    summary = tf.merge_all_summaries()

    if not tf.gfile.Exists(FLAGS.log_dir):
        tf.gfile.MakeDirs(FLAGS.log_dir)

    sess = tf.Session()

    # Initialize variables
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    sess.run(net.assign_ops)

    train_writer = tf.train.SummaryWriter(FLAGS.log_dir + '/train')
    test_writer = tf.train.SummaryWriter(FLAGS.log_dir + '/test')

    for epoch in range(FLAGS.max_steps):
        train_images, train_labels = dataset.get_batch(FLAGS.batch_size)
        train_feed_dict = {input: train_images, labels: train_labels, e: epoch}

        if (epoch + 1) % FLAGS.print_freq == 0:
            _, loss, acc, summary_str = sess.run(
                [stop_training_op, loss_op, acc_op, summary], train_feed_dict)
            train_writer.add_summary(summary_str, epoch)
            train_writer.flush()

            print("********** Step :", epoch, "of", FLAGS.max_steps,
                  "**********")

            print("Train set accuracy is: ", acc)
            print("Train set loss is: ", loss)
            print("--------------------------------------------------")

            if not submission:
                # ENSEMB WONT FIT INTO MEMORY :/ - if this is uncommented then there is also problem
                # with confusion matrix.
                # ------------------------
                # predictions = []
                # for i in range(4):
                #     val_feed_dict = {input: val_images[i * 50:(i + 1) * 50],
                #                      e: epoch}
                #     predictions.append(sess.run(logits, val_feed_dict))
                # predictions = np.concatenate(predictions)
                # predictions = np.argmax(predictions, 1)
                #
                # correct_prediction = (predictions == np.argmax(val_labels, 1)) * 1.0
                # acc = np.mean(correct_prediction)
                val_feed_dict = {
                    input: val_images,
                    labels: val_labels,
                    e: epoch
                }
                loss, acc, summary_str = sess.run([loss_op, acc_op, summary],
                                                  val_feed_dict)

                test_writer.add_summary(summary_str, epoch)
                test_writer.flush()
                print("Validation set accuracy is: ", acc)
                print("--------------------------------------------------")

        else:
            sess.run([start_training_op, train_op], train_feed_dict)

        # TODO model saving/loading

    if not submission:
        cnf_matrix = net.get_confusion_matrix(logits, labels, sess,
                                              val_feed_dict)
        plot_confusion_matrix(cnf_matrix,
                              title='Confusion matrix, without normalization')

        imgs = net.get_problematic_photos(logits, labels, sess, val_feed_dict,
                                          val_images, 5)
        plot_imgs(imgs)

    #We save your predictions to file
    if submission:
        test_p_file = open(FLAGS.submission_filename, 'w')
        test_p_file.write('ImageName,Prediction\n')
        # VGG problem -> not enough memory
        # we need to split test data to batches
        predictions = []
        for i in range(5):
            test_feed_dict = {input: dataset.test[i * 100:(i + 1) * 100], e: 0}
            predictions.append(sess.run(logits, test_feed_dict))
        predictions = np.concatenate(predictions)
        test_labels = np.argmax(predictions, 1)

        for i, image in enumerate(dataset.testimages):
            test_p_file.write(image + ',' + str(test_labels[i]) + '\n')
        test_p_file.close()
Ejemplo n.º 17
0
    print("INDEX: %s" % task_index)

cluster = tf.train.ClusterSpec(cluster_spec)
server = tf.train.Server(cluster, job_name=role, task_index=task_index)
if role == "ps":
    server.join()
else:
    ps_device = '/job:ps/task:0'
    """
    Start either train or eval. Note hardcoded parts of path for training and eval data
    """
    hps = LM.get_default_hparams().parse(FLAGS.hpconfig)
    hps._set("num_gpus", FLAGS.num_gpus)
    print('*****HYPER PARAMETERS*****')
    print(hps)
    print('**************************')

    vocab = Vocabulary.from_file(
        os.path.join(FLAGS.datadir, "1b_word_vocab.txt"))

    if FLAGS.mode == "train":
        #hps.batch_size = 256
        dataset = Dataset(
            vocab,
            os.path.join(FLAGS.datadir,
                         "training-monolingual.tokenized.shuffled/*"))
        run_train(dataset,
                  hps,
                  os.path.join(FLAGS.logdir, "train"),
                  ps_device=ps_device)
Ejemplo n.º 18
0
# encoding=utf-8

from contact_bi_model import BiContactRepr, ReprClf, LstmclfModel
from data_utils import Dataset, MultiDataset, SampleDataset, sequence_label_line_processing
import config
from work_utils import train as train_func


# line_processing_function
en_process = sequence_label_line_processing( config.vocab_path_en, config.tag_vocab_path)
zh_process = sequence_label_line_processing( config.vocab_path_zh, config.tag_vocab_path)

# data
train_en = Dataset(config.train_path_en, en_process, len=config.batches)
train_zh = Dataset(config.train_path_zh, zh_process, len=0)
train = SampleDataset([train_en, train_zh], config.sampling_ratio)
# return train_en_sequence, en_sequence_lenth, tag_en, 
#        train_zh_sequence, zh_sequence_lenth, tag_zh,
#        mask([0,0],[0,1],[1,0],[1,1])

dev_en = Dataset(config.dev_path_en, en_process, len=config.dev_batches)
dev_zh = Dataset(config.dev_path_zh, zh_process, len=0)
dev = SampleDataset([dev_en, dev_zh], config.sampling_ratio)
k=0
#raw_input('wtf')
'''
for k,input_data in enumerate(train.epoch_data()):
    print k
    print k,'-'*30
    print input_data[2]
    raw_input('TARGET')
Ejemplo n.º 19
0
        return results_sel_para, results_pred_start, results_pred_end


dev_data = load_data('preprocessed_data/dev_data.json', thres,
                     max_plen)[0:25000]

print('Done loading dev data.')

params = {
    'batch_size': 32,
    'shuffle': False,
    'num_workers': 32,
    'pin_memory': True
}

dev_set = Dataset(dev_data, max_plen, max_qlen, data_dir, glove_vec_size)
dev_generator = DataLoader(dev_set, **params)

config = Config(glove_vec_size, elmo_options, elmo_weights, elmo_emb_size,
                hidden_size, max_plen, max_qlen, num_para, device)
model = Model(config)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

if (cuda):
    model = model.to(device)

checkpoint = torch.load('checkpoints/saved_model.pth')

model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
Ejemplo n.º 20
0
os.makedirs(directoryOut)
os.makedirs(directoryData)
os.makedirs(directoryCkpt)
os.makedirs(directoryOutLogs)

num_words = None

seq_len = 25
batch_size = 16
valid_batch_size = 16  ## Needs to be smaller due to memory issues
embed_size = 64
num_epochs = 20
hidden_size = 64
num_layers = 1

dataset = Dataset(data_dir, num_words)
dataset.set_batch_size(batch_size)
dataset.set_seq_len(seq_len)
dataset.save(dataset_specific_info)

params = {}

#take account of the 0 token for padding
params['vocab_size'] = dataset.vocab_size + 1
params['num_classes'] = dataset.vocab_size
params['batch_size'] = batch_size
params['valid_batch_size'] = valid_batch_size
params['seq_len'] = seq_len
params['hidden_dim'] = hidden_size
params['num_layers'] = num_layers
params['embed_size'] = embed_size
Ejemplo n.º 21
0
    plt.axis('off')


netG = Generator(name="dcgan_g_html")
netD = Discriminator(name="dcgan_d_html")

loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss()

real_label = nd.ones((batch_size, ), ctx=ctx)
fake_label = nd.zeros((batch_size, ), ctx=ctx)

img_list = [
    os.path.join(data_path, x) for x in os.listdir(data_path)
    if x.endswith('png')
]
train_data = Dataset(img_list, img_dims, batch_size=batch_size)


def init_params():
    netG.initialize(mx.init.Normal(0.02), ctx=ctx)
    netD.initialize(mx.init.Normal(0.02), ctx=ctx)


def load_weights():
    netG.load_params(ctx=ctx)
    netD.load_params(ctx=ctx)


def init_optimizers():
    trainerG = mx.gluon.Trainer(netG.collect_params(), 'adam', {
        'learning_rate': lr,
Ejemplo n.º 22
0
def main():
    path_embedding_glove = './glove.6B.100d.txt'
    path_dataset_train = './datasets/restaurants_train.json'
    #path_dataset_train = './datasets/mini.json'
    path_dataset_trial = './datasets/restaurants_trial.json'
    path_study_cases = './datasets/study_cases.json'
    path_saved = './saved_at/'
    path_log = './log_at.txt'

    embedding = EmbeddingGlove(path_embedding_glove)
    dataset_train = Dataset(path_dataset_train, embedding)
    dataset_trial = Dataset(path_dataset_trial, embedding)
    study_cases = Dataset(path_study_cases, embedding)

    max_sentence_len_train = dataset_train.metadata.max_sentence_len
    max_sentence_len_trial = dataset_trial.metadata.max_sentence_len

    max_aspect_len_train = dataset_train.metadata.max_aspect_len
    max_aspect_len_trial = dataset_trial.metadata.max_aspect_len

    # ======================================================================

    embedding_matrix = torch.tensor(embedding.matrix, dtype=torch.float)
    embedding_dim = embedding.embedding_dim
    hidden_dim = 150
    polarity_dim = 3
    batch_size = 40
    max_sentence_len = max(max_sentence_len_train, max_sentence_len_trial)
    max_aspect_len = max(max_aspect_len_train, max_aspect_len_trial)
    epochs = 40
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print('embedding_dim: ' + str(embedding_dim))
    print('hidden_dim: ' + str(hidden_dim))
    print('polarity_dim: ' + str(polarity_dim))
    print('batch_size: ' + str(batch_size))
    print('max_sentence_len: ' + str(max_sentence_len))
    print('max_aspect_len: ' + str(max_aspect_len))
    print('epochs: ' + str(epochs))
    print('device: ' + str(device))

    # ======================================================================

    batches_train_sentences, batches_train_aspects, batches_train_polarities = dataset_train.GenerateBatches(
        batch_size, max_sentence_len, max_aspect_len)

    batches_trial_sentences, batches_trial_aspects, batches_trial_polarities = dataset_trial.GenerateBatches(
        batch_size, max_sentence_len, max_aspect_len)

    study_cases_sentences, study_cases_aspects, study_cases_polarities = study_cases.GenerateBatches(
        batch_size, max_sentence_len, max_aspect_len)

    num_batches = len(batches_train_sentences)

    # ======================================================================

    model = AT(embedding_matrix, embedding_dim, hidden_dim, polarity_dim,
               max_sentence_len, max_aspect_len)
    model.to(device)
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # ======================================================================

    train = False
    file_name_saved = 'aoa_epoch38_acuracy0.951239224137931'

    if train:
        file_log = open(path_log, 'w')
        max_acuracy = 0.0

        for epoch in range(epochs):
            print('\n========== Epoch ' + str(epoch) + ' ==========')

            model.train()

            for i in range(num_batches):
                optimizer.zero_grad()

                batch_sentences = batches_train_sentences[i]
                batch_sentences = torch.tensor(batch_sentences,
                                               dtype=torch.long).to(device)

                batch_aspects = batches_train_aspects[i]
                batch_aspects = torch.tensor(batch_aspects,
                                             dtype=torch.long).to(device)

                batch_polarities = batches_train_polarities[i]
                batch_polarities = torch.tensor(batch_polarities,
                                                dtype=torch.long).to(device)

                prediction, _ = model(batch_sentences, batch_aspects)

                loss = loss_function(prediction, batch_polarities)
                loss.backward()
                optimizer.step()

            acuracy, f1, _ = CalculateAcuracyF1(model, device,
                                                batches_train_sentences,
                                                batches_train_aspects,
                                                batches_train_polarities)

            print('acuracy train: ' + str(acuracy))
            print('f1 train: ' + str(f1))

            file_log.write('epoch: ' + str(epoch) + '\n')
            file_log.write('acuracy_train: ' + str(acuracy) + ' f1_train: ' +
                           str(f1) + '\n')

            if acuracy >= max_acuracy:
                max_acuracy = acuracy
                file_name_saved = 'at_epoch' + str(epoch) + '_acuracy' + str(
                    acuracy)
                torch.save(model.state_dict(), path_saved + file_name_saved)
                print('saved: ' + path_saved + file_name_saved)

        file_log.close()

    else:
        print('\n========== Load saved ==========')

        model.load_state_dict(torch.load(path_saved + file_name_saved))
        print('load: ' + path_saved + file_name_saved)

        acuracy, f1, _ = CalculateAcuracyF1(model, device,
                                            batches_train_sentences,
                                            batches_train_aspects,
                                            batches_train_polarities)
        print('acuracy train: ' + str(acuracy))
        print('f1 train: ' + str(f1))

# ======================================================================

    print('\n********** Trial dataset **********')

    acuracy, f1, indices_failures = CalculateAcuracyF1(
        model, device, batches_trial_sentences, batches_trial_aspects,
        batches_trial_polarities)
    print('acuracy trial: ' + str(acuracy))
    print('f1 trial: ' + str(f1))
    print('indices failures:')
    print(indices_failures)

    for index in indices_failures:
        print(dataset_trial.opinions[index])

# ======================================================================

    print('\n********** Study cases **********')

    with torch.no_grad():
        for i in range(len(study_cases_sentences)):
            batch_sentences = study_cases_sentences[i]
            batch_sentences = torch.tensor(batch_sentences,
                                           dtype=torch.long).to(device)

            batch_aspects = study_cases_aspects[i]
            batch_aspects = torch.tensor(batch_aspects,
                                         dtype=torch.long).to(device)

            batch_polarities = study_cases_polarities[i]

            prediction, attention = model(batch_sentences, batch_aspects)

            print('Sentences: ')
            print(batch_sentences)
            print('Aspects: ')
            print(batch_aspects)
            print('Polarities: ')
            print(batch_polarities)
            print('Prediction: ')
            print(prediction)
            print('Attention: ')
            print(attention.squeeze(-1))
Ejemplo n.º 23
0
				f.write('\n')
		
		avg_loss = dev_loss/len(data)
		print(avg_loss)
		f.close()
		return invalid



data = load_data(data_dir + '/msmarco/' + args.data +'_v2.1.json')

params = {'batch_size': 256,
          'shuffle': False,
          'num_workers': 16}

dev_set = Dataset(data, max_plen, max_qlen, glove_vec_size, data_dir)
dev_generator = utils.data.DataLoader(dev_set, **params)

device = torch.device('cpu')

cuda = torch.cuda.is_available()
if(cuda):
	device = torch.device('cuda')

config = Config(glove_vec_size, hidden_size, max_plen, max_qlen, num_para, device)


model = Model(config)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

if(cuda):
Ejemplo n.º 24
0
def main():
    config = json.load(open("config.json"))
    dset = Dataset(config)
    dset.build()
    return
Ejemplo n.º 25
0
emb_matrix, word2idx, idx2word = load_emb_matrix('/vocab.txt', glove_vec_size, glove_word2vec)
print('Done loading emb_matrix.')

train_data = load_data(data_dir + '/msmarco/train_v2.1.json', 35000, 15000)

print('Done loading Training data.')


train_params = {'batch_size': 32,
          'shuffle': True,
          'num_workers': 32,
	  	  'pin_memory': True}


training_set = Dataset(train_data, max_plen, max_qlen, max_ans_len, glove_vec_size, glove_word2vec, word2idx, qp_vocab_size, mode='train')
training_generator = DataLoader(training_set, **train_params)


cuda = torch.cuda.is_available()

device = torch.device('cpu')
if(cuda):
	device = torch.device('cuda')


config = Config(glove_vec_size,
				elmo_options,
				elmo_weights,
				elmo_emb_size,
				common_vocab_size,
Ejemplo n.º 26
0
vocab_words = load_vocab(config.words_filename)
vocab_tags = load_vocab(config.tags_filename)
vocab_chars = load_vocab(config.chars_filename)

# get processing functions
processing_word = get_processing_word(vocab_words,
                                      vocab_chars,
                                      lowercase=True,
                                      chars=config.chars)
processing_tag = get_processing_word(vocab_tags, lowercase=False)

# get pre trained embeddings
embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

# create dataset
dev = Dataset(clear_data_path(config.dev_filename), processing_word,
              processing_tag, config.max_iter)
test = Dataset(clear_data_path(config.test_filename), processing_word,
               processing_tag, config.max_iter)
train = Dataset(clear_data_path(config.train_filename), processing_word,
                processing_tag, config.max_iter)

# build model
model = NERModel(config,
                 embeddings,
                 ntags=len(vocab_tags),
                 nchars=len(vocab_chars))
# build graph
model.build()

# train, evaluate and interact
model.train(train, dev, vocab_tags)
Ejemplo n.º 27
0
                yield (s, sl, q, a, al)
                del (batch_s[:], batch_sl[:], batch_q[:], batch_a[:],
                     batch_al[:])

        #batch = [(s,sl,q,a,al) for s,sl,q,a,al in zip(batch_s,batch_sl,batch_q,batch_a,batch_al)]
        #batch = sorted(batch, key=lambda tup:len(tup[0]),reverse=True)
        #s,sl,q,a,al = zip(*batch)
        #if len(batch_s) == batch_size:
        #        yield (s,sl,q,a,al)


if __name__ == '__main__':

    from data_utils import Dataset

    for i in np.arange(14, 20):
        print('start')
        data = Dataset(i + 1)
        data.preprocess('train')
        data.preprocess('valid')
        data.preprocess('test')
        pickle.dump(data, open('data/qa' + str(i + 1) + '.pickle', 'wb'))
        print(i)
#dataset = pickle.load(open('data/qa2.pickle','rb'))

#for idx, (s, sl, q, a, al) in enumerate(dataset.data_loader('train')):
#print(s[0].shape)
#print(sl[0])
#print(q[1])
#print(dataset.idx2word(a))
        buffer = []
        for line in tqdm(f, total=L):
            line = buffer.append(line.strip())
            if len(buffer) > 40000:
                buffer = ' '.join(buffer)
                tokens = list(en.tokenizer(buffer.lower()))
                buffer = []
                for w in tokens:
                    vocab[w.text] += 1

        # One last time to clean the buffer
        buffer = ' '.join(buffer)
        tokens = list(en.tokenizer(buffer.lower()))
        buffer = []
        for w in tokens:
            vocab[w.text] += 1


#download_data()

lang_file = "./models/wiki103.large.lang"
if not os.path.isfile(lang_file):
    print("Creating vocab file...")
    en_lang = Dataset('wiki')
    en_lang.buildLang(open(data_path + f'wiki.{split}.tokens'),
                      num_lines=train_lines)
    with open(lang_file, 'wb') as f:
        pickle.dump(en_lang, f)
else:
    print("Loading vocab file...")
    en_lang = pickle.load(open('./models/wiki103.large.lang', 'rb'))
Ejemplo n.º 29
0
def main():

    # Read datasets
    data = Dataset(args.DATA_DIR)
    sents, tags = data.get_all_data()

    # Construct the model
    MyModel = BiLSTMModel(args.MAX_SEQ_LEN, args.EMBEDDING,
                          args.LSTM_HIDDEN_UNITS, args.LSTM_DENSE_DIM,
                          data.get_nwords(), data.get_ntags())
    model = MyModel.define_model()

    num_train_sents = len(data.train_sents)
    num_val_sents = len(data.val_sents)
    num_test_sents = len(data.test_sents)

    print(
        "# train sents = {0} \n # of val sents = {1} \n # of test sents = {2}".
        format(num_train_sents, num_val_sents, num_test_sents),
        flush=True)

    # indexes to train, val and test data
    partition = {
        "train": list(range(num_train_sents)),
        "val": list(range(num_val_sents)),
        "test": list(range(num_test_sents))
    }

    # Parameters
    params = {
        'dim': args.MAX_SEQ_LEN,
        'batch_size': args.BATCH_SIZE,
        'n_classes': data.get_ntags(),
        'shuffle': True,
        'word2idx': data.get_word2idx(),
        'tag2idx': data.get_tag2idx()
    }

    # Generators
    training_generator = DG.DataGenerator(partition['train'], data.train_sents,
                                          data.train_tags, **params)
    validation_generator = DG.DataGenerator(partition['val'], data.val_sents,
                                            data.val_tags, **params)

    # Train model on dataset
    history = model.fit_generator(generator=training_generator,
                                  validation_data=validation_generator,
                                  use_multiprocessing=True,
                                  epochs=args.NUM_EPOCHS,
                                  verbose=1)

    # Parameters
    params_test = {
        'dim': args.MAX_SEQ_LEN,
        'batch_size': 1,
        'n_classes': data.get_ntags(),
        'shuffle': False,
        'word2idx': data.get_word2idx(),
        'tag2idx': data.get_tag2idx()
    }

    # Make predictions
    testing_generator = DG.DataGenerator(partition['test'], data.test_sents,
                                         data.train_tags, **params_test)

    pred_test = model.predict_generator(generator=testing_generator,
                                        steps=num_test_sents)
    pred_test = np.argmax(pred_test, axis=-1)

    # print(pred_test.shape)

    def pad(x):
        x1 = [
            tgs + ([data.get_tag2idx()["PAD"]] * (args.MAX_SEQ_LEN - len(tgs)))
            for tgs in x
        ]
        x2 = [tgs[:args.MAX_SEQ_LEN] for tgs in x1]
        return np.array(x2)

    test_tags_padded = pad(data.test_tags)

    # print(test_tags_padded.shape)

    def get_measures(yTrue, yPred):
        y1 = yTrue.reshape(1, -1).squeeze()
        y2 = yPred.reshape(1, -1).squeeze()

        P = precision_score(y1, y2, average=None)
        R = recall_score(y1, y2, average=None)
        F1 = f1_score(y1, y2, average=None)

        print("Precision=", flush=True)
        print(P, flush=True)
        print("Recall=", flush=True)
        print(R, flush=True)
        print("F1 score=", flush=True)
        print(F1, flush=True)

    print("Test...", flush=True)
    get_measures(test_tags_padded, pred_test)
Ejemplo n.º 30
0
import segmentation_models as sm

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

x_train_path = '/home/mat/ug/16123001/data/x_train/folder/'
y_train_path = '/home/mat/ug/16123001/data/y_train/folder/'
x_val_path = '/home/mat/ug/16123001/data/x_val/folder/'
y_val_path = '/home/mat/ug/16123001/data/y_val/folder/'

# classes for data loading and preprocessing
with tf.device('/gpu:3'):

    dataset = Dataset(x_train_path, y_train_path, classes=['non-polyp', 'polyp'], augmentation=get_training_augmentation())
    
    BATCH_SIZE = 8
    CLASSES = ['non-polyp', 'polyp']
    LR = 0.0001
    EPOCHS = 25
    IMAGE_ORDERING = 'channels_last'
    n_classes = 2
    
    # SOTA
    BACKBONE = 'resnet34'
    # define model
    model = sm.Unet(BACKBONE, encoder_weights='imagenet')
    model = sm.Linknet(BACKBONE, encoder_weights='imagenet')
    model = sm.FPN(BACKBONE, encoder_weights='imagenet')
    model = sm.PSPNet(BACKBONE, encoder_weights='imagenet')