Example #1
0
def train(opts, **kwargs):
    """
    Train a model
    Parameters
    ----------
    opts
        a config - nested dictionary with options
    Returns
    -------
    """
    m = Model(name=opts['model_name'], **opts['model_params'], **kwargs)
    connector = S3InMemoryConnector(bucket=BUCKET, prefix=MODEL_PREFIX)

    # instantiate dataset and data iterator
    dataset = Dataset(**opts['input_data_params'])
    records = dataset.get_data()

    # check data
    if check_data(records, getattr(dataset, 'validate_itm', None)):
        log.info("DATA OK")
    else:
        log.info("BAD DATA")
        return

    if opts.get('--update', False):
        m.load(connector=connector)
        m.update(records)
    else:
        m.train(records)

    log.info("DONE")

    # save model
    m.save(connector=connector)
Example #2
0
def upload_dataset():
    # check if the post request has the file part
    if 'file' not in request.files or not request.files['file'].filename:
        flash('No file specified')
        return redirect('/training')

    file = request.files['file']
    if not allowed_dataset_filename(file.filename):
        flash("Only .zip files are allowed")
        return redirect('/training')

    name = request.form['name']
    if not name:
        flash('Name of project can not be empty')
        return redirect('/training')

    new_dataset = Dataset(user_id=session['user_id'], name=name)
    db.session.add(new_dataset)
    db.session.flush()
    filename = f'{new_dataset.dataset_id}_{secure_filename(file.filename)}'
    file.save(os.path.join(UPLOAD_FOLDER, filename))

    new_dataset.dataset_filename = filename
    new_dataset.state = Dataset.UPLOADED
    db.session.commit()

    return redirect('/training')
Example #3
0
    def regenerate(self):
        import os, shutil
        if os.path.isdir(self.dest):
            shutil.rmtree(self.dest)
        os.makedirs(self.dest)

        from generators import PageGenerator
        from model import Dataset
        from output import WebFiles
        items = []
        data = Dataset('data')
        dest = WebFiles(self.dest, self.web_dest)
        for post in data.get_all_posts():
            #src = post.get_src_path()
            #src = os.path.join(self.src, name)
            #link = self.make_dest_name(name)
            dest_name = post.get_dest_name()
            #dest = os.path.join(self.dest, link)
            #raise RuntimeError(name, src, dest)
            #page_gen = PageGenerator(src, dest)
            #link = post.publish_to(dest)
            page_gen = PageGenerator(post)
            html_content = page_gen.get_html()
            link = dest.publish(dest_name, html_content)
            #page_gen.regenerate()
            items.append((post.get_title(), post.get_desc(), link))
        self.generate_index(items)
Example #4
0
    def regenerate(self):
        import os, shutil
        if os.path.isdir(self.dest):
            shutil.rmtree(self.dest)
        os.makedirs(self.dest)

        from generators import PageGenerator
        from model import Dataset
        from output import WebFiles
        items = []
        data = Dataset('data')
        dest = WebFiles(self.dest, self.web_dest)
        for post in data.get_all_posts():
            #src = post.get_src_path()
            #src = os.path.join(self.src, name)
            #link = self.make_dest_name(name)
            dest_name = post.get_dest_name()
            #dest = os.path.join(self.dest, link)
            #raise RuntimeError(name, src, dest)
            #page_gen = PageGenerator(src, dest)
            #link = post.publish_to(dest)
            page_gen = PageGenerator(post)
            html_content = page_gen.get_html()
            link = dest.publish(dest_name, html_content)
            #page_gen.regenerate()
            items.append((post.get_title(), post.get_desc(), link))
        self.generate_index(items)
Example #5
0
def main(args):

	torch.manual_seed(0)
	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = False
	
	# Uncomment this if HTTP error happened

	# new_mirror = 'https://ossci-datasets.s3.amazonaws.com/mnist'
	# torchvision.datasets.MNIST.resources = [
	#    ('/'.join([new_mirror, url.split('/')[-1]]), md5)
	#    for url, md5 in torchvision.datasets.MNIST.resources
	# ]
	
	train_raw_dataset = torchvision.datasets.MNIST(root='./mnist', 
									train=True, 
									download=True,
									transform=tf.Compose([tf.ToTensor(), tf.Normalize((0.1307,), (0.3081,))]))
	
	valid_raw_dataset = torchvision.datasets.MNIST(root='./mnist', 
									train=False, 
									download=True, 
									transform=tf.Compose([tf.ToTensor(), tf.Normalize((0.1307,), (0.3081,))]))
	# Train and validate only on pictures of 1
	train_dataset = Dataset(train_raw_dataset, [1])
	valid_dataset = Dataset(valid_raw_dataset, [1])
	
	if args.gpu and torch.cuda.is_available():
		device = torch.device('cuda:0')
		print(f'Using GPU {torch.cuda.get_device_name()}')
		print(torch.cuda.get_device_properties(device))
	else:
		device = torch.device('cpu')
		print('Using CPU')
	
	if args.load_path:
		r_net_path = os.path.join(args.load_path, args.r_load_path)
		d_net_path = os.path.join(args.load_path, args.d_load_path)
		r_net = torch.load(r_net_path).to(device)
		print(f'Loaded R_Net from {r_net_path}')
		d_net = torch.load(d_net_path).to(device)
		print(f'Loaded D_Net from {d_net_path}')
	else:
		r_net = R_Net(in_channels = 1, std = args.std, skip = args.res, cat = args.cat).to(device)
		d_net = D_Net(in_resolution = (28, 28), in_channels = 1).to(device)
		print('Created models')
	
	# TRAINING PARAMETERS

	save_path = (args.save_path, args.r_save_path, args.d_save_path)
	optim_r_params = {'alpha' : 0.9, 'weight_decay' : 1e-9}
	optim_d_params = {'alpha' : 0.9, 'weight_decay' : 1e-9}

	model = train_model(r_net, d_net, train_dataset, valid_dataset, R_Loss, D_Loss, optimizer_class=torch.optim.RMSprop,
					device=device, batch_size=args.batch_size, optim_r_params=optim_r_params, optim_d_params=optim_d_params,
					learning_rate=args.lr, rec_loss_bound=args.rec_bound,
					save_step=args.sstep, num_workers=args.nw, save_path=save_path, lambd=args.lambd)
Example #6
0
def main():
    # 读取配置文件
    config = Configure()
    # 设置logger
    logger = get_logger(config['path_log'])

    # 读取词典
    vocab_util = Vocab_Util()
    # dict[word] = idx
    vocab_words = vocab_util.load_vocab(config['word_vocab_file'])
    # dict[char] = idx
    vocab_chars = vocab_util.load_vocab(config['char_vocab_file'])
    # dict[tag] = idx
    vocab_tags = vocab_util.load_vocab(config['tag_vocab_file'])
    # 将词典封装给模型
    vocabs = [vocab_words, vocab_chars, vocab_tags]

    embeddings = vocab_util.get_trimmed_glove_vectors(config['trimmed_file'])

    # 对数据进行处理
    processing_word = get_processing_word(vocab_words=vocab_words,
                                          vocab_chars=vocab_chars,
                                          lowercase=True,
                                          chars=config['use_chars'],
                                          allow_unk=True)
    processing_tag = get_processing_word(vocab_words=vocab_tags,
                                         lowercase=False,
                                         allow_unk=False)

    # 得到训练数据
    train_dataset = Dataset(filename=config['train_data'],
                            max_iter=None,
                            processing_word=processing_word,
                            processing_tag=processing_tag)
    # 得到dev数据
    dev_dataset = Dataset(filename=config['dev_data'],
                          max_iter=None,
                          processing_word=processing_word,
                          processing_tag=processing_tag)

    # for data in train_dataset:
    # 	print data
    for x_batch, y_batch in train_dataset.get_minibatch(4):
        print x_batch
        print y_batch

    # 构造模型进行训练
    model = ner_model(config, logger, vocabs, embeddings)
    # 构建模型图
    model.build()
    # 训练
    model.train(train_dataset, dev_dataset)
Example #7
0
def knn_fetching_zo_1(query_nd, k, N_max, sort=True, verbose=2):
    assert (Dataset.root.n_dimensions() == query_nd.n_dimensions())
    tree = Dataset.root.tree()

    t_0 = time.time()

    debug_time = time.time()

    indices = np.zeros((query_nd.n_points(), k), dtype=np.int)
    for i, root_id in enumerate(query_nd.indices()):
        res = tree.get_nns_by_item(root_id, k, include_distances=False)
        indices[i, :] = res

    if verbose > 1:
        print('\tQuerying tree took {:.2f} s'.format(time.time() - debug_time))

    # Get the unique indices, and where they are in the array
    unique_idcs = np.unique(indices.flatten())

    if verbose > 1:
        print('\tSearched for {} neighbours of {} observations.'.format(
            k, query_nd.n_points()))
        print('\tFound {} observations ({} unique)'.format(
            indices.size, unique_idcs.size))

    if sort:
        unique_idcs.sort()

    query_result_data = Dataset.root.data()[unique_idcs, :]
    query_result = Dataset(query_result_data,
                           unique_idcs,
                           name='Query result.')

    if verbose > 1:
        print('\tFound {} unique observations for zoom-out.'.format(
            unique_idcs.size))

    if unique_idcs.size > N_max:
        if verbose > 1:
            print('\tSubsampling {} observations to {}.'.format(
                unique_idcs.size, N_max))
        dataset = query_result.random_sampling(N_max)
    else:
        dataset = query_result

    if verbose:
        print('knn_fetching_zo took {:.2f} seconds.\n'.format(time.time() -
                                                              t_0))

    return dataset
Example #8
0
def illumina_metadata_batch1(session,
                             p='../nov3/ILLUMINA_Metadata_Batch1.tsv'):
    count = 0
    for l in open(p):
        _, project, sample, _, _, run_id, project_title, *rest = l.strip(
        ).split('\t')
        if project == 'sample_id':
            continue

        prjs = list(session.query(Dataset).filter(Dataset.ena_id == project))
        if len(prjs) == 0:
            dataset = Dataset(ena_id=project, project_title=project_title)
            prj = dataset
            session.add(dataset)
            session.commit()
        prjs = list(session.query(Dataset).filter(Dataset.ena_id == project))

        assert len(prjs) == 1
        prj = prjs[0]

        runq = session.query(Run).filter_by(ena_id=run_id).first()
        if not runq:
            run = Run(ena_id=run_id, dataset_id=prj)
            prj.runs.append(run)
            count += 1
            session.add(run)

    print(f"adding {count} illumina runs")
    session.commit()
Example #9
0
def main():
	'''
	构造测试集,并用restore之前的模型来进行evaluate
	'''
	# 读取配置文件
	config = Configure()
	# 设置logger
	logger = get_logger(config['path_log'])
	# 读取词典
	vocab_util = Vocab_Util()
	# dict[word] = idx
	vocab_words = vocab_util.load_vocab(config['word_vocab_file'])
	# dict[char] = idx
	vocab_chars = vocab_util.load_vocab(config['char_vocab_file'])
	# dict[tag] = idx
	vocab_tags = vocab_util.load_vocab(config['tag_vocab_file'])
	# 将词典封装给模型
	vocabs = [vocab_words, vocab_chars, vocab_tags]

	embeddings = vocab_util.get_trimmed_glove_vectors(config['trimmed_file'])

	# 对数据进行处理
	processing_word = get_processing_word(vocab_words = vocab_words, vocab_chars = vocab_chars, 
		lowercase = True, chars = config['use_chars'], allow_unk = True)
	processing_tag = get_processing_word(vocab_words = vocab_tags, lowercase = False, allow_unk = False)

	# 得到训练数据
	test_dataset = Dataset(filename = config['test_data'], 
		max_iter = None, processing_word = processing_word, processing_tag = processing_tag)	

	# 构造模型进行训练
	model = ner_model(config,logger,vocabs,embeddings)
	model.build()
	model.restore_session()
	model.evaluate(test_dataset)
Example #10
0
def backfill(opts, **kwargs):
    """
    processes data by model
    Parameters
    ----------
    opts
        options dictionary, should have
            'input_data_params' - parameters to instatiate a dataset taht creates a stream of raw data to process,
            'model_params' - model parameters needed to instantiate it
            'output_data_params' - parameters of a consumer of processed data, things like params for additional
                                    post-model tranformations, parameters of destination for outputing results, etc...
    """

    # load model, init dataset and consumer
    m = Model(name=opts['model_name'], **opts['model_params'], **kwargs)
    connector = S3InMemoryConnector(bucket=BUCKET, prefix=MODEL_PREFIX)
    m.load(connector=connector)

    records = Dataset(**opts['input_data_params']).get_data()
    c = Consumer(name=opts['model_name'], **opts['output_data_params'])

    # define model output
    # m.proc_type = 'proc'  # cannot do that because of spacy, even dill does not pickle spacy instances :(
    # m.n_proc = 4          # but it would have worked if no cython functions
    outputs = m.itransform(records)

    # consume results
    c.consume(outputs)
Example #11
0
def nanopore_metadata(session, p='../oct5/OXFORD_NANOPORE_Metadata.tsv'):
    count = 0
    for l in open(p):
        project, sample, _, _, run_id, project_title, *rest = l.strip().split(
            '\t')
        if project == 'project_id':
            continue

        prjs = list(session.query(Dataset).filter(Dataset.ena_id == project))
        if len(prjs) == 0:
            dataset = Dataset(ena_id=project, project_title=project_title)
            session.add(dataset)
            session.commit()
        prjs = list(session.query(Dataset).filter(Dataset.ena_id == project))
        assert len(prjs) == 1
        prj = prjs[0]

        runq = session.query(Run).filter_by(ena_id=run_id).first()
        if not runq:
            run = Run(ena_id=run_id, dataset_id=prj)
            prj.runs.append(run)
            count += 1
            session.add(run)

    print(f"adding {count} nanopore runs")
    session.commit()
Example #12
0
def main():
    '''
	完成数据的预处理
	'''
    configure = Configure('./config.cfg')

    processing_word = get_processing_word(lowercase=True)

    # 构造dataset
    train_dataset = Dataset(configure['train_data'],
                            processing_word=processing_word)
    dev_dataset = Dataset(configure['train_data'],
                          processing_word=processing_word)
    test_dataset = Dataset(configure['train_data'],
                           processing_word=processing_word)

    # 构造word和tag的vocab
    vocab_util = Vocab_Util()
    vocab_words, vocab_tags = vocab_util.get_vocabs_from_datasets(
        [train_dataset, dev_dataset, test_dataset])
    # 构造词向量中的词
    vocab_glove = vocab_util.get_vocabs_from_glove(configure['glove_file'])

    # 取交集,同时出现在词向量词典和数据集中的词
    vocab_words = vocab_words & vocab_glove
    # 加入UNK和数字NUM
    vocab_words.add(UNK)
    vocab_words.add(NUM)

    # 保存单词和tag的vocab文件
    vocab_util.write_vocab(vocab_words, configure['word_vocab_file'])
    vocab_util.write_vocab(vocab_tags, configure['tag_vocab_file'])

    # 获取Trim Glove Vectors,并存储
    vocab = vocab_util.load_vocab(configure['word_vocab_file'])
    vocab_util.export_trimmed_glove_vectors(vocab, configure['glove_file'],
                                            configure['trimmed_file'],
                                            configure['word_embedding_dim'])

    # 构造char vocab, 并且进行存储
    train_dataset = Dataset(configure['train_data'])
    vocab_chars = vocab_util.get_char_vocab_from_datasets(train_dataset)
    vocab_util.write_vocab(vocab_chars, configure['char_vocab_file'])
def train_adversarial(model,
                      train_dataset,
                      epochs,
                      layers,
                      target_attack=False):
    """Train the model.
    train_dataset, val_dataset: Training and validation Dataset objects.
    learning_rate: The learning rate to train with
    epochs: Number of training epochs. Note that previous training epochs
            are considered to be done alreay, so this actually determines
            the epochs to train in total rather than in this particaular
            call.
    layers: Allows selecting wich layers to train. It can be:
        - A regular expression to match layer names to train
        - One of these predefined values:
          heaads: The RPN, classifier and mask heads of the network
          all: All the layers
          3+: Train Resnet stage 3 and up
          4+: Train Resnet stage 4 and up
          5+: Train Resnet stage 5 and up
    """

    # Pre-defined layer regular expressions
    layer_regex = {
        # all layers but the backbone
        "heads":
        r"(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)",
        # From a specific Resnet stage and up
        "3+":
        r"(fpn.C3.*)|(fpn.C4.*)|(fpn.C5.*)|(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)",
        "4+":
        r"(fpn.C4.*)|(fpn.C5.*)|(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)",
        "5+":
        r"(fpn.C5.*)|(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)",
        # All layers
        "all": ".*",
    }
    if layers in layer_regex.keys():
        layers = layer_regex[layers]

    # Data generators
    train_set = Dataset(train_dataset, model.config, augment=False)
    train_generator = torch.utils.data.DataLoader(train_set,
                                                  batch_size=1,
                                                  shuffle=False,
                                                  num_workers=4)

    model.set_trainable(layers)

    for epoch in range(model.epoch + 1, epochs + 1):
        # Training
        train_adversarial_batch(model,
                                train_generator,
                                target_attack=target_attack)
Example #14
0
def knn_fetching_zo_4(query_nd,
                      N_max,
                      M,
                      new_fraction=0.5,
                      k=1,
                      sort=True,
                      verbose=2):
    assert (Dataset.root.n_dimensions() == query_nd.n_dimensions())

    t_0 = time.time()

    sphere = query_nd.bounding_hypersphere(smooth=False)

    # Number of points from query_nd we want to keep.
    N_keep = round(N_max * (1 - new_fraction))

    query_nd_subsampled = query_nd.random_sampling(N_keep)

    # Number of points we need to add.
    N_fetch = N_max - N_keep

    fetched_idcs = np.zeros(N_fetch, dtype=np.int)
    for i in range(N_fetch):
        new_idx = None
        while new_idx is None or new_idx in fetched_idcs[:i]:
            D_s = Dataset.root.random_sampling(M)
            D_s_outside_sphere = D_s.select_logical(
                ~sphere.contains(D_s.data()))
            # Just to be sure, remove any points from query_nd_subsampled.
            D_s_outside_sphere = D_s_outside_sphere.select_logical(~np.in1d(
                D_s_outside_sphere.indices(), query_nd_subsampled.indices()))
            singleton_dataset = D_s_outside_sphere.knn_pointset(
                k, query_dataset=query_nd, method='bruteforce', verbose=False)
            new_idx = singleton_dataset.indices()[0]

        if verbose > 1:
            print('{}/{}'.format(i, N_fetch))
        fetched_idcs[i] = new_idx

    fetched_data = Dataset.root.data()[
        fetched_idcs, :]  # Is it faster to also fill this in the loop?
    fetched_dataset = Dataset(fetched_data, fetched_idcs)

    # Take union of the subsampled query and the newly fetched dataset.
    result = query_nd_subsampled + fetched_dataset

    if verbose:
        print('ZO4 took {:.2f} seconds.'.format(time.time() - t_0))

    return result
Example #15
0
    def get_permissions(self, id, token):
        """Returns tuple like (dataset, http_error_method_name, error_msg)."""
        if token:
            # This URL has jwt authentication embedded in the query string so
            # that it is shareable. Ignore other permission rules as long as
            # the jwt is valid.
            payload, error = jwt_helper.decode(token)
            if not payload or error:
                return (None, 'http_unauthorized', error)

            allowed_endpoints = payload.get('allowed_endpoints', [])
            if self.get_endpoint_str() not in allowed_endpoints:
                return (None, 'http_forbidden', "Endpoint not allowed.")

            ds = Dataset.get_by_id(id)
            if not ds:
                return (None, 'http_not_found', None)

        else:
            # Without a token, do normal user-based authentication.
            user = self.get_current_user()
            if user.user_type == 'public':
                return (None, 'http_unauthorized', '')

            ds = Dataset.get_by_id(id)
            if not ds:
                return (None, 'http_not_found', None)

            if not ds.parent_id:
                if not user.super_admin:
                    return (None, 'http_forbidden',
                            "Only supers can get parentless datasets.")
            elif not owns(self.get_current_user(), ds.parent_id):
                return (None, 'http_forbidden', "Must own parent.")

        return (ds, None, None)
Example #16
0
def knn_selection(source, n_samples, pos, sort=True):
    knn = NearestNeighbors(n_neighbors=n_samples)

    X = source.data()
    knn.fit(X)

    idcs_in_source = knn.kneighbors(pos.reshape(1, -1),
                                    return_distance=False).flatten()

    if sort:
        idcs_in_source.sort()

    data = source.data()[idcs_in_source, :]
    idcs_in_root = source.indices()[idcs_in_source]

    dataset = Dataset(data, idcs_in_root, name='KNN selection')

    return dataset
Example #17
0
def process_key(model, consumer, opts, s3_key: str) -> None:
    """
    This is the function is run by app.py which is a message consumer, message is send from
    content ingest job and contains a new s3 key.

    Download s3 key, vectorize new content, upload vectors to scylla

    Parameters
    ----------
    s3_key
        received s3 key
    """
    try:
        opts['input_data_params']['no_parallel'] = True
        opts['input_data_params']['keys_list'] = s3_key
        records = Dataset(**opts['input_data_params']).get_data()
        outputs = model.itransform(records)
        consumer.consume(outputs)
    except:
        log.error("Something failed...")
        log.error(traceback.format_exc())
Example #18
0
    def post(self):
        rserve_user, error = authenticate_rserve(self)
        if error == jwt_helper.NOT_FOUND:
            return self.http_unauthorized(error)
        elif error:
            return self.http_forbidden(error)

        if not rserve_user.super_admin:
            return self.http_forbidden("Must be super admin.")

        params = self.get_params({
            'content_type': str,
            'data': 'json',
            'filename': str,
            'parent_id': str,
        })
        # Parent id may be in the query string.
        parent_id = (self.request.get('parent_id', None)
                     or params.pop('parent_id', None))

        ds = Dataset.create(parent_id=parent_id, **params)
        ds.put()

        self.write(ds)
    weighted_loss = raw_loss * loss_weight
    return weighted_loss, raw_loss, KL, bits, MAE, l1


if __name__ == '__main__':
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")
    torch.backends.cudnn.benchmark = True

    print('device = {}'.format(device))

    params = {'batch_size': 32, 'shuffle': True, 'num_workers': 0}

    # Define datasets and loaders
    train_path = r'X:\DS Training Data\samples\train.npy'
    train_data = Dataset(train_path)
    train_loader = torch.utils.data.DataLoader(train_data,
                                               drop_last=True,
                                               **params)

    val_path = r'X:\DS Training Data\samples\val.npy'
    val_data = Dataset(val_path)
    val_loader = torch.utils.data.DataLoader(val_data,
                                             drop_last=True,
                                             **params)

    test_path = r'X:\DS Training Data\samples\test.npy'
    test_data = Dataset(test_path)
    test_loader = torch.utils.data.DataLoader(test_data,
                                              drop_last=True,
                                              **params)
Example #20
0
def main():
    path_to_dataset = "C:\\GIT\\ZIWM\\data.csv"
    file_writer = FileWriter("")
    total_number_of_classes = 8
    total_number_of_features = 30

    num_of_neighbours = [1, 5, 10]
    type_of_metric = ["euclidean", "manhattan"]

    #loading raw values from file
    datasetLoader = load_datasets.Load_datasets(path_to_dataset)
    dataset_raw_values = datasetLoader.loadDataset()

    #constructing main dataset with division for features and classes
    dataset = Dataset(dataset_raw_values)

    best_fit = 0.0
    best_average_fit = 0.0

    # selecting number of features and running tests
    for number_of_features_selected in range(1, 31):
        #print(number_of_features_selected)
        trimmed_feature_list = FeatureSelector.selectKBestFeatures(
            number_of_features_selected, dataset.dataset_features_array,
            dataset.dataset_class_array)
        #   dividing data sets into patients
        patients = []
        for i in range(len(dataset.dataset_class_array)):
            patient = Patient(i, dataset.dataset_class_array[i],
                              trimmed_feature_list[i])
            # print(patient.getId(), patient.getDisease_class(), patient.get_features())
            patients.append(patient)

        #   testing for each metric type and number of neighbours
        for metric in type_of_metric:
            for n_neighbours in num_of_neighbours:

                test_result_arr = []
                for i in range(5):
                    print("metric: ", metric, " n_neighbours", n_neighbours,
                          " run: ", i)
                    #   creating learn and test data sets
                    learning_set, testing_set = SplitSets.splitSets(patients)
                    #   creating algorythm and training
                    kn = KNearestNeighbour(n_neighbours, metric, learning_set,
                                           testing_set)
                    kn.train()
                    res1 = kn.test()
                    #   swaping training and learning sets
                    temp_set = learning_set
                    learning_set = testing_set
                    testing_set = temp_set

                    kn.setTestSet(testing_set)
                    kn.setTrainingSet(learning_set)

                    #   training once again
                    kn.train()

                    res2 = kn.test()

                    print("test result 1: ", res1)
                    print("test result 2: ", res2)

                    test_result_arr.append(res1)
                    test_result_arr.append(res2)

                    if (res1 > best_fit):
                        best_fit = res1
                    if (res2 > best_fit):
                        best_fit = res2

                test_average = sum(test_result_arr) / len(test_result_arr)
                print("average of tests: ", test_average)

                result_str = str(number_of_features_selected
                                 ) + " | " + metric + " | " + str(
                                     n_neighbours) + " | " + str(
                                         test_average) + " \n"
                file_writer.write(result_str)
                if (test_average > best_average_fit):
                    best_average_fit = test_average

        #   comparing results of test data set
        #   calculating hit rate

    print("best fit: ", best_fit)
    print("best fit average: ", best_average_fit)
    file_writer.close()
Example #21
0
def main():
    pred_file_path = 'test.csv'
    load_save_model = True
    lr = 1e-5
    batch_size = 8
    gpu = True
    torch.manual_seed(0)
    device = torch.device('cpu')
    if gpu:
        device = torch.device('cuda')

    tokenizer = BertTokenizer(vocab_file='publish/vocab.txt', max_len=512)
    _, known_token = load_dataset('TRAIN/Train_reviews.csv',
                                  'TRAIN/Train_labels.csv', tokenizer)
    dataset = load_review_dataset('TRAIN/TEST/Test_reviews.csv')
    dataset = Dataset(list(dataset.items()))
    dataloader = torch_data.DataLoader(dataset=dataset,
                                       batch_size=batch_size,
                                       shuffle=False,
                                       collate_fn=test_collate_fn(
                                           tokenizer, known_token))
    bert_pretraining = convert_tf_checkpoint_to_pytorch(
        './publish/bert_model.ckpt', './publish/bert_config.json')
    model = Model(bert_pretraining.bert)

    model = model.cuda()
    if load_save_model:
        model.load_state_dict(torch.load('./save_model/best.model'))

    pred_file = open(pred_file_path, mode='w', encoding='utf-8')

    pbar = tqdm()
    model.eval()
    for step, (batch_X, len_X, mask, batch_idx,
               origin_batch_X) in enumerate(dataloader):
        batch_X = batch_X.to(device)
        mask = mask.to(device)

        scores, gather_idx = model(batch_X, len_X, mask, None)
        (pred_seq_target, pred_match_target, pred_single_aspect_category_target, pred_single_opinion_category_target,\
            pred_cross_category_target, pred_single_aspect_polarity_target, pred_single_opinion_polarity_target,\
                pred_cross_polarity_target) = model.infer(scores, mask)

        label = []

        aspect_idx, opinion_idx = gather_idx
        for b in range(batch_X.shape[0]):
            _aspect_idx, _opinion_idx = aspect_idx[b], opinion_idx[b]
            if len(_aspect_idx) == 0 and len(_opinion_idx) == 0:
                label.append((batch_idx[b], '_', '_', '_', '_'))

            _aspect_cross, _opinion_cross = [
                False for i in range(len(_aspect_idx))
            ], [False for i in range(len(_opinion_idx))]
            for i in range(len(_aspect_idx)):
                for j in range(len(_opinion_idx)):
                    if pred_match_target[b][i, j] == 1:
                        _aspect_cross[i] = True
                        _opinion_cross[j] = True
                        category = ID2CATEGORY[pred_cross_category_target[b][
                            i, j]]
                        polarity = ID2POLARITY[pred_cross_polarity_target[b][
                            i, j]]
                        aspect = tokenizer.decode(
                            list(origin_batch_X[b, _aspect_idx[i]].cpu().
                                 detach().numpy())).replace(' ', '')
                        opinion = tokenizer.decode(
                            list(origin_batch_X[b,
                                                _opinion_idx[j]].cpu().detach(
                                                ).numpy())).replace(' ', '')
                        # aspect = tokenizer.decode(list(batch_X[b, _aspect_idx[i]].cpu().detach().numpy())).replace(' ', '')
                        # opinion = tokenizer.decode(list(batch_X[b, _opinion_idx[j]].cpu().detach().numpy())).replace(' ', '')
                        aspect_beg = len(
                            tokenizer.decode(
                                list(batch_X[b,
                                             1:_aspect_idx[i][0]].cpu().detach(
                                             ).numpy())).replace(' ', ''))
                        aspect_end = aspect_beg + len(aspect)
                        opinion_beg = len(
                            tokenizer.decode(
                                list(batch_X[b, 1:_opinion_idx[j][0]].cpu().
                                     detach().numpy())).replace(' ', ''))
                        opinion_end = opinion_beg + len(opinion)
                        label.append((batch_idx[b], aspect, opinion, category,
                                      polarity))
            for i in range(len(_aspect_idx)):
                if _aspect_cross[i] == False:
                    category = ID2CATEGORY[
                        pred_single_aspect_category_target[b][i]]
                    polarity = ID2POLARITY[
                        pred_single_aspect_polarity_target[b][i]]
                    aspect = tokenizer.decode(
                        list(origin_batch_X[
                            b,
                            _aspect_idx[i]].cpu().detach().numpy())).replace(
                                ' ', '')
                    # aspect = tokenizer.decode(list(batch_X[b, _aspect_idx[i]].cpu().detach().numpy())).replace(' ', '')
                    aspect_beg = len(
                        tokenizer.decode(
                            list(batch_X[b, 1:_aspect_idx[i][0]].cpu().detach(
                            ).numpy())).replace(' ', ''))
                    aspect_end = aspect_beg + len(aspect)
                    label.append(
                        (batch_idx[b], aspect, '_', category, polarity))
            for i in range(len(_opinion_idx)):
                if _opinion_cross[i] == False:
                    category = ID2CATEGORY[
                        pred_single_opinion_category_target[b][i]]
                    polarity = ID2POLARITY[
                        pred_single_opinion_polarity_target[b][i]]
                    opinion = tokenizer.decode(
                        list(origin_batch_X[
                            b,
                            _opinion_idx[i]].cpu().detach().numpy())).replace(
                                ' ', '')
                    # opinion = tokenizer.decode(list(batch_X[b, _opinion_idx[i]].cpu().detach().numpy())).replace(' ', '')
                    opinion_beg = len(
                        tokenizer.decode(
                            list(batch_X[b, 1:_opinion_idx[i][0]].cpu().detach(
                            ).numpy())).replace(' ', ''))
                    opinion_end = opinion_beg + len(opinion)
                    label.append(
                        (batch_idx[b], '_', opinion, category, polarity))

        for _label in label:
            _label = ','.join(list(map(lambda x: str(x), _label)))
            pred_file.write(_label + '\n')
        pbar.update(batch_size)
        pbar.set_description('step: %d' % step)
    pred_file.close()
    pbar.close()
Example #22
0
def train_model(batch_size, lr, max_epochs, num_unmasked_weights, loss_weight,
                run_validation, print_grads, training_set, LAT):
    '''
    Generates a dictionary of pairs of utterances, each pair representing a back and forth interaction

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > batch_size
        > lr - learning rate
        > max_epochs
        > num_unmasked_weights - The number of weights to be allowed through the linear combination
            e.g. num_unmasked_weights = 1 means only one sample will be chosen,
                = 2, 2 samples will be chosen. etc.
            output vector -> softmax -> mask -> ensure weights sum to 1 (not softmax because 0's stay 0).
        > loss_weight - a function that weighs the loss over time.
        > run_validation - if True, will run through validation set after each epoch
        > print_grads - Will display gradient flow every print_grads
            Set to False or 0 to avoid printing gradients
        > training_set - descriptive string describing the training data
        > LAT - boolean, whether to include log attack time or not
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > model
        > losses - dictionary with the following keys
            'raw_losses' - losses unweighted by loss_weight
            'train_losses' - losses weighted by loss_weight
            'epoch_loss' - loss per epoch
            'val_losses' - validation loss
            'weights' - appends the weight vector of a whole batch once every
    '''
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")
    torch.backends.cudnn.benchmark = True

    print('device = {}'.format(device))

    params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 0}

    # Define datasets and loaders
    samples_path = r'X:\DS Training Data\samples'
    train_data = Dataset(samples_path, 'train.npy', LAT=LAT)
    train_loader = torch.utils.data.DataLoader(train_data,
                                               drop_last=True,
                                               **params)

    val_data = Dataset(samples_path, 'val.npy', LAT=LAT)
    val_loader = torch.utils.data.DataLoader(val_data,
                                             drop_last=True,
                                             **params)

    # test_data = Dataset(samples_path, 'test.npy')
    # test_loader = torch.utils.data.DataLoader(test_data, drop_last=True, **params)

    src_path = test_path = r'X:\DS Training Data\samples\base\src.npy'

    model = simpleVAE(d=32,
                      src_path=src_path,
                      batch_size=params['batch_size'],
                      device=device,
                      num_to_keep=num_unmasked_weights,
                      LAT=LAT).cuda()
    # print(model)

    criterion = simple_loss

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    balance = [100, 0, 1]  # [MAE, KL, l1]

    losses = {
        'raw_losses': [],  # unweighted losses
        'train_losses': [],  # weighted losses
        'epoch_loss': [],  # cumulative loss per epoch
        'val_losses': [],  # validation loss
        'loss_weight': [],  # overarching weight of loss
        'step': [],  # global step
        'weights':
        [],  # contains the output weights of the model, recorded every X epochs
        'idxs': [],  # indices corresponding to weight vectors
        'KL': [],  # KL divergence
        'bits': [],  # KL divergence free bits
        'l1': [],  # l1 loss
        'MSE': [],  # MSE loss
        'MAE': [],  # MAE loss
        'balance': balance,  # Relative MAE, KL, l1 loss
        'max_epochs': max_epochs,  # maximum epochs
        'num_unmasked_weights':
        num_unmasked_weights,  # number of unmasked weights (defined in model)
        'training_set':
        training_set,  # descriptive string describing training set
        'lr': lr  # learning rate
    }

    with torch.autograd.set_detect_anomaly(True):
        for epoch in range(1, max_epochs + 1):
            model.train()
            epoch_loss = 0
            displayed = False
            weights_recorded = False

            # Training
            for batch_idx, (local_batch, idx) in enumerate(train_loader):
                # Transfer to GPU
                local_batch = local_batch.to(device, dtype=torch.float32)

                # ================ forward ================
                candidate, mu, logvar, q_z, weight_vec = model(local_batch)
                loss = criterion(
                    create_torch_stft(candidate).double(),
                    local_batch[:, :1025].double(), loss_weight(epoch))
                epoch_loss += loss.item()

                # Keep track of some loss statistics
                losses['raw_losses'].append(loss.item() / loss_weight(epoch))
                losses['MSE'].append(losses['raw_losses'][-1])
                losses['train_losses'].append(loss.item())
                losses['loss_weight'].append(loss_weight(epoch))
                losses['step'].append(model.global_step)
                losses['KL'].append(0)
                losses['bits'].append(0)
                losses['l1'].append(0)
                losses['MAE'].append(0)

                # ================ backward ================
                loss.backward()
                if print_grads:
                    if epoch % print_grads == 0 and not displayed:
                        displayed = True
                        plot_grad_flow(model.named_parameters())
                optimizer.step()
                model.global_step += 1
                model.zero_grad()

                if batch_idx % 5 == 0:
                    # Record time per 10 batches
                    if batch_idx != 0:
                        end = time.time()
                        elapsed = end - start
                    else:
                        elapsed = 0
                    start = time.time()

                    # Record weights every few epochs, depending on max_epochs
                    if not weights_recorded:
                        if max_epochs <= 5:
                            losses['weights'].append(weight_vec)
                            losses['idxs'].append(idx)
                            weights_recorded = True
                        elif 5 < max_epochs <= 20:
                            if epoch % 2 == 0:
                                losses['weights'].append(weight_vec)
                                losses['idxs'].append(idx)
                                weights_recorded = True
                        elif 20 < max_epochs <= 100:
                            if epoch % 10 == 0:
                                losses['weights'].append(weight_vec)
                                losses['idxs'].append(idx)
                                weights_recorded = True
                        elif 100 < max_epochs <= 1000:
                            if epoch % 20 == 0:
                                losses['weights'].append(weight_vec)
                                losses['idxs'].append(idx)
                                weights_recorded = True
                        else:
                            if epoch % 50 == 0:
                                losses['weights'].append(weight_vec)
                                losses['idxs'].append(idx)
                                weights_recorded = True

                    # print training update
                    print('* * * * * * * * * * * * * * * * * *\n')
                    print('   epoch : {}/{}'.format(epoch,
                                                    losses['max_epochs']))
                    print('   batch : ({}/{}) {:.0f}%'.format(
                        len(local_batch) * batch_idx,
                        len(train_loader.dataset),
                        100. * batch_idx / len(train_loader)))
                    print('    step : {}'.format(model.global_step))
                    print('    time : {:.2f}\n'.format(elapsed))

                    print('    loss : ({}) * {:.3f}'.format(
                        loss_weight(epoch), losses['raw_losses'][-1]))
                    print('            = {}\n'.format(
                        losses['train_losses'][-1]))

                    print('     max : {:.6f}'.format(torch.max(
                        weight_vec[-1])))
                    print(' max idx : {:.0f}'.format(
                        torch.argmax(weight_vec[-1])))
                    print('     min : {:.6f}\n'.format(
                        torch.min(weight_vec[-1])))
                    print('*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*')

            losses['epoch_loss'].append(epoch_loss)

            if run_validation == True:
                # ================ validation ================
                with torch.no_grad():
                    model.eval()  # Put model in evaluation mode

                    for batch_idx, (local_batch, idx) in enumerate(val_loader):
                        # Transfer to GPU
                        local_batch = local_batch.to(device,
                                                     dtype=torch.float32)

                        # Model computations
                        candidate, mu, logvar, q_z, _ = model(local_batch)
                        loss = criterion(
                            create_torch_stft(candidate).double(),
                            local_batch[:, :1025].double(), loss_weight(epoch))

                    print('loading validation loss...\n')
                    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
                    print('Validation Loss: {}'.format(loss.item()))
                    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n')
                    losses['val_losses'].append(loss.item())

            else:
                losses['val_losses'].append(0)

    return model, losses
Example #23
0
def test(args):

    print('[*] TEST')
    device = torch.device('cuda:' + str(args.device))
    dataset_path = args.data
    onmt_path = args.pretrain
    unref_path = args.unref
    nbatch = int(args.batch)
    output = args.output

    #data load
    gold_reply_path = os.path.join(dataset_path, 'test_reply.txt')
    gene_reply_path = os.path.join(dataset_path, 'gene_reply.txt')
    test_query_path = os.path.join(dataset_path, 'test_query.txt')

    refer_test_data_path_list = [gene_reply_path, gold_reply_path]
    unref_test_data_path_list = [test_query_path, gold_reply_path]
    vocab_path = os.path.join(onmt_path, 'sample.vocab.pt')
    onmt_model_path = os.path.join(onmt_path, 'sample.model.pt')

    refer_test_dataset = Dataset(vocab_path=vocab_path,
                                 data_path_list=refer_test_data_path_list,
                                 max_length=50)
    refer_test_loader = DataLoader(dataset=refer_test_dataset,
                                   batch_size=nbatch,
                                   collate_fn=collate_fn,
                                   num_workers=8)

    unref_test_dataset = Dataset(vocab_path=vocab_path,
                                 data_path_list=unref_test_data_path_list,
                                 max_length=50)
    unref_test_loader = DataLoader(dataset=unref_test_dataset,
                                   batch_size=nbatch,
                                   collate_fn=collate_fn,
                                   num_workers=8)

    #unref model load
    unrefer_pos_model = torch.load(unref_path + '_pos.th')
    unrefer_neg_model = torch.load(unref_path + '_neg.th')

    #test
    positive_test = open(output + '/positive_result.txt',
                         'w',
                         encoding='utf-8')
    negative_test = open(output + '/negative_result.txt',
                         'w',
                         encoding='utf-8')

    for query, q_len, reply, r_len in unref_test_loader:
        prediction1 = unrefer_pos_model(query, q_len, reply, r_len)
        prediction2 = unrefer_neg_model(query, q_len, reply, r_len)
        #print(query,'/', q_len, '/', reply, '/', r_len)
        print(prediction1)
        print(prediction2)
        positive_test.write(str(prediction1.data))
        negative_test.write(str(prediction2.data))
        print('break')
        break

    positive_test.close()
    negative_test.close()

    ##reference score
    encoder = build_pretrained_model(onmt_model_path, refer_test_dataset.vocab)
    ref_model = RefScorer(encoder, device)
    sim_output = open(output + '/similarity.txt', 'w', encoding='utf-8')

    for gold_indices, gold_lens, gen_indices, gen_lens in refer_test_loader:
        similarity = ref_model.get_ref_score(gold_indices, gold_lens,
                                             gen_indices, gen_lens)
        print(similarity.data)
        sim_output.write(str(similarity.data))
        break
    sim_output.close()
Example #24
0
def train(args):
    print('[*] TRAIN')

    device = torch.device('cuda:' + str(args.device))
    ninput = int(args.dim)
    nlayer = int(args.layer)
    nbatch = int(args.batch)
    nhidden = int(args.hidden)
    margin = int(args.margin)
    epoch = int(args.epoch)
    learningrate = float(args.lr)
    dataset_path = args.data
    onmt_path = args.pretrain

    train_src_path = os.path.join(dataset_path, 'src_train.txt')
    train_tar_path = os.path.join(dataset_path, 'tar_train.txt')

    unref_train_data_path_list = [train_src_path, train_tar_path]
    vocab_path = os.path.join(onmt_path, 'sample.vocab.pt')

    #pre-trained model
    #onmt_vocab_path = os.path.join(onmt_path, 'sample.vocab.pt')
    #onmt_model_path = os.path.join(onmt_path, 'sample.model.pt')

    #data load
    unref_train_dataset = Dataset(vocab_path=vocab_path,
                                  data_path_list=unref_train_data_path_list,
                                  max_length=50)
    unref_nega_dataset = NegativeDataset(unref_train_dataset.data,
                                         unref_train_dataset.vocab)
    #print('positive ', unref_train_dataset[0], unref_train_dataset.data[0])
    #print('negative ', negative_dataset[0], negative_dataset.data[0])

    positive_loader = DataLoader(dataset=unref_train_dataset,
                                 batch_size=nbatch,
                                 collate_fn=collate_fn,
                                 num_workers=8)
    negative_loader = DataLoader(dataset=unref_nega_dataset,
                                 batch_size=nbatch,
                                 collate_fn=collate_fn,
                                 num_workers=8)

    vocab_size = unref_train_dataset.getVocabSize()
    batch_num = math.ceil(unref_train_dataset.getInstanceSize() / nbatch)
    print('[*] # of batch: ', batch_num, ' pos, neg :', len(positive_loader),
          len(negative_loader))

    #ninput, nhidden, nlayer, ntoken, nbatch
    unrefer_pos_model = UnrefScorer(ninput, nhidden, nlayer, vocab_size,
                                    nbatch, device)
    unrefer_pos_model = unrefer_pos_model.to(device)

    unrefer_neg_model = UnrefScorer(ninput, nhidden, nlayer, vocab_size,
                                    nbatch, device)
    unrefer_neg_model = unrefer_neg_model.to(device)

    loss_f = torch.nn.MarginRankingLoss(margin)
    optimizer1 = torch.optim.Adam(unrefer_pos_model.parameters(),
                                  lr=learningrate)
    optimizer2 = torch.optim.Adam(unrefer_neg_model.parameters(),
                                  lr=learningrate)
    total_loss = 0

    for i in range(epoch):  #epoch
        iter_positive = iter(positive_loader)
        iter_negative = iter(negative_loader)
        for mini in range(batch_num):
            #positive training
            pos_src, pos_src_len, pos_tar, pos_tar_len = next(iter_positive)
            neg_src, neg_src_len, neg_tar, neg_tar_len = next(iter_negative)
            #print('pos', pos_src, '/', pos_src_len, '/', pos_tar, '/', pos_tar_len)

            encd_pos = unrefer_pos_model(pos_src, pos_src_len, pos_tar,
                                         pos_tar_len)
            encd_neg = unrefer_neg_model(neg_src, neg_src_len, neg_tar,
                                         neg_tar_len)
            #print('next ', mini, encd_pos.size(), encd_neg.size())

            target = torch.ones(encd_pos.size(0), 1).to(device)  #batch

            loss = loss_f(encd_pos, encd_neg, target)
            total_loss = total_loss + loss.item()
            unrefer_pos_model.zero_grad()
            unrefer_neg_model.zero_grad()
            loss.backward()
            optimizer1.step()
            optimizer2.step()

        print('[-] epoch: ', i, ', total_loss :', total_loss)
        total_loss = 0

    torch.save(unrefer_pos_model, args.output + '_pos.th')
    torch.save(unrefer_neg_model, args.output + '_neg.th')
Example #25
0
        FILE.write('%f\n' % total_stats[i][2][-1])
    FILE.close()


# -----------------------------------------------------------------------------------------

###########################
###	 START HERE	###
###########################
if __name__ == '__main__':

    # Get the command line arguments in a dictionary
    args = Arguments()

    # Create a Dataset object based on the <genome_file>
    refGenome = Dataset(args['<genome_file>'])
    refGenome.calcGenomeCoords()
    if args['-v'] in ['all', 'refG']: refGenome.printDataset('refGenome')

    # Create a Dataset object based on the <A_file>
    fileA = Dataset(args['<A_file>'])
    fileA.calcDataCoords(refGenome)
    if args['-v'] in ['all', 'fileA']: fileA.printDataset('fileA')

    # Print output header for the <A_file>
    output_header(args, fileA)

    # Foreach <B_files>
    if args['-c']: FH = open('testFiles/summary', 'w')
    mnRandom = 0
    for fileB_name in args['<B_files>']:
Example #26
0
from model import Dataset, Classifier, GRADIENT_BOOSTING

if __name__ == "__main__":
    classifier = Classifier(GRADIENT_BOOSTING)
    classifier.deserialize_model("model.out")
    predict_dataset = Dataset()
    predict_dataset.append_candidates("candidates_temp.csv")
    proba = classifier.predict_proba(predict_dataset.features)
    predict_dataset.store_proba("probabilities.csv", proba)
Example #27
0
def root_selection(source):
    data = Dataset.root.data()[source.indices(), :]
    dataset = Dataset(data, source.indices(), name='Root selection')
    return dataset
Example #28
0
if torch.cuda.is_available():
    args.cuda = True
else:
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")

print("Using CUDA: {}".format(args.cuda))

set_seed_everywhere(args.seed, args.cuda)

handle_dirs(args.save_dir)

if args.reload_from_files and Path(args.vectorizer_file).exists():
    dataset = Dataset.load_dataset_and_load_vectorizer(args.dataset_csv,
                                                       args.vectorizer_file)
else:
    dataset = Dataset.load_dataset_and_make_vectorizer(args.dataset_csv)
    dataset.save_vectorizer(args.vectorizer_file)

vectorizer = dataset.get_vectorizer()

model = Model(
    source_vocab_size=len(vectorizer.source_vocab),
    source_embedding_size=args.source_embedding_size,
    target_vocab_size=len(vectorizer.target_vocab),
    target_embedding_size=args.target_embedding_size,
    encoding_size=args.encoding_size,
    target_bos_index=vectorizer.target_vocab.begin_seq_index,
)