def train(opts, **kwargs): """ Train a model Parameters ---------- opts a config - nested dictionary with options Returns ------- """ m = Model(name=opts['model_name'], **opts['model_params'], **kwargs) connector = S3InMemoryConnector(bucket=BUCKET, prefix=MODEL_PREFIX) # instantiate dataset and data iterator dataset = Dataset(**opts['input_data_params']) records = dataset.get_data() # check data if check_data(records, getattr(dataset, 'validate_itm', None)): log.info("DATA OK") else: log.info("BAD DATA") return if opts.get('--update', False): m.load(connector=connector) m.update(records) else: m.train(records) log.info("DONE") # save model m.save(connector=connector)
def upload_dataset(): # check if the post request has the file part if 'file' not in request.files or not request.files['file'].filename: flash('No file specified') return redirect('/training') file = request.files['file'] if not allowed_dataset_filename(file.filename): flash("Only .zip files are allowed") return redirect('/training') name = request.form['name'] if not name: flash('Name of project can not be empty') return redirect('/training') new_dataset = Dataset(user_id=session['user_id'], name=name) db.session.add(new_dataset) db.session.flush() filename = f'{new_dataset.dataset_id}_{secure_filename(file.filename)}' file.save(os.path.join(UPLOAD_FOLDER, filename)) new_dataset.dataset_filename = filename new_dataset.state = Dataset.UPLOADED db.session.commit() return redirect('/training')
def regenerate(self): import os, shutil if os.path.isdir(self.dest): shutil.rmtree(self.dest) os.makedirs(self.dest) from generators import PageGenerator from model import Dataset from output import WebFiles items = [] data = Dataset('data') dest = WebFiles(self.dest, self.web_dest) for post in data.get_all_posts(): #src = post.get_src_path() #src = os.path.join(self.src, name) #link = self.make_dest_name(name) dest_name = post.get_dest_name() #dest = os.path.join(self.dest, link) #raise RuntimeError(name, src, dest) #page_gen = PageGenerator(src, dest) #link = post.publish_to(dest) page_gen = PageGenerator(post) html_content = page_gen.get_html() link = dest.publish(dest_name, html_content) #page_gen.regenerate() items.append((post.get_title(), post.get_desc(), link)) self.generate_index(items)
def main(args): torch.manual_seed(0) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Uncomment this if HTTP error happened # new_mirror = 'https://ossci-datasets.s3.amazonaws.com/mnist' # torchvision.datasets.MNIST.resources = [ # ('/'.join([new_mirror, url.split('/')[-1]]), md5) # for url, md5 in torchvision.datasets.MNIST.resources # ] train_raw_dataset = torchvision.datasets.MNIST(root='./mnist', train=True, download=True, transform=tf.Compose([tf.ToTensor(), tf.Normalize((0.1307,), (0.3081,))])) valid_raw_dataset = torchvision.datasets.MNIST(root='./mnist', train=False, download=True, transform=tf.Compose([tf.ToTensor(), tf.Normalize((0.1307,), (0.3081,))])) # Train and validate only on pictures of 1 train_dataset = Dataset(train_raw_dataset, [1]) valid_dataset = Dataset(valid_raw_dataset, [1]) if args.gpu and torch.cuda.is_available(): device = torch.device('cuda:0') print(f'Using GPU {torch.cuda.get_device_name()}') print(torch.cuda.get_device_properties(device)) else: device = torch.device('cpu') print('Using CPU') if args.load_path: r_net_path = os.path.join(args.load_path, args.r_load_path) d_net_path = os.path.join(args.load_path, args.d_load_path) r_net = torch.load(r_net_path).to(device) print(f'Loaded R_Net from {r_net_path}') d_net = torch.load(d_net_path).to(device) print(f'Loaded D_Net from {d_net_path}') else: r_net = R_Net(in_channels = 1, std = args.std, skip = args.res, cat = args.cat).to(device) d_net = D_Net(in_resolution = (28, 28), in_channels = 1).to(device) print('Created models') # TRAINING PARAMETERS save_path = (args.save_path, args.r_save_path, args.d_save_path) optim_r_params = {'alpha' : 0.9, 'weight_decay' : 1e-9} optim_d_params = {'alpha' : 0.9, 'weight_decay' : 1e-9} model = train_model(r_net, d_net, train_dataset, valid_dataset, R_Loss, D_Loss, optimizer_class=torch.optim.RMSprop, device=device, batch_size=args.batch_size, optim_r_params=optim_r_params, optim_d_params=optim_d_params, learning_rate=args.lr, rec_loss_bound=args.rec_bound, save_step=args.sstep, num_workers=args.nw, save_path=save_path, lambd=args.lambd)
def main(): # 读取配置文件 config = Configure() # 设置logger logger = get_logger(config['path_log']) # 读取词典 vocab_util = Vocab_Util() # dict[word] = idx vocab_words = vocab_util.load_vocab(config['word_vocab_file']) # dict[char] = idx vocab_chars = vocab_util.load_vocab(config['char_vocab_file']) # dict[tag] = idx vocab_tags = vocab_util.load_vocab(config['tag_vocab_file']) # 将词典封装给模型 vocabs = [vocab_words, vocab_chars, vocab_tags] embeddings = vocab_util.get_trimmed_glove_vectors(config['trimmed_file']) # 对数据进行处理 processing_word = get_processing_word(vocab_words=vocab_words, vocab_chars=vocab_chars, lowercase=True, chars=config['use_chars'], allow_unk=True) processing_tag = get_processing_word(vocab_words=vocab_tags, lowercase=False, allow_unk=False) # 得到训练数据 train_dataset = Dataset(filename=config['train_data'], max_iter=None, processing_word=processing_word, processing_tag=processing_tag) # 得到dev数据 dev_dataset = Dataset(filename=config['dev_data'], max_iter=None, processing_word=processing_word, processing_tag=processing_tag) # for data in train_dataset: # print data for x_batch, y_batch in train_dataset.get_minibatch(4): print x_batch print y_batch # 构造模型进行训练 model = ner_model(config, logger, vocabs, embeddings) # 构建模型图 model.build() # 训练 model.train(train_dataset, dev_dataset)
def knn_fetching_zo_1(query_nd, k, N_max, sort=True, verbose=2): assert (Dataset.root.n_dimensions() == query_nd.n_dimensions()) tree = Dataset.root.tree() t_0 = time.time() debug_time = time.time() indices = np.zeros((query_nd.n_points(), k), dtype=np.int) for i, root_id in enumerate(query_nd.indices()): res = tree.get_nns_by_item(root_id, k, include_distances=False) indices[i, :] = res if verbose > 1: print('\tQuerying tree took {:.2f} s'.format(time.time() - debug_time)) # Get the unique indices, and where they are in the array unique_idcs = np.unique(indices.flatten()) if verbose > 1: print('\tSearched for {} neighbours of {} observations.'.format( k, query_nd.n_points())) print('\tFound {} observations ({} unique)'.format( indices.size, unique_idcs.size)) if sort: unique_idcs.sort() query_result_data = Dataset.root.data()[unique_idcs, :] query_result = Dataset(query_result_data, unique_idcs, name='Query result.') if verbose > 1: print('\tFound {} unique observations for zoom-out.'.format( unique_idcs.size)) if unique_idcs.size > N_max: if verbose > 1: print('\tSubsampling {} observations to {}.'.format( unique_idcs.size, N_max)) dataset = query_result.random_sampling(N_max) else: dataset = query_result if verbose: print('knn_fetching_zo took {:.2f} seconds.\n'.format(time.time() - t_0)) return dataset
def illumina_metadata_batch1(session, p='../nov3/ILLUMINA_Metadata_Batch1.tsv'): count = 0 for l in open(p): _, project, sample, _, _, run_id, project_title, *rest = l.strip( ).split('\t') if project == 'sample_id': continue prjs = list(session.query(Dataset).filter(Dataset.ena_id == project)) if len(prjs) == 0: dataset = Dataset(ena_id=project, project_title=project_title) prj = dataset session.add(dataset) session.commit() prjs = list(session.query(Dataset).filter(Dataset.ena_id == project)) assert len(prjs) == 1 prj = prjs[0] runq = session.query(Run).filter_by(ena_id=run_id).first() if not runq: run = Run(ena_id=run_id, dataset_id=prj) prj.runs.append(run) count += 1 session.add(run) print(f"adding {count} illumina runs") session.commit()
def main(): ''' 构造测试集,并用restore之前的模型来进行evaluate ''' # 读取配置文件 config = Configure() # 设置logger logger = get_logger(config['path_log']) # 读取词典 vocab_util = Vocab_Util() # dict[word] = idx vocab_words = vocab_util.load_vocab(config['word_vocab_file']) # dict[char] = idx vocab_chars = vocab_util.load_vocab(config['char_vocab_file']) # dict[tag] = idx vocab_tags = vocab_util.load_vocab(config['tag_vocab_file']) # 将词典封装给模型 vocabs = [vocab_words, vocab_chars, vocab_tags] embeddings = vocab_util.get_trimmed_glove_vectors(config['trimmed_file']) # 对数据进行处理 processing_word = get_processing_word(vocab_words = vocab_words, vocab_chars = vocab_chars, lowercase = True, chars = config['use_chars'], allow_unk = True) processing_tag = get_processing_word(vocab_words = vocab_tags, lowercase = False, allow_unk = False) # 得到训练数据 test_dataset = Dataset(filename = config['test_data'], max_iter = None, processing_word = processing_word, processing_tag = processing_tag) # 构造模型进行训练 model = ner_model(config,logger,vocabs,embeddings) model.build() model.restore_session() model.evaluate(test_dataset)
def backfill(opts, **kwargs): """ processes data by model Parameters ---------- opts options dictionary, should have 'input_data_params' - parameters to instatiate a dataset taht creates a stream of raw data to process, 'model_params' - model parameters needed to instantiate it 'output_data_params' - parameters of a consumer of processed data, things like params for additional post-model tranformations, parameters of destination for outputing results, etc... """ # load model, init dataset and consumer m = Model(name=opts['model_name'], **opts['model_params'], **kwargs) connector = S3InMemoryConnector(bucket=BUCKET, prefix=MODEL_PREFIX) m.load(connector=connector) records = Dataset(**opts['input_data_params']).get_data() c = Consumer(name=opts['model_name'], **opts['output_data_params']) # define model output # m.proc_type = 'proc' # cannot do that because of spacy, even dill does not pickle spacy instances :( # m.n_proc = 4 # but it would have worked if no cython functions outputs = m.itransform(records) # consume results c.consume(outputs)
def nanopore_metadata(session, p='../oct5/OXFORD_NANOPORE_Metadata.tsv'): count = 0 for l in open(p): project, sample, _, _, run_id, project_title, *rest = l.strip().split( '\t') if project == 'project_id': continue prjs = list(session.query(Dataset).filter(Dataset.ena_id == project)) if len(prjs) == 0: dataset = Dataset(ena_id=project, project_title=project_title) session.add(dataset) session.commit() prjs = list(session.query(Dataset).filter(Dataset.ena_id == project)) assert len(prjs) == 1 prj = prjs[0] runq = session.query(Run).filter_by(ena_id=run_id).first() if not runq: run = Run(ena_id=run_id, dataset_id=prj) prj.runs.append(run) count += 1 session.add(run) print(f"adding {count} nanopore runs") session.commit()
def main(): ''' 完成数据的预处理 ''' configure = Configure('./config.cfg') processing_word = get_processing_word(lowercase=True) # 构造dataset train_dataset = Dataset(configure['train_data'], processing_word=processing_word) dev_dataset = Dataset(configure['train_data'], processing_word=processing_word) test_dataset = Dataset(configure['train_data'], processing_word=processing_word) # 构造word和tag的vocab vocab_util = Vocab_Util() vocab_words, vocab_tags = vocab_util.get_vocabs_from_datasets( [train_dataset, dev_dataset, test_dataset]) # 构造词向量中的词 vocab_glove = vocab_util.get_vocabs_from_glove(configure['glove_file']) # 取交集,同时出现在词向量词典和数据集中的词 vocab_words = vocab_words & vocab_glove # 加入UNK和数字NUM vocab_words.add(UNK) vocab_words.add(NUM) # 保存单词和tag的vocab文件 vocab_util.write_vocab(vocab_words, configure['word_vocab_file']) vocab_util.write_vocab(vocab_tags, configure['tag_vocab_file']) # 获取Trim Glove Vectors,并存储 vocab = vocab_util.load_vocab(configure['word_vocab_file']) vocab_util.export_trimmed_glove_vectors(vocab, configure['glove_file'], configure['trimmed_file'], configure['word_embedding_dim']) # 构造char vocab, 并且进行存储 train_dataset = Dataset(configure['train_data']) vocab_chars = vocab_util.get_char_vocab_from_datasets(train_dataset) vocab_util.write_vocab(vocab_chars, configure['char_vocab_file'])
def train_adversarial(model, train_dataset, epochs, layers, target_attack=False): """Train the model. train_dataset, val_dataset: Training and validation Dataset objects. learning_rate: The learning rate to train with epochs: Number of training epochs. Note that previous training epochs are considered to be done alreay, so this actually determines the epochs to train in total rather than in this particaular call. layers: Allows selecting wich layers to train. It can be: - A regular expression to match layer names to train - One of these predefined values: heaads: The RPN, classifier and mask heads of the network all: All the layers 3+: Train Resnet stage 3 and up 4+: Train Resnet stage 4 and up 5+: Train Resnet stage 5 and up """ # Pre-defined layer regular expressions layer_regex = { # all layers but the backbone "heads": r"(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)", # From a specific Resnet stage and up "3+": r"(fpn.C3.*)|(fpn.C4.*)|(fpn.C5.*)|(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)", "4+": r"(fpn.C4.*)|(fpn.C5.*)|(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)", "5+": r"(fpn.C5.*)|(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)", # All layers "all": ".*", } if layers in layer_regex.keys(): layers = layer_regex[layers] # Data generators train_set = Dataset(train_dataset, model.config, augment=False) train_generator = torch.utils.data.DataLoader(train_set, batch_size=1, shuffle=False, num_workers=4) model.set_trainable(layers) for epoch in range(model.epoch + 1, epochs + 1): # Training train_adversarial_batch(model, train_generator, target_attack=target_attack)
def knn_fetching_zo_4(query_nd, N_max, M, new_fraction=0.5, k=1, sort=True, verbose=2): assert (Dataset.root.n_dimensions() == query_nd.n_dimensions()) t_0 = time.time() sphere = query_nd.bounding_hypersphere(smooth=False) # Number of points from query_nd we want to keep. N_keep = round(N_max * (1 - new_fraction)) query_nd_subsampled = query_nd.random_sampling(N_keep) # Number of points we need to add. N_fetch = N_max - N_keep fetched_idcs = np.zeros(N_fetch, dtype=np.int) for i in range(N_fetch): new_idx = None while new_idx is None or new_idx in fetched_idcs[:i]: D_s = Dataset.root.random_sampling(M) D_s_outside_sphere = D_s.select_logical( ~sphere.contains(D_s.data())) # Just to be sure, remove any points from query_nd_subsampled. D_s_outside_sphere = D_s_outside_sphere.select_logical(~np.in1d( D_s_outside_sphere.indices(), query_nd_subsampled.indices())) singleton_dataset = D_s_outside_sphere.knn_pointset( k, query_dataset=query_nd, method='bruteforce', verbose=False) new_idx = singleton_dataset.indices()[0] if verbose > 1: print('{}/{}'.format(i, N_fetch)) fetched_idcs[i] = new_idx fetched_data = Dataset.root.data()[ fetched_idcs, :] # Is it faster to also fill this in the loop? fetched_dataset = Dataset(fetched_data, fetched_idcs) # Take union of the subsampled query and the newly fetched dataset. result = query_nd_subsampled + fetched_dataset if verbose: print('ZO4 took {:.2f} seconds.'.format(time.time() - t_0)) return result
def get_permissions(self, id, token): """Returns tuple like (dataset, http_error_method_name, error_msg).""" if token: # This URL has jwt authentication embedded in the query string so # that it is shareable. Ignore other permission rules as long as # the jwt is valid. payload, error = jwt_helper.decode(token) if not payload or error: return (None, 'http_unauthorized', error) allowed_endpoints = payload.get('allowed_endpoints', []) if self.get_endpoint_str() not in allowed_endpoints: return (None, 'http_forbidden', "Endpoint not allowed.") ds = Dataset.get_by_id(id) if not ds: return (None, 'http_not_found', None) else: # Without a token, do normal user-based authentication. user = self.get_current_user() if user.user_type == 'public': return (None, 'http_unauthorized', '') ds = Dataset.get_by_id(id) if not ds: return (None, 'http_not_found', None) if not ds.parent_id: if not user.super_admin: return (None, 'http_forbidden', "Only supers can get parentless datasets.") elif not owns(self.get_current_user(), ds.parent_id): return (None, 'http_forbidden', "Must own parent.") return (ds, None, None)
def knn_selection(source, n_samples, pos, sort=True): knn = NearestNeighbors(n_neighbors=n_samples) X = source.data() knn.fit(X) idcs_in_source = knn.kneighbors(pos.reshape(1, -1), return_distance=False).flatten() if sort: idcs_in_source.sort() data = source.data()[idcs_in_source, :] idcs_in_root = source.indices()[idcs_in_source] dataset = Dataset(data, idcs_in_root, name='KNN selection') return dataset
def process_key(model, consumer, opts, s3_key: str) -> None: """ This is the function is run by app.py which is a message consumer, message is send from content ingest job and contains a new s3 key. Download s3 key, vectorize new content, upload vectors to scylla Parameters ---------- s3_key received s3 key """ try: opts['input_data_params']['no_parallel'] = True opts['input_data_params']['keys_list'] = s3_key records = Dataset(**opts['input_data_params']).get_data() outputs = model.itransform(records) consumer.consume(outputs) except: log.error("Something failed...") log.error(traceback.format_exc())
def post(self): rserve_user, error = authenticate_rserve(self) if error == jwt_helper.NOT_FOUND: return self.http_unauthorized(error) elif error: return self.http_forbidden(error) if not rserve_user.super_admin: return self.http_forbidden("Must be super admin.") params = self.get_params({ 'content_type': str, 'data': 'json', 'filename': str, 'parent_id': str, }) # Parent id may be in the query string. parent_id = (self.request.get('parent_id', None) or params.pop('parent_id', None)) ds = Dataset.create(parent_id=parent_id, **params) ds.put() self.write(ds)
weighted_loss = raw_loss * loss_weight return weighted_loss, raw_loss, KL, bits, MAE, l1 if __name__ == '__main__': use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") torch.backends.cudnn.benchmark = True print('device = {}'.format(device)) params = {'batch_size': 32, 'shuffle': True, 'num_workers': 0} # Define datasets and loaders train_path = r'X:\DS Training Data\samples\train.npy' train_data = Dataset(train_path) train_loader = torch.utils.data.DataLoader(train_data, drop_last=True, **params) val_path = r'X:\DS Training Data\samples\val.npy' val_data = Dataset(val_path) val_loader = torch.utils.data.DataLoader(val_data, drop_last=True, **params) test_path = r'X:\DS Training Data\samples\test.npy' test_data = Dataset(test_path) test_loader = torch.utils.data.DataLoader(test_data, drop_last=True, **params)
def main(): path_to_dataset = "C:\\GIT\\ZIWM\\data.csv" file_writer = FileWriter("") total_number_of_classes = 8 total_number_of_features = 30 num_of_neighbours = [1, 5, 10] type_of_metric = ["euclidean", "manhattan"] #loading raw values from file datasetLoader = load_datasets.Load_datasets(path_to_dataset) dataset_raw_values = datasetLoader.loadDataset() #constructing main dataset with division for features and classes dataset = Dataset(dataset_raw_values) best_fit = 0.0 best_average_fit = 0.0 # selecting number of features and running tests for number_of_features_selected in range(1, 31): #print(number_of_features_selected) trimmed_feature_list = FeatureSelector.selectKBestFeatures( number_of_features_selected, dataset.dataset_features_array, dataset.dataset_class_array) # dividing data sets into patients patients = [] for i in range(len(dataset.dataset_class_array)): patient = Patient(i, dataset.dataset_class_array[i], trimmed_feature_list[i]) # print(patient.getId(), patient.getDisease_class(), patient.get_features()) patients.append(patient) # testing for each metric type and number of neighbours for metric in type_of_metric: for n_neighbours in num_of_neighbours: test_result_arr = [] for i in range(5): print("metric: ", metric, " n_neighbours", n_neighbours, " run: ", i) # creating learn and test data sets learning_set, testing_set = SplitSets.splitSets(patients) # creating algorythm and training kn = KNearestNeighbour(n_neighbours, metric, learning_set, testing_set) kn.train() res1 = kn.test() # swaping training and learning sets temp_set = learning_set learning_set = testing_set testing_set = temp_set kn.setTestSet(testing_set) kn.setTrainingSet(learning_set) # training once again kn.train() res2 = kn.test() print("test result 1: ", res1) print("test result 2: ", res2) test_result_arr.append(res1) test_result_arr.append(res2) if (res1 > best_fit): best_fit = res1 if (res2 > best_fit): best_fit = res2 test_average = sum(test_result_arr) / len(test_result_arr) print("average of tests: ", test_average) result_str = str(number_of_features_selected ) + " | " + metric + " | " + str( n_neighbours) + " | " + str( test_average) + " \n" file_writer.write(result_str) if (test_average > best_average_fit): best_average_fit = test_average # comparing results of test data set # calculating hit rate print("best fit: ", best_fit) print("best fit average: ", best_average_fit) file_writer.close()
def main(): pred_file_path = 'test.csv' load_save_model = True lr = 1e-5 batch_size = 8 gpu = True torch.manual_seed(0) device = torch.device('cpu') if gpu: device = torch.device('cuda') tokenizer = BertTokenizer(vocab_file='publish/vocab.txt', max_len=512) _, known_token = load_dataset('TRAIN/Train_reviews.csv', 'TRAIN/Train_labels.csv', tokenizer) dataset = load_review_dataset('TRAIN/TEST/Test_reviews.csv') dataset = Dataset(list(dataset.items())) dataloader = torch_data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, collate_fn=test_collate_fn( tokenizer, known_token)) bert_pretraining = convert_tf_checkpoint_to_pytorch( './publish/bert_model.ckpt', './publish/bert_config.json') model = Model(bert_pretraining.bert) model = model.cuda() if load_save_model: model.load_state_dict(torch.load('./save_model/best.model')) pred_file = open(pred_file_path, mode='w', encoding='utf-8') pbar = tqdm() model.eval() for step, (batch_X, len_X, mask, batch_idx, origin_batch_X) in enumerate(dataloader): batch_X = batch_X.to(device) mask = mask.to(device) scores, gather_idx = model(batch_X, len_X, mask, None) (pred_seq_target, pred_match_target, pred_single_aspect_category_target, pred_single_opinion_category_target,\ pred_cross_category_target, pred_single_aspect_polarity_target, pred_single_opinion_polarity_target,\ pred_cross_polarity_target) = model.infer(scores, mask) label = [] aspect_idx, opinion_idx = gather_idx for b in range(batch_X.shape[0]): _aspect_idx, _opinion_idx = aspect_idx[b], opinion_idx[b] if len(_aspect_idx) == 0 and len(_opinion_idx) == 0: label.append((batch_idx[b], '_', '_', '_', '_')) _aspect_cross, _opinion_cross = [ False for i in range(len(_aspect_idx)) ], [False for i in range(len(_opinion_idx))] for i in range(len(_aspect_idx)): for j in range(len(_opinion_idx)): if pred_match_target[b][i, j] == 1: _aspect_cross[i] = True _opinion_cross[j] = True category = ID2CATEGORY[pred_cross_category_target[b][ i, j]] polarity = ID2POLARITY[pred_cross_polarity_target[b][ i, j]] aspect = tokenizer.decode( list(origin_batch_X[b, _aspect_idx[i]].cpu(). detach().numpy())).replace(' ', '') opinion = tokenizer.decode( list(origin_batch_X[b, _opinion_idx[j]].cpu().detach( ).numpy())).replace(' ', '') # aspect = tokenizer.decode(list(batch_X[b, _aspect_idx[i]].cpu().detach().numpy())).replace(' ', '') # opinion = tokenizer.decode(list(batch_X[b, _opinion_idx[j]].cpu().detach().numpy())).replace(' ', '') aspect_beg = len( tokenizer.decode( list(batch_X[b, 1:_aspect_idx[i][0]].cpu().detach( ).numpy())).replace(' ', '')) aspect_end = aspect_beg + len(aspect) opinion_beg = len( tokenizer.decode( list(batch_X[b, 1:_opinion_idx[j][0]].cpu(). detach().numpy())).replace(' ', '')) opinion_end = opinion_beg + len(opinion) label.append((batch_idx[b], aspect, opinion, category, polarity)) for i in range(len(_aspect_idx)): if _aspect_cross[i] == False: category = ID2CATEGORY[ pred_single_aspect_category_target[b][i]] polarity = ID2POLARITY[ pred_single_aspect_polarity_target[b][i]] aspect = tokenizer.decode( list(origin_batch_X[ b, _aspect_idx[i]].cpu().detach().numpy())).replace( ' ', '') # aspect = tokenizer.decode(list(batch_X[b, _aspect_idx[i]].cpu().detach().numpy())).replace(' ', '') aspect_beg = len( tokenizer.decode( list(batch_X[b, 1:_aspect_idx[i][0]].cpu().detach( ).numpy())).replace(' ', '')) aspect_end = aspect_beg + len(aspect) label.append( (batch_idx[b], aspect, '_', category, polarity)) for i in range(len(_opinion_idx)): if _opinion_cross[i] == False: category = ID2CATEGORY[ pred_single_opinion_category_target[b][i]] polarity = ID2POLARITY[ pred_single_opinion_polarity_target[b][i]] opinion = tokenizer.decode( list(origin_batch_X[ b, _opinion_idx[i]].cpu().detach().numpy())).replace( ' ', '') # opinion = tokenizer.decode(list(batch_X[b, _opinion_idx[i]].cpu().detach().numpy())).replace(' ', '') opinion_beg = len( tokenizer.decode( list(batch_X[b, 1:_opinion_idx[i][0]].cpu().detach( ).numpy())).replace(' ', '')) opinion_end = opinion_beg + len(opinion) label.append( (batch_idx[b], '_', opinion, category, polarity)) for _label in label: _label = ','.join(list(map(lambda x: str(x), _label))) pred_file.write(_label + '\n') pbar.update(batch_size) pbar.set_description('step: %d' % step) pred_file.close() pbar.close()
def train_model(batch_size, lr, max_epochs, num_unmasked_weights, loss_weight, run_validation, print_grads, training_set, LAT): ''' Generates a dictionary of pairs of utterances, each pair representing a back and forth interaction ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~ > batch_size > lr - learning rate > max_epochs > num_unmasked_weights - The number of weights to be allowed through the linear combination e.g. num_unmasked_weights = 1 means only one sample will be chosen, = 2, 2 samples will be chosen. etc. output vector -> softmax -> mask -> ensure weights sum to 1 (not softmax because 0's stay 0). > loss_weight - a function that weighs the loss over time. > run_validation - if True, will run through validation set after each epoch > print_grads - Will display gradient flow every print_grads Set to False or 0 to avoid printing gradients > training_set - descriptive string describing the training data > LAT - boolean, whether to include log attack time or not ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~ > model > losses - dictionary with the following keys 'raw_losses' - losses unweighted by loss_weight 'train_losses' - losses weighted by loss_weight 'epoch_loss' - loss per epoch 'val_losses' - validation loss 'weights' - appends the weight vector of a whole batch once every ''' use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") torch.backends.cudnn.benchmark = True print('device = {}'.format(device)) params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 0} # Define datasets and loaders samples_path = r'X:\DS Training Data\samples' train_data = Dataset(samples_path, 'train.npy', LAT=LAT) train_loader = torch.utils.data.DataLoader(train_data, drop_last=True, **params) val_data = Dataset(samples_path, 'val.npy', LAT=LAT) val_loader = torch.utils.data.DataLoader(val_data, drop_last=True, **params) # test_data = Dataset(samples_path, 'test.npy') # test_loader = torch.utils.data.DataLoader(test_data, drop_last=True, **params) src_path = test_path = r'X:\DS Training Data\samples\base\src.npy' model = simpleVAE(d=32, src_path=src_path, batch_size=params['batch_size'], device=device, num_to_keep=num_unmasked_weights, LAT=LAT).cuda() # print(model) criterion = simple_loss optimizer = torch.optim.Adam(model.parameters(), lr=lr) balance = [100, 0, 1] # [MAE, KL, l1] losses = { 'raw_losses': [], # unweighted losses 'train_losses': [], # weighted losses 'epoch_loss': [], # cumulative loss per epoch 'val_losses': [], # validation loss 'loss_weight': [], # overarching weight of loss 'step': [], # global step 'weights': [], # contains the output weights of the model, recorded every X epochs 'idxs': [], # indices corresponding to weight vectors 'KL': [], # KL divergence 'bits': [], # KL divergence free bits 'l1': [], # l1 loss 'MSE': [], # MSE loss 'MAE': [], # MAE loss 'balance': balance, # Relative MAE, KL, l1 loss 'max_epochs': max_epochs, # maximum epochs 'num_unmasked_weights': num_unmasked_weights, # number of unmasked weights (defined in model) 'training_set': training_set, # descriptive string describing training set 'lr': lr # learning rate } with torch.autograd.set_detect_anomaly(True): for epoch in range(1, max_epochs + 1): model.train() epoch_loss = 0 displayed = False weights_recorded = False # Training for batch_idx, (local_batch, idx) in enumerate(train_loader): # Transfer to GPU local_batch = local_batch.to(device, dtype=torch.float32) # ================ forward ================ candidate, mu, logvar, q_z, weight_vec = model(local_batch) loss = criterion( create_torch_stft(candidate).double(), local_batch[:, :1025].double(), loss_weight(epoch)) epoch_loss += loss.item() # Keep track of some loss statistics losses['raw_losses'].append(loss.item() / loss_weight(epoch)) losses['MSE'].append(losses['raw_losses'][-1]) losses['train_losses'].append(loss.item()) losses['loss_weight'].append(loss_weight(epoch)) losses['step'].append(model.global_step) losses['KL'].append(0) losses['bits'].append(0) losses['l1'].append(0) losses['MAE'].append(0) # ================ backward ================ loss.backward() if print_grads: if epoch % print_grads == 0 and not displayed: displayed = True plot_grad_flow(model.named_parameters()) optimizer.step() model.global_step += 1 model.zero_grad() if batch_idx % 5 == 0: # Record time per 10 batches if batch_idx != 0: end = time.time() elapsed = end - start else: elapsed = 0 start = time.time() # Record weights every few epochs, depending on max_epochs if not weights_recorded: if max_epochs <= 5: losses['weights'].append(weight_vec) losses['idxs'].append(idx) weights_recorded = True elif 5 < max_epochs <= 20: if epoch % 2 == 0: losses['weights'].append(weight_vec) losses['idxs'].append(idx) weights_recorded = True elif 20 < max_epochs <= 100: if epoch % 10 == 0: losses['weights'].append(weight_vec) losses['idxs'].append(idx) weights_recorded = True elif 100 < max_epochs <= 1000: if epoch % 20 == 0: losses['weights'].append(weight_vec) losses['idxs'].append(idx) weights_recorded = True else: if epoch % 50 == 0: losses['weights'].append(weight_vec) losses['idxs'].append(idx) weights_recorded = True # print training update print('* * * * * * * * * * * * * * * * * *\n') print(' epoch : {}/{}'.format(epoch, losses['max_epochs'])) print(' batch : ({}/{}) {:.0f}%'.format( len(local_batch) * batch_idx, len(train_loader.dataset), 100. * batch_idx / len(train_loader))) print(' step : {}'.format(model.global_step)) print(' time : {:.2f}\n'.format(elapsed)) print(' loss : ({}) * {:.3f}'.format( loss_weight(epoch), losses['raw_losses'][-1])) print(' = {}\n'.format( losses['train_losses'][-1])) print(' max : {:.6f}'.format(torch.max( weight_vec[-1]))) print(' max idx : {:.0f}'.format( torch.argmax(weight_vec[-1]))) print(' min : {:.6f}\n'.format( torch.min(weight_vec[-1]))) print('*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*') losses['epoch_loss'].append(epoch_loss) if run_validation == True: # ================ validation ================ with torch.no_grad(): model.eval() # Put model in evaluation mode for batch_idx, (local_batch, idx) in enumerate(val_loader): # Transfer to GPU local_batch = local_batch.to(device, dtype=torch.float32) # Model computations candidate, mu, logvar, q_z, _ = model(local_batch) loss = criterion( create_torch_stft(candidate).double(), local_batch[:, :1025].double(), loss_weight(epoch)) print('loading validation loss...\n') print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print('Validation Loss: {}'.format(loss.item())) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n') losses['val_losses'].append(loss.item()) else: losses['val_losses'].append(0) return model, losses
def test(args): print('[*] TEST') device = torch.device('cuda:' + str(args.device)) dataset_path = args.data onmt_path = args.pretrain unref_path = args.unref nbatch = int(args.batch) output = args.output #data load gold_reply_path = os.path.join(dataset_path, 'test_reply.txt') gene_reply_path = os.path.join(dataset_path, 'gene_reply.txt') test_query_path = os.path.join(dataset_path, 'test_query.txt') refer_test_data_path_list = [gene_reply_path, gold_reply_path] unref_test_data_path_list = [test_query_path, gold_reply_path] vocab_path = os.path.join(onmt_path, 'sample.vocab.pt') onmt_model_path = os.path.join(onmt_path, 'sample.model.pt') refer_test_dataset = Dataset(vocab_path=vocab_path, data_path_list=refer_test_data_path_list, max_length=50) refer_test_loader = DataLoader(dataset=refer_test_dataset, batch_size=nbatch, collate_fn=collate_fn, num_workers=8) unref_test_dataset = Dataset(vocab_path=vocab_path, data_path_list=unref_test_data_path_list, max_length=50) unref_test_loader = DataLoader(dataset=unref_test_dataset, batch_size=nbatch, collate_fn=collate_fn, num_workers=8) #unref model load unrefer_pos_model = torch.load(unref_path + '_pos.th') unrefer_neg_model = torch.load(unref_path + '_neg.th') #test positive_test = open(output + '/positive_result.txt', 'w', encoding='utf-8') negative_test = open(output + '/negative_result.txt', 'w', encoding='utf-8') for query, q_len, reply, r_len in unref_test_loader: prediction1 = unrefer_pos_model(query, q_len, reply, r_len) prediction2 = unrefer_neg_model(query, q_len, reply, r_len) #print(query,'/', q_len, '/', reply, '/', r_len) print(prediction1) print(prediction2) positive_test.write(str(prediction1.data)) negative_test.write(str(prediction2.data)) print('break') break positive_test.close() negative_test.close() ##reference score encoder = build_pretrained_model(onmt_model_path, refer_test_dataset.vocab) ref_model = RefScorer(encoder, device) sim_output = open(output + '/similarity.txt', 'w', encoding='utf-8') for gold_indices, gold_lens, gen_indices, gen_lens in refer_test_loader: similarity = ref_model.get_ref_score(gold_indices, gold_lens, gen_indices, gen_lens) print(similarity.data) sim_output.write(str(similarity.data)) break sim_output.close()
def train(args): print('[*] TRAIN') device = torch.device('cuda:' + str(args.device)) ninput = int(args.dim) nlayer = int(args.layer) nbatch = int(args.batch) nhidden = int(args.hidden) margin = int(args.margin) epoch = int(args.epoch) learningrate = float(args.lr) dataset_path = args.data onmt_path = args.pretrain train_src_path = os.path.join(dataset_path, 'src_train.txt') train_tar_path = os.path.join(dataset_path, 'tar_train.txt') unref_train_data_path_list = [train_src_path, train_tar_path] vocab_path = os.path.join(onmt_path, 'sample.vocab.pt') #pre-trained model #onmt_vocab_path = os.path.join(onmt_path, 'sample.vocab.pt') #onmt_model_path = os.path.join(onmt_path, 'sample.model.pt') #data load unref_train_dataset = Dataset(vocab_path=vocab_path, data_path_list=unref_train_data_path_list, max_length=50) unref_nega_dataset = NegativeDataset(unref_train_dataset.data, unref_train_dataset.vocab) #print('positive ', unref_train_dataset[0], unref_train_dataset.data[0]) #print('negative ', negative_dataset[0], negative_dataset.data[0]) positive_loader = DataLoader(dataset=unref_train_dataset, batch_size=nbatch, collate_fn=collate_fn, num_workers=8) negative_loader = DataLoader(dataset=unref_nega_dataset, batch_size=nbatch, collate_fn=collate_fn, num_workers=8) vocab_size = unref_train_dataset.getVocabSize() batch_num = math.ceil(unref_train_dataset.getInstanceSize() / nbatch) print('[*] # of batch: ', batch_num, ' pos, neg :', len(positive_loader), len(negative_loader)) #ninput, nhidden, nlayer, ntoken, nbatch unrefer_pos_model = UnrefScorer(ninput, nhidden, nlayer, vocab_size, nbatch, device) unrefer_pos_model = unrefer_pos_model.to(device) unrefer_neg_model = UnrefScorer(ninput, nhidden, nlayer, vocab_size, nbatch, device) unrefer_neg_model = unrefer_neg_model.to(device) loss_f = torch.nn.MarginRankingLoss(margin) optimizer1 = torch.optim.Adam(unrefer_pos_model.parameters(), lr=learningrate) optimizer2 = torch.optim.Adam(unrefer_neg_model.parameters(), lr=learningrate) total_loss = 0 for i in range(epoch): #epoch iter_positive = iter(positive_loader) iter_negative = iter(negative_loader) for mini in range(batch_num): #positive training pos_src, pos_src_len, pos_tar, pos_tar_len = next(iter_positive) neg_src, neg_src_len, neg_tar, neg_tar_len = next(iter_negative) #print('pos', pos_src, '/', pos_src_len, '/', pos_tar, '/', pos_tar_len) encd_pos = unrefer_pos_model(pos_src, pos_src_len, pos_tar, pos_tar_len) encd_neg = unrefer_neg_model(neg_src, neg_src_len, neg_tar, neg_tar_len) #print('next ', mini, encd_pos.size(), encd_neg.size()) target = torch.ones(encd_pos.size(0), 1).to(device) #batch loss = loss_f(encd_pos, encd_neg, target) total_loss = total_loss + loss.item() unrefer_pos_model.zero_grad() unrefer_neg_model.zero_grad() loss.backward() optimizer1.step() optimizer2.step() print('[-] epoch: ', i, ', total_loss :', total_loss) total_loss = 0 torch.save(unrefer_pos_model, args.output + '_pos.th') torch.save(unrefer_neg_model, args.output + '_neg.th')
FILE.write('%f\n' % total_stats[i][2][-1]) FILE.close() # ----------------------------------------------------------------------------------------- ########################### ### START HERE ### ########################### if __name__ == '__main__': # Get the command line arguments in a dictionary args = Arguments() # Create a Dataset object based on the <genome_file> refGenome = Dataset(args['<genome_file>']) refGenome.calcGenomeCoords() if args['-v'] in ['all', 'refG']: refGenome.printDataset('refGenome') # Create a Dataset object based on the <A_file> fileA = Dataset(args['<A_file>']) fileA.calcDataCoords(refGenome) if args['-v'] in ['all', 'fileA']: fileA.printDataset('fileA') # Print output header for the <A_file> output_header(args, fileA) # Foreach <B_files> if args['-c']: FH = open('testFiles/summary', 'w') mnRandom = 0 for fileB_name in args['<B_files>']:
from model import Dataset, Classifier, GRADIENT_BOOSTING if __name__ == "__main__": classifier = Classifier(GRADIENT_BOOSTING) classifier.deserialize_model("model.out") predict_dataset = Dataset() predict_dataset.append_candidates("candidates_temp.csv") proba = classifier.predict_proba(predict_dataset.features) predict_dataset.store_proba("probabilities.csv", proba)
def root_selection(source): data = Dataset.root.data()[source.indices(), :] dataset = Dataset(data, source.indices(), name='Root selection') return dataset
if torch.cuda.is_available(): args.cuda = True else: args.cuda = False args.device = torch.device("cuda" if args.cuda else "cpu") print("Using CUDA: {}".format(args.cuda)) set_seed_everywhere(args.seed, args.cuda) handle_dirs(args.save_dir) if args.reload_from_files and Path(args.vectorizer_file).exists(): dataset = Dataset.load_dataset_and_load_vectorizer(args.dataset_csv, args.vectorizer_file) else: dataset = Dataset.load_dataset_and_make_vectorizer(args.dataset_csv) dataset.save_vectorizer(args.vectorizer_file) vectorizer = dataset.get_vectorizer() model = Model( source_vocab_size=len(vectorizer.source_vocab), source_embedding_size=args.source_embedding_size, target_vocab_size=len(vectorizer.target_vocab), target_embedding_size=args.target_embedding_size, encoding_size=args.encoding_size, target_bos_index=vectorizer.target_vocab.begin_seq_index, )