def __init__(self, input_channels, output_channels, kernel, dropout=0.0, activation='identity', dilation=1, groups=1, batch_norm=True): super(HighwayConvBlock, self).__init__(input_channels, 2*output_channels, kernel, dropout, activation, dilation, groups, batch_norm) self._gate = Sigmoid()
class StructuredFunction: """ Two activation functions, one represents restrictions of Sum Type (results should sum to 1), other - restrictions of Product Type (results are in [0, 1] and independent of each other) """ def __init__(self, sum, prod): self.sum = sum self.prod = prod class Linear(Function): """ Linear activation function. Doesn't do any transformation of data """ @staticmethod def forward(ctx, input): return input @staticmethod def backward(ctx, grad_outputs): return grad_outputs linear = Linear.apply structuredLinear = StructuredFunction(linear, linear) structuredSigmoid = StructuredFunction(Softmax(dim=1), Sigmoid())
def __init__(self, input_shape, n_convfilter, n_fc_filters, h_shape, conv3d_filter_shape): #n_convfilter = [96, 128, 256, 256, 256, 256] print("\ninitializing \"encoder\"") #input_shape = (self.batch_size, 3, img_w, img_h) super(encoder, self).__init__() # conv1 self.conv1a = Conv2d(input_shape[1], n_convfilter[0], 7, padding=3) self.bn1a = BatchNorm2d(n_convfilter[0]) self.conv1b = Conv2d(n_convfilter[0], n_convfilter[0], 3, padding=1) self.bn1b = BatchNorm2d(n_convfilter[0]) # conv2 self.conv2a = Conv2d(n_convfilter[0], n_convfilter[1], 3, padding=1) self.bn2a = BatchNorm2d(n_convfilter[1]) self.conv2b = Conv2d(n_convfilter[1], n_convfilter[1], 3, padding=1) self.bn2b = BatchNorm2d(n_convfilter[1]) self.conv2c = Conv2d(n_convfilter[0], n_convfilter[1], 1) self.bn2c = BatchNorm2d(n_convfilter[1]) # conv3 self.conv3a = Conv2d(n_convfilter[1], n_convfilter[2], 3, padding=1) self.bn3a = BatchNorm2d(n_convfilter[2]) self.conv3b = Conv2d(n_convfilter[2], n_convfilter[2], 3, padding=1) self.bn3b = BatchNorm2d(n_convfilter[2]) self.conv3c = Conv2d(n_convfilter[1], n_convfilter[2], 1) self.bn3c = BatchNorm2d(n_convfilter[2]) # conv4 self.conv4a = Conv2d(n_convfilter[2], n_convfilter[3], 3, padding=1) self.bn4a = BatchNorm2d(n_convfilter[3]) self.conv4b = Conv2d(n_convfilter[3], n_convfilter[3], 3, padding=1) self.bn4b = BatchNorm2d(n_convfilter[3]) # conv5 self.conv5a = Conv2d(n_convfilter[3], n_convfilter[4], 3, padding=1) self.bn5a = BatchNorm2d(n_convfilter[4]) self.conv5b = Conv2d(n_convfilter[4], n_convfilter[4], 3, padding=1) self.bn5b = BatchNorm2d(n_convfilter[4]) # conv6 self.conv6a = Conv2d(n_convfilter[4], n_convfilter[5], 3, padding=1) self.bn6a = BatchNorm2d(n_convfilter[5]) self.conv6b = Conv2d(n_convfilter[5], n_convfilter[5], 3, padding=1) self.bn6b = BatchNorm2d(n_convfilter[5]) # pooling layer self.pool = MaxPool2d(kernel_size=2, padding=1) # batch_size, 256, 64, 64 # nonlinearities of the network self.leaky_relu = LeakyReLU(negative_slope=0.01) self.sigmoid = Sigmoid() self.tanh = Tanh() # find the input feature map size of the fully connected layer fc7_feat_w, fc7_feat_h = self.fc_in_featmap_size( input_shape, num_pooling=6) # define the fully connected layer self.fc7 = Linear( int(n_convfilter[5] * fc7_feat_w * fc7_feat_h), n_fc_filters[0]) # batch_size, 1024 # define the FCConv3DLayers in 3d convolutional gru unit #conv3d_filter_shape = (self.n_deconvfilter[0], self.n_deconvfilter[0], 3, 3, 3) # 128*128*3*3*3 self.t_x_s_update = BN_FCConv3DLayer_torch( # conv3d_filter_shape = [128, 128, 3, 3, 3] h_shape = (batch_size, 128, 4, 4, 4) n_fc_filters[0], conv3d_filter_shape, h_shape) #n_convfilter = [96, 128, 256, 256, 256, 256] self.t_x_s_reset = BN_FCConv3DLayer_torch( # n_deconvfilter = [128, 128, 128, 64, 32, 2] n_fc_filters[0], conv3d_filter_shape, h_shape) # 1024 self.t_x_rs = BN_FCConv3DLayer_torch( n_fc_filters[0], conv3d_filter_shape, h_shape) # 1024,
def __init__(self, dimension): super(HighwayLayer, self).__init__() self._linear = Sequential(Linear(dimension, dimension), ReLU()) self._gate = Sequential(Linear(dimension, dimension), Sigmoid())
def __init__(self, in_node_feats, in_global_feats): super().__init__() self.node_fn = Sequential(Linear(in_node_feats, 2), Sigmoid()) self.global_fn = Sequential(Linear(in_global_feats, 5), Sigmoid())
def __init__(self, n_features, n_embeddings, n_units): super(Net, self).__init__() self.n_features = n_features self.n_embeddings = n_embeddings self.n_units = n_units self.encoder = ModuleDict({ 'gru': GRU(self.n_features, self.n_units, 3, dropout=0.1, bidirectional=True, batch_first=True), 'linear': Linear(2 * self.n_units, self.n_embeddings) }) self.decoder = ModuleDict({ 'gru': GRU(self.n_embeddings, self.n_units, 3, dropout=0.1, bidirectional=True, batch_first=True), 'linear': Linear(2 * self.n_units, self.n_features) }) self.decoder1 = ModuleDict({ 'gru': GRU(16, self.n_units, 3, dropout=0.1, bidirectional=True, batch_first=True), 'linear': Linear(2 * self.n_units, 16) }) self.decoder2 = ModuleDict({ 'gru': GRU(10, self.n_units, 3, dropout=0.1, bidirectional=True, batch_first=True), 'linear': Linear(2 * self.n_units, 10) }) self.decoder3 = ModuleDict({ 'gru': GRU(self.n_embeddings, self.n_units, 3, dropout=0.1, bidirectional=True, batch_first=True), 'linear': Linear(2 * self.n_units, 1) }) self.decoder4 = ModuleDict({ 'gru': GRU(self.n_embeddings, self.n_units, 3, dropout=0.1, bidirectional=True, batch_first=True), 'linear': Linear(2 * self.n_units, 1) }) self.relu = ReLU() self.sigmoid = Sigmoid()
def main(in_path, outpath): nltk.download() span_extractor = torch.load(os.path.join(EXPERIMENT, 'best_span_extractor.tar'), map_location='cpu') answer_verifier = torch.load(os.path.join(EXPERIMENT, 'best_answer_verifier.tar'), map_location='cpu') span_extractor.use_cuda = False answer_verifier.use_cuda = False tokenizer = StanfordTokenizer( options={'ptb3Escaping': True}) # same tokenizer used by lexical parser parser = StanfordParser(java_options='-mx5g') data = json.load(open(in_path, 'r'))['data'] batches = [] official_eval = {} official_eval_tokens = {} qaid_map = {} num_articles = len(data) for aidx in range(len(data)): article = data[aidx] print('\t- Article Count=%d/%d' % (aidx + 1, num_articles)) for pidx, paragraph in enumerate(article['paragraphs']): passage, qas = paragraph['context'], paragraph['qas'] passage = passage.replace(u'\xa0', ' ') sentences = sent_tokenize(passage) sentence_tokens = [ tokenizer.tokenize(sentence) for sentence in sentences ] raw_trees = [ list(s)[0] for s in list( parser.parse_sents(sentence_tokens, verbose=True)) ] squad_tree = TreePassage(raw_trees) for qidx, qa in enumerate(qas): question_sentences = sent_tokenize(qa['question']) question_tokens = [] for s in question_sentences: question_tokens += tokenizer.tokenize(s) batches.append( Batch([{ 'apid': 'apid', 'qa_id': qa['id'], 'context_squad_tree': squad_tree, 'question_tokens': question_tokens, 'answers': [], 'is_impossible': 0 }], False)) qaid_map[qa['id']] = paragraph['context'] span_extractor.eval() answer_verifier.eval() for idx, batch in enumerate(batches): qa_id = batch.qa_id[0] node_scores, expected_f1s, global_answer_score = span_extractor( batch, eval_system=True) score_confidence, predicted_node_idxs = node_scores.max(dim=1) score_confidence, predicted_node_idxs = (variable_to_numpy( score_confidence, False), variable_to_numpy(predicted_node_idxs, False)) # Answer score = predicted has answer probability answer_score = answer_verifier(batch, predicted_node_idxs=predicted_node_idxs, eval_system=True) answer_proba = variable_to_numpy( Sigmoid()(answer_score), False) # convert from tensor to numpy array global_answer_proba = variable_to_numpy(Sigmoid()(global_answer_score), False) has_answer_proba = (0.3 * score_confidence + 0.4 * global_answer_proba + 0.3 * answer_proba)[0] predicted_span = batch.trees[0].span(predicted_node_idxs[0]) predicted_has_answer = has_answer_proba >= HAS_ANSWER_THRESHOLD predicted_text = tokens_to_text(predicted_span, qaid_map[qa_id]) official_eval[qa_id] = predicted_text if predicted_has_answer else '' official_eval_tokens[qa_id] = ' '.join( predicted_span) if predicted_has_answer else '' json.dump(official_eval, open(outpath, 'w'))
def __init__(self): super(Net, self).__init__() self.hidden_feature = 600 self.linear1 = Linear(345, self.hidden_feature) self.sigmoid1 = Sigmoid() self.linear2 = Linear(self.hidden_feature, 30)
def torch_fn(): """Create a sigmoid layer in torch.""" return Sigmoid()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default="bert-base-uncased", type=str, required=False, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--bert_model_path", default="", type=str, required=False, help="Bert pretrained saved pytorch model path.") parser.add_argument("--reformer_model_path", default=None, type=str, required=False, help="Bert pretrained saved pytorch model path.") parser.add_argument("--experiment", default="attention", type=str, required=False, help="4 types: attention, base, long, ablation. " "base: original bert" "long: uses an lstm to keep track of all bert hidden representations, but backprop over the first" "attention: uses an lstm + attention mechanism to backprop over more than the first representation" "ablation: concat all the hidden representations" ) parser.add_argument("--model_name_or_path", default="bert-base-uncased", type=str, required=True) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--reformer_hashes", default=4, type=int, help="Reformer hash buckets") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--max_tokens", default=16384, type=int, help="The total tokens for ease of processing") parser.add_argument("--token_shift", default=200, type=int, help="") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_pos_encoding", action='store_true', help="train a model with positional coding.") parser.add_argument("--do_min_att", action='store_true', help="ensure attention has a minimal alpha.") parser.add_argument("--do_truncate", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=16, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--max_epochs", default=20, type=float, help="Proportion of training to perform linear learning rate warmup for. ") parser.add_argument("--warmup_epochs", default=1.0, type=float, help="Proportion of training to perform linear learning rate warmup for. ") parser.add_argument("--patience", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--val_split", default=0.05, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() save_args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') args.device = device logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) cls_token = tokenizer.convert_tokens_to_ids(["[CLS]"]) sep_token = tokenizer.convert_tokens_to_ids(["[SEP]"]) model = get_model(args, num_labels, len(tokenizer.vocab), cls_token, sep_token, args.token_shift) if args.bert_model_path != "": print("Loading model from: " + args.bert_model_path) if args.do_train: pretrained_dict = torch.load(os.path.join(args.bert_model_path,"pytorch_model.bin")) model_dict = model.state_dict() # 1. filter out unnecessary keys pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} if 'classifier1.weight' in pretrained_dict:# and pretrained_dict['classifier1.weight'].shape[0] != num_labels: del pretrained_dict['classifier1.weight'] del pretrained_dict['classifier1.bias'] '''if 'classifier2.weight' in pretrained_dict and pretrained_dict['classifier2.weight'].shape[0] != num_labels: del pretrained_dict['classifier2.weight'] del pretrained_dict['classifier2.bias']''' # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load the new state dict model.load_state_dict(model_dict) else: model.load_state_dict(torch.load(args.bert_model_path)) sig = Sigmoid() if args.local_rank == 0: torch.distributed.barrier() if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 nb_tr_steps = 0 tr_loss = 0 loss_fct = CrossEntropyLoss() if args.do_train: if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() UC = "" if args.do_lower_case else "UC" cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(task_name), str(args.max_tokens), UC)) # Prepare data loader logger.info("Loading training dataset") train_data = load_dataset(cached_train_features_file, args, processor, tokenizer, output_mode, data_type="train") if args.task_name == "arxiv": logger.info("Loading validation dataset") cached_val_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(task_name), str(args.max_tokens), UC)) val_data = load_dataset(cached_val_features_file, args, processor, tokenizer, output_mode, data_type="val") else: logger.info("Spliting train dataset into validation dataset") train_data1, train_data2, train_data3 = train_data.tensors #random.shuffle(train_data) rand = torch.randperm(train_data1.shape[0]) train_data1 = train_data1[rand] train_data2 = train_data2[rand] train_data3 = train_data3[rand] val_size = int(train_data1.shape[0] * args.val_split) val_data1 = train_data1[:val_size] val_data2 = train_data2[:val_size] val_data3 = train_data3[:val_size] train_data1 = train_data1[val_size:] train_data2 = train_data2[val_size:] train_data3 = train_data3[val_size:] train_data = TensorDataset(train_data1, train_data2, train_data3) val_data = TensorDataset(val_data1, val_data2, val_data3) if args.local_rank == -1: train_sampler = RandomSampler(train_data) val_sampler = RandomSampler(val_data) else: train_sampler = DistributedSampler(train_data) val_sampler = DistributedSampler(val_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=args.eval_batch_size) num_train_optimization_steps = (len(train_dataloader)) // args.gradient_accumulation_steps * args.max_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule(warmup=(args.warmup_epochs / args.max_epochs), t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=(args.warmup_epochs / args.max_epochs), t_total=num_train_optimization_steps) logger.info("***** Running training *****") #logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_val_loss = 90999990.0 patience = 0 val_losses = [] output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) e_iter = trange(int(args.max_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) for i, _ in enumerate(e_iter): torch.cuda.empty_cache() model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 t_iter = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, t_batch in enumerate(t_iter): input_ids, input_mask, label_ids = get_batch(args, t_batch, device, cls_token) outputs = model(input_ids, input_mask, labels=label_ids) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: # loading gpus takes a while the first iteration, get a better estimate this way if i == 0 and step == 0: t_iter.start_t = time.time() e_iter.start_t = time.time() acc = np.sum(np.argmax(outputs[1].cpu().detach().numpy(), axis=1) == label_ids.cpu().numpy()) / label_ids.shape[0] t_iter.set_description("loss{0:.3f},acc{1:.3f}".format(loss, acc)) tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) tb_writer.add_scalar('acc', acc, global_step) # input_ids;del input_mask;del label_ids;del outputs torch.cuda.empty_cache() model.eval() val_loss = 0 out_label_ids = None with torch.no_grad(): for v_batch in tqdm(val_dataloader, desc="valuating"): input_ids, input_mask, label_ids = get_batch(args, v_batch, device, cls_token) outputs = model(input_ids, input_mask, labels=label_ids) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if n_gpu > 1: loss = loss.mean() val_loss += loss.item() #del input_ids;del input_mask;del label_ids;del outputs val_losses.append(val_loss) #end training iter if val_loss < best_val_loss and (args.local_rank == -1 or torch.distributed.get_rank() == 0): best_val_loss = val_loss patience = 0 # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self print("best epoch {} loss {}".format(i,best_val_loss)) else: patience+=1 if patience >= args.patience: break ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() ### Example: #model = model_to_save output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_config_file = os.path.join(args.output_dir, CONFIG_NAME) #torch.save(model_to_save.state_dict(), output_model_file) if "reformer" not in args.experiment: model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned #model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) # Good practice: save your training arguments together with the trained model output_args_file = os.path.join(args.output_dir, 'training_args.bin') torch.save(args, output_args_file) with open(os.path.join(args.output_dir,'commandline_args.txt'), 'w') as f: json.dump(save_args.__dict__, f, indent=2) else: model = get_model(args, num_labels, len(tokenizer.vocab), cls_token, sep_token, args.token_shift) model.load_state_dict(torch.load(output_model_file)) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) ### Evaluation if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): UC = "" if args.do_lower_case else "UC" cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(task_name), str(args.max_tokens), UC)) logger.info("Loading test dataset") eval_data = load_dataset(cached_eval_features_file, args, processor, tokenizer, output_mode, data_type = "test") eval_data_long = [] eval_data_short = [] #import pdb; pdb.set_trace() '''for item in eval_data: if item[1].sum().item() <= args.max_seq_length -2: eval_data_short.append(item) else: eval_data_long.append(item) eval_data = eval_data_long''' logger.info("***** Running evaluation *****") #logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) # Note that this sampler samples randomly eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] out_label_ids = None model.token_shift = args.token_shift torch.cuda.empty_cache() for t_batch in tqdm(eval_dataloader, desc="Evaluating"): input_ids, input_mask, label_ids = get_batch(args, t_batch, device, cls_token) with torch.no_grad(): outputs = model(input_ids, input_mask, labels = label_ids) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) elif output_mode == "multi_classification": preds = preds > .5 result = compute_metrics(task_name, preds, out_label_ids) loss = tr_loss/global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss with open(os.path.join(args.output_dir, "eval_results.txt"), "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) with open(os.path.join(args.output_dir, 'val_loss.txt'), 'w') as f: for item in val_losses: f.write("%s\n" % item) acc = result['acc'] with open(os.path.join(args.output_dir, "results.csv"), "w") as writer: writer.write(f"{args.task_name}, {args.experiment}, {args.model_name_or_path[13:]},{args.learning_rate},{args.reformer_hashes},{acc}\n")
with open(testDataName, 'rb') as f: A = pickle.load(f) loader = Data.DataLoader(dataset=A, batch_size=BATCHSIZE, collate_fn=A.collate_fn) tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') model = BertLinear.from_pretrained('bert-base-chinese') #modelName = config["checkpoint"] + 'BertLinear1000.pt' model.load_state_dict(torch.load(modelName)) model = model.to(device) model.eval() f = Sigmoid() to_Write = {} with torch.no_grad(): with open(predictName, 'w') as f_predict: for batch in tqdm(loader): questionId = batch['id'] #print(questionId) X = batch['input_ids'] X = torch.tensor(X).to(device) print('X', X) token_type_ids = batch['token_type_ids'] token_type_ids = torch.tensor(token_type_ids).to(device) attention_mask = batch['attention_mask'] attention_mask = torch.tensor(attention_mask).to(device)
def train(self, verbose=True): # grab training params BATCH_SIZE = self.training_params['BATCH_SIZE'] TRAINING_ITERATIONS = self.training_params['TRAINING_ITERATIONS'] BATCH_SIZE = self.training_params['BATCH_SIZE'] CHECKPOINT_AFTER = self.training_params['CHECKPOINT_AFTER'] SAVEPOINT_AFTER = self.training_params['SAVEPOINT_AFTER'] TEST_BATCH_SIZE = self.training_params['TEST_BATCH_SIZE'] model = self.model X = self.features[:self.num_train] Y = self.labels[:self.num_train] # See: https://discuss.pytorch.org/t/multi-label-classification-in-pytorch/905/45 training_loss = torch.nn.BCEWithLogitsLoss() opt = optim.Adam(model.parameters(), lr=3e-4, weight_decay=0.001) itr = 1 for epoch in range( TRAINING_ITERATIONS): # loop over the dataset multiple times t0 = time.time() running_loss = 0.0 rand_idx = list(np.arange(0, X.shape[0] - 1)) random.shuffle(rand_idx) # Sample all data points indices = [ rand_idx[ii * BATCH_SIZE:(ii + 1) * BATCH_SIZE] for ii in range((len(rand_idx) + BATCH_SIZE - 1) // BATCH_SIZE) ] for ii, idx in enumerate(indices): # zero the parameter gradients opt.zero_grad() inputs = Variable(torch.from_numpy( X[idx, :])).float().to(device=self.device) y_true = Variable(torch.from_numpy( Y[idx, :])).float().to(device=self.device) # forward + backward + optimize outputs = model(inputs) loss = training_loss(outputs, y_true).float().to(device=self.device) loss.backward() opt.step() # print statistics\n", running_loss += loss.item() if itr % CHECKPOINT_AFTER == 0: rand_idx = list(np.arange(0, X.shape[0] - 1)) random.shuffle(rand_idx) test_inds = rand_idx[:TEST_BATCH_SIZE] inputs = Variable(torch.from_numpy( X[test_inds, :])).float().to(device=self.device) y_out = Variable(torch.from_numpy( Y[test_inds])).float().to(device=self.device) # forward + backward + optimize outputs = model(inputs) loss = training_loss(outputs, y_out).float().to(device=self.device) outputs = Sigmoid()(outputs).round() accuracy = [ float(all(torch.eq(outputs[ii], y_out[ii]))) for ii in range(TEST_BATCH_SIZE) ] accuracy = np.mean(accuracy) verbose and print("loss: " + str(loss.item()) + " , acc: " + str(accuracy)) if itr % SAVEPOINT_AFTER == 0: torch.save(model.state_dict(), self.model_fn) verbose and print('Saved model at {}'.format( self.model_fn)) # writer.add_scalar('Loss/train', running_loss, epoch) itr += 1 verbose and print('Done with epoch {} in {}s'.format( epoch, time.time() - t0)) torch.save(model.state_dict(), self.model_fn) print('Saved model at {}'.format(self.model_fn)) print('Done training')
def __init__(self, num_channels=32, feat_channels=[64, 128, 256, 512, 1024], residual='conv'): # residual: conv for residual input x through 1*1 conv across every layer for downsampling, None for removal of residuals super(UNet3D, self).__init__() # Encoder downsamplers self.pool1 = MaxPool3d((1, 2, 2)) self.pool2 = MaxPool3d((1, 2, 2)) self.pool3 = MaxPool3d((1, 2, 2)) self.pool4 = MaxPool3d((1, 2, 2)) # Encoder convolutions self.conv_blk1 = Conv3D_Block(num_channels, feat_channels[0], residual=residual) self.conv_blk2 = Conv3D_Block(feat_channels[0], feat_channels[1], residual=residual) self.conv_blk3 = Conv3D_Block(feat_channels[1], feat_channels[2], residual=residual) self.conv_blk4 = Conv3D_Block(feat_channels[2], feat_channels[3], residual=residual) self.conv_blk5 = Conv3D_Block(feat_channels[3], feat_channels[4], residual=residual) # Decoder convolutions self.dec_conv_blk4 = Conv3D_Block(2 * feat_channels[3], feat_channels[3], residual=residual) self.dec_conv_blk3 = Conv3D_Block(2 * feat_channels[2], feat_channels[2], residual=residual) self.dec_conv_blk2 = Conv3D_Block(2 * feat_channels[1], feat_channels[1], residual=residual) self.dec_conv_blk1 = Conv3D_Block(2 * feat_channels[0], feat_channels[0], residual=residual) # Decoder upsamplers self.deconv_blk4 = Deconv3D_Block(feat_channels[4], feat_channels[3]) self.deconv_blk3 = Deconv3D_Block(feat_channels[3], feat_channels[2]) self.deconv_blk2 = Deconv3D_Block(feat_channels[2], feat_channels[1]) self.deconv_blk1 = Deconv3D_Block(feat_channels[1], feat_channels[0]) # Final 1*1 Conv Segmentation map self.one_conv = Conv3d(feat_channels[0], num_channels, kernel_size=1, stride=1, padding=0, bias=True) # Activation function self.sigmoid = Sigmoid()
def __init__(self, embedding_dim, bottleneck_dim, input_channels, output_channels, kernel, dropout=0.0, activation='identity', dilation=1, groups=1, batch_norm=True): super(HighwayConvBlockGenerated, self).__init__(embedding_dim, bottleneck_dim, input_channels, 2*output_channels, kernel, dropout, activation, dilation, groups, batch_norm) self._gate = Sigmoid()
def __init__(self): super().__init__() self.sig = Sigmoid() self.loss = BCELoss(reduction='sum')
def __init__(self): super(LinearClassifier, self).__init__() self.fully_connected = Linear(2, 1) self.sigmoid = Sigmoid()
#! /usr/bin/env python3 from collections import OrderedDict from context import archetypes from context import utensils from utensils.datasets import MnistDataset from archetypes.autoencoder import Autoencoder import torch from torch.nn import Linear, Sigmoid, LeakyReLU sigmoid = Sigmoid() leaky_relu = LeakyReLU() mse = torch.nn.MSELoss(reduction='sum') bs = 256 shuf = True nepochs = 1 data_home = "/home/jamc/Data/MNIST_data" image_fn = f"{data_home}/train-images-idx3-ubyte.gz" label_fn = f"{data_home}/train-labels-idx1-ubyte.gz" dataset = MnistDataset(image_fn, label_fn, shape=(-1, )) training = torch.utils.data.DataLoader(dataset, batch_size=bs, shuffle=shuf) encoder = OrderedDict( (('Hidden_Layer_1', Linear(dataset.images.shape[-1],
def beta(self, state_index): # Input : index in [0, n_states - 1] # Return : beta(state), variable of shape (1) state_var = self.varFromStateIndex(state_index) return Sigmoid()(torch.matmul(state_var, self.upsilon))
def evaluate(model: PreTrainedModel, dataloader: DataLoader, device: str) -> (int, List[int], List[int]): """ Evaluates a Bert Model on a labelled data set. Args: model: the BertModel to be evaluated dataloader: the DataLoader with the test data device: the device where evaluation will take place ("cpu" or "cuda") Returns: a tuple with (the evaluation loss, a list with the correct labels, and a list with the predicted labels) """ model.eval() eval_loss = 0 nb_eval_steps = 0 predicted_labels, correct_labels = [], [] for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): if type(model) == BertForSequenceClassification or type( model) == BertForMultiLabelSequenceClassification: tmp_eval_loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids) elif type(model) == DistilBertForSequenceClassification: tmp_eval_loss, logits = model(input_ids, attention_mask=input_mask, labels=label_ids) if type(model) == BertForSequenceClassification or type( model) == DistilBertForSequenceClassification: outputs = np.argmax(logits.to('cpu'), axis=1) label_ids = label_ids.to('cpu').numpy() predicted_labels += list(outputs) elif type(model) == BertForMultiLabelSequenceClassification: sig = Sigmoid() outputs = sig(logits).to('cpu').numpy() label_ids = label_ids.to('cpu').numpy() predicted_labels += list(outputs >= 0.5) correct_labels += list(label_ids) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps correct_labels = np.array(correct_labels) predicted_labels = np.array(predicted_labels) return eval_loss, correct_labels, predicted_labels
def define_model(A, X): model = Sequential(Layer(A, X.shape[1], 4, 'tanh'), Layer(A, 4, 2, 'tanh'), Linear(2, 1, ' '), Sigmoid()) return model
def __init__(self, inplace=True): super(h_swish, self).__init__() self.sigmoid = Sigmoid()
def __init__(self): super(SigmoidMAELoss, self).__init__() from torch.nn import Sigmoid self.__sigmoid__ = Sigmoid() self.__l1_loss__ = MSELoss()
def forward(self, x, m): x = m * self.main_net(x) if isinstance(self, BaseClassifier): out = Sigmoid()(x) out = out.view(-1, 1) return None, out
def eval_on_model(args): device = args.device if device == 'cpu': raise NotImplementedError("CPU training is not implemented.") device = torch.device(args.device) torch.cuda.set_device(device) # build model model = build_model(args) model.to(device) # output dir p_out = Path( args.p_out).joinpath(f"{model.name}-{args.tensorboard_exp_name}") if not p_out.exists(): p_out.mkdir(exist_ok=True, parents=True) # dataset & loader annotation = pd.read_csv(args.annotation_file) query = annotation[annotation.mp3_path.str.match('/'.join( args.audio_file.split('/')[-2:]))] assert query.shape[0] != 0, f"Cannot find the audio file: {args.audio_file}" # split audio info and segment audio threshold = args.eval_threshold song_info = query[query.columns.values[50:]] tags = query.columns.values[:50] labels = query[tags].values[0] label_names = tags[labels.astype(bool)] segments = _segment_audio(_load_audio(args.audio_file, sample_rate=22050), n_samples=59049) LOG.info(f"Song info: {song_info}") LOG.info(f"Number of segments: {len(segments)}") LOG.info(f"Ground truth tags: {label_names}") LOG.info(f"Positive tag threshold: {threshold}") # create loss loss_fn = get_loss(args.loss) # load checkpoint OR init state_dict if args.checkpoint is not None: state_dict = load_ckpt(args.checkpoint, reset_epoch=args.ckpt_epoch, no_scheduler=args.ckpt_no_scheduler, no_optimizer=args.ckpt_no_optimizer, no_loss_fn=args.ckpt_no_loss_fn, map_values=args.ckpt_map_values) model_dict = {'model': model} if 'model' in state_dict else None apply_state_dict(state_dict, model=model_dict) best_val_loss = state_dict['val_loss'] epoch = state_dict['epoch'] global_i = state_dict['global_i'] LOG.info( f"Checkpoint loaded. Epoch trained {epoch}, global_i {global_i}, best val {best_val_loss:.6f}" ) else: raise AssertionError("Pre-trained checkpoint must be provided.") # start testing model.eval() sigmoid = Sigmoid().to(device) t_start = time.time() # concatenate segments segments = torch.from_numpy( np.concatenate([seg.reshape(1, 1, -1) for seg in segments ])).to(torch.float32).cuda(device=device) targets = torch.from_numpy(np.concatenate( [labels.reshape(1, -1)] * 10)).to(torch.float32).cuda(device=device) # forward pass with torch.no_grad(): logits = model(segments) out = sigmoid(logits) loss = loss_fn(logits, targets) out = out.cpu().numpy() out[out > threshold] = 1 out[out <= threshold] = 0 out = np.sum(out, axis=0) res = pd.DataFrame(data={'tags': tags, 'freq': out}) res = res[res.freq != 0].sort_values(by='freq', ascending=False) CONSOLE.print(res) LOG.info(f"Testing speed: {time.time() - t_start:.4f}s, " f"loss: {loss.item()}, ") return
def __init__(self, name, in_size, device): super(FCN, self).__init__() assert (in_size % 16 == 0) self.name = name self.in_size = in_size self.device = device self.convBlock1 = Sequential( Conv2d(in_channels=3, kernel_size=5, out_channels=32, stride=2, padding=2), BatchNorm2d(num_features=32, momentum=0.1), ReLU(inplace=True), Conv2d(in_channels=32, kernel_size=3, out_channels=32, stride=1, padding=1), BatchNorm2d(num_features=32, momentum=0.1), ReLU(inplace=True) ) self.upsampling1 = ConvTranspose2d(in_channels=32, kernel_size=int(self.in_size / 2) + 1, out_channels=1, stride=1, padding=0) self.pool1 = MaxPool2d(kernel_size=2, stride=2) self.convBlock2 = Sequential( Conv2d(in_channels=32, kernel_size=3, out_channels=64, stride=1, padding=1), BatchNorm2d(num_features=64, momentum=0.1), ReLU(inplace=True), Conv2d(in_channels=64, kernel_size=3, out_channels=64, stride=1, padding=1), BatchNorm2d(num_features=64, momentum=0.1), ReLU(inplace=True) ) self.upsampling2 = ConvTranspose2d(in_channels=64, kernel_size=3 * int(self.in_size / 4) + 1, out_channels=1, stride=1, padding=0) self.pool2 = MaxPool2d(kernel_size=2, stride=2) self.convBlock3 = Sequential( Conv2d(in_channels=64, kernel_size=3, out_channels=96, stride=1, padding=1), BatchNorm2d(num_features=96, momentum=0.1), ReLU(inplace=True), Conv2d(in_channels=96, kernel_size=3, out_channels=96, stride=1, padding=1), BatchNorm2d(num_features=96, momentum=0.1), ReLU(inplace=True) ) self.upsampling3 = ConvTranspose2d(in_channels=96, kernel_size=7 * int(self.in_size / 8) + 1, out_channels=1, stride=1, padding=0) self.pool3 = MaxPool2d(kernel_size=2, stride=2) self.convBlock4 = Sequential( Conv2d(in_channels=96, kernel_size=3, out_channels=128, stride=1, padding=1), BatchNorm2d(num_features=128, momentum=0.1), ReLU(inplace=True), Conv2d(in_channels=128, kernel_size=3, out_channels=128, stride=1, padding=1), BatchNorm2d(num_features=128, momentum=0.1), ReLU(inplace=True) ) self.upsampling4 = ConvTranspose2d(in_channels=128, kernel_size=15 * int(self.in_size / 16) + 1, out_channels=1, stride=1, padding=0) self.convScore = Sequential( Conv2d(in_channels=4, kernel_size=1, out_channels=1, stride=1, padding=0), Sigmoid() ) self = self.to(device) self.optimizer = SGD(self.parameters(), lr=LR_SGD, momentum=MOMENTUM_SGD, nesterov=True, weight_decay=WD_SGD)
def test_on_model(args): device = args.device if device == 'cpu': raise NotImplementedError("CPU training is not implemented.") device = torch.device(args.device) torch.cuda.set_device(device) # build model model = build_model(args) model.to(device) # output dir p_out = Path( args.p_out).joinpath(f"{model.name}-{args.tensorboard_exp_name}") if not p_out.exists(): p_out.mkdir(exist_ok=True, parents=True) # dataset & loader test_dataset = MTTDataset(path=args.p_data, split='test') test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.n_workers, pin_memory=True, drop_last=False) # not dropping last in testing test_steps = test_dataset.calc_steps( args.batch_size, drop_last=False) # not dropping last in testing LOG.info(f"Total testing steps: {test_steps}") LOG.info(f"Testing data size: {len(test_dataset)}") # create loss loss_fn = get_loss(args.loss) # create metric metric = AUCMetric() # load checkpoint OR init state_dict if args.checkpoint is not None: state_dict = load_ckpt(args.checkpoint, reset_epoch=args.ckpt_epoch, no_scheduler=args.ckpt_no_scheduler, no_optimizer=args.ckpt_no_optimizer, no_loss_fn=args.ckpt_no_loss_fn, map_values=args.ckpt_map_values) model_dict = {'model': model} if 'model' in state_dict else None apply_state_dict(state_dict, model=model_dict) best_val_loss = state_dict['val_loss'] epoch = state_dict['epoch'] global_i = state_dict['global_i'] LOG.info( f"Checkpoint loaded. Epoch trained {epoch}, global_i {global_i}, best val {best_val_loss:.6f}" ) else: raise AssertionError("Pre-trained checkpoint must be provided.") # summary writer writer = SummaryWriter(log_dir=p_out.as_posix(), filename_suffix='-test') # start testing model.eval() sigmoid = Sigmoid().to(device) status_col = TextColumn("") running_loss = 0 if args.data_normalization: fetcher = DataPrefetcher(test_loader, mean=MTT_MEAN, std=MTT_STD, device=device) else: fetcher = DataPrefetcher(test_loader, mean=None, std=None, device=device) samples, targets = fetcher.next() with Progress("[progress.description]{task.description}", "[{task.completed}/{task.total}]", BarColumn(), "[progress.percentage]{task.percentage:>3.0f}%", TimeRemainingColumn(), TextColumn("/"), TimeElapsedColumn(), status_col, expand=False, console=CONSOLE, refresh_per_second=5) as progress: task = progress.add_task(description=f'[Test]', total=test_steps) i = 0 # counter t_start = time.time() with torch.no_grad(): while samples is not None: # forward model logits = model(samples) out = sigmoid(logits) test_loss = loss_fn(logits, targets) # collect running loss running_loss += test_loss.item() i += 1 writer.add_scalar('Test/Loss', running_loss / i, i) # auc metric metric.step(targets.cpu().numpy(), out.cpu().numpy()) # pre-fetch next samples samples, targets = fetcher.next() if not progress.finished: status_col.text_format = f"Test loss: {running_loss/i:.06f}" progress.update(task, advance=1) auc_tag, auc_sample, ap_tag, ap_sample = metric.auc_ap_score LOG.info(f"Testing speed: {(time.time() - t_start)/i:.4f}s/it, " f"auc_tag: {auc_tag:.04f}, " f"auc_sample: {auc_sample:.04f}, " f"ap_tag: {ap_tag:.04f}, " f"ap_sample: {ap_sample:.04f}") writer.close() return
def forward(self, input, target): if input.size(0) != target.size(0): raise RuntimeError('Input and target should have the same size ' 'in the batch dimension.') # used_rows = 0 batch_size = target.size(0) # output = input.new_zeros(batch_size) # gather_inds = target.new_empty(batch_size) total_cluster_loss = input.new_zeros(batch_size) head_onehot = target.new_zeros(batch_size, self.cutoffs[0]) cluster_onehot = target.new_zeros(batch_size, self.n_clusters) cutoff_values = [0] + self.cutoffs for i in range(len(cutoff_values) - 1): low_idx = cutoff_values[i] high_idx = cutoff_values[i + 1] num_idx = high_idx - low_idx target_mask = (target >= low_idx) & (target < high_idx) target_mask_row = torch.sum(target_mask, dim=1) row_indices = target_mask_row.nonzero().squeeze() if row_indices.numel() == 0: continue input_subset = input.index_select(0, row_indices) target_onehot = self.get_multi_hot_label(target, target_mask, row_indices, low_idx, num_idx).detach() if i == 0: # indices = row_indices.repeat(num_idx, 1).transpose(1,0) head_onehot.index_copy_(0, row_indices, target_onehot) else: head_output = self.head(input_subset) cluster_root_output = head_output[:, self.shortlist_size + i - 1] sig_func = Sigmoid() # test = sig_func(cluster_root_output) cluster_root_output = torch.diag(sig_func(cluster_root_output)) cluster_output = self.tail[i - 1](input_subset) # cluster_output = cluster_output * cluster_root_output cluster_output = torch.mm(cluster_root_output, sig_func(cluster_output)) # cluster_index = self.shortlist_size + i - 1 temp_onehot = target.new_zeros(batch_size).index_fill_( 0, row_indices, 1) cluster_onehot[:, i - 1] = temp_onehot # loss_fct = BCEWithLogitsLoss(reduction='none') loss_fct = BCELoss(reduction='none') loss = loss_fct(cluster_output.view(-1, num_idx), target_onehot.view(-1, num_idx).float()) loss = torch.sum(loss, dim=1) # total_cluster_loss = total_cluster_loss.scatter_add(0,row_indices,loss) temp_loss = input.new_zeros(batch_size) total_cluster_loss += temp_loss.index_copy_( 0, row_indices, loss) head_output = self.head(input) head_onehot = torch.cat((head_onehot, cluster_onehot), dim=1) loss_fct = BCEWithLogitsLoss(reduction='none') head_loss = loss_fct(head_output.view(-1, self.head_size), head_onehot.view(-1, self.head_size).float()) cluster_root_loss = head_loss[:, self.shortlist_size:] # temp_mask = head_onehot[:,self.shortlist_size:] multiplier = (cluster_onehot == 0).long() # multiplier += cluster_onehot * torch.tensor(self.cluster_size) cluster_root_loss = cluster_root_loss * multiplier.float() head_loss[:, self.shortlist_size:] = cluster_root_loss head_loss = torch.sum(head_loss, dim=1) multiplier += cluster_onehot * torch.tensor(self.cluster_size).cuda() num_loss = torch.sum(multiplier, dim=1) + self.shortlist_size # loss = (head_loss + total_cluster_loss) / num_loss.float() loss = ((head_loss + total_cluster_loss) / num_loss.float()).mean() return loss
def __init__(self, input_shape, n_convfilter, \ n_fc_filters, h_shape, conv3d_filter_shape): print("initializing \"encoder\"") #input_shape = (self.batch_size, 3, img_w, img_h) super(encoder, self).__init__() #conv1 conv1_kernal_size = 7 self.conv1 = Conv2d(in_channels= input_shape[1], \ out_channels= n_convfilter[0], \ kernel_size= conv1_kernal_size, \ padding = int((conv1_kernal_size - 1) / 2)) #conv2 conv2_kernal_size = 3 self.conv2 = Conv2d(in_channels= n_convfilter[0], \ out_channels= n_convfilter[1], \ kernel_size= conv2_kernal_size,\ padding = int((conv2_kernal_size - 1) / 2)) #conv3 conv3_kernal_size = 3 self.conv3 = Conv2d(in_channels= n_convfilter[1], \ out_channels= n_convfilter[2], \ kernel_size= conv2_kernal_size,\ padding = int((conv3_kernal_size - 1) / 2)) #conv4 conv4_kernal_size = 3 self.conv4 = Conv2d(in_channels= n_convfilter[2], \ out_channels= n_convfilter[3], \ kernel_size= conv2_kernal_size,\ padding = int((conv4_kernal_size - 1) / 2)) #conv5 conv5_kernal_size = 3 self.conv5 = Conv2d(in_channels= n_convfilter[3], \ out_channels= n_convfilter[4], \ kernel_size= conv2_kernal_size,\ padding = int((conv5_kernal_size - 1) / 2)) #conv6 conv6_kernal_size = 3 self.conv6 = Conv2d(in_channels= n_convfilter[4], \ out_channels= n_convfilter[5], \ kernel_size= conv2_kernal_size,\ padding = int((conv6_kernal_size - 1) / 2)) #pooling layer self.pool = MaxPool2d(kernel_size=2, padding=1) #nonlinearities of the network self.leaky_relu = LeakyReLU(negative_slope=0.01) self.sigmoid = Sigmoid() self.tanh = Tanh() #find the input feature map size of the fully connected layer fc7_feat_w, fc7_feat_h = self.fc_in_featmap_size(input_shape, num_pooling=6) #define the fully connected layer self.fc7 = Linear(int(n_convfilter[5] * fc7_feat_w * fc7_feat_h), n_fc_filters[0]) #define the FCConv3DLayers in 3d convolutional gru unit self.t_x_s_update = FCConv3DLayer_torch(n_fc_filters[0], conv3d_filter_shape, h_shape) self.t_x_s_reset = FCConv3DLayer_torch(n_fc_filters[0], conv3d_filter_shape, h_shape) self.t_x_rs = FCConv3DLayer_torch(n_fc_filters[0], conv3d_filter_shape, h_shape)
# of the input parameters. hidden_size = 2 # Since we are generating a binary class, we need to identify to which blob (class) the # point belongs to. output_size = 1 y = np.reshape(y, (len(y), 1)) inputs = torch.tensor(X, dtype=torch.float) labels = torch.tensor(y, dtype=torch.float) # We write a simple sequential two layer neural network model. model = Sequential(Linear(in_features=input_size, out_features=hidden_size), ReLU(), Linear(in_features=input_size, out_features=output_size), Sigmoid()) # Setup the loss function. We are currently using Binary Cross Entropy # You can also use torch.nn.BCEWithLogitsLoss and remove the Sigmoid # layer from the model as this is already included in the loss function. criterion = torch.nn.BCELoss(reduction='mean') # Setup the optimizer to determine the parameters for the neural network # to do binary classification. Do play around this other optimizers. optimizer = torch.optim.SGD(model.parameters(), lr=0.01) # How many epochs should be used for the model training? num_epochs = 30 # At what frequency should we print the current loss. print_freq = 10
def __init__(self): super(Model, self).__init__() self.l1 = Linear(8, 6) self.l2 = Linear(6, 4) self.l3 = Linear(4, 1) self.sigmoid = Sigmoid()