def __init__(self, args, pretrained_word_matrix): super(BiLSTM_CNN_CRF, self).__init__() self.args = args self.char_cnn = CharCNN(max_word_len=args.max_word_len, kernel_lst=args.kernel_lst, num_filters=args.num_filters, char_vocab_size=args.char_vocab_size, char_emb_dim=args.char_emb_dim, final_char_dim=args.final_char_dim) if pretrained_word_matrix is not None: self.word_emb = nn.Embedding.from_pretrained(pretrained_word_matrix) else: self.word_emb = nn.Embedding(args.word_vocab_size, args.word_emb_dim, padding_idx=0) nn.init.uniform_(self.word_emb.weight, -0.25, 0.25) self.bi_lstm = nn.LSTM(input_size=args.word_emb_dim + args.final_char_dim, hidden_size=args.hidden_dim // 2, # Bidirectional will double the hidden_size bidirectional=True, batch_first=True) self.output_linear = nn.Linear(args.hidden_dim, len(get_labels(args))) self.crf = CRF(num_tags=len(get_labels(args)), batch_first=True)
def process_image(self,image_path,waitTime=0): emotion_labels = get_labels('fer2013') gender_labels = get_labels('imdb') font = cv2.FONT_HERSHEY_SIMPLEX x_offset_emotion = 20 y_offset_emotion = 40 x_offset = 30 y_offset = 60 if type(image_path) is str: frame = cv2.imread(image_path) else: frame=image_path gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) faces = self.fd.process(frame) frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for (x,y,w,h) in faces: face = frame[(y - y_offset):(y + h + y_offset), (x - x_offset):(x + w + x_offset)] gray_face = gray[(y - y_offset_emotion):(y + h + y_offset_emotion), (x - x_offset_emotion):(x + w + x_offset_emotion)] try: face = cv2.resize(face, (48, 48)) gray_face = cv2.resize(gray_face, (48, 48)) except: continue face = np.expand_dims(face, 0) face = preprocess_input(face) gender_label_arg = np.argmax(self.gender_classifier.predict(face)) gender = gender_labels[gender_label_arg] gray_face = preprocess_input(gray_face) gray_face = np.expand_dims(gray_face, 0) gray_face = np.expand_dims(gray_face, -1) emotion_label_arg = np.argmax(self.emotion_classifier.predict(gray_face)) emotion = emotion_labels[emotion_label_arg] if gender == gender_labels[0]: gender_color = (0, 0, 255) else: gender_color = (255, 0, 0) cv2.rectangle(frame, (x, y), (x + w, y + h), gender_color, 2) cv2.putText(frame, emotion, (x, y - 40), font, 0.5, gender_color, 2, cv2.LINE_AA) cv2.putText(frame, gender, (x , y - 40 + 20), font, 0.5, gender_color, 2, cv2.LINE_AA) frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) # cv2.imwrite('predicted_test_image.png', frame) cv2.imshow('predicted test image',frame) cv2.waitKey(waitTime)
def main(): args = parse_args() logging.info("\n\targs: {}\n".format(args)) labels = None if args.label is not None: logging.info("Init labels ...") labels = get_labels(args.label) logging.info("Init data loader ...") data_loader = DataLoader(args.link, args.node_count) with tf.Graph().as_default(), tf.Session() as session: # logging.info("loading train data") # train_samps = np.loadtxt(args.train_data, dtype=int) # train_samps = get_sample(args.train_data) total_samples = args.batch_size * args.num_batches logging.info("Initing LINE model") model = LINE(args, session, args.node_count, total_samples) logging.info("training ...") model.train(data_loader, labels) if args.save != "": logging.info("Saving ...") model._saver.save(session, os.path.join(args.save, "model.ckpt"), global_step=args.epochs)
def __init__(self, args, train_dataset=None, dev_dataset=None, test_dataset=None): self.args = args self.train_dataset = train_dataset self.dev_dataset = dev_dataset self.test_dataset = test_dataset self.label_lst = get_labels(args) self.num_labels = len(self.label_lst) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later self.pad_token_label_id = args.ignore_index self.word_vocab, self.char_vocab, _, _ = load_vocab(args) self.pretrained_word_matrix = None if not args.no_w2v: self.pretrained_word_matrix = load_word_matrix( args, self.word_vocab) self.model = BiLSTM_CNN_CRF(args, self.pretrained_word_matrix) # GPU or CPU self.device = "cuda" if torch.cuda.is_available( ) and not args.no_cuda else "cpu" self.model.to(self.device) self.test_texts = None if args.write_pred: self.test_texts = get_test_texts(args) # Empty the original prediction files if os.path.exists(args.pred_dir): shutil.rmtree(args.pred_dir)
def __init__(self, data_path, background_id=None, class_names=None, dataset_name=None, suffix='.jpg', use_bounding_boxes=False, use_classes=False): self.path_prefix = data_path self.background_id = background_id if class_names == None: self.arg_to_class = get_labels(dataset_name='german_open_2017') self.class_to_arg = { value: key for key, value in self.arg_to_class.items() } self.class_names = list(self.class_to_arg.keys()) self.suffix = suffix else: if background_id != None and background_id != -1: class_names.insert(background_id, 'background') elif background_id == -1: class_names.append('background') keys = np.arange(len(class_names)) self.arg_to_class = dict(zip(keys, class_names)) self.class_names = class_names #consider adding the suffix here as well #self.suffix = suffix self.data = dict() self.use_bounding_boxes = use_bounding_boxes self._preprocess_XML()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data_dir', default=None, type=str, required=True) parser.add_argument('--w2v_path', default=None, type=str, required=True) parser.add_argument('--labels', default=None, type=str, required=True) parser.add_argument('--batch_size', default=32, type=int) parser.add_argument('--epochs', default=3, type=int) parser.add_argument('--logging_steps', default=20, type=int) parser.add_argument('--learning_rate', default=5e-3, type=float) args = parser.parse_args() args.device = torch.device('cuda') labels = get_labels(args.labels) glove = GloVe(cache=args.w2v_path) # model model = LstmCrf(w2v=glove, num_tags=len(labels), hidden_dim=512) model.to(args.device) # dataset train_dataset = NerDataset(args.data_dir, labels, glove, mode='train') eval_dataset = NerDataset(args.data_dir, labels, glove, mode='dev') # train train(args, model, train_dataset) # eval result = eval(args, model, eval_dataset, labels) print(result)
def __init__(self, args, train_dataset=None, dev_dataset=None, test_dataset=None): self.args = args self.train_dataset = train_dataset self.dev_dataset = dev_dataset self.test_dataset = test_dataset self.label_lst = get_labels(args) self.num_labels = len(self.label_lst) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later self.pad_token_label_id = args.ignore_index self.config_class, self.model_class, _ = MODEL_CLASSES[args.model_type] self.bert_config = self.config_class.from_pretrained( args.model_name_or_path, num_labels=self.num_labels, finetuning_task=args.task) self.model = self.model_class(self.bert_config, args) # GPU or CPU self.device = "cuda" if torch.cuda.is_available( ) and not args.no_cuda else "cpu" self.model.to(self.device)
def Nonlinear_Trainer(): print("Load the training data...") start_time = time.time() train_imgs, train_idxs = load_train_data(data_dir) del train_imgs print("{:.4f} seconds".format(time.time() - start_time)) print("Extract the image features...") train_features = np.load('./train_bow.npy') print('Train the classifiers...') accuracy = 0 models = {} for class_name in category: target_idxs = np.array([ read_txt(os.path.join(data_dir, '{}_train.txt'.format(class_name))) ]) target_labels = get_labels(train_idxs, target_idxs) models[class_name] = nonlinear_classifier(train_features, target_labels) train_accuracy = models[class_name].score(train_features, target_labels) print('{} zClassifier train accuracy: {:.4f}'.format( class_name, train_accuracy)) accuracy += train_accuracy print('Average train accuracy: {:.4f}'.format(accuracy / len(category))) del train_features, target_labels, target_idxs return models
def Nonlinear_Test(models): print("Load the validation data...") start_time = time.time() val_imgs, val_idxs = load_val_data(data_dir) print("{:.4f} seconds".format(time.time() - start_time)) del val_imgs print("Extract the image features...") val_features = np.load('./val_bow.npy') print('Test the classifiers...') accuracy = 0 for class_name in category: target_idxs = np.array([ read_txt(os.path.join(data_dir, '{}_val.txt'.format(class_name))) ]) target_labels = get_labels(val_idxs, target_idxs) val_accuracy = models[class_name].score(val_features, target_labels) print('{} Classifier validation accuracy: {:.4f}'.format( class_name, val_accuracy)) accuracy += val_accuracy del val_features, target_idxs, target_labels print('Average validation accuracy: {:.4f}'.format(accuracy / len(category)))
def train_model(train_inputs, train_labels, test_data, test_labels, model, optimizer, criterion, kmer_size, with_attention): losses = [] print('Training the model:') start_time = time.time() train_accuracies, test_accuracies = [], [] #labels_hat = [] test_labels = utils.get_labels(Config.positive_test_sample_size, Config.negative_test_sample_size) bar = Bar('Processing', max=Config.num_epochs) #print('Attention Weights before training:', model.context) for epoch in range( Config.num_epochs): # loop over the dataset multiple times loss, acc = train_epoch(model, train_inputs, train_labels, optimizer, criterion) losses.append(loss) train_accuracy = 100 * (acc[0] / (acc[0] + acc[1])) train_accuracies.append(train_accuracy) torch.save(model.state_dict(), Config.test_model_name) test_accuracy = test.test(test_data, test_labels, kmer_size, Config.test_model_name) test_accuracies.append(test_accuracy) bar.next() bar.finish() torch.save(model.state_dict(), Config.test_model_name) print('Finished. Training took %.3f' % ((time.time() - start_time) / 60), 'minutes.') #print('Attention Weights after training:', model.context) return losses, train_accuracies, test_accuracies
def gen_csv_report(test_file, pred_file, report_file=None): label2idx, idx2char = get_labels( os.path.join(BASE_PATH, 'corpus/labels.lst')) csv_dict = collections.OrderedDict() for key1 in label2idx: csv_dict[key1.strip()] = collections.OrderedDict() for key2 in label2idx: csv_dict[key1.strip()][key2.strip()] = 0 #print(csv_dict) f_test = open(test_file, 'r', encoding='utf-8') f_pred = open(pred_file, 'r', encoding='utf-8') for (test_line, pred_line) in zip(f_test, f_pred): *test_s, test_label = test_line[:-1].split('\t') *pred_s, pred_label = pred_line[:-1].split('\t') csv_dict[test_label][pred_label] += 1 with open(report_file, 'w', encoding='utf-8') as f: f.write(' ') for key in label2idx: f.write(',' + key) f.write('\n') for key in label2idx: f.write(key) for k in label2idx: f.write(',' + str(csv_dict[key][k])) f.write('\n')
def __init__(self, args, train_dataset=None, dev_dataset=None, test_dataset=None): self.args = args self.train_dataset = train_dataset self.dev_dataset = dev_dataset self.test_dataset = test_dataset self.label_lst = get_labels(args) self.num_labels = len(self.label_lst) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later self.pad_token_label_id = torch.nn.CrossEntropyLoss().ignore_index self.config_class, self.model_class, _ = MODEL_CLASSES[args.model_type] self.config = self.config_class.from_pretrained( args.model_name_or_path, num_labels=self.num_labels, finetuning_task=args.task, id2label={str(i): label for i, label in enumerate(self.label_lst)}, label2id={label: i for i, label in enumerate(self.label_lst)}) self.model = self.model_class.from_pretrained(args.model_name_or_path, config=self.config) # GPU or CPU self.device = "cuda" if torch.cuda.is_available( ) and not args.no_cuda else "cpu" self.model.to(self.device) self.test_texts = None if args.write_pred: self.test_texts = get_test_texts(args) # Empty the original prediction files if os.path.exists(args.pred_dir): shutil.rmtree(args.pred_dir)
def split_dataset(*guids, test_size, seed): random.seed(seed) labels = [ '{}_{}'.format(*t) for t in get_labels(*guids, vendor=True, train=None, flatten=True) ] stats = defaultdict(list) for guid, label in zip(guids, labels): stats[label].append(guid) train_set = defaultdict(list) test_set = defaultdict(list) for label in stats: if len(stats[label]) < 2: continue num_sample = len(stats[label]) num_test = max(int(num_sample * test_size), 1) idx_test = random.choice(num_sample, num_test, replace=False) for i in range(num_sample): if i in idx_test: test_set[label].append(stats[label][i]) else: train_set[label].append(stats[label][i]) with open(os.path.join(DATA_FOLDER, 'set', 'train.json'), 'w') as f: json.dump(train_set, f) with open(os.path.join(DATA_FOLDER, 'set', 'test.json'), 'w') as f: json.dump(test_set, f) return train_set, test_set
def KobertModelLoader(): init_logger() global parser global pred_config global args global device global model global label_lst parser = argparse.ArgumentParser() parser.add_argument("--model_dir", default="./model", type=str, help="Path to save, load model") parser.add_argument("--batch_size", default=32, type=int, help="Batch size for prediction") parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") pred_config = parser.parse_args() # load model and args args = get_args(pred_config) device = get_device(pred_config) model = load_model(pred_config, args, device) label_lst = get_labels(args)
def __init__(self): self.datasets = get_datasets(heart_diseases, n_inputs) self.label_data = get_labels(self.datasets) self.callbacks = [] # Initialize callbacks tensorboard_logs_path = "tensorboard_data/cnn/" tb_callback = tf.keras.callbacks.TensorBoard( log_dir=tensorboard_logs_path, histogram_freq=1, write_graph=True, embeddings_freq=1) # load_weights_on_restart will read the filepath of the weights if it exists and it will # load the weights into the model cp_callback = tf.keras.callbacks.ModelCheckpoint( filepath="saved_models/cnn/model.hdf5", save_best_only=True, save_weights_only=True, load_weights_on_restart=restore_model) self.callbacks.extend([tb_callback, cp_callback]) self.set_data() self.define_model()
def load_data(self, is_training=True): if is_training == True: idx = np.random.choice(len(self.trainidx), size=self.batch_size) trajectories = [self.load_feature(self.trainidx[i]) for i in idx] if not len(self.labels): labels = [ utils.get_labels(len(trajectories[i]), self.num_subgoals) for i in range(len(idx)) ] # Equi-partition subgoals else: labels = [self.labels[self.trainidx[i]] for i in idx] # Estimated subgoals return trajectories, labels else: feat = [] feat.append(self.load_feature(self.testidx[self.currenttestidx])) if self.currenttestidx == len(self.testidx) - 1: done = True self.currenttestidx = 0 else: done = False self.currenttestidx += 1 return feat, None, done
def accuracy(self, encoder, dataloader_eval, classes, device=torch.device("cpu"), print_summary=False): if self.conv_part: encoder_depth = 6 else: encoder_depth = 9 # Empty tensors to store predictions and labels predictions_soft = torch.Tensor().float().to(device) labels = np.array([]) print('[Evaluation of the samples...]') self.eval() # Validation Mode encoder.eval() with torch.no_grad(): # No need to track the gradients for batch in tqdm(dataloader_eval): # Extract noisy waterfalls and move tensors to the selected device noisy_waterfalls, _, _, targets_labels = utils.get_labels( batch, device) encoded_waterfalls = encoder.encode( noisy_waterfalls.to(device), encoder_depth=encoder_depth) targets = self.forward(encoded_waterfalls) predictions_soft = torch.cat((predictions_soft, targets), dim=0) # loss = self.branches['NumTarget'].loss_fn(targets, targets_labels) # Flatten the signals and append it to the labels labels = np.append(labels, batch['Parameters']['num_Targets']) # Compute hard prediction and convert data form tensors to numpy vectors try: _, preds = torch.max(predictions_soft.data, dim=1) except: preds = torch.zeros(0) preds = preds.cpu().numpy() print('[Computation of the accuracy metrics...]') # Collects several evaluation metrics conf_marix = np.zeros((13, 13)) try: conf_marix = confusion_matrix(labels, preds, labels=classes['t']) except: pass result_metrics = { 'matrix': conf_marix, 'accuracy': accuracy_score(labels, preds), 'balanced_accuracy': balanced_accuracy_score(labels, preds) } return result_metrics
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) serv_motion = self.add_preload_service('MotionSensor') self.char_detected = serv_motion.configure_char('MotionDetected') self.engine = ClassificationEngine("./models/classify.tflite") self.is_trained = retrain() self.labels = get_labels() self.is_running = True logging.info(self.setup_message())
def test(models, test_data, configs, epochs=50): x_test, y_test = test_data labels = get_labels(configs) for model, label in zip(models, labels): y_pred = model.predict(x_test) print("Model") print(" Configuration\n ", end="") print(*label.replace(" -", ":").split(", "), sep="\n ") print(" Accuracy") print(f" Epoch {epochs}: {accuracy(y_test, y_pred) * 100:.2f}%")
def set_test(model, test_iter): if not test_iter.is_test: test_iter.is_test = True labels = get_labels() idx2tag = dict(zip(range(len(labels)), labels)) model.eval() with torch.no_grad(): true_tags, pred_tags = [], [] for input_ids_list, input_mask_list, segment_ids_list, label_ids_list, tokens_list in tqdm( test_iter): input_ids = list2ts2device(input_ids_list) input_mask = list2ts2device(input_mask_list) segment_ids = list2ts2device(segment_ids_list) batch_output = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) # 恢复标签真实长度 real_batch_tags = [] for i in range(config.batch_size): real_len = int(input_mask[i].sum()) real_batch_tags.append(label_ids_list[i][:real_len]) # List[int] pred_tags.extend([ idx2tag.get(idx) for indices in batch_output for idx in indices ]) true_tags.extend([ idx2tag.get(idx) for indices in real_batch_tags for idx in indices ]) assert len(pred_tags) == len( true_tags), 'len(pred_tags) is not equal to len(true_tags)!' # logging loss, f1 and report target_names = set(config.tags) - {"[PAD]", "[CLS]", "[SEP]", "O"} evaluation_dict = classification_report(true_tags, pred_tags, digits=4, output_dict=True) precision = 0 recall = 0 f1 = 0 for key in evaluation_dict.keys(): if key in target_names: precision += evaluation_dict[key]['precision'] recall += evaluation_dict[key]['recall'] f1 += evaluation_dict[key]['f1-score'] f1 = f1 / len(target_names) precision = precision / len(target_names) recall = recall / len(target_names) print('precision: {:.4f}, recall: {:.4f}, F1: {:.4f}'.format( precision, recall, f1)) return precision, recall, f1
def role_process_binary(input_file, output_file, is_predict=False): label_list = get_labels(task="role", mode="classification") label_map = {label: i for i, label in enumerate(label_list)} rows = open(input_file, encoding='utf-8').read().splitlines() results = [] count = 0 for row in rows: if len(row) == 1: print(row) row = json.loads(row) count += 1 if "id" not in row: row["id"] = count start_labels = ['O'] * len(row["text"]) end_labels = ['O'] * len(row["text"]) arguments = [] if is_predict: results.append({ "id": row["id"], "tokens": list(row["text"]), "start_labels": start_labels, "end_labels": end_labels, "arguments": arguments }) continue for event in row["event_list"]: event_type = event["event_type"] for arg in event["arguments"]: role = arg['role'] role_id = label_map[role] argument = arg['argument'] argument_start_index = arg["argument_start_index"] argument_end_index = argument_start_index + len(argument) - 1 if start_labels[argument_start_index] == "O": start_labels[argument_start_index] = role else: start_labels[argument_start_index] += (" " + role) if end_labels[argument_end_index] == "O": end_labels[argument_end_index] = role else: end_labels[argument_end_index] += (" " + role) if arg['alias'] != []: print(arg['alias']) arg.pop('alias') arguments.append(arg) results.append({ "id": row["id"], "tokens": list(row["text"]), "start_labels": start_labels, "end_labels": end_labels, "arguments": arguments }) write_file(results, output_file)
def index_output_segment_bin(test_file, prediction_file, output_file): label_list = get_labels(task='role', mode="classification") label_map = {i: label for i, label in enumerate(label_list)} tests = open(test_file, encoding='utf-8').read().splitlines() predictions = open(prediction_file, encoding='utf-8').read().splitlines() results = [] index = 0 max_length = 256 - 2 for test, prediction in zip(tests, predictions): index += 1 test = json.loads(test) start_labels = test.pop('start_labels') end_labels = test.pop('end_labels') tokens = test.pop('tokens') text = ''.join(tokens) test['text'] = text segment_ids = test.pop('segment_ids') trigger = ''.join( [tokens[i] for i in range(len(tokens)) if segment_ids[i]]) for i in range(len(tokens)): if segment_ids[i]: trigger_start_index = i break event = {} # event['trigger'] = trigger # event['trigger_start_index']= trigger_start_index event_type = test.pop("event_type") event["event_type"] = event_type prediction = json.loads(prediction) arg_list = prediction["labels"] arguments = [] for arg in arg_list: sub_dict = {} argument_start_index = arg[1] - 1 argument_end_index = arg[2] - 1 argument = text[argument_start_index:argument_end_index + 1] role = label_map[arg[3]] sub_dict["role"] = role sub_dict["argument"] = argument # sub_dict["argument_start_index"] = argument_start_index arguments.append(sub_dict) event["arguments"] = arguments test['event_list'] = [event] results.append(test) write_file(results, output_file)
def do_submission(): train, test = load_dataset() train_X = train["tweet"] train_Y = get_labels(train) test_X = test["tweet"] feature_type = ["wordcount", "char"] test_ids = get_test_ids(test) meta_train_X, meta_test_X = get_extracted_features(feature_type, train_X, test_X) print("n_samples: %d, n_features: %d" % meta_train_X.shape) predict_and_sub(meta_train_X, train_Y.values, meta_test_X, test_ids, predict_ridge)
def load_and_cache_examples(config, task, tokenizer, evaluate=False, test=False): if config.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() processor = GeneralProcessor() output_mode = "classification" # Load data features from cache or dataset file evaluation_set_name = 'test' if test else 'dev' cached_features_file = os.path.join(config.data_dir, 'cached_{}_{}_{}_{}'.format( evaluation_set_name if evaluate else 'train', list(filter(None, config.pretrained_model_name.split('/'))).pop(), str(config.max_seq_len), str(task))) if os.path.exists(cached_features_file): logger.info(f"Loading features from cached file {cached_features_file}") features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", config.data_dir) label_list = get_labels(config.data_dir) examples = processor.get_dev_examples(config.data_dir) if evaluate else processor.get_train_examples(config.data_dir) features = convert_examples_to_features( examples, label_list, config.max_seq_len, tokenizer, "classification", use_entity_indicator=config.use_entity_indicator) if config.local_rank in [-1, 0]: logger.info(f"Saving features into cached file {cached_features_file}") torch.save(features, cached_features_file) if config.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Convert to Tensors and build dataset all_input_ids = torch.tensor( [f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in features], dtype=torch.long) all_e1_mask = torch.tensor( [f.e1_mask for f in features], dtype=torch.long) # add e1 mask all_e2_mask = torch.tensor( [f.e2_mask for f in features], dtype=torch.long) # add e2 mask if output_mode == "classification": all_label_ids = torch.tensor( [f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor( [f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_e1_mask, all_e2_mask) return dataset
def __init__(self, weights=None, biases=None): self.weights = weights if weights else self.weights self.biases = biases if biases else self.biases self.datasets = get_datasets(heart_diseases, nr_inputs) self.label_data = get_labels(self.datasets) self.saver = ModelSaver(save_dir="saved_models/cnn/") logs_path = "tensorboard_data/cnn/" self.tensorboard_handler = TensorBoardHandler(logs_path) self.tensorboard_handler.add_histograms(self.weights) self.tensorboard_handler.add_histograms(self.biases) self.build()
def train(loader, model, optimizer, criterion, epoch, d1, d2, blind, noise_level): running_l1 = 0 train_l1 = 0 model.train(True) k1 = model.weight[0].unsqueeze(0).expand(loader.batch_size, -1, -1, -1) k2 = model.weight[1].unsqueeze(0).expand(loader.batch_size, -1, -1, -1) d1 = d1.expand(loader.batch_size, -1, -1, -1) d2 = d2.expand(loader.batch_size, -1, -1, -1) for i, data in tqdm.tqdm(enumerate(loader)): x, y, mag, ori = data x = x.to(device) y = y.to(device) mag = mag.to(device) ori = ori.to(device) ori = (90 - ori).add(360).fmod(180) labels = utils.get_labels(mag, ori) ori = ori * np.pi / 180 if blind: nl = (noise_level - 0.5) * np.random.rand(1) + 0.5 else: nl = noise_level nl = float(nl) / 255 y += nl * torch.randn_like(y) y = y.clamp(0, 1) y.requires_grad_() optimizer.zero_grad() hat_x = model(y, mag, ori, labels, k1, k2, d1, d2) error = criterion(hat_x, x) error.backward() optimizer.step() # computing running loss running_l1 += F.l1_loss(hat_x[-1], x).item() train_l1 += F.l1_loss(hat_x[-1], x).item() if (i + 1) % 500 == 0: running_l1 /= 500 print(' Running loss %2.5f' % (running_l1)) running_l1 = 0 return train_l1 / len(loader)
def parse_study_dir(data_dir, sample_to_label, label_to_encoding, genes_to_keep): '''This function extracts the gene expression data and labels for a single study Arguments --------- data_dir: str The path to the directories where the data are stored. These are generally directories within the unzipped main directory downloaded from refine.bio, and will contain data for a single study. sample_to_label: dict A dictionary mapping sample identifiers to their corresponding labels label_to_encoding: dict A dictionary mapping the string label (e.g. 'sepsis') to a numerical target like 0 genes_to_keep: list of strs The list of gene identifiers to be kept in the dataframe Returns ------- curr_df: pandas.DataFrame A single dataframe containing the expression data of all genes in genes_to_keep for all samples in the study study_labels: list of ints Labels corresponding to whether each sample contains to septic or healthy gene expression ''' study = os.path.basename(os.path.normpath(data_dir)) study_file_name = study + '.tsv' data_file = os.path.join(data_dir, study_file_name) curr_df = pd.read_csv(data_file, sep='\t') curr_df = curr_df.set_index('Gene') # Remove samples that don't fall into a class of interest labels_to_keep = label_to_encoding.keys() curr_df = utils.keep_samples_with_labels(curr_df, sample_to_label, labels_to_keep) # If keep_samples_with_labels returns None, we should return None for the labels as well if curr_df is None: return (None, None) # Retrieve labels for each sample study_labels = utils.get_labels(curr_df, sample_to_label, label_to_encoding) curr_df = curr_df.loc[genes_to_keep, :] return curr_df, study_labels
def fix_masks(image_dir, masks_dir): images_filenames = np.array(sorted(glob.glob(image_dir + "/*.tif"))) for filename in images_filenames: dataset = rasterio.open(filename) meta = dataset.profile labels_dict, num_labels = get_labels( meta, "IMG_PER1_20190217152904_ORT_P_000659.TIF", database_file) mask = get_income_level_segmentation_mask( labels_dict, levels_dict, (meta['width'], meta['height']), meta['transform']) out_filename = os.path.join(masks_dir, filename[filename.rfind("/") + 1:]) pickle.dump(mask, open(out_filename, "wb"))
def __init__(self, model_dir, result_dir="results/"): self.model_dir = model_dir self.CONFIG = get_config(os.path.join(model_dir, "config.yaml")) self.embSize = self.CONFIG["emb_size"] self.device = self.CONFIG["device"] self.label2int, self.int2label = get_labels(os.path.join(model_dir, "labels.txt")) self.result_dir = result_dir self.createResultDir() self.featureExtractorPath = os.path.join(model_dir, "extractor.pth") self.featureExtractor = self.getModel() self.classifierPath = os.path.join(model_dir, "classifier.pkl") self.classifier = self.get_classifier() self.craft_net = load_craftnet_model(cuda=True, weight_path=self.CONFIG["CRAFT_WEIGHT"]) self.threshold = [ 0.15658274643186937, 0.36065899509540494, 0.38856512542345073, 0.40749227647561814, 0.16654823949181954, 0.1937299593585302, 0.11403658569607081, 0.45526403121798326 ]
def main(): graph = utils.load_graph() position = utils.get_positions(graph) utils.make_dir('images/louvain') true_communities = utils.get_labels(graph, list(graph.nodes)) utils.plot_communities(graph, position, true_communities, labels=True, title='Butterfly Similarity Network - True Communities', path='images/louvain/communities_true.png') communities = utils.group_communities(louvain_clustering(graph)) utils.plot_communities(graph, position, communities, labels=False, title='Butterfly Similarity Network - Louvain Communities', path='images/louvain/communities_louvain.png') graph_nodes = sorted(list(graph.nodes)) predictions = utils.predict_majority_class(graph, communities) preds = [predictions[n] for n in graph_nodes] labels = [graph.nodes[n]['label'] for n in graph_nodes] utils.accuracy(preds, labels) utils.confusion_matrix(preds, labels, 'Confusion Matrix - Majority Label Predictions from Louvain Communities', 'images/louvain/cm_louvain.png')
def accuracy(model, loader, device): correct_count = 0 total_count = 0 model.to(device) for images, true_labels in tqdm(loader, desc='Accuracy Test'): total_count += len(images) images = images.to(device) true_labels = true_labels.to(device) predicted_labels = utils.get_labels(model, images).detach() correct = torch.eq(predicted_labels, true_labels) correct_count += len(torch.nonzero(correct)) return correct_count / total_count
def predict(model, generator, model_name, data_path): ''' Predict data from generator and visualize prediction sample Args: @model: CNN model @generator: generator of testing data @model_name: str, name of the model showing on the plot @data_path: path where the data is saved Return: scores: list, [loss, accuracy] true_labels: true labels predict_labels: predict labels Call: show_prediction(images, labels, labels_pre, model_name) get_lookup_tables(path, generator) get_labels(path, indexes, generator) ''' # evaluate model print('Start evaluating...') scores = model.evaluate(x=generator) print('Loss:{loss:.2f}\nAccuracy:{accuracy:.2f}'.format( loss=scores[0], accuracy=scores[1])) # predict model using loop to get labels, becuase model.evaluate does not return labels print('Start predicting...will take a few minutes') # create a list to store labels of each loop true_labels = [] predict_labels = [] # get lookup table: dictionary {index: class_name} lookup_table = get_lookup_tables(data_path, generator) for i in range(generator.samples // generator.batch_size): x, y = generator.next() y_pre = model.predict(x=x) # generate labels y_labels = get_labels(data_path, y, generator) y_pre_index = y_pre.argmax(axis=1) y_pre_labels = [lookup_table[i] for i in y_pre_index] # append result to list true_labels += y_labels predict_labels += y_pre_labels # plot prediction examples show_prediction(x, y_labels, y_pre_labels, model_name) return scores, true_labels, predict_labels
def train(): train, _ = load_dataset() train_X = train["tweet"] train_Y = get_labels(train) n_samples = len(train_Y) X_train, _, y_train, _ = train_test_split(train_X[:n_samples], train_Y[:n_samples], test_size=0.2, random_state=1) t0 = time() feature_type = ["wordcount", "char"] rmse_avg = do_cross_val(X_train, y_train, feature_type, nfolds=3) print("Average RMSE %.6f" % rmse_avg) duration = time() - t0 print("training time: %fs" % duration)
def train_model(): train, _ = load_dataset() train_X = train["tweet"] train_Y = get_labels(train) n_samples = len(train_Y) X_train, _, y_train, _ = train_test_split(train_X[:n_samples], train_Y[:n_samples], test_size=0.2, random_state=1) scorer = make_scorer(rmse_score, greater_is_better=False) pipeline, parameters = get_ridge_model() # pipeline, parameters = get_three_predictor_model() # pipeline, parameters = get_elasticnet_model() # pipeline, parameters = get_three_predictor_model2() # pipeline, parameters = get_three_predictor_model3() # pipeline, parameters = get_ridge_model2() # pipeline, parameters = get_ridge_model3() # pipeline, parameters = get_advanced_ridge() do_gridsearch(X_train, y_train, pipeline, parameters, scorer)
#Adding Images for testing import cv2 from keras.models import load_model import numpy as np from statistics import mode from utils import preprocess_input from utils import get_labels # parameters image_path = '../images/test_image.jpg' detection_model_path = '../trained_models/detection_models/haarcascade_frontalface_default.xml' emotion_model_path = '../trained_models/emotion_models/simple_CNN.530-0.65.hdf5' gender_model_path = '../trained_models/gender_models/simple_CNN.81-0.96.hdf5' emotion_labels = get_labels('fer2013') gender_labels = get_labels('imdb') font = cv2.FONT_HERSHEY_SIMPLEX x_offset_emotion = 20 y_offset_emotion = 40 x_offset = 30 y_offset = 60 # loading models face_detection = cv2.CascadeClassifier(detection_model_path) emotion_classifier = load_model(emotion_model_path) gender_classifier = load_model(gender_model_path) frame = cv2.imread(image_path) gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
def train_final(): """ train a model using grid search for parameter estimation """ train, test = load_dataset() train_X = train["tweet"] train_Y = get_labels(train) test_X = test["tweet"] tfidf1 = TfidfVectorizer( max_df=0.6, min_df=0.0000003, stop_words="english", strip_accents="unicode", token_pattern="\w{1,}", max_features=5000, norm="l2", use_idf=False, smooth_idf=False, ngram_range=(1, 3), ) tfidf2 = TfidfVectorizer( max_df=0.6, analyzer="char", min_df=0.00001, stop_words="english", strip_accents="unicode", norm="l2", max_features=5000, ngram_range=(1, 7), smooth_idf=False, use_idf=False, ) tfidf1.fit(np.hstack((train_X, test_X))) tfidf2.fit(np.hstack((train_X, test_X))) train_X1 = tfidf1.transform(train_X) train_X2 = tfidf2.transform(train_X) train_X = hstack([train_X1, train_X2]).tocsr() n_samples = len(train_Y) X_train, _, y_train, _ = train_test_split(train_X[:n_samples], train_Y[:n_samples], test_size=0.2, random_state=1) scorer = make_scorer(rmse_score, greater_is_better=False) pipeline, parameters = get_advanced_ridge2() # pipeline, parameters = get_three_predictor_model() # pipeline, parameters = get_elasticnet_model() # pipeline, parameters = get_three_predictor_model2() # pipeline, parameters = get_three_predictor_model3() # pipeline, parameters = get_ridge_model2() # pipeline, parameters = get_ridge_model3() # pipeline, parameters = get_advanced_ridge() best_estimator = do_gridsearch(X_train, y_train, pipeline, parameters, n_jobs=5, verbose=1, scoring=scorer) # predict test data test_1 = tfidf1.transform(test_X) test_2 = tfidf2.transform(test_X) test_d = hstack([test_1, test_2]) final_preds = best_estimator.predict(test_d) save_prediction_subs(test["id"], final_preds)