Esempio n. 1
0
    def __init__(self, args, pretrained_word_matrix):
        super(BiLSTM_CNN_CRF, self).__init__()
        self.args = args

        self.char_cnn = CharCNN(max_word_len=args.max_word_len,
                                kernel_lst=args.kernel_lst,
                                num_filters=args.num_filters,
                                char_vocab_size=args.char_vocab_size,
                                char_emb_dim=args.char_emb_dim,
                                final_char_dim=args.final_char_dim)

        if pretrained_word_matrix is not None:
            self.word_emb = nn.Embedding.from_pretrained(pretrained_word_matrix)
        else:
            self.word_emb = nn.Embedding(args.word_vocab_size, args.word_emb_dim, padding_idx=0)
            nn.init.uniform_(self.word_emb.weight, -0.25, 0.25)

        self.bi_lstm = nn.LSTM(input_size=args.word_emb_dim + args.final_char_dim,
                               hidden_size=args.hidden_dim // 2,  # Bidirectional will double the hidden_size
                               bidirectional=True,
                               batch_first=True)

        self.output_linear = nn.Linear(args.hidden_dim, len(get_labels(args)))

        self.crf = CRF(num_tags=len(get_labels(args)), batch_first=True)
Esempio n. 2
0
    def process_image(self,image_path,waitTime=0):

        emotion_labels = get_labels('fer2013')
        gender_labels = get_labels('imdb')
        font = cv2.FONT_HERSHEY_SIMPLEX

        x_offset_emotion = 20
        y_offset_emotion = 40
        x_offset = 30
        y_offset = 60

        if type(image_path) is str:
            frame = cv2.imread(image_path)
        else:
            frame=image_path

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = self.fd.process(frame)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        for (x,y,w,h) in faces:
            face = frame[(y - y_offset):(y + h + y_offset),
                        (x - x_offset):(x + w + x_offset)]

            gray_face = gray[(y - y_offset_emotion):(y + h + y_offset_emotion),
                            (x - x_offset_emotion):(x + w + x_offset_emotion)]
            try:
                face = cv2.resize(face, (48, 48))
                gray_face = cv2.resize(gray_face, (48, 48))
            except:
                continue
            face = np.expand_dims(face, 0)
            face = preprocess_input(face)
            gender_label_arg = np.argmax(self.gender_classifier.predict(face))
            gender = gender_labels[gender_label_arg]

            gray_face = preprocess_input(gray_face)
            gray_face = np.expand_dims(gray_face, 0)
            gray_face = np.expand_dims(gray_face, -1)
            emotion_label_arg = np.argmax(self.emotion_classifier.predict(gray_face))
            emotion = emotion_labels[emotion_label_arg]

            if gender == gender_labels[0]:
                gender_color = (0, 0, 255)
            else:
                gender_color = (255, 0, 0)

            cv2.rectangle(frame, (x, y), (x + w, y + h), gender_color, 2)
            cv2.putText(frame, emotion, (x, y - 40), font,
                            0.5, gender_color, 2, cv2.LINE_AA)
            cv2.putText(frame, gender, (x , y - 40 + 20), font,
                            0.5, gender_color, 2, cv2.LINE_AA)

        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        # cv2.imwrite('predicted_test_image.png', frame)
        cv2.imshow('predicted test image',frame)
        cv2.waitKey(waitTime)
Esempio n. 3
0
def main():
    args = parse_args()
    logging.info("\n\targs: {}\n".format(args))
    labels = None
    if args.label is not None:
        logging.info("Init labels ...")
        labels = get_labels(args.label)
    logging.info("Init data loader ...")
    data_loader = DataLoader(args.link, args.node_count)
    with tf.Graph().as_default(), tf.Session() as session:
        # logging.info("loading train data")
        # train_samps = np.loadtxt(args.train_data, dtype=int)
        # train_samps = get_sample(args.train_data)

        total_samples = args.batch_size * args.num_batches
        logging.info("Initing LINE model")
        model = LINE(args, session, args.node_count, total_samples)
        logging.info("training ...")
        model.train(data_loader, labels)

        if args.save != "":
            logging.info("Saving ...")
            model._saver.save(session,
                              os.path.join(args.save, "model.ckpt"),
                              global_step=args.epochs)
Esempio n. 4
0
    def __init__(self,
                 args,
                 train_dataset=None,
                 dev_dataset=None,
                 test_dataset=None):
        self.args = args
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset

        self.label_lst = get_labels(args)
        self.num_labels = len(self.label_lst)

        # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
        self.pad_token_label_id = args.ignore_index

        self.word_vocab, self.char_vocab, _, _ = load_vocab(args)
        self.pretrained_word_matrix = None
        if not args.no_w2v:
            self.pretrained_word_matrix = load_word_matrix(
                args, self.word_vocab)

        self.model = BiLSTM_CNN_CRF(args, self.pretrained_word_matrix)

        # GPU or CPU
        self.device = "cuda" if torch.cuda.is_available(
        ) and not args.no_cuda else "cpu"
        self.model.to(self.device)

        self.test_texts = None
        if args.write_pred:
            self.test_texts = get_test_texts(args)
            # Empty the original prediction files
            if os.path.exists(args.pred_dir):
                shutil.rmtree(args.pred_dir)
    def __init__(self,
                 data_path,
                 background_id=None,
                 class_names=None,
                 dataset_name=None,
                 suffix='.jpg',
                 use_bounding_boxes=False,
                 use_classes=False):
        self.path_prefix = data_path
        self.background_id = background_id
        if class_names == None:
            self.arg_to_class = get_labels(dataset_name='german_open_2017')
            self.class_to_arg = {
                value: key
                for key, value in self.arg_to_class.items()
            }
            self.class_names = list(self.class_to_arg.keys())
            self.suffix = suffix
        else:
            if background_id != None and background_id != -1:
                class_names.insert(background_id, 'background')
            elif background_id == -1:
                class_names.append('background')
            keys = np.arange(len(class_names))
            self.arg_to_class = dict(zip(keys, class_names))
            self.class_names = class_names

        #consider adding the suffix here as well
        #self.suffix = suffix
        self.data = dict()
        self.use_bounding_boxes = use_bounding_boxes
        self._preprocess_XML()
Esempio n. 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', default=None, type=str, required=True)
    parser.add_argument('--w2v_path', default=None, type=str, required=True)
    parser.add_argument('--labels', default=None, type=str, required=True)
    parser.add_argument('--batch_size', default=32, type=int)
    parser.add_argument('--epochs', default=3, type=int)
    parser.add_argument('--logging_steps', default=20, type=int)
    parser.add_argument('--learning_rate', default=5e-3, type=float)
    args = parser.parse_args()

    args.device = torch.device('cuda')

    labels = get_labels(args.labels)
    glove = GloVe(cache=args.w2v_path)

    # model
    model = LstmCrf(w2v=glove, num_tags=len(labels), hidden_dim=512)
    model.to(args.device)

    # dataset
    train_dataset = NerDataset(args.data_dir, labels, glove, mode='train')
    eval_dataset = NerDataset(args.data_dir, labels, glove, mode='dev')

    # train
    train(args, model, train_dataset)

    # eval
    result = eval(args, model, eval_dataset, labels)

    print(result)
Esempio n. 7
0
    def __init__(self,
                 args,
                 train_dataset=None,
                 dev_dataset=None,
                 test_dataset=None):
        self.args = args
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset

        self.label_lst = get_labels(args)
        self.num_labels = len(self.label_lst)
        # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
        self.pad_token_label_id = args.ignore_index

        self.config_class, self.model_class, _ = MODEL_CLASSES[args.model_type]

        self.bert_config = self.config_class.from_pretrained(
            args.model_name_or_path,
            num_labels=self.num_labels,
            finetuning_task=args.task)
        self.model = self.model_class(self.bert_config, args)

        # GPU or CPU
        self.device = "cuda" if torch.cuda.is_available(
        ) and not args.no_cuda else "cpu"
        self.model.to(self.device)
Esempio n. 8
0
def Nonlinear_Trainer():
    print("Load the training data...")
    start_time = time.time()
    train_imgs, train_idxs = load_train_data(data_dir)
    del train_imgs
    print("{:.4f} seconds".format(time.time() - start_time))

    print("Extract the image features...")
    train_features = np.load('./train_bow.npy')

    print('Train the classifiers...')
    accuracy = 0
    models = {}

    for class_name in category:
        target_idxs = np.array([
            read_txt(os.path.join(data_dir, '{}_train.txt'.format(class_name)))
        ])
        target_labels = get_labels(train_idxs, target_idxs)

        models[class_name] = nonlinear_classifier(train_features,
                                                  target_labels)
        train_accuracy = models[class_name].score(train_features,
                                                  target_labels)
        print('{} zClassifier train accuracy:  {:.4f}'.format(
            class_name, train_accuracy))
        accuracy += train_accuracy

    print('Average train accuracy: {:.4f}'.format(accuracy / len(category)))
    del train_features, target_labels, target_idxs

    return models
Esempio n. 9
0
def Nonlinear_Test(models):
    print("Load the validation data...")
    start_time = time.time()
    val_imgs, val_idxs = load_val_data(data_dir)
    print("{:.4f} seconds".format(time.time() - start_time))

    del val_imgs

    print("Extract the image features...")
    val_features = np.load('./val_bow.npy')

    print('Test the classifiers...')
    accuracy = 0
    for class_name in category:
        target_idxs = np.array([
            read_txt(os.path.join(data_dir, '{}_val.txt'.format(class_name)))
        ])
        target_labels = get_labels(val_idxs, target_idxs)

        val_accuracy = models[class_name].score(val_features, target_labels)
        print('{} Classifier validation accuracy:  {:.4f}'.format(
            class_name, val_accuracy))
        accuracy += val_accuracy

    del val_features, target_idxs, target_labels

    print('Average validation accuracy: {:.4f}'.format(accuracy /
                                                       len(category)))
Esempio n. 10
0
def train_model(train_inputs, train_labels, test_data, test_labels, model,
                optimizer, criterion, kmer_size, with_attention):
    losses = []
    print('Training the model:')
    start_time = time.time()
    train_accuracies, test_accuracies = [], []
    #labels_hat = []
    test_labels = utils.get_labels(Config.positive_test_sample_size,
                                   Config.negative_test_sample_size)
    bar = Bar('Processing', max=Config.num_epochs)
    #print('Attention Weights before training:', model.context)
    for epoch in range(
            Config.num_epochs):  # loop over the dataset multiple times
        loss, acc = train_epoch(model, train_inputs, train_labels, optimizer,
                                criterion)
        losses.append(loss)
        train_accuracy = 100 * (acc[0] / (acc[0] + acc[1]))
        train_accuracies.append(train_accuracy)
        torch.save(model.state_dict(), Config.test_model_name)
        test_accuracy = test.test(test_data, test_labels, kmer_size,
                                  Config.test_model_name)
        test_accuracies.append(test_accuracy)
        bar.next()
    bar.finish()
    torch.save(model.state_dict(), Config.test_model_name)
    print('Finished. Training took %.3f' % ((time.time() - start_time) / 60),
          'minutes.')
    #print('Attention Weights after training:', model.context)
    return losses, train_accuracies, test_accuracies
Esempio n. 11
0
def gen_csv_report(test_file, pred_file, report_file=None):

    label2idx, idx2char = get_labels(
        os.path.join(BASE_PATH, 'corpus/labels.lst'))

    csv_dict = collections.OrderedDict()
    for key1 in label2idx:
        csv_dict[key1.strip()] = collections.OrderedDict()
        for key2 in label2idx:
            csv_dict[key1.strip()][key2.strip()] = 0
    #print(csv_dict)

    f_test = open(test_file, 'r', encoding='utf-8')
    f_pred = open(pred_file, 'r', encoding='utf-8')
    for (test_line, pred_line) in zip(f_test, f_pred):
        *test_s, test_label = test_line[:-1].split('\t')
        *pred_s, pred_label = pred_line[:-1].split('\t')
        csv_dict[test_label][pred_label] += 1

    with open(report_file, 'w', encoding='utf-8') as f:
        f.write(' ')
        for key in label2idx:
            f.write(',' + key)
        f.write('\n')
        for key in label2idx:
            f.write(key)
            for k in label2idx:
                f.write(',' + str(csv_dict[key][k]))
            f.write('\n')
Esempio n. 12
0
 def __init__(self,
              args,
              train_dataset=None,
              dev_dataset=None,
              test_dataset=None):
     self.args = args
     self.train_dataset = train_dataset
     self.dev_dataset = dev_dataset
     self.test_dataset = test_dataset
     self.label_lst = get_labels(args)
     self.num_labels = len(self.label_lst)
     # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
     self.pad_token_label_id = torch.nn.CrossEntropyLoss().ignore_index
     self.config_class, self.model_class, _ = MODEL_CLASSES[args.model_type]
     self.config = self.config_class.from_pretrained(
         args.model_name_or_path,
         num_labels=self.num_labels,
         finetuning_task=args.task,
         id2label={str(i): label
                   for i, label in enumerate(self.label_lst)},
         label2id={label: i
                   for i, label in enumerate(self.label_lst)})
     self.model = self.model_class.from_pretrained(args.model_name_or_path,
                                                   config=self.config)
     # GPU or CPU
     self.device = "cuda" if torch.cuda.is_available(
     ) and not args.no_cuda else "cpu"
     self.model.to(self.device)
     self.test_texts = None
     if args.write_pred:
         self.test_texts = get_test_texts(args)
         # Empty the original prediction files
         if os.path.exists(args.pred_dir):
             shutil.rmtree(args.pred_dir)
Esempio n. 13
0
def split_dataset(*guids, test_size, seed):
    random.seed(seed)
    labels = [
        '{}_{}'.format(*t)
        for t in get_labels(*guids, vendor=True, train=None, flatten=True)
    ]
    stats = defaultdict(list)
    for guid, label in zip(guids, labels):
        stats[label].append(guid)
    train_set = defaultdict(list)
    test_set = defaultdict(list)
    for label in stats:
        if len(stats[label]) < 2:
            continue
        num_sample = len(stats[label])
        num_test = max(int(num_sample * test_size), 1)
        idx_test = random.choice(num_sample, num_test, replace=False)
        for i in range(num_sample):
            if i in idx_test:
                test_set[label].append(stats[label][i])
            else:
                train_set[label].append(stats[label][i])
    with open(os.path.join(DATA_FOLDER, 'set', 'train.json'), 'w') as f:
        json.dump(train_set, f)
    with open(os.path.join(DATA_FOLDER, 'set', 'test.json'), 'w') as f:
        json.dump(test_set, f)
    return train_set, test_set
Esempio n. 14
0
def KobertModelLoader():
    init_logger()
    global parser
    global pred_config
    global args
    global device
    global model
    global label_lst
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_dir",
                        default="./model",
                        type=str,
                        help="Path to save, load model")
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size for prediction")
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    pred_config = parser.parse_args()
    # load model and args
    args = get_args(pred_config)
    device = get_device(pred_config)
    model = load_model(pred_config, args, device)
    label_lst = get_labels(args)
Esempio n. 15
0
    def __init__(self):
        self.datasets = get_datasets(heart_diseases, n_inputs)
        self.label_data = get_labels(self.datasets)
        self.callbacks = []

        # Initialize callbacks
        tensorboard_logs_path = "tensorboard_data/cnn/"
        tb_callback = tf.keras.callbacks.TensorBoard(
            log_dir=tensorboard_logs_path,
            histogram_freq=1,
            write_graph=True,
            embeddings_freq=1)

        # load_weights_on_restart will read the filepath of the weights if it exists and it will
        # load the weights into the model
        cp_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath="saved_models/cnn/model.hdf5",
            save_best_only=True,
            save_weights_only=True,
            load_weights_on_restart=restore_model)

        self.callbacks.extend([tb_callback, cp_callback])

        self.set_data()
        self.define_model()
Esempio n. 16
0
    def load_data(self, is_training=True):
        if is_training == True:
            idx = np.random.choice(len(self.trainidx), size=self.batch_size)
            trajectories = [self.load_feature(self.trainidx[i]) for i in idx]
            if not len(self.labels):
                labels = [
                    utils.get_labels(len(trajectories[i]), self.num_subgoals)
                    for i in range(len(idx))
                ]  # Equi-partition subgoals
            else:
                labels = [self.labels[self.trainidx[i]]
                          for i in idx]  # Estimated subgoals
            return trajectories, labels

        else:
            feat = []
            feat.append(self.load_feature(self.testidx[self.currenttestidx]))
            if self.currenttestidx == len(self.testidx) - 1:
                done = True
                self.currenttestidx = 0
            else:
                done = False
                self.currenttestidx += 1

            return feat, None, done
Esempio n. 17
0
    def accuracy(self,
                 encoder,
                 dataloader_eval,
                 classes,
                 device=torch.device("cpu"),
                 print_summary=False):

        if self.conv_part:
            encoder_depth = 6
        else:
            encoder_depth = 9

        # Empty tensors to store predictions and labels
        predictions_soft = torch.Tensor().float().to(device)
        labels = np.array([])

        print('[Evaluation of the samples...]')
        self.eval()  # Validation Mode
        encoder.eval()
        with torch.no_grad():  # No need to track the gradients

            for batch in tqdm(dataloader_eval):
                # Extract noisy waterfalls and move tensors to the selected device
                noisy_waterfalls, _, _, targets_labels = utils.get_labels(
                    batch, device)

                encoded_waterfalls = encoder.encode(
                    noisy_waterfalls.to(device), encoder_depth=encoder_depth)

                targets = self.forward(encoded_waterfalls)
                predictions_soft = torch.cat((predictions_soft, targets),
                                             dim=0)
                # loss = self.branches['NumTarget'].loss_fn(targets,  targets_labels)

                # Flatten the signals and append it to the labels
                labels = np.append(labels, batch['Parameters']['num_Targets'])

        # Compute hard prediction and convert data form tensors to numpy vectors
        try:
            _, preds = torch.max(predictions_soft.data, dim=1)
        except:
            preds = torch.zeros(0)

        preds = preds.cpu().numpy()

        print('[Computation of the accuracy metrics...]')
        # Collects several evaluation metrics
        conf_marix = np.zeros((13, 13))
        try:
            conf_marix = confusion_matrix(labels, preds, labels=classes['t'])
        except:
            pass
        result_metrics = {
            'matrix': conf_marix,
            'accuracy': accuracy_score(labels, preds),
            'balanced_accuracy': balanced_accuracy_score(labels, preds)
        }

        return result_metrics
Esempio n. 18
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     serv_motion = self.add_preload_service('MotionSensor')
     self.char_detected = serv_motion.configure_char('MotionDetected')
     self.engine = ClassificationEngine("./models/classify.tflite")
     self.is_trained = retrain()
     self.labels = get_labels()
     self.is_running = True
     logging.info(self.setup_message())
Esempio n. 19
0
def test(models, test_data, configs, epochs=50):
    x_test, y_test = test_data
    labels = get_labels(configs)
    for model, label in zip(models, labels):
        y_pred = model.predict(x_test)
        print("Model")
        print("    Configuration\n        ", end="")
        print(*label.replace(" -", ":").split(", "), sep="\n        ")
        print("    Accuracy")
        print(f"        Epoch {epochs}: {accuracy(y_test, y_pred) * 100:.2f}%")
Esempio n. 20
0
def set_test(model, test_iter):
    if not test_iter.is_test:
        test_iter.is_test = True
    labels = get_labels()
    idx2tag = dict(zip(range(len(labels)), labels))
    model.eval()
    with torch.no_grad():
        true_tags, pred_tags = [], []
        for input_ids_list, input_mask_list, segment_ids_list, label_ids_list, tokens_list in tqdm(
                test_iter):
            input_ids = list2ts2device(input_ids_list)
            input_mask = list2ts2device(input_mask_list)
            segment_ids = list2ts2device(segment_ids_list)
            batch_output = model(input_ids=input_ids,
                                 token_type_ids=segment_ids,
                                 attention_mask=input_mask)

            # 恢复标签真实长度
            real_batch_tags = []
            for i in range(config.batch_size):
                real_len = int(input_mask[i].sum())
                real_batch_tags.append(label_ids_list[i][:real_len])

            # List[int]
            pred_tags.extend([
                idx2tag.get(idx) for indices in batch_output for idx in indices
            ])
            true_tags.extend([
                idx2tag.get(idx) for indices in real_batch_tags
                for idx in indices
            ])
            assert len(pred_tags) == len(
                true_tags), 'len(pred_tags) is not equal to len(true_tags)!'
        # logging loss, f1 and report

        target_names = set(config.tags) - {"[PAD]", "[CLS]", "[SEP]", "O"}
        evaluation_dict = classification_report(true_tags,
                                                pred_tags,
                                                digits=4,
                                                output_dict=True)
        precision = 0
        recall = 0
        f1 = 0
        for key in evaluation_dict.keys():
            if key in target_names:
                precision += evaluation_dict[key]['precision']
                recall += evaluation_dict[key]['recall']
                f1 += evaluation_dict[key]['f1-score']
        f1 = f1 / len(target_names)
        precision = precision / len(target_names)
        recall = recall / len(target_names)

        print('precision: {:.4f}, recall: {:.4f}, F1: {:.4f}'.format(
            precision, recall, f1))
        return precision, recall, f1
Esempio n. 21
0
def role_process_binary(input_file, output_file, is_predict=False):
    label_list = get_labels(task="role", mode="classification")
    label_map = {label: i for i, label in enumerate(label_list)}
    rows = open(input_file, encoding='utf-8').read().splitlines()
    results = []
    count = 0
    for row in rows:
        if len(row) == 1: print(row)
        row = json.loads(row)
        count += 1
        if "id" not in row:
            row["id"] = count
        start_labels = ['O'] * len(row["text"])
        end_labels = ['O'] * len(row["text"])
        arguments = []
        if is_predict:
            results.append({
                "id": row["id"],
                "tokens": list(row["text"]),
                "start_labels": start_labels,
                "end_labels": end_labels,
                "arguments": arguments
            })
            continue
        for event in row["event_list"]:
            event_type = event["event_type"]
            for arg in event["arguments"]:
                role = arg['role']
                role_id = label_map[role]
                argument = arg['argument']
                argument_start_index = arg["argument_start_index"]
                argument_end_index = argument_start_index + len(argument) - 1

                if start_labels[argument_start_index] == "O":
                    start_labels[argument_start_index] = role
                else:
                    start_labels[argument_start_index] += (" " + role)
                if end_labels[argument_end_index] == "O":
                    end_labels[argument_end_index] = role
                else:
                    end_labels[argument_end_index] += (" " + role)

                if arg['alias'] != []: print(arg['alias'])

                arg.pop('alias')
                arguments.append(arg)

        results.append({
            "id": row["id"],
            "tokens": list(row["text"]),
            "start_labels": start_labels,
            "end_labels": end_labels,
            "arguments": arguments
        })
    write_file(results, output_file)
Esempio n. 22
0
def index_output_segment_bin(test_file, prediction_file, output_file):
    label_list = get_labels(task='role', mode="classification")
    label_map = {i: label for i, label in enumerate(label_list)}

    tests = open(test_file, encoding='utf-8').read().splitlines()
    predictions = open(prediction_file, encoding='utf-8').read().splitlines()
    results = []
    index = 0
    max_length = 256 - 2
    for test, prediction in zip(tests, predictions):
        index += 1
        test = json.loads(test)
        start_labels = test.pop('start_labels')
        end_labels = test.pop('end_labels')

        tokens = test.pop('tokens')
        text = ''.join(tokens)
        test['text'] = text

        segment_ids = test.pop('segment_ids')
        trigger = ''.join(
            [tokens[i] for i in range(len(tokens)) if segment_ids[i]])
        for i in range(len(tokens)):
            if segment_ids[i]:
                trigger_start_index = i
                break

        event = {}
        # event['trigger'] = trigger
        # event['trigger_start_index']= trigger_start_index
        event_type = test.pop("event_type")
        event["event_type"] = event_type

        prediction = json.loads(prediction)
        arg_list = prediction["labels"]
        arguments = []
        for arg in arg_list:
            sub_dict = {}
            argument_start_index = arg[1] - 1
            argument_end_index = arg[2] - 1
            argument = text[argument_start_index:argument_end_index + 1]
            role = label_map[arg[3]]
            sub_dict["role"] = role
            sub_dict["argument"] = argument
            # sub_dict["argument_start_index"] = argument_start_index
            arguments.append(sub_dict)

        event["arguments"] = arguments

        test['event_list'] = [event]
        results.append(test)
    write_file(results, output_file)
Esempio n. 23
0
def do_submission():
    train, test = load_dataset()
    train_X = train["tweet"]
    train_Y = get_labels(train)
    test_X = test["tweet"]

    feature_type = ["wordcount", "char"]
    test_ids = get_test_ids(test)
    meta_train_X, meta_test_X = get_extracted_features(feature_type, train_X, test_X)

    print("n_samples: %d, n_features: %d" % meta_train_X.shape)

    predict_and_sub(meta_train_X, train_Y.values, meta_test_X, test_ids, predict_ridge)
Esempio n. 24
0
def load_and_cache_examples(config, task, tokenizer, evaluate=False, test=False):
    if config.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    processor = GeneralProcessor()
    output_mode = "classification"

    # Load data features from cache or dataset file
    evaluation_set_name = 'test' if test else 'dev'
    cached_features_file = os.path.join(config.data_dir, 'cached_{}_{}_{}_{}'.format(
        evaluation_set_name if evaluate else 'train',
        list(filter(None, config.pretrained_model_name.split('/'))).pop(),
        str(config.max_seq_len),
        str(task)))
    if os.path.exists(cached_features_file):
        logger.info(f"Loading features from cached file {cached_features_file}")
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", config.data_dir)
        label_list = get_labels(config.data_dir)
        examples = processor.get_dev_examples(config.data_dir) if evaluate else processor.get_train_examples(config.data_dir)
        features = convert_examples_to_features(
            examples, label_list, config.max_seq_len, tokenizer, "classification", use_entity_indicator=config.use_entity_indicator)
        if config.local_rank in [-1, 0]:
            logger.info(f"Saving features into cached file {cached_features_file}")
            torch.save(features, cached_features_file)

    if config.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()
    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor(
        [f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor(
        [f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor(
        [f.segment_ids for f in features], dtype=torch.long)
    all_e1_mask = torch.tensor(
        [f.e1_mask for f in features], dtype=torch.long)  # add e1 mask
    all_e2_mask = torch.tensor(
        [f.e2_mask for f in features], dtype=torch.long)  # add e2 mask
    if output_mode == "classification":
        all_label_ids = torch.tensor(
            [f.label_id for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor(
            [f.label_id for f in features], dtype=torch.float)
    dataset = TensorDataset(all_input_ids, all_input_mask,
                            all_segment_ids, all_label_ids, all_e1_mask, all_e2_mask)
    return dataset
Esempio n. 25
0
    def __init__(self, weights=None, biases=None):
        self.weights = weights if weights else self.weights
        self.biases = biases if biases else self.biases
        self.datasets = get_datasets(heart_diseases, nr_inputs)
        self.label_data = get_labels(self.datasets)

        self.saver = ModelSaver(save_dir="saved_models/cnn/")

        logs_path = "tensorboard_data/cnn/"
        self.tensorboard_handler = TensorBoardHandler(logs_path)
        self.tensorboard_handler.add_histograms(self.weights)
        self.tensorboard_handler.add_histograms(self.biases)

        self.build()
Esempio n. 26
0
def train(loader, model, optimizer, criterion, epoch, d1, d2, blind,
          noise_level):
    running_l1 = 0
    train_l1 = 0
    model.train(True)

    k1 = model.weight[0].unsqueeze(0).expand(loader.batch_size, -1, -1, -1)
    k2 = model.weight[1].unsqueeze(0).expand(loader.batch_size, -1, -1, -1)
    d1 = d1.expand(loader.batch_size, -1, -1, -1)
    d2 = d2.expand(loader.batch_size, -1, -1, -1)

    for i, data in tqdm.tqdm(enumerate(loader)):
        x, y, mag, ori = data
        x = x.to(device)
        y = y.to(device)
        mag = mag.to(device)
        ori = ori.to(device)
        ori = (90 - ori).add(360).fmod(180)

        labels = utils.get_labels(mag, ori)
        ori = ori * np.pi / 180

        if blind:
            nl = (noise_level - 0.5) * np.random.rand(1) + 0.5
        else:
            nl = noise_level
        nl = float(nl) / 255
        y += nl * torch.randn_like(y)
        y = y.clamp(0, 1)
        y.requires_grad_()

        optimizer.zero_grad()

        hat_x = model(y, mag, ori, labels, k1, k2, d1, d2)

        error = criterion(hat_x, x)
        error.backward()

        optimizer.step()

        # computing running loss
        running_l1 += F.l1_loss(hat_x[-1], x).item()
        train_l1 += F.l1_loss(hat_x[-1], x).item()

        if (i + 1) % 500 == 0:
            running_l1 /= 500
            print('    Running loss %2.5f' % (running_l1))
            running_l1 = 0

    return train_l1 / len(loader)
Esempio n. 27
0
def parse_study_dir(data_dir, sample_to_label, label_to_encoding,
                    genes_to_keep):
    '''This function extracts the gene expression data and labels for a single study

    Arguments
    ---------
    data_dir: str
        The path to the directories where the data are stored. These are generally directories
        within the unzipped main directory downloaded from refine.bio, and will contain
        data for a single study.
    sample_to_label: dict
        A dictionary mapping sample identifiers to their corresponding labels
    label_to_encoding: dict
        A dictionary mapping the string label (e.g. 'sepsis') to a numerical target like 0
    genes_to_keep: list of strs
        The list of gene identifiers to be kept in the dataframe

    Returns
    -------
    curr_df: pandas.DataFrame
        A single dataframe containing the expression data of all genes in genes_to_keep for all
        samples in the study
    study_labels: list of ints
        Labels corresponding to whether each sample contains to septic or healthy gene expression
    '''
    study = os.path.basename(os.path.normpath(data_dir))
    study_file_name = study + '.tsv'
    data_file = os.path.join(data_dir, study_file_name)
    curr_df = pd.read_csv(data_file, sep='\t')

    curr_df = curr_df.set_index('Gene')

    # Remove samples that don't fall into a class of interest
    labels_to_keep = label_to_encoding.keys()
    curr_df = utils.keep_samples_with_labels(curr_df, sample_to_label,
                                             labels_to_keep)

    # If keep_samples_with_labels returns None, we should return None for the labels as well
    if curr_df is None:
        return (None, None)

    # Retrieve labels for each sample
    study_labels = utils.get_labels(curr_df, sample_to_label,
                                    label_to_encoding)

    curr_df = curr_df.loc[genes_to_keep, :]

    return curr_df, study_labels
Esempio n. 28
0
def fix_masks(image_dir, masks_dir):
    images_filenames = np.array(sorted(glob.glob(image_dir + "/*.tif")))

    for filename in images_filenames:
        dataset = rasterio.open(filename)
        meta = dataset.profile
        labels_dict, num_labels = get_labels(
            meta, "IMG_PER1_20190217152904_ORT_P_000659.TIF", database_file)
        mask = get_income_level_segmentation_mask(
            labels_dict, levels_dict, (meta['width'], meta['height']),
            meta['transform'])

        out_filename = os.path.join(masks_dir,
                                    filename[filename.rfind("/") + 1:])

        pickle.dump(mask, open(out_filename, "wb"))
Esempio n. 29
0
 def __init__(self, model_dir, result_dir="results/"):
     self.model_dir = model_dir
     self.CONFIG = get_config(os.path.join(model_dir, "config.yaml"))
     self.embSize = self.CONFIG["emb_size"]
     self.device = self.CONFIG["device"]
     self.label2int, self.int2label = get_labels(os.path.join(model_dir, "labels.txt"))
     self.result_dir = result_dir
     self.createResultDir()
     self.featureExtractorPath = os.path.join(model_dir, "extractor.pth")
     self.featureExtractor = self.getModel()
     self.classifierPath = os.path.join(model_dir, "classifier.pkl")
     self.classifier = self.get_classifier()
     self.craft_net = load_craftnet_model(cuda=True, weight_path=self.CONFIG["CRAFT_WEIGHT"])
     self.threshold = [
         0.15658274643186937, 0.36065899509540494, 0.38856512542345073, 0.40749227647561814, 0.16654823949181954, 0.1937299593585302, 0.11403658569607081, 0.45526403121798326
     ]
Esempio n. 30
0
def main():
    graph = utils.load_graph()
    position = utils.get_positions(graph)
    utils.make_dir('images/louvain')

    true_communities = utils.get_labels(graph, list(graph.nodes))
    utils.plot_communities(graph, position, true_communities, labels=True, title='Butterfly Similarity Network - True Communities', path='images/louvain/communities_true.png')

    communities = utils.group_communities(louvain_clustering(graph))
    utils.plot_communities(graph, position, communities, labels=False, title='Butterfly Similarity Network - Louvain Communities', path='images/louvain/communities_louvain.png')

    graph_nodes = sorted(list(graph.nodes))
    predictions = utils.predict_majority_class(graph, communities)
    preds = [predictions[n] for n in graph_nodes]
    labels = [graph.nodes[n]['label'] for n in graph_nodes]
    utils.accuracy(preds, labels)
    utils.confusion_matrix(preds, labels, 'Confusion Matrix - Majority Label Predictions from Louvain Communities', 'images/louvain/cm_louvain.png')
Esempio n. 31
0
def accuracy(model, loader, device):
    correct_count = 0
    total_count = 0

    model.to(device)

    for images, true_labels in tqdm(loader, desc='Accuracy Test'):
        total_count += len(images)
        images = images.to(device)
        true_labels = true_labels.to(device)

        predicted_labels = utils.get_labels(model, images).detach()

        correct = torch.eq(predicted_labels, true_labels)
        correct_count += len(torch.nonzero(correct))

    return correct_count / total_count
def predict(model, generator, model_name, data_path):
    '''
    Predict data from generator and visualize prediction sample
    Args: 
        @model: CNN model
        @generator: generator of testing data
        @model_name: str, name of the model showing on the plot
        @data_path: path where the data is saved
    Return: 
        scores: list, [loss, accuracy]
        true_labels: true labels
        predict_labels: predict labels
    Call: 
        show_prediction(images, labels, labels_pre, model_name)
        get_lookup_tables(path, generator)
        get_labels(path, indexes, generator)
    '''

    # evaluate model
    print('Start evaluating...')
    scores = model.evaluate(x=generator)
    print('Loss:{loss:.2f}\nAccuracy:{accuracy:.2f}'.format(
        loss=scores[0], accuracy=scores[1]))

    # predict model using loop to get labels, becuase model.evaluate does not return labels
    print('Start predicting...will take a few minutes')
    # create a list to store labels of each loop
    true_labels = []
    predict_labels = []
    # get lookup table: dictionary {index: class_name}
    lookup_table = get_lookup_tables(data_path, generator)

    for i in range(generator.samples // generator.batch_size):
        x, y = generator.next()
        y_pre = model.predict(x=x)
        # generate labels
        y_labels = get_labels(data_path, y, generator)
        y_pre_index = y_pre.argmax(axis=1)
        y_pre_labels = [lookup_table[i] for i in y_pre_index]
        # append result to list
        true_labels += y_labels
        predict_labels += y_pre_labels
    # plot prediction examples
    show_prediction(x, y_labels, y_pre_labels, model_name)
    return scores, true_labels, predict_labels
Esempio n. 33
0
def train():
    train, _ = load_dataset()
    train_X = train["tweet"]
    train_Y = get_labels(train)

    n_samples = len(train_Y)

    X_train, _, y_train, _ = train_test_split(train_X[:n_samples], train_Y[:n_samples], test_size=0.2, random_state=1)

    t0 = time()

    feature_type = ["wordcount", "char"]

    rmse_avg = do_cross_val(X_train, y_train, feature_type, nfolds=3)

    print("Average RMSE %.6f" % rmse_avg)

    duration = time() - t0
    print("training time: %fs" % duration)
Esempio n. 34
0
def train_model():

    train, _ = load_dataset()
    train_X = train["tweet"]
    train_Y = get_labels(train)

    n_samples = len(train_Y)

    X_train, _, y_train, _ = train_test_split(train_X[:n_samples], train_Y[:n_samples], test_size=0.2, random_state=1)

    scorer = make_scorer(rmse_score, greater_is_better=False)

    pipeline, parameters = get_ridge_model()
    # pipeline, parameters = get_three_predictor_model()
    # pipeline, parameters = get_elasticnet_model()
    # pipeline, parameters = get_three_predictor_model2()
    # pipeline, parameters = get_three_predictor_model3()
    # pipeline, parameters = get_ridge_model2()
    # pipeline, parameters = get_ridge_model3()
    # pipeline, parameters = get_advanced_ridge()

    do_gridsearch(X_train, y_train, pipeline, parameters, scorer)
#Adding Images for testing

import cv2
from keras.models import load_model
import numpy as np
from statistics import mode
from utils import preprocess_input
from utils import get_labels

# parameters
image_path = '../images/test_image.jpg'
detection_model_path = '../trained_models/detection_models/haarcascade_frontalface_default.xml'
emotion_model_path = '../trained_models/emotion_models/simple_CNN.530-0.65.hdf5'
gender_model_path = '../trained_models/gender_models/simple_CNN.81-0.96.hdf5'
emotion_labels = get_labels('fer2013')
gender_labels = get_labels('imdb')
font = cv2.FONT_HERSHEY_SIMPLEX

x_offset_emotion = 20
y_offset_emotion = 40
x_offset = 30
y_offset = 60

# loading models
face_detection = cv2.CascadeClassifier(detection_model_path)
emotion_classifier = load_model(emotion_model_path)
gender_classifier = load_model(gender_model_path)

frame = cv2.imread(image_path)
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
Esempio n. 36
0
def train_final():
    """
    train a model using grid search for parameter estimation
    """

    train, test = load_dataset()
    train_X = train["tweet"]
    train_Y = get_labels(train)
    test_X = test["tweet"]

    tfidf1 = TfidfVectorizer(
        max_df=0.6,
        min_df=0.0000003,
        stop_words="english",
        strip_accents="unicode",
        token_pattern="\w{1,}",
        max_features=5000,
        norm="l2",
        use_idf=False,
        smooth_idf=False,
        ngram_range=(1, 3),
    )

    tfidf2 = TfidfVectorizer(
        max_df=0.6,
        analyzer="char",
        min_df=0.00001,
        stop_words="english",
        strip_accents="unicode",
        norm="l2",
        max_features=5000,
        ngram_range=(1, 7),
        smooth_idf=False,
        use_idf=False,
    )

    tfidf1.fit(np.hstack((train_X, test_X)))
    tfidf2.fit(np.hstack((train_X, test_X)))

    train_X1 = tfidf1.transform(train_X)
    train_X2 = tfidf2.transform(train_X)

    train_X = hstack([train_X1, train_X2]).tocsr()

    n_samples = len(train_Y)

    X_train, _, y_train, _ = train_test_split(train_X[:n_samples], train_Y[:n_samples], test_size=0.2, random_state=1)

    scorer = make_scorer(rmse_score, greater_is_better=False)

    pipeline, parameters = get_advanced_ridge2()
    # pipeline, parameters = get_three_predictor_model()
    # pipeline, parameters = get_elasticnet_model()
    # pipeline, parameters = get_three_predictor_model2()
    # pipeline, parameters = get_three_predictor_model3()
    # pipeline, parameters = get_ridge_model2()
    # pipeline, parameters = get_ridge_model3()
    # pipeline, parameters = get_advanced_ridge()

    best_estimator = do_gridsearch(X_train, y_train, pipeline, parameters, n_jobs=5, verbose=1, scoring=scorer)

    # predict test data
    test_1 = tfidf1.transform(test_X)
    test_2 = tfidf2.transform(test_X)

    test_d = hstack([test_1, test_2])

    final_preds = best_estimator.predict(test_d)
    save_prediction_subs(test["id"], final_preds)