def load_fire16(data_dir=dataset_dir,
                filename='fire16_labeled',
                data_set='train'):
    mapped_file = filename + '_4class.csv'
    if exists(join(data_dir, mapped_file)):
        # data_df = read_labelled_json(data_dir=data_dir, filename=mapped_file, data_set=data_set)
        data_df = read_csv(data_dir=data_dir,
                           data_file=mapped_file,
                           index_col=0,
                           header=0)
    else:
        data_df = read_labelled_json(data_dir=data_dir,
                                     filename=filename,
                                     data_set=data_set)

        ## Match label space between two datasets:
        data_df = labels_mapper(data_df)

        # delete all rows where sum == 0
        irrelevant_rows = []
        for i, row in data_df.iterrows():
            if sum(row[1:]) < 1:
                irrelevant_rows.append(i)

        data_df = data_df.drop(irrelevant_rows)

        data_df.to_csv(join(data_dir, mapped_file))

    return data_df
def load_smerp17(data_dir=dataset_dir,
                 filename='smerp17_labeled',
                 data_set='test'):
    data_df = read_labelled_json(data_dir=data_dir, filename=filename)

    # data_df.to_csv(join(data_dir, filename+'_4class'))

    return data_df
Exemple #3
0
def split_target(df=None,
                 data_dir=dataset_dir,
                 labelled_dataname=cfg['data']['test'],
                 test_size=0.999,
                 train_size=None,
                 n_classes=cfg['data']['num_classes'],
                 stratified=False):
    """ Splits labelled target data to train and test set.

    :param data_dir:
    :param labelled_dataname:
    :param test_size:
    :param train_size:
    :param n_classes:
    :return:
    """
    logger.info('Splits labelled target data to train and test set.')
    ## Read target data
    if df is None:
        df = read_labelled_json(data_dir, labelled_dataname)
    df, t_lab_test_df = split_df(df,
                                 test_size=test_size,
                                 stratified=stratified,
                                 order=2,
                                 n_classes=n_classes)

    logger.info(f'Number of TEST samples: [{t_lab_test_df.shape[0]}]')

    if train_size is not None:
        _, df = split_df(df,
                         test_size=train_size,
                         stratified=stratified,
                         order=2,
                         n_classes=n_classes)
    logger.info(f'Number of TRAIN samples: [{df.shape[0]}]')

    # token_dist(t_lab_df)

    return df, t_lab_test_df
 def read_data(self, data_dir=dataset_dir, filename=cfg['data']['train']):
     new_df = read_labelled_json(data_dir, filename)
     new_df = format_inputs(new_df)
     return new_df
Exemple #5
0
def classify(
    train_df=None,
    test_df=None,
    stoi=None,
    vectors=None,
    n_classes=cfg['data']['num_classes'],
    dim=cfg['embeddings']['emb_dim'],
    data_dir=dataset_dir,
    train_filename=cfg['data']['train'],
    test_filename=cfg['data']['test'],
    cls_thresh=None,
    epoch=cfg['training']['num_epoch'],
    num_layers=cfg['lstm_params']['num_layers'],
    num_hidden_nodes=cfg['lstm_params']['hid_size'],
    dropout=cfg['model']['dropout'],
    default_thresh=0.5,
    lr=cfg['model']['optimizer']['lr'],
    train_batch_size=cfg['training']['train_batch_size'],
    test_batch_size=cfg['training']['eval_batch_size'],
):
    """

    :param n_classes:
    :param test_batch_size:
    :param train_df:
    :param test_df:
    :param stoi:
    :param vectors:
    :param dim:
    :param data_dir:
    :param train_filename:
    :param test_filename:
    :param cls_thresh:
    :param epoch:
    :param num_layers:
    :param num_hidden_nodes:
    :param dropout:
    :param default_thresh:
    :param lr:
    :param train_batch_size:
    :return:
    """
    ## Prepare labelled source data:
    # logger.info('Prepare labelled source data')
    # if train_df is None:
    #     train_df = read_labelled_json(data_dir, train_filename)
    #     train_df = labels_mapper(train_df)
    train_dataname = train_filename + "_4class.csv"
    train_df.to_csv(join(data_dir, train_dataname))

    if stoi is None:
        logger.critical('GLOVE features')
        train_dataset, (train_vocab, train_label) = get_dataset_fields(
            csv_dir=data_dir,
            csv_file=train_dataname,
            min_freq=1,
            labelled_data=True)
    else:
        logger.critical('GCN features')
        train_dataset, (train_vocab, train_label) = get_dataset_fields(
            csv_dir=data_dir,
            csv_file=train_dataname,
            min_freq=1,
            labelled_data=True,
            embedding_file=None,
            embedding_dir=None)
        train_vocab.vocab.set_vectors(stoi=stoi, vectors=vectors, dim=dim)

    ## Plot representations:
    # plot_features_tsne(train_vocab.vocab.vectors,
    #                    list(train_vocab.vocab.stoi.keys()))

    ## Prepare labelled target data:
    logger.info('Prepare labelled target data')
    if test_df is None:
        test_df = read_labelled_json(data_dir, test_filename)
    test_dataname = test_filename + "_4class.csv"
    test_df.to_csv(join(data_dir, test_dataname))
    test_dataset, (test_vocab, test_label) = get_dataset_fields(
        csv_dir=data_dir,
        csv_file=test_dataname,  # init_vocab=True,
        labelled_data=True)

    # check whether cuda is available
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    logger.info('Get iterator')
    train_iter, val_iter = dataset2bucket_iter(
        (train_dataset, test_dataset),
        batch_sizes=(train_batch_size, test_batch_size))

    size_of_vocab = len(train_vocab.vocab)
    num_output_nodes = n_classes

    # instantiate the model
    logger.info('instantiate the model')
    model = BiLSTM_Classifier(size_of_vocab,
                              num_hidden_nodes,
                              num_output_nodes,
                              dim,
                              num_layers,
                              dropout=dropout)

    # architecture
    logger.info(model)

    # No. of trianable parameters
    logger.info('No. of trianable parameters')
    count_parameters(model)

    # Initialize the pretrained embedding
    logger.info('Initialize the pretrained embedding')
    pretrained_embeddings = train_vocab.vocab.vectors
    model.embedding.weight.data.copy_(pretrained_embeddings)

    logger.debug(pretrained_embeddings.shape)

    # label_cols = [str(cls) for cls in range(n_classes)]

    logger.info('Training model')
    model_best, val_preds_trues_best, val_preds_trues_all, losses = trainer(
        model, train_iter, val_iter, N_EPOCHS=epoch, lr=lr)

    plot_training_loss(losses['train'],
                       losses['val'],
                       plot_name='loss' + str(epoch) + str(lr))

    if cls_thresh is None:
        cls_thresh = [default_thresh] * n_classes

    predicted_labels = logit2label(DataFrame(
        val_preds_trues_best['preds'].cpu().numpy()),
                                   cls_thresh,
                                   drop_irrelevant=False)

    logger.info('Calculate performance')
    result = calculate_performance_pl(val_preds_trues_best['trues'],
                                      val_preds_trues_best['preds'])

    logger.info("Result: {}".format(result))

    # result_df = flatten_results(result)
    # result_df.round(decimals=4).to_csv(
    #     join(data_dir, test_filename + '_results.csv'))

    return result
Exemple #6
0
def prepare_datasets(train_df=None,
                     test_df=None,
                     stoi=None,
                     vectors=None,
                     dim=cfg['embeddings']['emb_dim'],
                     split_test=False,
                     get_iter=False,
                     data_dir=dataset_dir,
                     train_filename=cfg['data']['train'],
                     test_filename=cfg['data']['test']):
    """ Creates train and test dataset from df and returns data loader.

    :param get_iter: If iterator over the text samples should be returned
    :param split_test: Splits the testing data
    :param train_df: Training dataframe
    :param test_df: Testing dataframe
    :param vectors: Custom Vectors for each token
    :param dim: Embedding dim
    :param data_dir:
    :param train_filename:
    :param test_filename:
    :return:
    """
    logger.info(f'Prepare labelled train (source) data: {train_filename}')
    if train_df is None:
        if train_filename.startswith('fire16'):
            train_df = load_fire16()
        else:
            train_df = read_labelled_json(data_dir, train_filename)

    train_dataname = train_filename + "_4class.csv"
    train_df.to_csv(join(data_dir, train_dataname))

    if stoi is None:
        logger.critical('Setting GLOVE vectors:')
        train_dataset, (train_vocab, train_label) = get_dataset_fields(
            csv_dir=data_dir,
            csv_file=train_dataname,
            min_freq=1,
            labelled_data=True)
    else:
        logger.critical('Setting custom vectors:')
        train_dataset, (train_vocab, train_label) = get_dataset_fields(
            csv_dir=data_dir,
            csv_file=train_dataname,
            min_freq=1,
            labelled_data=True,
            embedding_file=None,
            embedding_dir=None)
        train_vocab.vocab.set_vectors(stoi=stoi, vectors=vectors, dim=dim)

    ## Plot representations:
    # plot_features_tsne(train_vocab.vocab.vectors,
    #                    list(train_vocab.vocab.stoi.keys()))

    # train_vocab = {
    #     'freqs':       train_vocab.vocab.freqs,
    #     'str2idx_map': dict(train_vocab.vocab.stoi),
    #     'idx2str_map': train_vocab.vocab.itos,
    #     'vectors': train_vocab.vocab.vectors,
    # }

    ## Prepare labelled target data:
    logger.info(f'Prepare labelled test (target) data: {test_filename}')
    if test_df is None:
        if test_filename.startswith('smerp17'):
            test_df = load_smerp17()
        else:
            test_df = read_labelled_json(data_dir,
                                         test_filename,
                                         data_set='test')

        if split_test:
            test_extra_df, test_df = split_target(df=test_df, test_size=0.4)
    test_dataname = test_filename + "_4class.csv"
    test_df.to_csv(join(data_dir, test_dataname))
    test_dataset, (test_vocab,
                   test_label) = get_dataset_fields(csv_dir=data_dir,
                                                    csv_file=test_dataname,
                                                    labelled_data=True)

    # test_vocab = {
    #     'freqs':       test_vocab.vocab.freqs,
    #     'str2idx_map': dict(test_vocab.vocab.stoi),
    #     'idx2str_map': test_vocab.vocab.itos,
    #     'vectors': test_vocab.vocab.vectors,
    # }

    logger.info('Get iterator')
    if get_iter:
        train_batch_size = cfg['training']['train_batch_size']
        test_batch_size = cfg['training']['eval_batch_size']
        train_iter, val_iter = dataset2bucket_iter(
            (train_dataset, test_dataset),
            batch_sizes=(train_batch_size, test_batch_size))

        return train_dataset, test_dataset, train_vocab, test_vocab, train_iter, val_iter

    return train_dataset, test_dataset, train_vocab, test_vocab
                        type=str)
    parser.add_argument("-ne",
                        "--num_train_epochs",
                        default=cfg['training']['num_epoch'],
                        type=int)
    parser.add_argument("-c",
                        "--use_cuda",
                        default=cfg['model']['use_cuda'],
                        action='store_true')

    args = parser.parse_args()

    from File_Handlers.json_handler import read_labelled_json
    from Class_mapper.FIRE16_SMERP17_map import labels_mapper

    data_dir = dataset_dir

    train_df = read_labelled_json(data_dir, args.dataset_name)
    train_df = labels_mapper(train_df)

    test_df = read_labelled_json(data_dir, cfg['data']['target']['labelled'])
    test_df = test_df.sample(frac=1)

    result, model_outputs = BERT_classifier(train_df=train_df,
                                            test_df=test_df,
                                            dataset_name=args.dataset_name,
                                            model_name=args.model_name,
                                            model_type=args.model_type,
                                            num_epoch=args.num_train_epochs,
                                            use_cuda=args.use_cuda)