Beispiel #1
0
def train_and_test(df, preds, seed):
    '''
    Run a single trial:
        Shuffle df and split it into training and testing subsets
        Train a new model based on the training sets
        Test the model with testing set
        Add prediction data into preds array

    :param df: dataframe with full set of all available samples
        columns: id, cat1 (primary class), cat2 (secondary),
        title, titlen (claened title)
    :param preds: an array of predictions, each prediction is a dictionary
        cat: true category, pred: predicted category,
        conf: model confidence in its prediction (< 1.0),
        title: actual title of the chapter/sample
    :return: average testing accuracy
    '''
    ret = {}

    # PREPS
    # randomly split the dataset
    df = utils.split_dataset(
        df,
        settings.CAT_DEPTH,
        settings.TRAIN_PER_CLASS_MIN,
        settings.TEST_PER_CLASS,
        settings.VALID_PER_CLASS,
    )

    # TRAIN
    classifier = Classifier.from_name(settings.CLASSIFIER, seed)
    classifier.set_datasets(df, titles_out_path)
    classifier.train()

    df_test = classifier.df_test

    if settings.EVALUATE_TRAINING_SET:
        evaluate_model(classifier,
                       classifier.df_train,
                       display_prefix='TRAIN = ')
    accuracy = evaluate_model(classifier,
                              df_test,
                              preds,
                              display_prefix='TEST  = ')
    classifier_key = utils.get_exp_key(classifier)

    classifier.release_resources()

    return classifier_key, accuracy, classifier.df_train
Beispiel #2
0
def prepare_dataset():
    '''Convert input .txt o .csv into a .csv file with all the necessary
    columns for training and testing classification models.'''

    # # experimental work done on first, small dataset.
    # utils.extract_transcripts_from_pdfs()
    # utils.learn_embeddings_from_transcipts()

    # load titles file into dataframe
    df_all = pd.DataFrame()
    for fileinfo in settings.DATASET_FILES:
        if not (fileinfo['can_train'] or fileinfo['can_test']):
            continue

        titles_path = utils.get_data_path('in', fileinfo['filename'])

        if not os.path.exists(titles_path):
            utils.log_error(
                'The training file ({0}) is missing. See README.md for more info.'
                .format(titles_path))

        df = utils.read_df_from_titles(titles_path,
                                       use_full_text=settings.FULL_TEXT)
        for flag in ['can_train', 'can_test']:
            df[flag] = fileinfo[flag]
        df_all = df_all.append(df, ignore_index=True)

    # save that as a csv
    df_all.to_csv(
        titles_out_path,
        columns=['id', 'cat1', 'cat2', 'title', 'can_train', 'can_test'],
        index=False)

    # normalise the title
    classifier = Classifier.from_name(settings.CLASSIFIER, None)
    df_all['titlen'] = df_all['title'].apply(lambda v: classifier.tokenise(v))
    classifier.release_resources()

    return df_all