def train_and_test(df, preds, seed): ''' Run a single trial: Shuffle df and split it into training and testing subsets Train a new model based on the training sets Test the model with testing set Add prediction data into preds array :param df: dataframe with full set of all available samples columns: id, cat1 (primary class), cat2 (secondary), title, titlen (claened title) :param preds: an array of predictions, each prediction is a dictionary cat: true category, pred: predicted category, conf: model confidence in its prediction (< 1.0), title: actual title of the chapter/sample :return: average testing accuracy ''' ret = {} # PREPS # randomly split the dataset df = utils.split_dataset( df, settings.CAT_DEPTH, settings.TRAIN_PER_CLASS_MIN, settings.TEST_PER_CLASS, settings.VALID_PER_CLASS, ) # TRAIN classifier = Classifier.from_name(settings.CLASSIFIER, seed) classifier.set_datasets(df, titles_out_path) classifier.train() df_test = classifier.df_test if settings.EVALUATE_TRAINING_SET: evaluate_model(classifier, classifier.df_train, display_prefix='TRAIN = ') accuracy = evaluate_model(classifier, df_test, preds, display_prefix='TEST = ') classifier_key = utils.get_exp_key(classifier) classifier.release_resources() return classifier_key, accuracy, classifier.df_train
def prepare_dataset(): '''Convert input .txt o .csv into a .csv file with all the necessary columns for training and testing classification models.''' # # experimental work done on first, small dataset. # utils.extract_transcripts_from_pdfs() # utils.learn_embeddings_from_transcipts() # load titles file into dataframe df_all = pd.DataFrame() for fileinfo in settings.DATASET_FILES: if not (fileinfo['can_train'] or fileinfo['can_test']): continue titles_path = utils.get_data_path('in', fileinfo['filename']) if not os.path.exists(titles_path): utils.log_error( 'The training file ({0}) is missing. See README.md for more info.' .format(titles_path)) df = utils.read_df_from_titles(titles_path, use_full_text=settings.FULL_TEXT) for flag in ['can_train', 'can_test']: df[flag] = fileinfo[flag] df_all = df_all.append(df, ignore_index=True) # save that as a csv df_all.to_csv( titles_out_path, columns=['id', 'cat1', 'cat2', 'title', 'can_train', 'can_test'], index=False) # normalise the title classifier = Classifier.from_name(settings.CLASSIFIER, None) df_all['titlen'] = df_all['title'].apply(lambda v: classifier.tokenise(v)) classifier.release_resources() return df_all