Example #1
0
def train_and_save(train_file: str, valid_file: str, train_sample: int,
                   valid_sample: int, model: str, feature_model: str,
                   classifier: str):
    Model = getattr(models, model)
    df_train = get_dataset(train_file)
    df_valid = get_dataset(valid_file)
    n_train_total, n_valid_total = df_train.shape[0], df_valid.shape[0]
    n_train_sample, n_valid_sample = train_sample, valid_sample
    if n_train_sample > n_train_total:
        logging.warning(
            f'Training sample size ({n_train_sample}) cannot be '
            f'larger than the training dataset (n={n_train_total:,d}).')
        n_train_sample = n_train_total
    if n_valid_sample > n_valid_total:
        logging.warning(
            f'Validation sample size ({n_valid_sample}) cannot be '
            f'larger than the validation dataset (n={n_valid_total:,d}).')
        n_valid_sample = n_valid_total
    X_train, Y_train = read_data(df_train, sample_n=n_train_sample)
    X_valid, Y_valid = read_data(df_valid, sample_n=n_valid_sample)
    model = Model(classifier=classifier,
                  steps=[feature_model],
                  memory='data/feature_cache')

    with joblib.parallel_backend('threading', n_jobs=2):
        model.fit(X_train, Y_train)
        score = model.score(X_valid, Y_valid)
        logging.info('')
        logging.info(f'Overall F1: {score:.4f}')
        logging.info('')
    save_model(model)
Example #2
0
def predict_one(dataset, dfs, totals, seed, fm, clf, **kwargs):
    """Predict for a random review"""
    lang = 'en' if '_en' in dataset else 'zh'
    X, y = read_data(dfs[0])
    if totals[0] == 0:
        review = {
            'id': 'N/A',
            'content_html': '--  No matching reviews found. Please remove keyword. --'
        }
        true_labels, probas = None, None
    else:
        # get a random review
        random_review = dfs[0].sample(1, random_state=seed)
        # split to feature and labels
        X, y = read_data(random_review)
        model = load_model(fm, clf)
        review = random_review.to_dict('records')[0]
        review = {
            'id': review['id'],
            'content_html': highlight_subsetence(
                review['content_raw'], lang
            ).replace('\n', '<br>')
        }
        probas = predict_proba(clf, model, X)
        
        true_labels = y.replace({ np.nan: None }).values
        predict_labels = model.predict(X)
        # number of correct predictions
        n_correct_labels = np.sum(true_labels == predict_labels,
                                  axis=1).tolist()
        true_labels = true_labels.tolist()
        predict_labels = predict_labels.tolist()
        true_label_counts = [Counter(x) for x in true_labels]
        predict_label_counts = [Counter(x) for x in predict_labels]

    label_names = y.columns.tolist()
    n_total_labels = len(label_names)  # number of labels to predict
    return {
        'review': review,
        'label_names': label_names,
        'n_total_labels': n_total_labels,
        'n_correct_labels': n_correct_labels,
        'n_correct_labels_html': render_template(
            'single/correct_count.jinja', **locals()
        ),
        'true_label_counts': true_label_counts,
        'predict_label_counts': predict_label_counts,
        'true_labels': true_labels,
        'predict_labels': predict_labels,
        'probas': probas,
        'filter_results': render_template(
            'single/filter_results.jinja', **{**kwargs, **locals()})
    }
Example #3
0
 def predict_df(self, df, save_to=None):
     """Make prediction on a data frame and save output"""
     # read_data returns a copy of df
     X, y, df = read_data(df, return_df=True)
     df['content'] = ''
     df[y.columns] = self.predict(X)
     if save_to:
         logger.info(f'Saving predictions to {save_to}...')
         df.to_csv(save_to, encoding="utf_8_sig", index=False)
     return df
Example #4
0
                        '--classifier',
                        default='SVC',
                        choices=classifier_choices,
                        help='Classifier used by the model')
    parser.add_argument('--train',
                        default=10000,
                        help='Number of training sample to use')
    parser.add_argument('--valid',
                        default=1000,
                        help='Number of validation sample to use')
    args = parser.parse_args()

    logging.info(f'{args}')

    Model = getattr(models, args.model)
    Classifier = getattr(classifiers, args.classifier)
    X_train, Y_train = read_data(get_dataset('train'), sample_n=args.train)
    X_valid, Y_valid = read_data(get_dataset('valid'), sample_n=args.valid)
    model = Model(classifier=Classifier,
                  steps=[args.feature_model],
                  memory='data/feature_cache')

    with joblib.parallel_backend('threading', n_jobs=4):
        model.fit(X_train, Y_train)
        score = model.score(X_valid, Y_valid)
        logging.info('')
        logging.info(f'Overall F1: {score:.4f}')
        logging.info('')

    save_model(model)
Example #5
0
                        default='SVC',
                        choices=classifier_choices,
                        help='Classifier used by the model')
    parser.add_argument('--train',
                        default=10000,
                        help='Number of training sample to use')
    parser.add_argument('--valid',
                        default=1000,
                        help='Number of validation sample to use')
    args = parser.parse_args()

    logging.info(f'{args}')

    Model = getattr(models, args.model)
    Classifier = getattr(classifiers, args.classifier)
    X_train, Y_train = read_data(config.train_data_path, sample_n=args.train)
    X_valid, Y_valid = read_data(config.valid_data_path, sample_n=args.valid)
    model = Model(classifier=Classifier,
                  steps=[args.feature_model],
                  memory='data/feature_cache')

    with joblib.parallel_backend('threading', n_jobs=4):
        model.fit(X_train, Y_train)
        score = model.score(X_valid, Y_valid)
        logging.info('')
        logging.info(f'Overall F1: {score:.4f}')
        logging.info('')

    filename = f'{args.feature_model}_{model.name}.pkl'
    save_model(model, os.path.join(config.model_save_path, filename))