from argparse import ArgumentParser from cmdline import add_args ap = ArgumentParser() add_args(ap, ('general', 'preproc', 'linear', 'test')) opt = ap.parse_args() if opt.class_weight: opt.class_weight = "balanced" else: opt.class_weight = None from logging import debug, info, basicConfig basicConfig(level=opt.log_level, format='%(asctime)s %(message)s') data_trn = load(opt.input_prefix) data_tst = load(opt.test_prefix) docs_trn, v, _ = doc_to_ngrams(data_trn.docs, min_df=opt.min_df, cache_dir=opt.cache_dir, dim_reduce=opt.dim_reduce, c_ngmin=opt.c_ngmin, c_ngmax=opt.c_ngmax, w_ngmin=opt.w_ngmin, w_ngmax=opt.w_ngmax, lowercase=opt.lowercase) docs_tst = preprocess(data_tst.docs, c_ngmin=opt.c_ngmin, c_ngmax=opt.c_ngmax, w_ngmin=opt.w_ngmin,
seed = 1234 if opt.class_weight: opt.class_weight = "balanced" else: opt.class_weight = None from logging import debug, info, basicConfig basicConfig(level=opt.log_level, format='%(asctime)s %(message)s') info('----start----') info(','.join([k + '=' + str(vars(opt)[k]) for k in sorted(vars(opt))])) # ---main--- data = load(opt.input_prefix) docs, v, _ = doc_to_ngrams(data.docs, min_df=opt.min_df, cache_dir=opt.cache_dir, dim_reduce=opt.dim_reduce, c_ngmin=opt.c_ngmin, c_ngmax=opt.c_ngmax, w_ngmin=opt.w_ngmin, w_ngmax=opt.w_ngmax, lowercase=opt.lowercase, input_name=opt.input_prefix) labels = np.array(data.labels) info("number of word/character features ({}/{}): {}".format( opt.w_ngmax, opt.c_ngmax, len(v.vocabulary_)))
from argparse import ArgumentParser from cmdline import add_args ap = ArgumentParser() add_args(ap, ('general', 'preproc', 'linear', 'test')) opt = ap.parse_args() if opt.class_weight: opt.class_weight = "balanced" else: opt.class_weight = None from logging import debug, info, basicConfig basicConfig(level=opt.log_level, format='%(asctime)s %(message)s') data_trn = load(opt.input_prefix) data_tst = load(opt.test_prefix) docs_trn, v, _ = doc_to_ngrams(data_trn.docs, min_df=opt.min_df, cache_dir = opt.cache_dir, dim_reduce = opt.dim_reduce, c_ngmin = opt.c_ngmin, c_ngmax = opt.c_ngmax, w_ngmin = opt.w_ngmin, w_ngmax = opt.w_ngmax, lowercase = opt.lowercase) docs_tst = preprocess(data_tst.docs, c_ngmin=opt.c_ngmin, c_ngmax=opt.c_ngmax, w_ngmin=opt.w_ngmin, w_ngmax=opt.w_ngmax, lowercase=opt.lowercase)
if opt.class_weight: opt.class_weight = "balanced" else: opt.class_weight = None from logging import debug, info, basicConfig basicConfig(level=opt.log_level, format='%(asctime)s %(message)s') info('----start----') info(','.join([k + '=' + str(vars(opt)[k]) for k in sorted(vars(opt))])) # ---main--- data = load(opt.input_prefix) docs, v, _ = doc_to_ngrams(data.docs, min_df=opt.min_df, cache_dir = opt.cache_dir, dim_reduce = opt.dim_reduce, c_ngmin = opt.c_ngmin, c_ngmax = opt.c_ngmax, w_ngmin = opt.w_ngmin, w_ngmax = opt.w_ngmax, lowercase = opt.lowercase, input_name = opt.input_prefix) labels = np.array(data.labels) info("number of word/character features ({}/{}): {}".format( opt.w_ngmax, opt.c_ngmax, len(v.vocabulary_)))