Ejemplo n.º 1
0
from argparse import ArgumentParser
from cmdline import add_args
ap = ArgumentParser()
add_args(ap, ('general', 'preproc', 'linear', 'test'))
opt = ap.parse_args()

if opt.class_weight:
    opt.class_weight = "balanced"
else:
    opt.class_weight = None

from logging import debug, info, basicConfig
basicConfig(level=opt.log_level, format='%(asctime)s %(message)s')

data_trn = load(opt.input_prefix)
data_tst = load(opt.test_prefix)

docs_trn, v, _ = doc_to_ngrams(data_trn.docs,
                               min_df=opt.min_df,
                               cache_dir=opt.cache_dir,
                               dim_reduce=opt.dim_reduce,
                               c_ngmin=opt.c_ngmin,
                               c_ngmax=opt.c_ngmax,
                               w_ngmin=opt.w_ngmin,
                               w_ngmax=opt.w_ngmax,
                               lowercase=opt.lowercase)
docs_tst = preprocess(data_tst.docs,
                      c_ngmin=opt.c_ngmin,
                      c_ngmax=opt.c_ngmax,
                      w_ngmin=opt.w_ngmin,
Ejemplo n.º 2
0
seed = 1234

if opt.class_weight:
    opt.class_weight = "balanced"
else:
    opt.class_weight = None

from logging import debug, info, basicConfig
basicConfig(level=opt.log_level, format='%(asctime)s %(message)s')

info('----start----')
info(','.join([k + '=' + str(vars(opt)[k]) for k in sorted(vars(opt))]))

# ---main---

data = load(opt.input_prefix)

docs, v, _ = doc_to_ngrams(data.docs,
                           min_df=opt.min_df,
                           cache_dir=opt.cache_dir,
                           dim_reduce=opt.dim_reduce,
                           c_ngmin=opt.c_ngmin,
                           c_ngmax=opt.c_ngmax,
                           w_ngmin=opt.w_ngmin,
                           w_ngmax=opt.w_ngmax,
                           lowercase=opt.lowercase,
                           input_name=opt.input_prefix)
labels = np.array(data.labels)

info("number of word/character features ({}/{}): {}".format(
    opt.w_ngmax, opt.c_ngmax, len(v.vocabulary_)))
Ejemplo n.º 3
0
from argparse import ArgumentParser
from cmdline import add_args
ap = ArgumentParser()
add_args(ap, ('general', 'preproc', 'linear', 'test'))
opt = ap.parse_args()

if opt.class_weight:
    opt.class_weight = "balanced"
else:
    opt.class_weight = None

from logging import debug, info, basicConfig
basicConfig(level=opt.log_level,
                    format='%(asctime)s %(message)s')

data_trn = load(opt.input_prefix)
data_tst = load(opt.test_prefix)

docs_trn, v, _ = doc_to_ngrams(data_trn.docs, min_df=opt.min_df,
                          cache_dir = opt.cache_dir,
                          dim_reduce = opt.dim_reduce,
                          c_ngmin = opt.c_ngmin,
                          c_ngmax = opt.c_ngmax,
                          w_ngmin = opt.w_ngmin,
                          w_ngmax = opt.w_ngmax,
                          lowercase = opt.lowercase)
docs_tst = preprocess(data_tst.docs,
    c_ngmin=opt.c_ngmin, c_ngmax=opt.c_ngmax,
    w_ngmin=opt.w_ngmin, w_ngmax=opt.w_ngmax,
    lowercase=opt.lowercase)
Ejemplo n.º 4
0
if opt.class_weight:
    opt.class_weight = "balanced"
else:
    opt.class_weight = None

from logging import debug, info, basicConfig
basicConfig(level=opt.log_level,
                    format='%(asctime)s %(message)s')

info('----start----')
info(','.join([k + '=' + str(vars(opt)[k]) for k in sorted(vars(opt))]))


# ---main---

data = load(opt.input_prefix)

docs, v, _ = doc_to_ngrams(data.docs, min_df=opt.min_df,
                          cache_dir = opt.cache_dir,
                          dim_reduce = opt.dim_reduce,
                          c_ngmin = opt.c_ngmin,
                          c_ngmax = opt.c_ngmax,
                          w_ngmin = opt.w_ngmin,
                          w_ngmax = opt.w_ngmax,
                          lowercase = opt.lowercase,
                          input_name = opt.input_prefix)
labels = np.array(data.labels)

info("number of word/character features ({}/{}): {}".format(
            opt.w_ngmax, opt.c_ngmax, len(v.vocabulary_)))