def main(self, args=None): self.data = self.parser.parse_args(args=args) assert not self.data.update_klass logging.basicConfig(level=self.data.verbose) logger = logging.getLogger('b4msa') logger.setLevel(self.data.verbose) best = load_json(self.data.params_fname)[0] print(self.data.params_fname, self.data.training_set) corpus, labels = read_data_labels(self.data.training_set) le = LabelEncoder() le.fit(labels) y = le.transform(labels) t = TextModel(corpus, **best) X = [t[x] for x in corpus] hy = [None for x in y] for tr, ts in KFold(n_splits=self.data.kratio, shuffle=True, random_state=self.data.seed).split(X): c = SVC(model=t) c.fit([X[x] for x in tr], [y[x] for x in tr]) _ = c.decision_function([X[x] for x in ts]) [hy.__setitem__(k, v) for k, v in zip(ts, _)] i = 0 with open(self.get_output(), 'w') as fpt: for tweet in tweet_iterator(self.data.training_set): tweet['decision_function'] = hy[i].tolist() i += 1 fpt.write(json.dumps(tweet) + "\n") return hy
def test_test(): from b4msa.command_line import params, train, test from b4msa.utils import read_data_labels import os import sys import tempfile import json output = tempfile.mktemp() fname = os.path.dirname(__file__) + '/text.json' sys.argv = ['b4msa', '-o', output, '-k', '2', fname, '-s', '2'] params() sys.argv = ['b4msa', '-m', output, fname, '-o', output] train() output2 = tempfile.mktemp() sys.argv = ['b4msa', '-m', output, fname, '-o', output2] test() X, y = read_data_labels(output2) print(y) os.unlink(output) with open(output2) as fpt: a = [json.loads(x) for x in fpt.readlines()] os.unlink(output2) for x in a: assert 'q_voc_ratio' in x assert len(y)
def test_kfold(): import os from b4msa.classifier import SVC from b4msa.utils import read_data_labels fname = os.path.dirname(__file__) + '/text.json' X, y = read_data_labels(fname, get_klass='klass', get_tweet='text') hy = SVC.predict_kfold(X, y, n_folds=10) for x in hy: assert x in ['POS', 'NEU', 'NEG']
def fit_file(self, fname, get_tweet='text', get_klass='klass', maxitems=1e100): X, y = read_data_labels(fname, get_klass=get_klass, get_tweet=get_tweet, maxitems=maxitems) self.fit([self.model[x] for x in X], y) return self
def test_kfold_pool(): import os from b4msa.classifier import SVC from b4msa.utils import read_data_labels from multiprocessing import Pool fname = os.path.dirname(__file__) + '/text.json' X, y = read_data_labels(fname, get_klass='klass', get_tweet='text') pool = Pool(2) hy = SVC.predict_kfold(X, y, n_folds=2, pool=pool) for x in hy: assert x in ['POS', 'NEU', 'NEG'] pool.close()
def test_SVC_predict(): from b4msa.classifier import SVC from b4msa.textmodel import TextModel from b4msa.utils import read_data_labels import os fname = os.path.dirname(__file__) + '/text.json' X, y = read_data_labels(fname) t = TextModel(X) c = SVC(t) c.fit_file(fname) y = c.predict_text('Excelente dia b4msa') assert y == 'POS'
def test_SVC_predict_from_file(): from b4msa.classifier import SVC from b4msa.textmodel import TextModel from b4msa.utils import read_data_labels import os fname = os.path.dirname(__file__) + '/text.json' X, y = read_data_labels(fname) t = TextModel(X) c = SVC(t) c.fit_file(fname) y = c.predict_file(fname) for i in y: assert i in ['POS', 'NEU', 'NEG']
def test_SVC_predict_from_file(): from b4msa.classifier import SVC from b4msa.textmodel import TextModel from b4msa.utils import read_data_labels import os #fname = os.path.dirname(__file__) + '/text.json' fname = 'text.json' #fname = 'test_text.json' X, y = read_data_labels(fname) t = TextModel(X) c = SVC(t) c.fit_file(fname) y = c.predict_file("test_text.json") print "Final Labels" print y
def predict_kfold_params(cls, fname, n_folds=10, score=None, numprocs=None, seed=0, param_kwargs={}): from b4msa.params import ParameterSelection, Wrapper X, y = read_data_labels(fname) if numprocs is not None: pool = Pool(numprocs) else: pool = None numprocs = 1 if n_folds % numprocs == 0: f = Wrapper(X, y, score, n_folds, cls, pool=pool, seed=seed) pool = None else: f = Wrapper(X, y, score, n_folds, cls, seed=seed) return ParameterSelection().search(f.f, pool=pool, **param_kwargs)
def fit_from_file(cls, fname, textModel_params={}): X, y = read_data_labels(fname) model = TextModel(X, **textModel_params) svc = cls(model) return svc.fit([model[x] for x in X], y)
def test_read_data_labels(): import os from b4msa.utils import read_data_labels filename = os.path.join(os.path.dirname(__file__), "text.json") read_data_labels(filename)