def main(self, args=None): self.data = self.parser.parse_args(args=args) assert not self.data.update_klass logging.basicConfig(level=self.data.verbose) if self.data.conf: best = json.loads(self.data.conf) else: best = load_json(self.data.params_fname)[0] corpus, labels = [], [] for train in self.data.training_set: X_, y_ = read_data_labels(train) corpus.extend(X_) labels.extend(y_) le = LabelEncoder() if self.data.labels: le.fit(self.data.labels.split(',')) else: le.fit(labels) y = le.transform(labels) model_klasses = os.environ.get('TEXTMODEL_KLASSES') best.setdefault('dist_vector', OPTION_NONE) if model_klasses: model_klasses = le.transform(model_klasses.split(',')) docs_ = [] labels_ = [] for i in range(len(corpus)): if y[i] in model_klasses: docs_.append(corpus[i]) labels_.append(y[i]) t = TextModel(docs_, **best) if best['dist_vector'] != OPTION_NONE: t = DistTextModel(t, docs_, labels_, le.classes_.shape[0], best['dist_vector']) else: t = TextModel(corpus, **best) if best['dist_vector'] != OPTION_NONE: t = DistTextModel(t, corpus, y, le.classes_.shape[0], best['dist_vector']) X = [t[x] for x in corpus] hy = [None for x in y] for tr, ts in KFold(n_splits=self.data.kratio, shuffle=True, random_state=self.data.seed).split(X): c = ClassifierWrapper() c.fit([X[x] for x in tr], [y[x] for x in tr]) _ = c.decision_function([X[x] for x in ts]) [hy.__setitem__(k, v) for k, v in zip(ts, _)] i = 0 with open(self.get_output(), 'w') as fpt: for train in self.data.training_set: for tweet in tweet_iterator(train): tweet['decision_function'] = hy[i].tolist() i += 1 fpt.write(json.dumps(tweet) + "\n") return hy
def test_test(): from b4msa.command_line import params, train, test from microtc.utils import read_data_labels import os import sys import tempfile import json output = tempfile.mktemp() fname = os.path.dirname(__file__) + '/text.json' sys.argv = ['b4msa', '-o', output, '-k', '2', fname, '-s', '2'] params() sys.argv = ['b4msa', '-m', output, fname, '-o', output] train() output2 = tempfile.mktemp() sys.argv = ['b4msa', '-m', output, fname, '-o', output2] test() X, y = read_data_labels(output2) print(y) os.unlink(output) with open(output2) as fpt: a = [json.loads(x) for x in fpt.readlines()] os.unlink(output2) for x in a: assert 'klass' in x assert len(y)
def main(self, args=None): self.data = self.parser.parse_args(args=args) assert not self.data.update_klass best = load_json(self.data.params_fname) if isinstance(best, list): best = best[0] best = clean_params(best) print(self.data.params_fname, self.data.training_set) corpus, labels = read_data_labels(self.data.training_set) le = LabelEncoder() le.fit(labels) y = le.transform(labels) t = TextModel(corpus, **best) X = [t[x] for x in corpus] hy = [None for x in y] for tr, ts in KFold(n_splits=self.data.kratio, shuffle=True, random_state=self.data.seed).split(X): c = SVC(model=t) c.fit([X[x] for x in tr], [y[x] for x in tr]) _ = c.decision_function([X[x] for x in ts]) [hy.__setitem__(k, v) for k, v in zip(ts, _)] i = 0 with open(self.get_output(), 'w') as fpt: for tweet in tweet_iterator(self.data.training_set): tweet['decision_function'] = hy[i].tolist() i += 1 fpt.write(json.dumps(tweet)+"\n") return hy
def test_kfold(): import os from b4msa.classifier import SVC from microtc.utils import read_data_labels fname = os.path.dirname(__file__) + '/text.json' X, y = read_data_labels(fname, get_klass='klass', get_tweet='text') hy = SVC.predict_kfold(X, y, n_folds=2) for x in hy: assert x in ['POS', 'NEU', 'NEG']
def fit_file(self, fname, get_tweet='text', get_klass='klass', maxitems=1e100): X, y = read_data_labels(fname, get_klass=get_klass, get_tweet=get_tweet, maxitems=maxitems) self.fit([self.model[x] for x in X], y) return self
def test_kfold_pool(): import os from b4msa.classifier import SVC from microtc.utils import read_data_labels from multiprocessing import Pool fname = os.path.dirname(__file__) + '/text.json' X, y = read_data_labels(fname, get_klass='klass', get_tweet='text') pool = Pool(2) hy = SVC.predict_kfold(X, y, n_folds=2, pool=pool) for x in hy: assert x in ['POS', 'NEU', 'NEG'] pool.close()
def test_SVC_predict(): from b4msa.classifier import SVC from b4msa.textmodel import TextModel from microtc.utils import read_data_labels import os fname = os.path.dirname(__file__) + '/text.json' X, y = read_data_labels(fname) t = TextModel(X) c = SVC(t) c.fit_file(fname) y = c.predict_text('Excelente dia b4msa') assert y == 'POS'
def test_SVC_predict_from_file(): from b4msa.classifier import SVC from b4msa.textmodel import TextModel from microtc.utils import read_data_labels import os fname = os.path.dirname(__file__) + '/text.json' X, y = read_data_labels(fname) t = TextModel(X) c = SVC(t) c.fit_file(fname) y = c.predict_file(fname) for i in y: assert i in ['POS', 'NEU', 'NEG']
def predict_kfold_params(cls, fname, n_folds=10, score=None, numprocs=None, seed=0, param_kwargs={}): from b4msa.params import ParameterSelection, Wrapper X, y = read_data_labels(fname) if numprocs is not None: pool = Pool(numprocs) else: pool = None numprocs = 1 if n_folds % numprocs == 0: f = Wrapper(X, y, score, n_folds, cls, pool=pool, seed=seed) pool = None else: f = Wrapper(X, y, score, n_folds, cls, seed=seed) return ParameterSelection().search(f.f, pool=pool, **param_kwargs)
def test_test(): from microtc.command_line import params, train, predict from microtc.utils import read_data_labels import os import sys import tempfile output = tempfile.mktemp() fname = os.path.dirname(__file__) + '/text.json' params('-o', output, '-k', '0.5:0.5', fname, '-s', '2') params('-o', output, '-k', '2', fname, '-s', '2') train('-o', output + '.model', '-m', output, fname) predict('-m', output + '.model', fname, '-o', output + '.predicted') X, y = read_data_labels(fname) print(y) os.unlink(output) os.unlink(output + '.model') os.unlink(output + '.predicted') assert len(y)
def test_predict_from_file(): from microtc.wrappers import ClassifierWrapper from microtc.textmodel import TextModel from microtc.utils import read_data_labels from sklearn.preprocessing import LabelEncoder import os fname = os.path.dirname(__file__) + '/text.json' corpus, labels = read_data_labels(fname) t = TextModel(corpus) le = LabelEncoder() le.fit(labels) y = le.transform(labels) c = ClassifierWrapper() X = [t[x] for x in corpus] c.fit(X, y) hy = le.inverse_transform(c.predict(X)) for i in hy: assert i in ['POS', 'NEU', 'NEG']
def test_test(): from microtc.command_line import params, train, predict from microtc.utils import read_data_labels import os import sys import tempfile output = tempfile.mktemp() fname = os.path.dirname(__file__) + '/text.json' sys.argv = ['microtc', '-o', output, '-k', '2', fname, '-s', '2'] params() sys.argv = ['microtc', '-m', output, fname, '-o', output] train() output2 = tempfile.mktemp() sys.argv = ['microtc', '-m', output, fname, '-o', output2] predict() X, y = read_data_labels(output2) print(y) os.unlink(output) os.unlink(output2) assert len(y)
def test_read_data_labels(): import os from microtc.utils import read_data_labels filename = os.path.join(os.path.dirname(__file__), "text.json") read_data_labels(filename)