def main(self, args=None): self.data = self.parser.parse_args(args=args) assert not self.data.update_klass logging.basicConfig(level=self.data.verbose) logger = logging.getLogger('b4msa') logger.setLevel(self.data.verbose) best = load_json(self.data.params_fname)[0] print(self.data.params_fname, self.data.training_set) corpus, labels = read_data_labels(self.data.training_set) le = LabelEncoder() le.fit(labels) y = le.transform(labels) t = TextModel(corpus, **best) X = [t[x] for x in corpus] hy = [None for x in y] for tr, ts in KFold(n_splits=self.data.kratio, shuffle=True, random_state=self.data.seed).split(X): c = SVC(model=t) c.fit([X[x] for x in tr], [y[x] for x in tr]) _ = c.decision_function([X[x] for x in ts]) [hy.__setitem__(k, v) for k, v in zip(ts, _)] i = 0 with open(self.get_output(), 'w') as fpt: for tweet in tweet_iterator(self.data.training_set): tweet['decision_function'] = hy[i].tolist() i += 1 fpt.write(json.dumps(tweet) + "\n") return hy
def main(self, args=None): self.data = self.parser.parse_args(args=args) assert not self.data.update_klass best = load_json(self.data.params_fname) if isinstance(best, list): best = best[0] best = clean_params(best) print(self.data.params_fname, self.data.training_set) corpus, labels = read_data_labels(self.data.training_set) le = LabelEncoder() le.fit(labels) y = le.transform(labels) t = TextModel(corpus, **best) X = [t[x] for x in corpus] hy = [None for x in y] for tr, ts in KFold(n_splits=self.data.kratio, shuffle=True, random_state=self.data.seed).split(X): c = SVC(model=t) c.fit([X[x] for x in tr], [y[x] for x in tr]) _ = c.decision_function([X[x] for x in ts]) [hy.__setitem__(k, v) for k, v in zip(ts, _)] i = 0 with open(self.get_output(), 'w') as fpt: for tweet in tweet_iterator(self.data.training_set): tweet['decision_function'] = hy[i].tolist() i += 1 fpt.write(json.dumps(tweet)+"\n") return hy
def test_SVC_predict(): from b4msa.classifier import SVC from b4msa.textmodel import TextModel from b4msa.utils import read_data_labels import os fname = os.path.dirname(__file__) + '/text.json' X, y = read_data_labels(fname) t = TextModel(X) c = SVC(t) c.fit_file(fname) y = c.predict_text('Excelente dia b4msa') assert y == 'POS'
def test_SVC_predict_from_file(): from b4msa.classifier import SVC from b4msa.textmodel import TextModel from b4msa.utils import read_data_labels import os fname = os.path.dirname(__file__) + '/text.json' X, y = read_data_labels(fname) t = TextModel(X) c = SVC(t) c.fit_file(fname) y = c.predict_file(fname) for i in y: assert i in ['POS', 'NEU', 'NEG']
def test_SVC_predict_from_file(): from b4msa.classifier import SVC from b4msa.textmodel import TextModel from b4msa.utils import read_data_labels import os #fname = os.path.dirname(__file__) + '/text.json' fname = 'text.json' #fname = 'test_text.json' X, y = read_data_labels(fname) t = TextModel(X) c = SVC(t) c.fit_file(fname) y = c.predict_file("test_text.json") print "Final Labels" print y
def main(self): self.data = self.parser.parse_args() if self.data.numprocs == 1: numprocs = None elif self.data.numprocs == 0: numprocs = cpu_count() else: numprocs = self.data.numprocs n_folds = self.data.n_folds n_folds = n_folds if n_folds is not None else 5 assert self.data.score.split(":")[0] in ('macrorecall', 'macrof1', 'microf1', 'weightedf1', 'accuracy', 'avgf1', 'avgf1f0'), "Unknown score {0}".format(self.data.score) best_list = SVC.predict_kfold_params( self.data.training_set, n_folds=n_folds, score=self.data.score, numprocs=numprocs, seed=self.data.seed, param_kwargs=dict( bsize=self.data.samplesize, hill_climbing=self.data.hill_climbing, # qsize=self.data.qsize, lang=self.data.lang ) ) output = self.get_output() if output.endswith('.gz'): with gzip.open(output, 'wb') as fpt: cdn = json.dumps(best_list, indent=2, sort_keys=True) fpt.write(bytes(cdn, encoding='utf-8')) else: with open(output, 'w') as fpt: fpt.write(json.dumps(best_list, indent=2, sort_keys=True))
def test_kfold(): import os from b4msa.classifier import SVC from b4msa.utils import read_data_labels fname = os.path.dirname(__file__) + '/text.json' X, y = read_data_labels(fname, get_klass='klass', get_tweet='text') hy = SVC.predict_kfold(X, y, n_folds=10) for x in hy: assert x in ['POS', 'NEU', 'NEG']
def main(self): self.data = self.parser.parse_args() logging.basicConfig(level=self.data.verbose) logger = logging.getLogger('b4msa') logger.setLevel(self.data.verbose) params_fname = self.data.params_fname param_list = load_json(params_fname) best = param_list[0] svc = SVC.fit_from_file(self.data.training_set, best) with open(self.get_output(), 'wb') as fpt: pickle.dump(svc, fpt)
def test_kfold_pool(): import os from b4msa.classifier import SVC from b4msa.utils import read_data_labels from multiprocessing import Pool fname = os.path.dirname(__file__) + '/text.json' X, y = read_data_labels(fname, get_klass='klass', get_tweet='text') pool = Pool(2) hy = SVC.predict_kfold(X, y, n_folds=2, pool=pool) for x in hy: assert x in ['POS', 'NEU', 'NEG'] pool.close()
def main(self): self.data = self.parser.parse_args() params_fname = self.data.params_fname if params_fname is not None: best = load_json(params_fname) if isinstance(best, list): best = best[0] else: best = dict() best = clean_params(best) kw = json.loads(self.data.kwargs) if self.data.kwargs is not None else dict() best.update(kw) svc = SVC.fit_from_file(self.data.training_set, best) save_model(svc, self.get_output())
def main(self): self.data = self.parser.parse_args() logging.basicConfig(level=self.data.verbose) params_fname = self.data.params_fname if params_fname.endswith('.gz'): with gzip.open(params_fname) as fpt: cdn = fpt.read() param_list = json.loads(str(cdn, encoding='utf-8')) else: with open(params_fname) as fpt: param_list = json.loads(fpt.read()) best = param_list[0] svc = SVC.fit_from_file(self.data.training_set, best) with open(self.get_output(), 'wb') as fpt: pickle.dump(svc, fpt)