예제 #1
0
    def main(self, args=None):
        self.data = self.parser.parse_args(args=args)
        assert not self.data.update_klass
        logging.basicConfig(level=self.data.verbose)
        logger = logging.getLogger('b4msa')
        logger.setLevel(self.data.verbose)
        best = load_json(self.data.params_fname)[0]
        print(self.data.params_fname, self.data.training_set)
        corpus, labels = read_data_labels(self.data.training_set)
        le = LabelEncoder()
        le.fit(labels)
        y = le.transform(labels)
        t = TextModel(corpus, **best)
        X = [t[x] for x in corpus]
        hy = [None for x in y]
        for tr, ts in KFold(n_splits=self.data.kratio,
                            shuffle=True,
                            random_state=self.data.seed).split(X):
            c = SVC(model=t)
            c.fit([X[x] for x in tr], [y[x] for x in tr])
            _ = c.decision_function([X[x] for x in ts])
            [hy.__setitem__(k, v) for k, v in zip(ts, _)]

        i = 0
        with open(self.get_output(), 'w') as fpt:
            for tweet in tweet_iterator(self.data.training_set):
                tweet['decision_function'] = hy[i].tolist()
                i += 1
                fpt.write(json.dumps(tweet) + "\n")
        return hy
예제 #2
0
    def main(self, args=None):
        self.data = self.parser.parse_args(args=args)
        assert not self.data.update_klass
        best = load_json(self.data.params_fname)
        if isinstance(best, list):
            best = best[0]
        best = clean_params(best)
        print(self.data.params_fname, self.data.training_set)
        corpus, labels = read_data_labels(self.data.training_set)
        le = LabelEncoder()
        le.fit(labels)
        y = le.transform(labels)
        t = TextModel(corpus, **best)
        X = [t[x] for x in corpus]
        hy = [None for x in y]
        for tr, ts in KFold(n_splits=self.data.kratio,
                            shuffle=True, random_state=self.data.seed).split(X):
            c = SVC(model=t)
            c.fit([X[x] for x in tr], [y[x] for x in tr])
            _ = c.decision_function([X[x] for x in ts])
            [hy.__setitem__(k, v) for k, v in zip(ts, _)]

        i = 0
        with open(self.get_output(), 'w') as fpt:
            for tweet in tweet_iterator(self.data.training_set):
                tweet['decision_function'] = hy[i].tolist()
                i += 1
                fpt.write(json.dumps(tweet)+"\n")
        return hy
예제 #3
0
def test_SVC_predict():
    from b4msa.classifier import SVC
    from b4msa.textmodel import TextModel
    from b4msa.utils import read_data_labels
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    X, y = read_data_labels(fname)
    t = TextModel(X)
    c = SVC(t)
    c.fit_file(fname)
    y = c.predict_text('Excelente dia b4msa')
    assert y == 'POS'
예제 #4
0
def test_SVC_predict_from_file():
    from b4msa.classifier import SVC
    from b4msa.textmodel import TextModel
    from b4msa.utils import read_data_labels
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    X, y = read_data_labels(fname)
    t = TextModel(X)
    c = SVC(t)
    c.fit_file(fname)
    y = c.predict_file(fname)
    for i in y:
        assert i in ['POS', 'NEU', 'NEG']
예제 #5
0
def test_SVC_predict_from_file():
    from b4msa.classifier import SVC
    from b4msa.textmodel import TextModel
    from b4msa.utils import read_data_labels
    import os
    #fname = os.path.dirname(__file__) + '/text.json'
    fname = 'text.json'
    #fname = 'test_text.json'
    X, y = read_data_labels(fname)
    t = TextModel(X)
    c = SVC(t)
    c.fit_file(fname)
    y = c.predict_file("test_text.json")
    print "Final Labels"
    print y
예제 #6
0
    def main(self):
        self.data = self.parser.parse_args()
        if self.data.numprocs == 1:
            numprocs = None
        elif self.data.numprocs == 0:
            numprocs = cpu_count()
        else:
            numprocs = self.data.numprocs

        n_folds = self.data.n_folds
        n_folds = n_folds if n_folds is not None else 5
        assert self.data.score.split(":")[0] in ('macrorecall', 'macrof1', 'microf1', 'weightedf1', 'accuracy', 'avgf1', 'avgf1f0'), "Unknown score {0}".format(self.data.score)

        best_list = SVC.predict_kfold_params(
            self.data.training_set,
            n_folds=n_folds,
            score=self.data.score,
            numprocs=numprocs,
            seed=self.data.seed,
            param_kwargs=dict(
                bsize=self.data.samplesize,
                hill_climbing=self.data.hill_climbing,
                # qsize=self.data.qsize,
                lang=self.data.lang
            )
        )
        output = self.get_output()
        if output.endswith('.gz'):
            with gzip.open(output, 'wb') as fpt:
                cdn = json.dumps(best_list, indent=2, sort_keys=True)
                fpt.write(bytes(cdn, encoding='utf-8'))
        else:
            with open(output, 'w') as fpt:
                fpt.write(json.dumps(best_list, indent=2, sort_keys=True))
예제 #7
0
def test_kfold():
    import os
    from b4msa.classifier import SVC
    from b4msa.utils import read_data_labels
    fname = os.path.dirname(__file__) + '/text.json'
    X, y = read_data_labels(fname, get_klass='klass', get_tweet='text')
    hy = SVC.predict_kfold(X, y, n_folds=10)
    for x in hy:
        assert x in ['POS', 'NEU', 'NEG']
예제 #8
0
 def main(self):
     self.data = self.parser.parse_args()
     logging.basicConfig(level=self.data.verbose)
     logger = logging.getLogger('b4msa')
     logger.setLevel(self.data.verbose)
     params_fname = self.data.params_fname
     param_list = load_json(params_fname)
     best = param_list[0]
     svc = SVC.fit_from_file(self.data.training_set, best)
     with open(self.get_output(), 'wb') as fpt:
         pickle.dump(svc, fpt)
예제 #9
0
def test_kfold_pool():
    import os
    from b4msa.classifier import SVC
    from b4msa.utils import read_data_labels
    from multiprocessing import Pool
    fname = os.path.dirname(__file__) + '/text.json'
    X, y = read_data_labels(fname, get_klass='klass', get_tweet='text')
    pool = Pool(2)
    hy = SVC.predict_kfold(X, y, n_folds=2, pool=pool)
    for x in hy:
        assert x in ['POS', 'NEU', 'NEG']
    pool.close()
예제 #10
0
 def main(self):
     self.data = self.parser.parse_args()
     params_fname = self.data.params_fname
     if params_fname is not None:
         best = load_json(params_fname)
         if isinstance(best, list):
             best = best[0]
     else:
         best = dict()
     best = clean_params(best)
     kw = json.loads(self.data.kwargs) if self.data.kwargs is not None else dict()
     best.update(kw)
     svc = SVC.fit_from_file(self.data.training_set, best)
     save_model(svc, self.get_output())
예제 #11
0
    def main(self):
        self.data = self.parser.parse_args()
        logging.basicConfig(level=self.data.verbose)
        params_fname = self.data.params_fname
        if params_fname.endswith('.gz'):
            with gzip.open(params_fname) as fpt:
                cdn = fpt.read()
                param_list = json.loads(str(cdn, encoding='utf-8'))
        else:
            with open(params_fname) as fpt:
                param_list = json.loads(fpt.read())
        best = param_list[0]
        svc = SVC.fit_from_file(self.data.training_set, best)

        with open(self.get_output(), 'wb') as fpt:
            pickle.dump(svc, fpt)