Example #1
0
    def main(self, args=None):
        self.data = self.parser.parse_args(args=args)
        assert not self.data.update_klass
        logging.basicConfig(level=self.data.verbose)
        if self.data.conf:
            best = json.loads(self.data.conf)
        else:
            best = load_json(self.data.params_fname)[0]

        corpus, labels = [], []
        for train in self.data.training_set:
            X_, y_ = read_data_labels(train)
            corpus.extend(X_)
            labels.extend(y_)
        le = LabelEncoder()
        if self.data.labels:
            le.fit(self.data.labels.split(','))
        else:
            le.fit(labels)
        y = le.transform(labels)
        model_klasses = os.environ.get('TEXTMODEL_KLASSES')

        best.setdefault('dist_vector', OPTION_NONE)
        if model_klasses:
            model_klasses = le.transform(model_klasses.split(','))
            docs_ = []
            labels_ = []
            for i in range(len(corpus)):
                if y[i] in model_klasses:
                    docs_.append(corpus[i])
                    labels_.append(y[i])

            t = TextModel(docs_, **best)
            if best['dist_vector'] != OPTION_NONE:
                t = DistTextModel(t, docs_, labels_, le.classes_.shape[0],
                                  best['dist_vector'])
        else:
            t = TextModel(corpus, **best)
            if best['dist_vector'] != OPTION_NONE:
                t = DistTextModel(t, corpus, y, le.classes_.shape[0],
                                  best['dist_vector'])

        X = [t[x] for x in corpus]
        hy = [None for x in y]
        for tr, ts in KFold(n_splits=self.data.kratio,
                            shuffle=True,
                            random_state=self.data.seed).split(X):
            c = ClassifierWrapper()
            c.fit([X[x] for x in tr], [y[x] for x in tr])
            _ = c.decision_function([X[x] for x in ts])
            [hy.__setitem__(k, v) for k, v in zip(ts, _)]

        i = 0
        with open(self.get_output(), 'w') as fpt:
            for train in self.data.training_set:
                for tweet in tweet_iterator(train):
                    tweet['decision_function'] = hy[i].tolist()
                    i += 1
                    fpt.write(json.dumps(tweet) + "\n")
        return hy
Example #2
0
def test_test():
    from b4msa.command_line import params, train, test
    from microtc.utils import read_data_labels
    import os
    import sys
    import tempfile
    import json
    output = tempfile.mktemp()
    fname = os.path.dirname(__file__) + '/text.json'
    sys.argv = ['b4msa', '-o', output, '-k', '2', fname, '-s', '2']
    params()
    sys.argv = ['b4msa', '-m', output, fname, '-o', output]
    train()
    output2 = tempfile.mktemp()
    sys.argv = ['b4msa', '-m', output, fname, '-o', output2]
    test()
    X, y = read_data_labels(output2)
    print(y)
    os.unlink(output)
    with open(output2) as fpt:
        a = [json.loads(x) for x in fpt.readlines()]
    os.unlink(output2)
    for x in a:
        assert 'klass' in x
    assert len(y)
Example #3
0
def test_test():
    from b4msa.command_line import params, train, test
    from microtc.utils import read_data_labels
    import os
    import sys
    import tempfile
    import json
    output = tempfile.mktemp()
    fname = os.path.dirname(__file__) + '/text.json'
    sys.argv = ['b4msa', '-o', output, '-k', '2', fname, '-s', '2']
    params()
    sys.argv = ['b4msa', '-m', output, fname, '-o', output]
    train()
    output2 = tempfile.mktemp()
    sys.argv = ['b4msa', '-m', output, fname, '-o', output2]
    test()
    X, y = read_data_labels(output2)
    print(y)
    os.unlink(output)
    with open(output2) as fpt:
        a = [json.loads(x) for x in fpt.readlines()]
    os.unlink(output2)
    for x in a:
        assert 'klass' in x
    assert len(y)
Example #4
0
    def main(self, args=None):
        self.data = self.parser.parse_args(args=args)
        assert not self.data.update_klass
        best = load_json(self.data.params_fname)
        if isinstance(best, list):
            best = best[0]
        best = clean_params(best)
        print(self.data.params_fname, self.data.training_set)
        corpus, labels = read_data_labels(self.data.training_set)
        le = LabelEncoder()
        le.fit(labels)
        y = le.transform(labels)
        t = TextModel(corpus, **best)
        X = [t[x] for x in corpus]
        hy = [None for x in y]
        for tr, ts in KFold(n_splits=self.data.kratio,
                            shuffle=True, random_state=self.data.seed).split(X):
            c = SVC(model=t)
            c.fit([X[x] for x in tr], [y[x] for x in tr])
            _ = c.decision_function([X[x] for x in ts])
            [hy.__setitem__(k, v) for k, v in zip(ts, _)]

        i = 0
        with open(self.get_output(), 'w') as fpt:
            for tweet in tweet_iterator(self.data.training_set):
                tweet['decision_function'] = hy[i].tolist()
                i += 1
                fpt.write(json.dumps(tweet)+"\n")
        return hy
Example #5
0
def test_kfold():
    import os
    from b4msa.classifier import SVC
    from microtc.utils import read_data_labels
    fname = os.path.dirname(__file__) + '/text.json'
    X, y = read_data_labels(fname, get_klass='klass', get_tweet='text')
    hy = SVC.predict_kfold(X, y, n_folds=2)
    for x in hy:
        assert x in ['POS', 'NEU', 'NEG']
Example #6
0
 def fit_file(self,
              fname,
              get_tweet='text',
              get_klass='klass',
              maxitems=1e100):
     X, y = read_data_labels(fname,
                             get_klass=get_klass,
                             get_tweet=get_tweet,
                             maxitems=maxitems)
     self.fit([self.model[x] for x in X], y)
     return self
Example #7
0
def test_kfold_pool():
    import os
    from b4msa.classifier import SVC
    from microtc.utils import read_data_labels
    from multiprocessing import Pool
    fname = os.path.dirname(__file__) + '/text.json'
    X, y = read_data_labels(fname, get_klass='klass', get_tweet='text')
    pool = Pool(2)
    hy = SVC.predict_kfold(X, y, n_folds=2, pool=pool)
    for x in hy:
        assert x in ['POS', 'NEU', 'NEG']
    pool.close()
Example #8
0
def test_SVC_predict():
    from b4msa.classifier import SVC
    from b4msa.textmodel import TextModel
    from microtc.utils import read_data_labels
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    X, y = read_data_labels(fname)
    t = TextModel(X)
    c = SVC(t)
    c.fit_file(fname)
    y = c.predict_text('Excelente dia b4msa')
    assert y == 'POS'
Example #9
0
def test_SVC_predict_from_file():
    from b4msa.classifier import SVC
    from b4msa.textmodel import TextModel
    from microtc.utils import read_data_labels
    import os
    fname = os.path.dirname(__file__) + '/text.json'
    X, y = read_data_labels(fname)
    t = TextModel(X)
    c = SVC(t)
    c.fit_file(fname)
    y = c.predict_file(fname)
    for i in y:
        assert i in ['POS', 'NEU', 'NEG']
Example #10
0
    def predict_kfold_params(cls, fname, n_folds=10, score=None, numprocs=None, seed=0, param_kwargs={}):
        from b4msa.params import ParameterSelection, Wrapper
        X, y = read_data_labels(fname)
        if numprocs is not None:
            pool = Pool(numprocs)
        else:
            pool = None
            numprocs = 1

        if n_folds % numprocs == 0:
            f = Wrapper(X, y, score, n_folds, cls, pool=pool, seed=seed)
            pool = None
        else:
            f = Wrapper(X, y, score, n_folds, cls, seed=seed)

        return ParameterSelection().search(f.f, pool=pool, **param_kwargs)
Example #11
0
def test_test():
    from microtc.command_line import params, train, predict
    from microtc.utils import read_data_labels
    import os
    import sys
    import tempfile
    output = tempfile.mktemp()
    fname = os.path.dirname(__file__) + '/text.json'
    params('-o', output, '-k', '0.5:0.5', fname, '-s', '2')
    params('-o', output, '-k', '2', fname, '-s', '2')
    train('-o', output + '.model', '-m', output, fname)
    predict('-m', output + '.model', fname, '-o', output + '.predicted')
    X, y = read_data_labels(fname)
    print(y)
    os.unlink(output)
    os.unlink(output + '.model')
    os.unlink(output + '.predicted')
    assert len(y)
Example #12
0
def test_predict_from_file():
    from microtc.wrappers import ClassifierWrapper
    from microtc.textmodel import TextModel
    from microtc.utils import read_data_labels
    from sklearn.preprocessing import LabelEncoder

    import os
    fname = os.path.dirname(__file__) + '/text.json'
    corpus, labels = read_data_labels(fname)
    t = TextModel(corpus)
    le = LabelEncoder()
    le.fit(labels)
    y = le.transform(labels)
    c = ClassifierWrapper()
    X = [t[x] for x in corpus]
    c.fit(X, y)
    hy = le.inverse_transform(c.predict(X))
    for i in hy:
        assert i in ['POS', 'NEU', 'NEG']
Example #13
0
def test_test():
    from microtc.command_line import params, train, predict
    from microtc.utils import read_data_labels
    import os
    import sys
    import tempfile
    output = tempfile.mktemp()
    fname = os.path.dirname(__file__) + '/text.json'
    sys.argv = ['microtc', '-o', output, '-k', '2', fname, '-s', '2']
    params()
    sys.argv = ['microtc', '-m', output, fname, '-o', output]
    train()
    output2 = tempfile.mktemp()
    sys.argv = ['microtc', '-m', output, fname, '-o', output2]
    predict()
    X, y = read_data_labels(output2)
    print(y)
    os.unlink(output)
    os.unlink(output2)
    assert len(y)
Example #14
0
    def predict_kfold_params(cls,
                             fname,
                             n_folds=10,
                             score=None,
                             numprocs=None,
                             seed=0,
                             param_kwargs={}):
        from b4msa.params import ParameterSelection, Wrapper
        X, y = read_data_labels(fname)
        if numprocs is not None:
            pool = Pool(numprocs)
        else:
            pool = None
            numprocs = 1

        if n_folds % numprocs == 0:
            f = Wrapper(X, y, score, n_folds, cls, pool=pool, seed=seed)
            pool = None
        else:
            f = Wrapper(X, y, score, n_folds, cls, seed=seed)

        return ParameterSelection().search(f.f, pool=pool, **param_kwargs)
Example #15
0
 def fit_file(self, fname, get_tweet='text',
              get_klass='klass', maxitems=1e100):
     X, y = read_data_labels(fname, get_klass=get_klass,
                             get_tweet=get_tweet, maxitems=maxitems)
     self.fit([self.model[x] for x in X], y)
     return self
Example #16
0
def test_read_data_labels():
    import os
    from microtc.utils import read_data_labels
    filename = os.path.join(os.path.dirname(__file__), "text.json")
    read_data_labels(filename)