Beispiel #1
0
def main():
    sys.setrecursionlimit(5000)
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    data = TwitterData()
    with data:
        # load data

        # fix seed for reproducibility
        seed_rng(1234)

        # train
        senti_models = SentiModels(data)
        # pipeline_name, pipeline = senti_models.fit_voting()
        # pipeline_name, pipeline = senti_models.fit_logreg()
        # pipeline_name, pipeline = senti_models.fit_word2vec_bayes()
        pipeline_name, pipeline = senti_models.fit_svm()
        # pipeline_name, pipeline = senti_models.fit_nn_word()
        # pipeline_name, pipeline = senti_models.fit_cnn_char()
        # pipeline_name, pipeline = senti_models.fit_multiview_cnn_word_cnn_char()
        # pipeline_name, pipeline = senti_models.fit_rnn_char_cnn_word()

        # test_data = [('val', data.val_objs, data.val_docs, data.val_labels)]
        test_data = [
            ('val', data.val_objs, data.val_docs, data.val_labels),
            ('test', data.test_objs, data.test_docs, data.test_labels)
        ]

        # predict & write results
        for name, objs, docs, labels in test_data:
            try:
                probs = pipeline.predict_proba(docs)
            except AttributeError:
                probs = LabelBinarizer().fit(data.classes_).transform(pipeline.predict(docs))
            results_dir = os.path.join(data.data_dir, 'results', name)
            os.makedirs(results_dir, exist_ok=True)
            with temp_chdir(results_dir):
                with open('{}.json'.format(pipeline_name), 'w') as results_sr:
                    for obj, prob in zip(objs, probs):
                        results_sr.write(json.dumps({
                            'id': obj['id'], 'label': data.classes_[np.argmax(prob)],
                            'probs': [(c, prob.item()) for c, prob in zip(data.classes_, prob)]
                        }) + '\n')
                with suppress(KeyError):
                    labels = np.fromiter(labels, dtype='int32')
                    print('{} data: '.format(name))
                    write_score('{}'.format(pipeline_name), labels, probs, data.classes_, data.average_classes)
Beispiel #2
0
 def __init__(self):
     super().__init__()
     stack = self._stack
     # classes
     self.classes_ = [1, 2, 3, 4, 5]
     self.average_classes = [1, 2, 3, 4, 5]
     # data
     self.data_dir = 'data/yelp'
     with temp_chdir(self.data_dir):
         self.train_objs = JSONDecoder(stack.enter_context(open('train.json')))
         self.train_docs = FieldExtractor(self.train_objs, 'text')
         self.train_labels = np.fromiter(FieldExtractor(self.train_objs, 'stars'), 'int32')
         self.val_objs = JSONDecoder(stack.enter_context(open('val.json')))
         self.val_docs = FieldExtractor(self.val_objs, 'text')
         self.val_labels = FieldExtractor(self.val_objs, 'stars')
         self.test_objs = JSONDecoder(stack.enter_context(open('test.json')))
         self.test_docs = FieldExtractor(self.test_objs, 'text')
         self.test_labels = FieldExtractor(self.test_objs, 'stars')
Beispiel #3
0
 def __init__(self):
     super().__init__()
     stack = self._stack
     # classes
     self.classes_ = [0, 1, 2]
     self.average_classes = [0, 2]
     # data
     self.data_dir = 'data/imdb'
     with temp_chdir(self.data_dir):
         self.train_objs = JSONDecoder(stack.enter_context(open('train.json')))
         self.train_docs = FieldExtractor(self.train_objs, 'text')
         self.train_labels = np.fromiter(FieldExtractor(self.train_objs, 'label'), 'int32')
         unsup_sr = stack.enter_context(open('unsup.json'))
         self.unsup_docs = BalancedSlice([FieldExtractor(unsup_sr, 'text')])
         self.val_objs = JSONDecoder(stack.enter_context(open('val.json')))
         self.val_docs = FieldExtractor(self.val_objs, 'text')
         self.val_labels = FieldExtractor(self.val_objs, 'label')
         self.test_objs = JSONDecoder(stack.enter_context(open('test.json')))
         self.test_docs = FieldExtractor(self.test_objs, 'text')
         self.test_labels = FieldExtractor(self.test_objs, 'label')
Beispiel #4
0
 def __init__(self):
     super().__init__()
     stack = self._stack
     # classes
     self.classes_ = [0, 1, 2]
     self.average_classes = [0, 2]
     # data
     self.data_dir = 'data/twitter/semeval_2016_submit'
     with temp_chdir(self.data_dir):
         self.train_objs = JSONDecoder(stack.enter_context(open('train.json')))
         self.train_docs = FieldExtractor(self.train_objs, 'text')
         self.train_labels = np.fromiter(FieldExtractor(self.train_objs, 'label'), 'int32')
         distant_srs = [stack.enter_context(open('../emote/class_{}.txt'.format(i), encoding='utf-8')) for i in [0, 2]]
         self.distant_docs = BalancedSlice(distant_srs)
         self.distant_labels = BalancedSlice((RepeatSr(0), RepeatSr(2)))
         unsup_sr = stack.enter_context(open('../unsup/all.txt', encoding='utf-8'))
         self.unsup_docs = BalancedSlice([unsup_sr])
         self.val_objs = JSONDecoder(stack.enter_context(open('val.json')))
         self.val_docs = FieldExtractor(self.val_objs, 'text')
         self.val_labels = FieldExtractor(self.val_objs, 'label')
         self.test_objs = JSONDecoder(stack.enter_context(open('test.json')))
         self.test_docs = FieldExtractor(self.test_objs, 'text')
         self.test_labels = FieldExtractor(self.test_objs, 'label')