Esempio n. 1
0
 def inference_html(self, tag):
     input_tag_tokenizer = tokenizer.InputTagTokenizer()
     tokens = input_tag_tokenizer.get_attrs_value(tag.html)
     bow = self.dictionary.doc2bow(tokens)
     vec = matutils.corpus2dense([bow], self.in_units).T[0]
     x = chainer.Variable(np.asarray(np.array([vec]).astype(np.float32)))
     with chainer.using_config('train', False):
         y = self.classifier.predictor(x)
     i = np.argmax(y.data, axis=1).tolist()[0]
     return self._label_name_from_id(i)
Esempio n. 2
0
 def inference_html(self, tag):
     input_tag_tokenizer = tokenizer.InputTagTokenizer()
     tokens = input_tag_tokenizer.get_attrs_value(tag.html)
     vec_bow = self.dictionary.doc2bow(tokens)
     vec_lsi = self.__sparse_to_dense(self.lsi[vec_bow])
     if len(vec_lsi) == 0:
         return 'unknown'
     else:
         predict_value = self.lr.predict([vec_lsi])[0]
         return self._label_name_from_id(predict_value)
Esempio n. 3
0
 def __convert_to_word_vecs(self, records, with_topic=False):
     input_tag_tokenizer = tokenizer.InputTagTokenizer()
     word_vecs = []
     topics = []
     test_topics = []
     for r in records:
         word_vecs.append(input_tag_tokenizer.get_attrs_value(r.html))
         if with_topic:
             # Note: use canonical topic instead of raw topic in mysql
             topics.append(r.canonical_topic)
     return (word_vecs, topics)
Esempio n. 4
0
    def __convert_training(self, training):
        input_tag_tokenizer = tokenizer.InputTagTokenizer()
        word_vecs = []
        labels = []
        test_labels = []
        for r in training:
            word_vecs.append(input_tag_tokenizer.get_attrs_value(r.html))
            labels.append(r.label)

        label_types = list(set(labels))
        label_ids = [label_types.index(x) for x in labels]

        return (word_vecs, label_ids, label_types)
Esempio n. 5
0
 def __convert_tests(self, tests):
     data = []
     labels = []
     for r in tests:
         input_tag_tokenizer = tokenizer.InputTagTokenizer()
         tokens = input_tag_tokenizer.get_attrs_value(r.html)
         bow = self.dictionary.doc2bow(tokens)
         vec = matutils.corpus2dense([bow], self.in_units).T[0]
         if r.label not in self.label_types:
             continue  # skip labels undefined in training data
         label_id = self.label_types.index(r.label)
         data.append(np.array(vec).astype(np.float32))
         labels.append(np.int32(label_id))
     return tuple_dataset.TupleDataset(data, labels)