Esempio n. 1
0
    def _prepare_data(self, files):
        print('prepare data...')

        embeddings = list()
        queries = list()
        labels = list()
        # mlb = MultiLabelBinarizer()

        for index in xrange(len(files)):
            path = files[index]
            with open(path, 'r') as f:
                for line in f:
                    # line = json.loads(line.strip().decode('utf-8'))
                    # question = line['question']
                    line = line.replace('\t', '').replace(
                        ' ', '').strip('\n').decode('utf-8').split('#')
                    question = QueryUtils.static_simple_remove_punct(
                        str(line[0]))
                    label = self.named_labels.index(
                        str(line[1].encode('utf-8')))
                    queries.append(question)
                    labels.append(label)
                    tokens = [self.cut(question)]
                    embedding = self.feature_extractor.transform(
                        tokens).toarray()
                    embeddings.append(embedding)

        embeddings = np.array(embeddings)
        embeddings = np.squeeze(embeddings)
        # self.kernel.fit()
        # self.mlb = mlb.fit(labels)
        # labels = self.mlb.transform(labels)

        # print (embeddings.shape, len(queries))
        # print_cn(labels.shape)

        return embeddings, labels, queries
Esempio n. 2
0
 def cut(self, input_):
     input_ = QueryUtils.static_simple_remove_punct(input_)
     seg = " ".join(jieba.cut(input_, cut_all=False))
     tokens = _uniout.unescape(str(seg), 'utf8')
     return tokens