Example #1
0
    def _build_feature_extractor(self, mode, files):
        print('Build feature extraction...')
        corpus = list()

        for path in files:
            with open(path, 'r') as f:
                for line in f:
                    # line = json.loads(line.strip().decode('utf-8'))
                    # question = line['question']
                    question = line.replace('\t', '').replace(
                        ' ', '').strip('\n').decode('utf-8')
                    question = QueryUtils.static_remove_cn_punct(str(question))
                    tokens = self.cut(question)
                    corpus.append(tokens)

        if mode == 'ngram':
            bigram_vectorizer = CountVectorizer(
                ngram_range=(1, 2),
                min_df=0.0,
                max_df=1.0,
                analyzer='char',
                stop_words=[',', '?', '我', '我要'],
                binary=True)
            self.feature_extractor = bigram_vectorizer.fit(corpus)
        if mode == 'tfidf':
            print_cn('use {0}'.format(mode))
            tfidf_vectorizer = TfidfVectorizer(analyzer='char',
                                               ngram_range=(1, 2),
                                               max_df=1.0,
                                               min_df=1,
                                               sublinear_tf=True)
            self.feature_extractor = tfidf_vectorizer.fit(corpus)
Example #2
0
    def _prepare_data(self, files):
        print('prepare data...')

        embeddings = list()
        queries = list()
        queries_ = dict()
        labels = list()
        mlb = MultiLabelBinarizer()

        for index in xrange(len(files)):
            path = files[index]
            label = self.named_labels[index]
            queries_[label] = list()
            with open(path, 'r') as f:
                for line in f:
                    # line = json.loads(line.strip().decode('utf-8'))
                    # question = line['question']
                    question = line.replace('\t', '').replace(
                        ' ', '').strip('\n').decode('utf-8')
                    question = QueryUtils.static_remove_cn_punct(str(question))
                    tokens = QueryUtils.static_jieba_cut(question)
                    # print_cn(tokens)
                    if len(tokens) == 0:
                        continue
                    # cc=self.check_zero_tokens(tokens)
                    # if not cc:
                    #     continue
                    queries_[label].append(question)
        # print len(queries_)
        for label, questions in queries_.iteritems():
            for question in questions:
                if question in queries and label not in labels[queries.index(
                        question)]:
                    # print_cn(question)
                    index = queries.index(question)
                    labels[index].append(label)
                else:
                    # print_cn(question)
                    queries.append(question)
                    labels.append([label])
                    tokens = self.cut(question).split(' ')
                    embedding = self.get_w2v_emb(tokens)
                    embeddings.append(embedding)

        embeddings = np.array(embeddings)
        embeddings = np.squeeze(embeddings)
        self.mlb = mlb.fit(labels)
        labels = self.mlb.transform(labels)

        # print (embeddings.shape, len(queries))
        # print_cn(labels.shape)

        return embeddings, labels, queries
Example #3
0
def cut(input_):
    input_ = QueryUtils.static_remove_cn_punct(input_)
    tokens = list(jieba.cut(input_, cut_all=False))
    return tokens
Example #4
0
 def cut(self, input_):
     input_ = QueryUtils.static_remove_cn_punct(input_)
     tokens = jieba.cut(input_, cut_all=True)
     seg = " ".join(tokens)
     tokens = _uniout.unescape(str(seg), 'utf8')
     return tokens