def bow():

    # 分词,去除stopwords
    logging.debug('分词,是否去除stopwords:%s'%(config['remove_stopword']))

    sentence_to_seg = lambda x: seg(x,sep='|',full_mode=config['full_mode'],remove_stopword=config['remove_stopword'])

    train_data['BOW_WORDS'] = train_data['SENTENCE'].apply(sentence_to_seg)
    test_data['BOW_WORDS'] = test_data['SENTENCE'].apply(sentence_to_seg)



    logging.debug('-'*20)
    logging.info('Beginning fit the bag-of-word model')
    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.
    CHOICES = config['model']
    if CHOICES == 'tfidf':
        logging.debug('使用模型:%s'%(CHOICES))
        vectorizer = TfidfVectorizer(analyzer="word",
                                     token_pattern=u'(?u)\\b\w+\\b',
                                     tokenizer=None,
                                     preprocessor=None,
                                     lowercase=False,
                                     stop_words=None,
                                     max_features=config['max_num_features'])
    else:
        logging.debug('使用模型:%s'%(CHOICES))
        vectorizer = CountVectorizer(analyzer="word",
                                     token_pattern=u'(?u)\\b\w+\\b',
                                     tokenizer=None,
                                     preprocessor=None,
                                     lowercase=False,
                                     stop_words=None,
                                     max_features=config['max_num_features'])

    print train_data.head()
    print test_data.head()

    train_data_features = train_data['BOW_WORDS'].as_matrix()
    train_data_features = vectorizer.fit_transform(train_data_features)
    vocab = vectorizer.get_feature_names()
    print vocab
    print('字典大小:%d'%(len(vocab)))
    logging.debug('字典大小:%d'%(len(vocab)))
    logging.debug(u'字典详情:%s'%(','.join(vocab)))
    train_data_features = train_data_features.toarray()
    logging.info('the feature\'shape of fit data is: %d,%d' % (train_data_features.shape))


    test_data_features = test_data['BOW_WORDS'].as_matrix()
    test_data_features = vectorizer.transform(test_data_features)
    test_data_features = test_data_features.toarray()

    logging.info('the feature\'shape of test data is: %d,%d' % (test_data_features.shape))

    return train_data_features,test_data_features
Beispiel #2
0
 def __seg__(self, sentence):
     """
     对句子进行分词,使用jieba分词
     :param sentence: 句子
     :type sentence: str
     :return:
     """
     sentence_to_seg = lambda x: seg(
         x, sep=" ", full_mode=self.__full_mode__, remove_stopword=self.__remove_stopword__, verbose=self.__verbose__
     )
     return sentence_to_seg(sentence)
Beispiel #3
0
 def __seg__(self, sentence):
     '''
     对句子进行分词,使用jieba分词
     :param sentence: 句子
     :type sentence: str
     :return:
     '''
     sentence_to_seg = lambda x: seg(x,
                                     sep=' ',
                                     full_mode=self.__full_mode__,
                                     remove_stopword=self.
                                     __remove_stopword__,
                                     verbose=self.__verbose__)
     return sentence_to_seg(sentence)
Beispiel #4
0
def count_sentences_length( sentences):
    '''
        统计句子的长度,按词统计
    :type sentences: str
    :param sentences: 句子
    :return: 句子长度
    :rtype: int
    '''
    sentence_to_seg = lambda x: seg(x,
                                    sep=' ',
                                    full_mode=True,
                                    remove_stopword=True,
                                    verbose=0
                                    )
    segmented_sentences = sentence_to_seg(sentences)
    sentences_length = len(segmented_sentences.split())
    # print sentences_length

    return sentences_length
Beispiel #5
0
    def trancate_sentence(self):
        """
        选取句子对齐的最大长度,并生成补齐的w2v特征向量
        """
        logging.debug("=" * 20)
        logging.debug("选取句子对齐的长度")
        # 使用正常模型的切词
        sentence_to_seg = lambda x: seg(
            x, sep=" ", full_mode=False, remove_stopword=self.__remove_stopword__, verbose=self.__verbose__
        )

        train_sentence_seg = map(self.__train_data__, sentence_to_seg)

        train_sentences_length = np.array([len(item.split()) for item in train_sentence_seg])
        sorted_index = np.argsort(train_sentences_length)
        print ("训练语料中,最大句子长度为:%d" % (max(train_sentences_length)))
        logging.debug("训练语料中,最大句子长度为:%d" % (max(train_sentences_length)))
        print self.__train_data__[sorted_index[-1]]
        logging.debug(u"训练中句子长度最大的句子为:%s" % (self.__train_data__[sorted_index[-1]]))
        print ("训练语料中,平均句子长度为:%d" % (np.average(train_sentences_length)))
        logging.debug("训练语料中,平均句子长度为:%f" % (np.average(train_sentences_length)))

        # 设置至少多少记录不被截断的比例(容忍度):比如 0.9 ,则表示至少有90%的句子不被截断
        trancate_rate = 0.9
        logging.debug("设置被截断数据的比例:%f" % (trancate_rate))
        trancate_index = int(len(train_sentences_length) * trancate_rate)
        trancate_value = train_sentences_length[sorted_index[trancate_index]]
        logging.debug("设置被截断数据的长度为:%d,注意大于该长度的句子会进行截断!" % (trancate_value))
        print trancate_value
        logging.debug(
            "被截断个数:%d,比例为:%f"
            % (
                sum(train_sentences_length > trancate_value),
                sum(train_sentences_length > trancate_value) / (len(train_sentences_length) * 1.0),
            )
        )
        print sum(train_sentences_length > trancate_value) / (len(train_sentences_length) * 1.0)
        print (len(train_sentences_length) * 1.0)
        print sum(train_sentences_length > trancate_value)
Beispiel #6
0
np.save(file(''.join(config['label_file_path']),'w'),index_to_label)

label_to_index = {label:idx for idx,label in enumerate(index_to_label)}

train_data['LABEL_INDEX'] = train_data['LABEL'].map(label_to_index)
test_data['LABEL_INDEX'] = test_data['LABEL'].map(label_to_index)
# print train_data.head()


logging.debug('=' * 20)
logging.debug('对数据进行分词...')
logging.debug('-' * 20)

sentence_to_seg = lambda x: seg(x,
                                sep=' ',
                                full_mode=config['full_mode'],
                                verbose=0,
                                replace_number=True,
                                remove_stopword=config['remove_stopword'])


train_data['WORDS'] = train_data['SENTENCE'].apply(sentence_to_seg)
test_data['WORDS'] = test_data['SENTENCE'].apply(sentence_to_seg)
print train_data.head()

# ------------------------------------------------------------------------------
# --------------region start : 使用期望交叉熵计算权重 并选取 top n 关键词-------------
# ------------------------------------------------------------------------------
logging.debug('=' * 20)
logging.debug('使用期望交叉熵计算权重 并选取 top n 关键词')
logging.debug('开始生成特征向量...')
Beispiel #7
0
# -------------- region start : 去除OOD检测错误句子后再次统计 -------------
# ------------------------------------------------------------------------------
logging.debug('=' * 20)
logging.debug('去除OOD检测错误句子后再次统计')
from jiebanlp.toolSet import seg
import io

# 使用现在的AIML训练库检验句子是否 AIML匹配
train_data_file_path = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/ood_dataset/dev_vesion/20160526/train_all.csv'

train_data = pd.read_csv(train_data_file_path,sep='\t',encoding='utf8')
train_data = train_data[train_data['LABEL']!=u'其他#其他']
# 分词
sentence_to_seg = lambda x: seg(x, sep=' ',
                               full_mode=False,
                               remove_stopword=False,
                               replace_number=False,
                               verbose=0)
train_data['WORDS'] = train_data['SENTENCE'].apply(sentence_to_seg)

ood_sentence = pd.read_csv('/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/ood_dataset/20150614-20150615测试集/20160125/data/ood_sentence.csv',sep='\t',header=0,index_col=0)
patten = re.compile(u'【.*')
filter_sentence = lambda x: patten.sub('',''.join(x.decode('utf8').split('|')[1:])).replace(' ','')
ood_sentence['PAST1_SENTENCE'] = ood_sentence['PAST1_SENTENCE'].apply(filter_sentence)
ood_sentence['CUR_SENTENCE'] = ood_sentence['CUR_SENTENCE'].apply(filter_sentence)
ood_sentence['WORDS'] = ood_sentence['PAST1_SENTENCE'].apply(sentence_to_seg)

print ood_sentence.head()
print ood_sentence.shape
# print ood_sentence['PAST1_SENTENCE']
print '去重后'
Beispiel #8
0

def word_similarity(word1, word2):
    '''
    计算两个词的相似性
    :param word1:
    :param word2:
    :return:
    '''
    try:
        return word2vec_model.n_similarity(word1, word2)
    except:
        return 0

sentence_to_seg = lambda x: seg(x,sep=' ',
                                full_mode=config['full_mode'],
                                remove_stopword=config['remove_stopword'],
                                verbose=0)


def replace_oov_with_similar_word(sentence, keywords):
    result = []
    # print keywords
    for word in sentence.split():
        if word not in keywords:
            # print '*' * 10
            logging.debug('*' * 10)
            logging.debug(u'word (%s)是OOV词' % (word))
            # print(u'word (%s)是OOV词' % (word))
            # print word
            keywords_sim_score = np.asarray([word_similarity(word, item) for item in keywords])
            sorted_index = np.argsort(keywords_sim_score)[-1::-1]
Beispiel #9
0
    def __seg__(self,sentence):

        sentence_to_seg = lambda x: seg(x, sep=' ', full_mode=self._full_mode,
                                    remove_stopword=config['remove_stopword'])

        return sentence_to_seg(sentence)
output1_file = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/segmentedCorpus/' \
              'jiebanlp/test_data/ch2r_test_file_seg.csv'
output2_file = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/segmentedCorpus/' \
              'jiebanlp/test_data/ch2r_test_file_seg_replace.csv'

train_file = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/segmentedCorpus/' \
             'jiebanlp/train_data/train_all.csv'

test = pd.read_csv(input_file,sep='\t',header=0,encoding='utf8')
train = pd.read_csv(train_file,sep='\t',header=0,encoding='utf8')



logging.debug( 'segment the file..')
test['SEGMENT'] = test['SENTENCE'].apply(lambda x: seg(x, sep='|', full_mode = False))
test['SEGMENT_FULL'] = test['SENTENCE'].apply(lambda x: seg(x, sep='|', full_mode = True))
test['SEGMENT_EVERYWORD'] = test['SEGMENT'].apply(lambda x: '|'.join(list(''.join(x.split('|')))))

from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

model_file = '/home/jdwang/PycharmProjects/corprocessor/word2vec/vector/weibo_50size.gem'
word2vec_model = Word2Vec.load(model_file)

def cosin_sim(w1,w2):
    try:
        sim = cosine_similarity(word2vec_model[w1].reshape(1, -1), word2vec_model[w2].reshape(1, -1))
        return sim
    except:
Beispiel #11
0
print ("总共类别数:%d" % (len(index_to_label)))

np.save(file("".join(config["label_file_path"]), "w"), index_to_label)

label_to_index = {label: idx for idx, label in enumerate(index_to_label)}

train_data["LABEL_INDEX"] = train_data["LABEL"].map(label_to_index)
test_data["LABEL_INDEX"] = test_data["LABEL"].map(label_to_index)
# print train_data.head()


logging.debug("=" * 20)
logging.debug("对数据进行分词...")
logging.debug("-" * 20)

sentence_to_seg = lambda x: seg(x, sep=" ", full_mode=config["full_mode"], remove_stopword=config["remove_stopword"])


train_data["WORDS"] = train_data["SENTENCE"].apply(sentence_to_seg)
test_data["WORDS"] = test_data["SENTENCE"].apply(sentence_to_seg)
print train_data.head()

if config["word2vec_to_solve_oov"]:
    word2vec_model = Word2Vec.load(config["word2vec_model_file_path"])


def word_similarity(word1, word2):
    """
    计算两个词的相似性
    :param word1:
    :param word2:
from jiebanlp.toolSet import seg
import io
out = io.open(output_file_path,'w',encoding='utf8')
counter = 0
error_line_counter = 0
for line in io.open(input_file_path,'r',encoding='utf8'):
    if (counter + 1) % 1e5 == 0:
        logging.debug('第%d条记录' % (counter + 1))
    if (counter + 1) % 1e6 == 0:
        print('第%d条记录' % (counter + 1))
    try:
        # backup = line
        line = line.strip()
        if len(line) == 0:
            continue
        seg_str = seg(line,sep=' ',full_mode=False)
        out.write(seg_str+'\n')
        # print seg_str
        # break
    except:
        logging.debug(u'exception!In line %d:%s'%(counter,line))
        error_line_counter+=1
    counter += 1
logging.debug('总共%d句'%(counter))
logging.debug('错误转换行的个数:%d'%(error_line_counter))
end_time = timeit.default_timer()
print 'end! Running time:%ds!' % (end_time - start_time)
logging.debug('='*20)
logging.debug('end! Running time:%ds!' % (end_time - start_time))
def sentenceSegment():
    logging.info('beginning segment sentences...')
    # if output dir is not existed,create a new dir.
    if not os.path.exists(segmented_crpus_dir):
        logging.error( "the segmentedCorpusDir/"+segmented_crpus_dir+" is not existed!\nCreate dir...")
        os.mkdir(segmented_crpus_dir)
    train = pd.DataFrame(columns=['LABEL','LABEL1','LABEL2','SENTENCE','SEGMENT','SEGMENT_FULL','SEGMENT_EVERYWORD'])
    # get all the file list in the dir
    fileList = os.listdir(outputCorpusDir)
    # get the number of the files
    numFile = len(fileList)
    # print numFile
    for i in xrange(numFile):
        fileName = fileList[i]
        if fileName.endswith(".clean"):
            print fileName
            logging.info('segment the file(%s)...'%(fileName))
            # open the file
            fileIn = open(outputCorpusDir + fileName, "r")
            # with open(outputCorpusDir + fileName, "r") as f:
            #     for line in f:
            #         print line
            fileOut = open(segmented_crpus_dir + fileName + ".seg", "w")
            # enumerate each line in file
            for line in fileIn:
                line = line.strip()
                line = line.decode('utf8')
                # print line

                # filter the space line
                if len(line) == 0:
                    continue
                # segment sentences using the bosonnlp tool
                # using the '|' segment the item
                sentence_seg = seg(line, sep='|', full_mode = False)

                sentence_seg_full_mode = seg(line, sep='|', full_mode = True)
                sentence_seg_everyword = '|'.join(list(''.join(sentence_seg.split('|'))))
                # print sentence_seg_everyword
                # quit()
                # write the segmented sentence to the file
                fileOut.write(sentence_seg.encode('utf8')+"\n")
                label_full_name = fileName.replace('.clean','')
                label1 = label_full_name.split('#')[0]
                label2 = '#'.join(label_full_name.split('#')[1:])
                train.loc[len(train)] = [label_full_name,
                                        label1,
                                        label2,
                                         line,
                                         sentence_seg,
                                         sentence_seg_full_mode,
                                         sentence_seg_everyword]
                # print sentence_seg
            # close the file
            fileIn.close()
            fileOut.close()

            logging.info('finished the file(%s)!'%(fileName))

    logging.info('finished segment sentences!')
    # print fit
    all_data_file_path = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/segmentedCorpus/' \
                         'jiebanlp/train_data/train_all.csv'
    train.to_csv(all_data_file_path,
                 sep='\t',
                 encoding='utf8',
                 index=None)
    print train.head()