def bow(): # 分词,去除stopwords logging.debug('分词,是否去除stopwords:%s'%(config['remove_stopword'])) sentence_to_seg = lambda x: seg(x,sep='|',full_mode=config['full_mode'],remove_stopword=config['remove_stopword']) train_data['BOW_WORDS'] = train_data['SENTENCE'].apply(sentence_to_seg) test_data['BOW_WORDS'] = test_data['SENTENCE'].apply(sentence_to_seg) logging.debug('-'*20) logging.info('Beginning fit the bag-of-word model') # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. CHOICES = config['model'] if CHOICES == 'tfidf': logging.debug('使用模型:%s'%(CHOICES)) vectorizer = TfidfVectorizer(analyzer="word", token_pattern=u'(?u)\\b\w+\\b', tokenizer=None, preprocessor=None, lowercase=False, stop_words=None, max_features=config['max_num_features']) else: logging.debug('使用模型:%s'%(CHOICES)) vectorizer = CountVectorizer(analyzer="word", token_pattern=u'(?u)\\b\w+\\b', tokenizer=None, preprocessor=None, lowercase=False, stop_words=None, max_features=config['max_num_features']) print train_data.head() print test_data.head() train_data_features = train_data['BOW_WORDS'].as_matrix() train_data_features = vectorizer.fit_transform(train_data_features) vocab = vectorizer.get_feature_names() print vocab print('字典大小:%d'%(len(vocab))) logging.debug('字典大小:%d'%(len(vocab))) logging.debug(u'字典详情:%s'%(','.join(vocab))) train_data_features = train_data_features.toarray() logging.info('the feature\'shape of fit data is: %d,%d' % (train_data_features.shape)) test_data_features = test_data['BOW_WORDS'].as_matrix() test_data_features = vectorizer.transform(test_data_features) test_data_features = test_data_features.toarray() logging.info('the feature\'shape of test data is: %d,%d' % (test_data_features.shape)) return train_data_features,test_data_features
def __seg__(self, sentence): """ 对句子进行分词,使用jieba分词 :param sentence: 句子 :type sentence: str :return: """ sentence_to_seg = lambda x: seg( x, sep=" ", full_mode=self.__full_mode__, remove_stopword=self.__remove_stopword__, verbose=self.__verbose__ ) return sentence_to_seg(sentence)
def __seg__(self, sentence): ''' 对句子进行分词,使用jieba分词 :param sentence: 句子 :type sentence: str :return: ''' sentence_to_seg = lambda x: seg(x, sep=' ', full_mode=self.__full_mode__, remove_stopword=self. __remove_stopword__, verbose=self.__verbose__) return sentence_to_seg(sentence)
def count_sentences_length( sentences): ''' 统计句子的长度,按词统计 :type sentences: str :param sentences: 句子 :return: 句子长度 :rtype: int ''' sentence_to_seg = lambda x: seg(x, sep=' ', full_mode=True, remove_stopword=True, verbose=0 ) segmented_sentences = sentence_to_seg(sentences) sentences_length = len(segmented_sentences.split()) # print sentences_length return sentences_length
def trancate_sentence(self): """ 选取句子对齐的最大长度,并生成补齐的w2v特征向量 """ logging.debug("=" * 20) logging.debug("选取句子对齐的长度") # 使用正常模型的切词 sentence_to_seg = lambda x: seg( x, sep=" ", full_mode=False, remove_stopword=self.__remove_stopword__, verbose=self.__verbose__ ) train_sentence_seg = map(self.__train_data__, sentence_to_seg) train_sentences_length = np.array([len(item.split()) for item in train_sentence_seg]) sorted_index = np.argsort(train_sentences_length) print ("训练语料中,最大句子长度为:%d" % (max(train_sentences_length))) logging.debug("训练语料中,最大句子长度为:%d" % (max(train_sentences_length))) print self.__train_data__[sorted_index[-1]] logging.debug(u"训练中句子长度最大的句子为:%s" % (self.__train_data__[sorted_index[-1]])) print ("训练语料中,平均句子长度为:%d" % (np.average(train_sentences_length))) logging.debug("训练语料中,平均句子长度为:%f" % (np.average(train_sentences_length))) # 设置至少多少记录不被截断的比例(容忍度):比如 0.9 ,则表示至少有90%的句子不被截断 trancate_rate = 0.9 logging.debug("设置被截断数据的比例:%f" % (trancate_rate)) trancate_index = int(len(train_sentences_length) * trancate_rate) trancate_value = train_sentences_length[sorted_index[trancate_index]] logging.debug("设置被截断数据的长度为:%d,注意大于该长度的句子会进行截断!" % (trancate_value)) print trancate_value logging.debug( "被截断个数:%d,比例为:%f" % ( sum(train_sentences_length > trancate_value), sum(train_sentences_length > trancate_value) / (len(train_sentences_length) * 1.0), ) ) print sum(train_sentences_length > trancate_value) / (len(train_sentences_length) * 1.0) print (len(train_sentences_length) * 1.0) print sum(train_sentences_length > trancate_value)
np.save(file(''.join(config['label_file_path']),'w'),index_to_label) label_to_index = {label:idx for idx,label in enumerate(index_to_label)} train_data['LABEL_INDEX'] = train_data['LABEL'].map(label_to_index) test_data['LABEL_INDEX'] = test_data['LABEL'].map(label_to_index) # print train_data.head() logging.debug('=' * 20) logging.debug('对数据进行分词...') logging.debug('-' * 20) sentence_to_seg = lambda x: seg(x, sep=' ', full_mode=config['full_mode'], verbose=0, replace_number=True, remove_stopword=config['remove_stopword']) train_data['WORDS'] = train_data['SENTENCE'].apply(sentence_to_seg) test_data['WORDS'] = test_data['SENTENCE'].apply(sentence_to_seg) print train_data.head() # ------------------------------------------------------------------------------ # --------------region start : 使用期望交叉熵计算权重 并选取 top n 关键词------------- # ------------------------------------------------------------------------------ logging.debug('=' * 20) logging.debug('使用期望交叉熵计算权重 并选取 top n 关键词') logging.debug('开始生成特征向量...')
# -------------- region start : 去除OOD检测错误句子后再次统计 ------------- # ------------------------------------------------------------------------------ logging.debug('=' * 20) logging.debug('去除OOD检测错误句子后再次统计') from jiebanlp.toolSet import seg import io # 使用现在的AIML训练库检验句子是否 AIML匹配 train_data_file_path = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/ood_dataset/dev_vesion/20160526/train_all.csv' train_data = pd.read_csv(train_data_file_path,sep='\t',encoding='utf8') train_data = train_data[train_data['LABEL']!=u'其他#其他'] # 分词 sentence_to_seg = lambda x: seg(x, sep=' ', full_mode=False, remove_stopword=False, replace_number=False, verbose=0) train_data['WORDS'] = train_data['SENTENCE'].apply(sentence_to_seg) ood_sentence = pd.read_csv('/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/ood_dataset/20150614-20150615测试集/20160125/data/ood_sentence.csv',sep='\t',header=0,index_col=0) patten = re.compile(u'【.*') filter_sentence = lambda x: patten.sub('',''.join(x.decode('utf8').split('|')[1:])).replace(' ','') ood_sentence['PAST1_SENTENCE'] = ood_sentence['PAST1_SENTENCE'].apply(filter_sentence) ood_sentence['CUR_SENTENCE'] = ood_sentence['CUR_SENTENCE'].apply(filter_sentence) ood_sentence['WORDS'] = ood_sentence['PAST1_SENTENCE'].apply(sentence_to_seg) print ood_sentence.head() print ood_sentence.shape # print ood_sentence['PAST1_SENTENCE'] print '去重后'
def word_similarity(word1, word2): ''' 计算两个词的相似性 :param word1: :param word2: :return: ''' try: return word2vec_model.n_similarity(word1, word2) except: return 0 sentence_to_seg = lambda x: seg(x,sep=' ', full_mode=config['full_mode'], remove_stopword=config['remove_stopword'], verbose=0) def replace_oov_with_similar_word(sentence, keywords): result = [] # print keywords for word in sentence.split(): if word not in keywords: # print '*' * 10 logging.debug('*' * 10) logging.debug(u'word (%s)是OOV词' % (word)) # print(u'word (%s)是OOV词' % (word)) # print word keywords_sim_score = np.asarray([word_similarity(word, item) for item in keywords]) sorted_index = np.argsort(keywords_sim_score)[-1::-1]
def __seg__(self,sentence): sentence_to_seg = lambda x: seg(x, sep=' ', full_mode=self._full_mode, remove_stopword=config['remove_stopword']) return sentence_to_seg(sentence)
output1_file = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/segmentedCorpus/' \ 'jiebanlp/test_data/ch2r_test_file_seg.csv' output2_file = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/segmentedCorpus/' \ 'jiebanlp/test_data/ch2r_test_file_seg_replace.csv' train_file = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/segmentedCorpus/' \ 'jiebanlp/train_data/train_all.csv' test = pd.read_csv(input_file,sep='\t',header=0,encoding='utf8') train = pd.read_csv(train_file,sep='\t',header=0,encoding='utf8') logging.debug( 'segment the file..') test['SEGMENT'] = test['SENTENCE'].apply(lambda x: seg(x, sep='|', full_mode = False)) test['SEGMENT_FULL'] = test['SENTENCE'].apply(lambda x: seg(x, sep='|', full_mode = True)) test['SEGMENT_EVERYWORD'] = test['SEGMENT'].apply(lambda x: '|'.join(list(''.join(x.split('|'))))) from gensim.models import Word2Vec from sklearn.metrics.pairwise import cosine_similarity import numpy as np model_file = '/home/jdwang/PycharmProjects/corprocessor/word2vec/vector/weibo_50size.gem' word2vec_model = Word2Vec.load(model_file) def cosin_sim(w1,w2): try: sim = cosine_similarity(word2vec_model[w1].reshape(1, -1), word2vec_model[w2].reshape(1, -1)) return sim except:
print ("总共类别数:%d" % (len(index_to_label))) np.save(file("".join(config["label_file_path"]), "w"), index_to_label) label_to_index = {label: idx for idx, label in enumerate(index_to_label)} train_data["LABEL_INDEX"] = train_data["LABEL"].map(label_to_index) test_data["LABEL_INDEX"] = test_data["LABEL"].map(label_to_index) # print train_data.head() logging.debug("=" * 20) logging.debug("对数据进行分词...") logging.debug("-" * 20) sentence_to_seg = lambda x: seg(x, sep=" ", full_mode=config["full_mode"], remove_stopword=config["remove_stopword"]) train_data["WORDS"] = train_data["SENTENCE"].apply(sentence_to_seg) test_data["WORDS"] = test_data["SENTENCE"].apply(sentence_to_seg) print train_data.head() if config["word2vec_to_solve_oov"]: word2vec_model = Word2Vec.load(config["word2vec_model_file_path"]) def word_similarity(word1, word2): """ 计算两个词的相似性 :param word1: :param word2:
from jiebanlp.toolSet import seg import io out = io.open(output_file_path,'w',encoding='utf8') counter = 0 error_line_counter = 0 for line in io.open(input_file_path,'r',encoding='utf8'): if (counter + 1) % 1e5 == 0: logging.debug('第%d条记录' % (counter + 1)) if (counter + 1) % 1e6 == 0: print('第%d条记录' % (counter + 1)) try: # backup = line line = line.strip() if len(line) == 0: continue seg_str = seg(line,sep=' ',full_mode=False) out.write(seg_str+'\n') # print seg_str # break except: logging.debug(u'exception!In line %d:%s'%(counter,line)) error_line_counter+=1 counter += 1 logging.debug('总共%d句'%(counter)) logging.debug('错误转换行的个数:%d'%(error_line_counter)) end_time = timeit.default_timer() print 'end! Running time:%ds!' % (end_time - start_time) logging.debug('='*20) logging.debug('end! Running time:%ds!' % (end_time - start_time))
def sentenceSegment(): logging.info('beginning segment sentences...') # if output dir is not existed,create a new dir. if not os.path.exists(segmented_crpus_dir): logging.error( "the segmentedCorpusDir/"+segmented_crpus_dir+" is not existed!\nCreate dir...") os.mkdir(segmented_crpus_dir) train = pd.DataFrame(columns=['LABEL','LABEL1','LABEL2','SENTENCE','SEGMENT','SEGMENT_FULL','SEGMENT_EVERYWORD']) # get all the file list in the dir fileList = os.listdir(outputCorpusDir) # get the number of the files numFile = len(fileList) # print numFile for i in xrange(numFile): fileName = fileList[i] if fileName.endswith(".clean"): print fileName logging.info('segment the file(%s)...'%(fileName)) # open the file fileIn = open(outputCorpusDir + fileName, "r") # with open(outputCorpusDir + fileName, "r") as f: # for line in f: # print line fileOut = open(segmented_crpus_dir + fileName + ".seg", "w") # enumerate each line in file for line in fileIn: line = line.strip() line = line.decode('utf8') # print line # filter the space line if len(line) == 0: continue # segment sentences using the bosonnlp tool # using the '|' segment the item sentence_seg = seg(line, sep='|', full_mode = False) sentence_seg_full_mode = seg(line, sep='|', full_mode = True) sentence_seg_everyword = '|'.join(list(''.join(sentence_seg.split('|')))) # print sentence_seg_everyword # quit() # write the segmented sentence to the file fileOut.write(sentence_seg.encode('utf8')+"\n") label_full_name = fileName.replace('.clean','') label1 = label_full_name.split('#')[0] label2 = '#'.join(label_full_name.split('#')[1:]) train.loc[len(train)] = [label_full_name, label1, label2, line, sentence_seg, sentence_seg_full_mode, sentence_seg_everyword] # print sentence_seg # close the file fileIn.close() fileOut.close() logging.info('finished the file(%s)!'%(fileName)) logging.info('finished segment sentences!') # print fit all_data_file_path = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/segmentedCorpus/' \ 'jiebanlp/train_data/train_all.csv' train.to_csv(all_data_file_path, sep='\t', encoding='utf8', index=None) print train.head()