def fetch_segwords(tablename): mysql = msc.MyPymysqlPool("dbMysql") sql = "SELECT t1.keyword_segmented,t2.seqno \ FROM pzbase.ai_keywords_classification_train t1 \ inner join pzbase.ai_keywords_classification_classdef t2 \ on t1.class_level1=t2.class_level1 and t1.class_level2=t2.class_level2 and t1.class_level3=t2.class_level3 and t2.search_word_flag=1 \ where t1.proc_flag=1 and t1.keyword not in (select keyword from pzbase.ai_keywords_classification_test where predict_method='bayes')" rst1 = mysql.getAll(sql) # sql = "SELECT t1.keyword_segmented,t2.seqno \ # FROM pzbase.ai_keywords_classification_train t1 \ # inner join pzbase.ai_keywords_classification_classdef t2 \ # on t1.class_level1=t2.class_level1 and t1.class_level2=t2.class_level2 and t1.class_level3=t2.class_level3 and t2.search_word_flag=1 \ # where t1.proc_flag=1 and t1.keyword in (select keyword from pzbase.ai_keywords_classification_test where predict_method='')" # rst2 = mysql.getAll(sql) sql = "SELECT keyword_segmented,seqno FROM pzbase.ai_keywords_classification_test where predict_method='bayes'" rst2 = mysql.getAll(sql) mysql.dispose() loginfo = ' %d, %d segmented keywords and classIDs are fetched.' % ( len(rst1), len(rst2)) gl.write_log(logpath, 'info', loginfo) return rst1, rst2
def load_wordVectors(model_filepath): loginfo = 'loding question word2ven model...' gl.write_log(logpath, 'info', loginfo) model = word2vec.Word2Vec.load(model_filepath) vocab = list(model.wv.vocab.keys()) # 所有的单词 return model, vocab
def index_search(query_words, title_number): if not title_number: title_number = get_title_number # words_results: [<Top 5 Results for Term('segwords', '美团') runtime=...>, <Top 5 Results for Term('segwords', '汽车') runtime=...>] words_results = tml.query_index(query_words, int(title_number), index_searcher, query_parser, logpath) i = 0 j = 0 words_results_dict = {} for word_results in words_results: # get results for each segmented query word # print('word_results: ', word_results) word_result_dict = {} for word_result in word_results: word_result_id = {} word_result_id['source'] = word_result.get(key="source") word_result_id['ad_title'] = word_result.get(key="title") word_result_id['segmented_words'] = word_result.get(key="segwords") word_result_dict[word_result.get(key="seq_no")] = word_result_id j += 1 words_results_dict[query_words[i]] = word_result_dict i += 1 loginfo = ' query words: %s, result number: %d.' % (query_words, j) gl.write_log(logpath, 'info', loginfo) return words_results_dict
def load_dicts(path, logpath): jieba.load_userdict(path) # 动态调整词频,让未登录词的词频自动靠前,这样可以优先匹配 [jieba.suggest_freq(line.strip(), tune=True) for line in open(path, 'r', encoding='utf8')] loginfo = ' User dict %s has beed loaded.' % path gl.write_log(logpath, 'info', loginfo)
def get_similarity_words(model, query_word, number): similarity_words = '' try: similarity_words = model.most_similar(query_word, topn=number) except KeyError: loginfo = ' The word is not in vocabulary!' gl.write_log(logpath, 'error', loginfo) return similarity_words
def fetch_segwords(tablename): mysql = msc.MyPymysqlPool("dbMysql") sql = "SELECT ad_title_segwords FROM %s where ad_title_segwords is not null" % tablename rst = mysql.getAll(sql) mysql.dispose() loginfo = ' %d rows are fetched.' % len(rst) gl.write_log(logpath, 'info', loginfo) return rst
def fetch_segwords(tablename): mysql = msc.MyPymysqlPool("dbMysql") sql = ''.join(['SELECT question_stem_segment FROM ', tablename, " where is_segmented=1", ]) rst = mysql.getAll(sql) mysql.dispose() loginfo = ' %d rows have been fetched.' % len(rst) gl.write_log(logpath, 'info', loginfo) return rst
def incrementally_build_model(original_modelpath, new_model_filepath, sentences): model = word2vec.Word2Vec.load(original_modelpath) model.build_vocab(sentences, update=True) print(model.corpus_count, model.iter) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) model.save(new_model_filepath) loginfo = ' word2vec model %s has been built incrementally based on %s!' % (new_model_filepath, original_modelpath) gl.write_log(logpath, 'info', loginfo)
def calculate_similarity(model, word1_list, word2_list): similarity = -1 try: similarity = model.n_similarity(word1_list, word2_list) except KeyError: loginfo = ' The words similarity is not available!' gl.write_log(logpath, 'error', loginfo) return similarity
def fetch_dest_segwords(tablename, field_value): mysql = msc.MyPymysqlPool("dbMysql") sql = ''.join(['SELECT question_seqno, question_stem, question_stem_segment_clear FROM ', \ tablename, " where is_segmented=1 and in_tablename='", field_value, "'", ]) rst = mysql.getAll(sql) mysql.dispose() loginfo = ' %d matched questions have been fetched.' % len(rst) gl.write_log(logpath, 'info', loginfo) return rst
def get_stopwords(path, logpath): stopwords = [] with open(path, "r", encoding='utf8') as f: lines = f.readlines() for line in lines: stopwords.append(line.strip()) loginfo = ' Stop words dict %s has beed loaded.' % path gl.write_log(logpath, 'info', loginfo) return stopwords
def get_records(tablename): mysql = msc.MyPymysqlPool("dbMysql") sql = ''.join(['SELECT question_seqno, upper(question_stem) as question_stem FROM ', \ tablename, " where is_segmented=0", ]) rst = mysql.getAll(sql) mysql.dispose() loginfo = ' %d rows are fetched.' % len(rst) gl.write_log(logpath, 'info', loginfo) return rst
def incrementally_build_model(model_path, sentences): model = word2vec.Word2Vec.load(model_path) model.build_vocab(sentences, update=True) print(model.corpus_count, model.iter) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) model.save(model_path) loginfo = ' model %s has been built incrementally!' % model_path gl.write_log(logpath, 'info', loginfo)
def get_titles(tablename): mysql = msc.MyPymysqlPool("dbMysql") # upper chars for general match, Boss直聘=BOSS直聘=boss直聘 sql = "SELECT seq_no, upper(ad_title) as ad_title FROM %s where proc_flag is null or proc_flag<>1" % tablename rst = mysql.getAll( sql ) # [{'seq_no': 97, 'ad_title': '小说看到一半要花钱?这里让你免费看完大结局!'}, {'seq_no': 98, 'ad_title': '玄幻大神天蚕土豆亲授逆袭攻略'}] mysql.dispose() loginfo = ' %d titles are fetched.' % len(rst) gl.write_log(logpath, 'info', loginfo) return rst
def get_titles(tablename): mysql = msc.MyPymysqlPool("dbMysql") # upper chars for general match, Boss=BOSS=boss # sql = "SELECT seqno, upper(keyword) as keyword FROM %s where proc_flag=0 order by seqno" % tablename sql = "SELECT seqno, upper(keyword) as keyword FROM %s order by seqno" % tablename rst = mysql.getAll( sql ) # [{'seqno': 97, 'keyword': '小说看到一半要花钱?这里让你免费看完大结局!'}, {'seqno': 98, 'keyword': '玄幻大神天蚕土豆亲授逆袭攻略'}] mysql.dispose() loginfo = ' %d titles are fetched.' % len(rst) gl.write_log(logpath, 'info', loginfo) return rst
def create_index(index_dir): loginfo = 'inverted index are creating...' gl.write_log(logpath, 'info', loginfo) index_dir_writen, index_dir_processing = check_index_directory(index_dir) # create inverted index for all data because of not too many titles write_index_file(index_dir_writen, tablename) if index_dir_writen == index_dir_processing: shutil.rmtree(index_dir) os.rename(index_dir_writen, index_dir) loginfo = 'Inverted index directory %s has been renamed.' % index_dir gl.write_log(logpath, 'info', loginfo)
def build_model(model_path, sentences, embedding_size=128, in_window=5, in_min_count=5): w2vModel = word2vec.Word2Vec(sentences, sg=1, size=embedding_size, window=in_window, min_count=in_min_count, workers=multiprocessing.cpu_count()) w2vModel.save(model_path) loginfo = ' model %s has been built initially!' % model_path gl.write_log(logpath, 'info', loginfo)
def segment_title(titles, tablename): i = 0 for title in titles: seq_no = title["seq_no"] ad_title = title["ad_title"] segmented_words = tml.words_segment(ad_title, stopwords, GOOD_WORDS) words = re.sub(CUT_WORDS, '', str(segmented_words)) write_segmented_words(tablename, seq_no, words) if i % 1000 == 0: loginfo = ' progress status: %d ' % i gl.write_log(logpath, 'info', loginfo) i += 1 loginfo = ' Total %d titles\'s segmented words have been writen.' % i gl.write_log(logpath, 'info', loginfo)
def segment_questions(records, tablename): i = 0 for record in records: seqno = record["question_seqno"] stemwords = record["question_stem"] segmented_words = tml.words_segment(stemwords, stopwords, GOOD_WORDS, HMM=True) # 实际检测有HMM效果更好 segwords = re.sub(CUT_WORDS, '', str(segmented_words)) segwords = tml.iterate_replacements(segwords, '\\', '') segwords = tml.iterate_replacements(segwords, ' ', ' ') # 将segwords中多个空格迭代替换为1个空格 write_segmented_words(tablename, seqno, stemwords, segwords) if i % 500 == 0: loginfo = ' progress status: %d ' % i gl.write_log(logpath, 'info', loginfo) i += 1 loginfo = ' Total %d keyword\'s segmented words have been writen.' % i gl.write_log(logpath, 'info', loginfo)
def check_index_directory(index_dir): # 考虑到索引创建时间可能会比较长,为了保证创建索引时索引文件依然可以访问, # 如果已有索引,则先存放到后缀_processing目录中,待索引生成后,直接进行替换 index_dir_processing = index_dir + "_processing" if not os.path.exists(index_dir): os.mkdir(index_dir) index_dir_writen = index_dir loginfo = ' Inverted index directory %s has been created.' % index_dir gl.write_log(logpath, 'info', loginfo) else: if os.path.exists(index_dir_processing): shutil.rmtree(index_dir_processing) os.mkdir(index_dir_processing) loginfo = ' Temporary inverted index directory %s has been created.' % index_dir_processing gl.write_log(logpath, 'info', loginfo) index_dir_writen = index_dir_processing return index_dir_writen, index_dir_processing
def segment_title(titles, tablename): i = 0 for title in titles: seqno = title["seqno"] keyword = title["keyword"] segmented_words = tml.words_segment(keyword, stopwords, GOOD_WORDS, iscutall=False) # segmented_words = tml.words_segment(keyword, stopwords, GOOD_WORDS, iscutall=True) words = re.sub(CUT_WORDS, '', str(segmented_words)) write_segmented_words(tablename, seqno, words) if i % 1000 == 0: loginfo = ' progress status: %d ' % i gl.write_log(logpath, 'info', loginfo) i += 1 loginfo = ' Total %d keyword\'s segmented words have been writen.' % i gl.write_log(logpath, 'info', loginfo)
def insert_question_similarity(model, vocab, dest_records, base_records, threshold, tablename): mysql = msc.MyPymysqlPool("dbMysql") # 计算rst中两两记录之间的相似度,将大于阈值的存入question_similarity_table i = 0 j = 0 for dest_record in dest_records: # mysql.begin() # 开启事务 seqno1 = dest_record[0] stem1 = dest_record[1] segwords1 = dest_record[2] segwords1_list = segwords1.split() max_similarity = -1 max_seqno = -1 for base_record in base_records: seqno2 = int(base_record[0]) stem2 = base_record[1] segwords2 = base_record[2] segwords2_list = segwords2.split() if len(segwords1_list) > 0 and len(segwords2_list) > 0: similarity = calculate_similarity(model, segwords1_list, segwords2_list) if similarity > max_similarity: max_similarity = similarity max_seqno = seqno2 if (similarity >= threshold): sql = ''.join(['insert into ', tablename, "(question_seqno1,question_seqno2,question_stem1,question_stem2," \ "question_stem_segment1,question_stem_segment2,similarity,load_time)" \ " values(", str(seqno1), ",", str(seqno2), ",'", str(stem1), "','", str(stem2), "','", str(segwords1), "','", str(segwords2), "',", str(similarity), \ ", CURRENT_TIMESTAMP());"]) mysql.insert(sql) i += 1 j += 1 print('seqno1, max_seqno, max_similarity: ', seqno1, max_seqno, max_similarity) mysql.end() # 结束提交 mysql.dispose() loginfo = '%d similar rows have been inserted into %s!' % (i, tablename) gl.write_log(logpath, 'info', loginfo)
def write_index_file(index_dir, tablename): analyzer = ChineseAnalyzer(minsize=1) # can index one word schema = Schema(seq_no=NUMERIC(stored=True), source=TEXT(stored=True), title=TEXT(stored=True), segwords=TEXT(stored=True, analyzer=analyzer)) ix = create_in(index_dir, schema) writer = ix.writer() datasets = fetch_segwords(tablename) for dataset in datasets: get_seq_no = int(dataset["seq_no"]) get_source = dataset["ad_title_source"] get_title = dataset["ad_title"].replace('\n', '') get_segwords = dataset["ad_title_segwords"].replace('\n', '') writer.add_document(seq_no=get_seq_no, source=get_source, title=get_title, segwords=get_segwords) writer.commit() loginfo = 'Inverted index for %s has been created.' % tablename gl.write_log(logpath, 'info', loginfo)
help="the train/test table name", default='pzbase.ai_keywords_classification_test') args = args.parse_args() args_dict = args.__dict__ return args_dict if __name__ == '__main__': global logpath args_dict = comand_line_set() tablename = args_dict.get("tablename") logpath = args_dict.get("logpath") gl.write_log(logpath, 'info', '\n\n') loginfo = 'segwords vectorization starting...' gl.write_log(logpath, 'info', loginfo) # get segmeted keywords rst1, rst2 = fetch_segwords(tablename) word2vec_vectorizer(rst1, rst2) exit() # tfidf_vec_trainX, tfidf_vec_testX, trainy, testy = tfidf_vectorizer1(rst1, 0.1) # vec_trainX, vec_testX, trainy, seqno_test = tfidf_vectorizer2(rst1, rst2) vec_trainX, vec_testX, trainy, seqno_test = tf_vectorizer(rst1, rst2) model = nativebayes_model_train(vec_trainX, trainy) # joblib.dump((tfidf_vec_trainX, tfidf_vec_testX, trainy, seqno_test), 'vec_data.pkl'.format(), compress=3) # joblib.dump(model, 'vec_model.pkl'.format(), compress=3)
global get_title_number global get_similar_number global index_searcher global query_parser args_dict = comand_line_set() index_path = args_dict.get("indexpath") logpath = args_dict.get("logpath") user_dict_path = args_dict.get("userdictpath") comp_dict_path = args_dict.get("compdictpath") stop_word_path = args_dict.get("stopwordpath") modelpath = args_dict.get("modelpath") get_title_number = args_dict.get("titlenumber") get_similar_number = args_dict.get("similarnumber") gl.write_log(logpath, 'info', "\n\n") loginfo = ' word retrieval service starting...' gl.write_log(logpath, 'info', loginfo) # preload dicts to save running time tml.load_dicts(user_dict_path, logpath) tml.load_dicts(comp_dict_path, logpath) stopwords = tml.get_stopwords(stop_word_path, logpath) ix = open_dir(index_path) # for read only index_searcher = ix.searcher() query_parser = QueryParser("segwords", schema=ix.schema) loginfo = ' inverted index file %s has been opened.' % index_path gl.write_log(logpath, 'info', loginfo) # preload similar model to save running time