def __init__(self): self.seg = Tokenizer() self.lm = kenlm.Model(conf.lm) self.punctuation_list = ".。,,,、??::;;{}[]【】“‘’”《》/!!%……()<>@#$~^¥%&*\"\'=+-_——「」" self.stopwords = [ e.strip() for e in open(conf.stop_words, encoding="utf8").readlines() ]
def gen_true_data(source_path, out_path): t = Tokenizer() res = [] total_num = len(open(source_path, encoding="utf8").readlines()) with open(source_path, encoding="utf8") as fin: for line in tqdm(fin, total=total_num): query = line.strip().split("\t")[0] senten2term, _ = t.tokenize(query) if len(senten2term) < 2: continue res.append("\t".join(senten2term) + "\n") with open(out_path, "w", encoding="utf8") as fin: fin.write("".join(res))
def get_corpus(file_path="position_name_desc_re"): title_entitys = {} token = Tokenizer() for file_name in os.listdir(file_path): # 遍历文件夹里的文件 text = [ line.strip().lower().replace("\\n", "").split('\t') for line in open(file_path + "/" + file_name, encoding="utf8").readlines() ] for line in tqdm(text, total=len(text)): if len(line) <= 1: continue import_tokens = token.select_important_tokens("".join(line[1:])) if line[0] not in title_entitys: title_entitys[line[0]] = [] title_entitys[line[0]].extend(import_tokens) a = 1
def label_data(path, out_path): print("generage query weighting label data") t = Tokenizer() ; res = [] total_num = len(open(path, encoding="utf8").readlines()) for i, line in enumerate(tqdm(open(path, encoding="utf8"), total=total_num)): line_info = json.loads(line) cv_info, jd_info = json.loads(line_info['cv']), json.loads(line_info['jd']) senten2term, word_seg = t.tokenize(jd_info['name']) weight_cv = cv_weight(cv_info, senten2term, t) weight_jd = jd_weight(jd_info, senten2term, t) tmp = "\t".join([(weight_cv[i][0] + ":" + str(round(0.6 * weight_jd[i][1] + 0.4 * weight_cv[i][1], 3))) for i in range(len(weight_cv))]) + "\n" res.append(tmp) print("writing label data %s" % (out_path)) with open(out_path, "w", encoding="utf8") as fin: fin.write("".join(res))
class LanguageModelScore: """Value = {perplexity (entire query) / perplexity (entire query without current term)}. This feature reflects the query quality with/without current term""" def __init__(self): self.seg = Tokenizer() self.lm = kenlm.Model(conf.lm) self.punctuation_list = ".。,,,、??::;;{}[]【】“‘’”《》/!!%……()<>@#$~^¥%&*\"\'=+-_——「」" self.stopwords = [ e.strip() for e in open(conf.stop_words, encoding="utf8").readlines() ] def weight_lm(self, sentence): senten2term, word_seg = self.seg.tokenize(sentence) total_score = self.lm.perplexity(' '.join(senten2term)) weight, weight_sum = [], 0.0 for i in range(len(senten2term)): tmp = [senten2term[j] for j in range(len(senten2term)) if i != j] score = self.lm.perplexity((' '.join(tmp))) val = total_score / score if senten2term[i] in self.punctuation_list or senten2term[ i] in self.stopwords: val = 0.0 weight.append((senten2term[i], val)) weight_sum += val token_weight = [(k, round(v / weight_sum, 3)) for k, v in weight] return token_weight
def __init__(self, model_type='rnn', ckpt_num=0): tf.logging.set_verbosity(tf.logging.INFO) self.tokenizer = Tokenizer() self.encoder = Encoder(model_type) self.estimator_save_name = FLAGS.model_dir + "/model.ckpt-" + str( ckpt_num) # 模型输入 self.a_in = tf.placeholder(tf.int32, [None, SEQ_LEN], name='a') # [batch_size, SEQ_LEN] self.b_in = tf.placeholder( tf.int32, [None, None, SEQ_LEN], name='b') # [batch_size, 1 + MAX_NUM_NEG, SEQ_LEN] self.is_training = tf.placeholder_with_default(False, shape=()) # 创建session self.session = tf.Session() self.word_embed, self.intent_embed = self.encoder.create_tf_embed( self.a_in, self.b_in, self.is_training) # 语义编码 tf.train.Saver().restore(self.session, self.estimator_save_name)
def __init__(self, ckpt_num=156000, is_training=False): #init_log() self.logs = {} batch_size = 1 logging.info("Init query weight model ...") self.sp = Tokenizer() self.lm = language_model() self.xgb_model = xgb.Booster(model_file=conf.rank_model) #self.xgb_dict = parse_xgb_dict(conf.rank_model + '.txt') tf.logging.set_verbosity(tf.logging.INFO) tf_float = tf.bfloat16 if FLAGS.use_bfloat16 else tf.float32 self.input_ids = tf.placeholder(dtype=tf.int64, shape=[batch_size, FLAGS.seq_len], name="input_ids") self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[batch_size, FLAGS.seq_len], name="segment_ids") self.input_mask = tf.placeholder(dtype=tf_float, shape=[batch_size, FLAGS.seq_len], name="input_mask") self.label_ids = tf.placeholder(dtype=tf.int64, shape=[batch_size], name="label_ids") inp = tf.transpose(self.input_ids, [1, 0]) seg_id = tf.transpose(self.segment_ids, [1, 0]) inp_mask = tf.transpose(self.input_mask, [1, 0]) self.sess = tf.Session() xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) self.output, self.attn_prob, self.attention_out = xlnet_model.output_encode, xlnet_model.attn_prob, xlnet_model.attention_out num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) xlnet_model.saver.restore( self.sess, FLAGS.init_checkpoint + "/model.ckpt-" + str(ckpt_num)) #### load pretrained models # scaffold_fn = model_utils.init_from_checkpoint(FLAGS) logging.info("Init query weight model finished ...")
class queryDocSim: def __init__(self, model_type='rnn', ckpt_num=14000): tf.logging.set_verbosity(tf.logging.INFO) self.tokenizer = Tokenizer() self.encoder = Encoder(model_type) self.estimator_save_name = FLAGS.model_dir + "/model.ckpt-" + str( ckpt_num) # 模型输入 self.a_in = tf.placeholder(tf.int32, [None, SEQ_LEN], name='a') # [batch_size, SEQ_LEN] self.b_in = tf.placeholder( tf.int32, [None, None, SEQ_LEN], name='b') # [batch_size, 1 + MAX_NUM_NEG, SEQ_LEN] self.is_training = tf.placeholder_with_default(False, shape=()) # 创建session self.session = tf.Session() self.word_embed, self.intent_embed = self.encoder.create_tf_embed( self.a_in, self.b_in, self.is_training) # 语义编码 tf.train.Saver().restore(self.session, self.estimator_save_name) # 加载模型 def run_step(self, entity, entity_list): x_batch = np.array([seq2ids(entity)]) y_batch = np.array([[seq2ids(e) for e in entity_list]]) fetch = self.session.run( {'emb_a': self.encoder.emb_a, 'emb_b': self.encoder.emb_b, 'sim_ab': self.encoder.sim_ab}, \ feed_dict={self.a_in: x_batch, self.b_in: y_batch, self.is_training: False}) res = { entity + "-" + e: fetch['sim_ab'][0][i] for i, e in enumerate(entity_list) } sorted_res = sorted(res.items(), key=lambda d: d[1], reverse=True) return sorted_res def sim(self, query, doc, topk=5): sim_sum = 0.0 tmp = self.tokenizer.select_important_tokens(doc) sim_res = self.run_step(query, tmp) prob_res = [(k, sigmoid(v)) for k, v in sim_res] for k, v in prob_res[:topk]: sim_sum += v sim_avg = round(sim_sum / topk, 3) return sim_avg, prob_res def cal_sim(self, req_dict): similarity, prob_res = 0.0, [] try: query = req_dict['request']['p']['query'] doc = req_dict['request']['p']['doc'] similarity, prob_res = self.sim(query, doc) except Exception as e: logging.warning("run_error: %s" % traceback.format_exc()) return similarity, prob_res
def gen_entity_dict(): token = Tokenizer() ''' title_freq = Counter([line.split("\t")[0] for line in open("data/jdtitledesc", encoding="utf8").readlines()]) top_title_freq = title_freq.most_common() with open("data/total_title", "w", encoding="utf8") as fin: for t, f in top_title_freq: fin.write(t + "\t" + str(f) + "\n") ''' match_obj = re.compile("(.+)\t([0-9]+)", re.M | re.I) titles, title_words = [], [] stop_word_re = "[" + "|".join(STOP_WORDS) + "]{1,}" custom_word_re = "[急聘|诚聘|双休|代表|高薪|五险]{1,}" punction_re = "[" + "\|".join([e for e in PUNCTUATION_LIST]) + "]{1,}" salary_re = "50k" t = re.sub(custom_word_re + stop_word_re, "", "50k,{急聘客服专员(双休 五险一金)") text = [ line.strip().lower() for line in open("data/total_title", encoding="utf8").readlines() ] for line in tqdm(text, total=len(text)): match_res = match_obj.match(line) if not match_res: continue title, freq = match_res.group(1), int(match_res.group(2)) if freq <= 2 or len(title) >= 10: continue #title = "50k,急聘客服专员(双休 五险一金)" title = re.split("[(|( )/]", title)[0] title = re.sub(custom_word_re + stop_word_re, "", title) titles.append(title) title_words.extend(token.cut(title)[0]) title_freq = Counter(title_words).most_common() with open("data/title_entitys", "w", encoding="utf8") as fin: for t, f in title_freq: fin.write(t + "\n") with open("data/valid_titles", "w", encoding="utf8") as fin: for t, f in Counter(titles).most_common(): fin.write(t + "\n") exit()
class entitySimilar: def __init__(self, model_type='rnn', ckpt_num=0): tf.logging.set_verbosity(tf.logging.INFO) self.tokenizer = Tokenizer() self.encoder = Encoder(model_type) self.estimator_save_name = FLAGS.model_dir + "/model.ckpt-" + str( ckpt_num) # 模型输入 self.a_in = tf.placeholder(tf.int32, [None, SEQ_LEN], name='a') # [batch_size, SEQ_LEN] self.b_in = tf.placeholder( tf.int32, [None, None, SEQ_LEN], name='b') # [batch_size, 1 + MAX_NUM_NEG, SEQ_LEN] self.is_training = tf.placeholder_with_default(False, shape=()) # 创建session self.session = tf.Session() self.word_embed, self.intent_embed = self.encoder.create_tf_embed( self.a_in, self.b_in, self.is_training) # 语义编码 tf.train.Saver().restore(self.session, self.estimator_save_name) def run_step(self, entity, entity_list): x_batch = np.array([seq2ids(entity)]) y_batch = np.array([[seq2ids(e) for e in entity_list]]) fetch = self.session.run( {'emb_a': self.encoder.emb_a, 'emb_b': self.encoder.emb_b, 'sim_ab': self.encoder.sim_ab}, \ feed_dict={self.a_in: x_batch, self.b_in: y_batch, self.is_training: False}) res = { entity + "-" + e: fetch['sim_ab'][0][i] for i, e in enumerate(entity_list) } sorted_res = sorted(res.items(), key=lambda d: d[1], reverse=True) return sorted_res def analyze(self, word, text): tmp = self.tokenizer.select_important_tokens(text) sim_res = self.run_step(word, tmp) prob_res = [(k, sigmoid(v)) for k, v in sim_res] pass
class query_weight: def __init__(self, ckpt_num=156000, is_training=False): #init_log() batch_size = 1 logging.info("Init query weight model ...") self.sp = Tokenizer() self.lm = language_model() self.xgb_model = xgb.Booster(model_file=conf.rank_model) tf.logging.set_verbosity(tf.logging.INFO) tf_float = tf.bfloat16 if FLAGS.use_bfloat16 else tf.float32 self.input_ids = tf.placeholder(dtype=tf.int64, shape=[batch_size, FLAGS.seq_len], name="input_ids") self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[batch_size, FLAGS.seq_len], name="segment_ids") self.input_mask = tf.placeholder(dtype=tf_float, shape=[batch_size, FLAGS.seq_len], name="input_mask") self.label_ids = tf.placeholder(dtype=tf.int64, shape=[batch_size], name="label_ids") inp = tf.transpose(self.input_ids, [1, 0]) seg_id = tf.transpose(self.segment_ids, [1, 0]) inp_mask = tf.transpose(self.input_mask, [1, 0]) self.sess = tf.Session() xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) self.output, self.attn_prob, self.attention_out = xlnet_model.output_encode, xlnet_model.attn_prob, xlnet_model.attention_out num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) xlnet_model.saver.restore( self.sess, FLAGS.init_checkpoint + "/model.ckpt-" + str(ckpt_num)) #### load pretrained models # scaffold_fn = model_utils.init_from_checkpoint(FLAGS) logging.info("Init query weight model finished ...") def run(self, req_dict): result = None try: query = req_dict["request"]["p"]["query"] result = self.run_step(query) except Exception as e: logging.warning("run_error: %s" % traceback.format_exc()) return result def run_step(self, text): cur_sent = preprocess_text(text.strip(), lower=FLAGS.uncased) tokens, ids = self.sp.encode_ids(cur_sent) sent_len, diff_len = len(ids) - 1, FLAGS.seq_len - len(ids) input_ids = ids + [SEP_ID] * (diff_len - 1) + [ CLS_ID ] # cat_data = np.concatenate([inp, a_data, sep_array, b_data, sep_array, cls_array]) input_tokens = tokens + ["<sep>"] * (diff_len - 1) + ["<cls>"] input_mask = [1] + [0] * sent_len + [1] * diff_len segment_ids = [0] * (sent_len + 1) + [ 2 ] * diff_len # seg_id = ([0] * (reuse_len + a_data.shape[0]) + [0] + [1] * b_data.shape[0] + [1] + [2]) input_ids, input_tokens, input_mask, segment_ids = input_ids[:FLAGS. seq_len], input_tokens[: FLAGS . seq_len], input_mask[: FLAGS . seq_len], segment_ids[: FLAGS . seq_len] ''' logging.info("text: %s, seg_text: %s" % (text, " ".join([str(x) for x in tokens]))) logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) ''' il={'text':text,'seg_text':" ".join([str(x) for x in tokens]),'input_ids':" ".join([str(x) for x in input_ids]), \ 'input_mask':" ".join([str(x) for x in input_mask]),'segment_ids':" ".join([str(x) for x in segment_ids])} logging.info(json.dumps(il, ensure_ascii=False)) feed_dict = { self.input_ids: [input_ids], self.segment_ids: [segment_ids], self.input_mask: [input_mask] } fetch = self.sess.run( [self.output, self.attn_prob, self.attention_out], feed_dict) out_encode, atten_prob = fetch[0], fetch[1] #weight0 = normalization(self.cal_weight(out_encode, input_tokens)) weight_attn = normalization(self.weight_attenprob(atten_prob, tokens)) weight_idf = normalization(self.sp.cal_weight_idf(tokens[1:])) weight_lm = normalization(self.lm.cal_weight_lm(tokens[1:])) weight_rule = self.merge_weight([(weight_attn, 0.5), (weight_idf, 0.5), (weight_lm, 0.5)]) self.weight_attn, self.weight_idf, self.weight_lm = weight_attn, weight_idf, weight_lm sen2terms = [e for e in tokens[1:]] weightrank = self.rank_weight(sen2terms, weight_attn, weight_idf, weight_lm) weight_rank = normalization(weightrank) weight = self.merge_weight([(weight_rank, 0.7), (weight_rule, 0.0)]) # 0.6-0.4 wl = {'weight_rank':' '.join([str(k)+':'+str(v) for k, v in weight_rank]),'weight_rule':' '.join([str(k)+':'+str(v) for k, v in weight_rule]), \ 'weight': ' '.join([str(k) + ':' + str(v) for k, v in weight])} logging.info(json.dumps(wl, ensure_ascii=False)) return weight def rank_weight(self, sen2terms, weight_attn, weight_idf, weight_lm): tmp, score_sum = [], 1e-8 for term in sen2terms: feature_vector, _ = get_feature(term, sen2terms, weight_attn, weight_idf, weight_lm) feature = np.array(feature_vector) feature_csr = sparse.csr_matrix(feature) input = DMatrix(feature_csr) score = self.xgb_model.predict(input)[0] prob = 1.0 / (1 + math.exp(-1 * score)) tmp.append((term, prob)) score_sum += prob res = [(k, round(v / score_sum, 3)) for k, v in tmp] return res def merge_weight(self, weight_tuple): weight, weight_sum = [], 1e-8 for j in range(len(weight_tuple[0][0])): tmp = 0.0 for i in range(len(weight_tuple)): (word, val), coef = weight_tuple[i][0][j], weight_tuple[i][1] tmp += val * coef weight.append((weight_tuple[0][0][j][0], tmp)) weight_sum += tmp token_weight = [(k, round(v / weight_sum, 3)) for k, v in weight] return token_weight def weight_attenprob(self, attention_probs, input_tokens): weights = [] (row, col, batch, dim) = attention_probs.shape for j in range(col): tmp = 0.0 for i in range(row): if i == j: continue tmp += attention_probs[i][j][0][0] weights.append(tmp) token_weight = [(input_tokens[i], weights[i]) for i in range(min(len(input_tokens), len(weights))) if input_tokens[i] not in special_words] token_weights = token_weight + [ (input_tokens[i], 0.0) for i in range(len(token_weight) + 1, len(input_tokens)) ] return token_weights def cal_weight(self, encode_vects, input_tokens): vects, vect = encode_vects[0], np.sum(encode_vects, axis=1)[0] token_weights = [(input_tokens[i], cal_sim(vect, vects[i])) for i in range(len(vects)) if input_tokens[i] not in special_words] #token_weight = [(input_tokens[i], weight[i-1]) if input_tokens[i] not in special_words else (input_tokens[i], 0.0) for i in range(len(input_tokens))] return token_weights
def _create_data(idx, input_paths): # Load sentence-piece model #sp = spm.SentencePieceProcessor(); sp.Load(FLAGS.sp_path) sp = Tokenizer() input_shards = [] total_line_cnt = 0 for input_path in input_paths: input_data, sent_ids = [], [] sent_id, line_cnt = True, 0 tf.logging.info("Processing %s", input_path) for line in tf.gfile.Open(input_path): if line_cnt % 100000 == 0: tf.logging.info("Loading line %d", line_cnt) line_cnt += 1 if not line.strip(): if FLAGS.use_eod: sent_id = not sent_id cur_sent = [EOD_ID] else: continue else: if FLAGS.from_raw_text: cur_sent = preprocess_text(line.strip(), lower=FLAGS.uncased) #cur_sent = encode_ids(sp, cur_sent) _, cur_sent = sp.encode_ids(cur_sent) #a=sp.encode_ids("java开发工程师") else: cur_sent = list(map(int, line.strip().split())) input_data.extend(cur_sent) sent_ids.extend([sent_id] * len(cur_sent)) sent_id = not sent_id tf.logging.info("Finish with line %d", line_cnt) if line_cnt == 0: continue input_data = np.array(input_data, dtype=np.int64) sent_ids = np.array(sent_ids, dtype=np.bool) total_line_cnt += line_cnt input_shards.append((input_data, sent_ids)) tf.logging.info("[Task %d] Total number line: %d", idx, total_line_cnt) tfrecord_dir = os.path.join(FLAGS.save_dir, "tfrecords") filenames, num_batch = [], 0 # Randomly shuffle input shards (with a fixed but distinct random seed) np.random.seed(100 * FLAGS.task + FLAGS.pass_id) perm_indices = np.random.permutation(len(input_shards)) tf.logging.info("Using perm indices %s for pass %d", perm_indices.tolist(), FLAGS.pass_id) input_data_list, sent_ids_list = [], [] prev_sent_id = None for perm_idx in perm_indices: input_data, sent_ids = input_shards[perm_idx] # make sure the `send_ids[0] == not prev_sent_id` if prev_sent_id is not None and sent_ids[0] == prev_sent_id: sent_ids = np.logical_not(sent_ids) # append to temporary list input_data_list.append(input_data) sent_ids_list.append(sent_ids) # update `prev_sent_id` prev_sent_id = sent_ids[-1] input_data = np.concatenate(input_data_list) sent_ids = np.concatenate(sent_ids_list) file_name, cur_num_batch = create_tfrecords( save_dir=tfrecord_dir, basename="{}-{}-{}".format(FLAGS.split, idx, FLAGS.pass_id), data=[input_data, sent_ids], bsz_per_host=FLAGS.bsz_per_host, seq_len=FLAGS.seq_len, bi_data=FLAGS.bi_data, sp=sp, ) filenames.append(file_name) num_batch += cur_num_batch record_info = {"filenames": filenames, "num_batch": num_batch} return record_info
import os, traceback, logging, time, re, sys from pyspark import SparkContext, SparkConf from seg_utils import Tokenizer static_jd_title = "titledesc" # titledesc, title, desc token = Tokenizer() def parse_line_jdtitle(line): title = [] try: seg_line = line.strip().split('\t') if seg_line[0].isdigit() and len(seg_line) >= 4: title.append("_".join([seg_line[0], seg_line[3]])) except Exception as e: logging.warning('parse_line_jdtitle_err=%s,line:%s' % (repr(e), line)) return title def parse_line_jd(line): desc = [] try: seg_line = line.strip().split('\t') if seg_line[0].isdigit() and len(seg_line) >= 34: important_tokens = token.select_important_tokens( seg_line[33].replace('\\n', "")) desc = ["_".join([seg_line[0], e]) for e in important_tokens] except Exception as e: logging.warning('parse_line_jd_err=%s,line:%s' % (repr(e), line)) return desc
def __init__(self): self.tokenizer = Tokenizer()
class TrainData(): def __init__(self): self.tokenizer = Tokenizer() def original2corp(self): text = [] print("extract corpu from original file: %s --> corpus file: %s" % (FLAGS.original_file, FLAGS.corpus_file)) for line in open(FLAGS.original_file, encoding="utf8").readlines(): try: e = line.strip().split("\t")[33].replace("\\n", "").lower() except: continue a = line.strip().split("\t") text.append(e) with open(FLAGS.corpus_file, "w", encoding="utf8") as fin: fin.write("\n".join(text)) def gen_train_samples(self): self.original2corp() sample_set = {} np.random.seed(8) # 加载数据,以文本为单位 important_tokens = [] text = open(FLAGS.corpus_file, encoding="utf8").readlines()[:10] print("select important tokens...") for e in tqdm(text, total=len(text)): tmp = self.tokenizer.select_important_tokens(clean_line(e.strip())) if len(tmp) < 10: continue important_tokens.append(tmp) # 采样正负样本,同一个文本中的词为正样本,不同文本中的词为负样本 print("sample(1+k negative) train and valid set...") num_neg = min(len(important_tokens) - 1, MAX_NUM_NEG) for cur_index, cur_ele in tqdm(enumerate(important_tokens), total=len(important_tokens)): np.random.shuffle(cur_ele) cut_index = int(len(cur_ele) / 3) lhs, rhs = cur_ele[:cut_index], cur_ele[cut_index:] for word_index, word in enumerate(lhs): if word in sample_set: continue positive_entity = rhs[word_index] # 正样本 # 负采样 negative_entitys, negs = [], [] negative_indexes = [ i for i in range(len(important_tokens)) if i != cur_index ] random.shuffle(negative_indexes) for e in negative_indexes: if (len(negs) >= num_neg): break if word in important_tokens[ e] or positive_entity in important_tokens[e]: continue negs.append(e) for neg_index in negs: while True: neg_tmp = random.sample(important_tokens[neg_index], 1)[0] if neg_tmp != word and neg_tmp not in negative_entitys: break negative_entitys.append(neg_tmp) assert len(negative_entitys) == num_neg # 采样数少的情况下进行填充 #if len(negative_entitys) < num_neg: # negative_entitys += ["PAD"] * (num_neg - len(negative_entitys)) sample_set[word] = [positive_entity, negative_entitys] # 产生字典 token_freq = defaultdict(int) token_freq["UNKNOWN"] = 1e8 #token_freq["PAD"] = 1e8-1 for k, (p, n) in sample_set.items(): tmp = [k, p] + n for t in tmp: if re_en.fullmatch(t): token_freq[t] += 1 else: for e in list(t): token_freq[e] += 1 sorted_token_freq = sorted(token_freq.items(), key=lambda d: d[1], reverse=True)[:VOCAB_SIZE] word2id = {w: i for i, (w, f) in enumerate(sorted_token_freq)} if conf.over_write_vocab: print("generate word2id file: %s" % (conf.vocab)) json.dump(word2id, open(conf.vocab, "w", encoding="utf8"), ensure_ascii=False, indent=2) _keys_ = list(sample_set.keys()) train_set = { k: sample_set[k] for k in _keys_[:int(len(_keys_) * conf.train_valid_ratio)] } valid_set = { k: sample_set[k] for k in _keys_[int(len(_keys_) * conf.train_valid_ratio):] } print("total_sample: %d\ttrain_sample: %d\tvalid_sample :%d" % (len(sample_set), len(train_set), len(valid_set))) print("generate train sample file :%s\tvalid sample file: %s" % (conf.train_samples, conf.valid_samples)) json.dump(train_set, open(conf.train_samples, "w", encoding="utf8"), ensure_ascii=False, indent=2) json.dump(valid_set, open(conf.valid_samples, "w", encoding="utf8"), ensure_ascii=False, indent=2) def gen_vocab(self, title2entitys): token_freq = defaultdict(int) token_freq["UNKNOWN"] = 1e8 for title, entitys in title2entitys.items(): line = [title] + entitys for t in line: if re_en.fullmatch(t): token_freq[t] += 1 else: for e in list(t): token_freq[e] += 1 sorted_token_freq = sorted(token_freq.items(), key=lambda d: d[1], reverse=True)[:VOCAB_SIZE] word2id = {w: i for i, (w, f) in enumerate(sorted_token_freq)} print("generate word2id file: %s" % (conf.vocab)) json.dump(word2id, open(conf.vocab, "w", encoding="utf8"), ensure_ascii=False, indent=2) def gen_train_sample_based_title_desc(self): entity_dicts = { line.strip(): 1 for line in open(conf.new_entity_file, encoding="utf8").readlines() } valid_titles = { line.strip(): 1 for line in open("data/valid_titles", encoding="utf8").readlines() } title_entitys, entity_title, sample_set = {}, {}, [] matchObj = re.compile(r'(.+)&([0-9]+)', re.M | re.I) title2entitys = {line.strip().lower().split('\t')[0]: line.strip().lower().split('\t')[1:] \ for line in open("data/cv_title2entitys_corpu", encoding="utf8").readlines()} title_entitys = { k: v for k, v in title2entitys.items() if len(v) >= 10 and len(v) < 20 } if conf.over_write_vocab: self.gen_vocab(title_entitys) _keys_ = list(title_entitys.keys()) print("sample(1+k negative) train and valid set...") num_neg = min(len(title_entitys) - 1, MAX_NUM_NEG) # 采样 for title, entitys in tqdm(title_entitys.items(), total=len(title_entitys)): positive_entitys = random.sample(entitys, min(len(entitys), 10)) # 正样本 negative_titles_candidate = [e for e in _keys_ if e != title] for pos_entity in positive_entitys: # 负样本 negative_entitys = [] negs = random.sample(negative_titles_candidate, num_neg) for neg_tit in negs: try: negative_entitys.append( random.sample(title_entitys[neg_tit], 1)[0]) except: a = 1 if len(negative_entitys) < num_neg: negative_entitys += [negative_entitys[0] ] * (num_neg - len(negative_entitys)) assert len(negative_entitys) == num_neg sample_set.append([title, pos_entity, list(negative_entitys)]) #exit() train_set = { i: ele for i, ele in enumerate( sample_set[:int(len(sample_set) * conf.train_valid_ratio)]) } valid_set = { i: ele for i, ele in enumerate( sample_set[int(len(sample_set) * conf.train_valid_ratio):]) } print("total_sample: %d\ttrain_sample: %d\tvalid_sample :%d" % (len(sample_set), len(train_set), len(valid_set))) print("generate train sample file :%s\tvalid sample file: %s" % (FLAGS.train_samples, FLAGS.valid_samples)) json.dump(train_set, open(FLAGS.train_samples, "w", encoding="utf8"), ensure_ascii=False, indent=2) json.dump(valid_set, open(FLAGS.valid_samples, "w", encoding="utf8"), ensure_ascii=False, indent=2) a = 1