def read_urls(path, vocab, is_train, repr="3gram", ngram_size=3): urls = [] max_len = 0 if os.path.exists(path): with codecs.open(path, "r", "UTF-8") as f: for i, line in enumerate(f): line = line.strip() if len(line) == 0: line = DEFAULT_URL marker_index = line.find('?') url = line[7:marker_index] if sys.version_info[0] < 3: q_tokens = split_sent(normalize_unicode(unicode(url)), repr, ngram_size) else: q_tokens = split_sent(normalize_unicode(str(url)), repr, ngram_size) token_ids = [] if len(q_tokens) > max_len: max_len = len(q_tokens) for token in q_tokens: if token not in vocab['url']: if is_train: vocab['url'][token] = len(vocab['url']) if token in vocab['url']: token_ids.append(vocab['url'][token]) else: token_ids.append(OOV_WORD_INDEX) urls.append(token_ids) return urls, max_len
def read_sentences(path, vocab, is_train, repr="word", ngram_size=3, test_vocab=None): questions = [] max_len = 0 with codecs.open(path, "r", "UTF-8") as f: for i, line in enumerate(f): q_tokens = split_sent(normalize_unicode(line.strip()), repr, ngram_size) token_ids = [] if len(q_tokens) > max_len: max_len = len(q_tokens) for token in q_tokens: if token not in vocab[repr]: if is_train: vocab[repr][token] = len(vocab[repr]) elif repr == "word" and token not in test_vocab[repr]: test_vocab[repr][token] = len(vocab[repr]) + len( test_vocab[repr]) if token in vocab[repr]: token_ids.append(vocab[repr][token]) elif repr == "word": token_ids.append(test_vocab[repr][token]) else: token_ids.append(OOV_WORD_INDEX) questions.append(token_ids) return questions, max_len
def generate_idf(base_dir): freq_dict = {"unigram": defaultdict(int), "bigram": defaultdict(int), "trigram": defaultdict(int)} total_word_freq, total_bigram_freq, total_ngram_freq = 0, 0, 0 for dataset in datasets: for file in files: path = '%s/%s/%s' % (base_dir, dataset, file) if os.path.exists(path): with codecs.open(path, 'r', 'UTF-8') as f: for i, line in enumerate(f): tokens = split_sent(normalize_unicode(line.strip()), "word") for j in range(len(tokens)): freq_dict["unigram"][tokens[j].lower()] += 1 #freq_dict["word"][ps.stem(tokens[j])] += 1 if j >= 1: bigram = " ".join([token.lower() for token in tokens[j - 1:j + 1]]) #bigram = " ".join([ps.stem(token) for token in tokens[j-1:j+1]]) freq_dict["bigram"][bigram] += 1 if j >= 2: trigram = " ".join([token.lower() for token in tokens[j - 2:j + 1]]) #trigram = " ".join([ps.stem(token) for token in tokens[j-2:j+1]]) freq_dict["trigram"][trigram] += 1 total_word_freq += len(tokens) json.dump(freq_dict, open("%s/collection_raw_idf.json" % base_dir, "w")) return freq_dict, total_word_freq
def fill_example_queue(self): input_gen = text_generator( utils.example_generator(self._data_path, self._single_pass)) while True: try: (op, evd, kp, arg) = next(input_gen) except StopIteration: tf.logging.info("The example generator for this example " "queue filling thread has exhausted data.") if self._single_pass: tf.logging.info( "Finished reading dataset. This thread is stopping!!") self._finished_reading = True break else: raise Exception( "The example generator is out of data; error.") kp_sents = [sent.strip() for sent in utils.split_sent(kp, "kp")] arg_sents = [sent.strip() for sent in utils.split_sent(arg, "arg")] example = DataSample(op, evd, kp_sents, arg_sents, self._src_vocab, self._tgt_vocab, self._hps) self._example_queue.put(example)
def predicting(fname, weight_file): print('loading models') model = model_build(MODEL_NAME, model_cfg, embedding_matrix) model.load_weights(weight_file) df_predict = pd.read_excel(fname) df_predict = df_predict[pd.notnull(df_predict['segment'])] p_x = df_predict['segment'].tolist() if MODEL_NAME in ['HAN', 'MHAN']: p_x = [ split_sent(sent, MAX_WORDS, MAX_SENTS, CUT_MODE) for sent in p_x ] p_x = get_sequences(tokenizer, p_x, TEXT_FORMAT, MAX_WORDS) print('data shape', p_x.shape) print('making predictions...') predicted = model.predict(p_x) df_predict['label_90'] = [1 if p[0] > 0.9 else 0 for p in predicted] #df_predict[df_predict['label_90'] == 1]['cleaned_text'][0:10] df_predict.to_excel('./data/taobao_0_30000_predict.xlsx', index=None)
x_w, x_c, y = get_x_y(DATASET) tokenizer_w = Tokenizer(filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = True, split = " ") tokenizer_c = Tokenizer(filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = True, split = " ") tokenizer_w.fit_on_texts(x_w) tokenizer_c.fit_on_texts(x_c) vocab_w = tokenizer_w.word_index vocab_w['UNK'] = 0 vocab_c = tokenizer_c.word_index vocab_c['UNK'] = 0 DATE = time.strftime('%Y%m%d%H%M',time.localtime(time.time())) m_name = './model/' + MODEL_NAME + '_' + DATE + '.yaml' weights_name = './model/' + MODEL_NAME + '_weights_' + DATE + '.hdf5' MAX_WORDS = [20,30] MAX_SENTS = [6,6] x_w = [split_sent(sent, MAX_WORDS[0], MAX_SENTS[0], CUT_MODE) for sent in x_w] x_c = [split_sent(sent, MAX_WORDS[1], MAX_SENTS[1], CUT_MODE) for sent in x_c] embed_mat_w = load_embeddings(EMBED_FILE_word, vocab_w, EMBED_DIMS) embed_mat_c = load_embeddings(EMBED_FILE_char, vocab_c, EMBED_DIMS) result = train(CV, [x_w, x_c], y, [tokenizer_w, tokenizer_c], DATE)
DATE = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) model_file = './model/' + MODEL_NAME + '_' + DATE + '.h5' weight_file = './model/' + MODEL_NAME + '_weights_' + DATE + '.hdf5' # HAN族模型需要的文本输入格式 if MODEL_NAME in ['HAN', 'MHAN']: print('prepare inputs for HAN series model...') if EMBED_TYPE == 'word' or 'scratch': MAX_WORDS = 20 MAX_SENTS = 5 elif EMBED_TYPE == 'char': MAX_WORDS = 30 MAX_SENTS = 6 N_LIMIT = MAX_WORDS * MAX_SENTS sents = [ split_sent(sent, MAX_WORDS, MAX_SENTS, CUT_MODE) for sent in sents ] TEXT_FORMAT = 'seq' new_name = MODEL_NAME + '_' + str(MAX_WORDS) + '_' + str(MAX_SENTS) model_file = './model/' + new_name + '_' + DATE + '.h5' weight_file = './model/' + new_name + '_weights_' + DATE + '.hdf5' # 初始化参数设置 model_cfg = ModelConfig(MAX_WORDS, MAX_SENTS, EMBED_DIMS, len(vocab) + 1, MODEL_NAME, ntags=2) train_cfg = TrainingConfig(ntags=2, model_name=MODEL_NAME)
# 模型及权重保存路径 DATE = time.strftime('%Y%m%d%H%M',time.localtime(time.time())) m_name = './model/' + MODEL_NAME + '_' + DATE + '.yaml' weights_name = './model/' + MODEL_NAME + '_weights_' + DATE + '.hdf5' # HAN模型需要的文本输入格式 if MODEL_NAME in ['HAN','HMAN']: print('prepare inputs for HAN series model...') if EMBED_TYPE == 'word' or 'scratch': MAX_WORDS = 20 MAX_SENTS = 5 elif EMBED_TYPE == 'char': MAX_WORDS = 30 MAX_SENTS = 6 N_LIMIT = MAX_WORDS * MAX_SENTS x = [split_sent(sent, MAX_WORDS, MAX_SENTS, CUT_MODE) for sent in x] if PREDICT: p_x = [split_sent(sent, MAX_WORDS, MAX_SENTS, CUT_MODE) for sent in predict_text] TEXT_FORMAT = 'seq' new_name = MODEL_NAME + '_' + str(MAX_WORDS) + '_' + str(MAX_SENTS) m_name = './model/' + new_name + '_' + DATE + '.yaml' weights_name = './model/' + new_name + '_weights_' + DATE + '.hdf5' # 读入预训练的词向量矩阵 if PRE_TRAINED and MODEL_NAME != 'one-hot': print('loading word embeddings...') embedding_matrix = load_embeddings(EMBED_FILE, vocab, EMBED_DIMS) else: embedding_matrix = None