def prepare(path_univ_dir): files = os.listdir(path_univ_dir) for file in files: text_set = set() texts = list() with open(os.path.join(path_univ_dir, file), 'r') as f: for line in f: text = re.sub(stop_word_re, '', line.strip()) for word_type, word_re in word_type_re.items(): text = re.sub(word_re, word_type, text) text = word_replace(text, homo_dict) text = word_replace(text, syno_dict) if text not in text_set: text_set.add(text) texts.append(text) with open(os.path.join(path_univ_dir, file), 'w') as f: for text in texts: f.write(text + '\n')
def predict(text, name): text = re.sub(stop_word_re, '', text.strip()) for word_type, word_re in word_type_re.items(): text = re.sub(word_re, word_type, text) text = word_replace(text, homo_dict) text = word_replace(text, syno_dict) cache_sents = map_item(name, caches) seq = word2ind.texts_to_sequences([text])[0] pad_seq = pad_sequences([seq], maxlen=seq_len) model = map_item(name, models) encode_seq = model.predict([pad_seq]) encode_mat = np.repeat(encode_seq, len(cache_sents), axis=0) dists = np.sqrt(np.sum(np.square(encode_mat - cache_sents), axis=1)) min_dists = sorted(dists)[:3] min_inds = np.argsort(dists)[:3] min_preds = [labels[ind] for ind in min_inds] if __name__ == '__main__': min_texts = [texts[ind] for ind in min_inds] formats = list() for pred, prob, text in zip(min_preds, min_dists, min_texts): formats.append('{} {:.3f} {}'.format(pred, prob, text)) return ', '.join(formats) else: return min_preds[0]
def clean(text): text = re.sub(stop_word_re, '', text) for word_type, word_re in word_type_re.items(): text = re.sub(word_re, word_type, text) text = word_replace(text, homo_dict) return word_replace(text, syno_dict)
def clean(text): text = re.sub(stop_word_re, '', text.strip()) text = word_replace(text, homo_dict) return word_replace(text, syno_dict)