class PhraseCleaner: def __init__(self): self.tokenizer = Tokenizer() self.tokenizer.load() def process(self, phrase): return u' '.join(self.tokenizer.tokenize(phrase.lower()))
facts_path = args.facts max_nb_facts = args.max_nb_facts metric = args.metric if metric == 'jaccard': shingle_len = args.shingle_len if metric in ['wmd', 'w2v']: word2vector_path = args.word2vector print('Loading the w2v model "{}"'.format(word2vector_path)) w2v = gensim.models.KeyedVectors.load_word2vec_format(word2vector_path, binary=True) #w2v_dims = len(w2v.syn0[0]) tokenizer = Tokenizer() tokenizer.load() phrases1 = [] with io.open(facts_path, 'r', encoding='utf-8') as rdr: for phrase in rdr: words = tokenizer.tokenize(phrase.strip()) if len(words) > 0: if metric == 'w2v': phrases1.append((phrase.strip(), words, get_average_vector(words, w2v))) else: phrases1.append((phrase.strip(), words)) if len(phrases1) >= max_nb_facts: break nb_phrases = len(phrases1)
def __init__(self): self.tokenizer = Tokenizer() self.tokenizer.load()
str_pairs1 = [self.str_pairs[i] for i in pos_sample_indexes] relevancy1 = [self.relevancy[i] for i in pos_sample_indexes] weights1 = [self.weights[i] for i in pos_sample_indexes] # оставшиеся после усечения негативные примеры str_pairs0 = [self.str_pairs[i] for i in neg_sample_indexes] relevancy0 = [self.relevancy[i] for i in neg_sample_indexes] weights0 = [self.weights[i] for i in neg_sample_indexes] self.str_pairs = list(itertools.chain(str_pairs1, str_pairs0)) self.relevancy = list(itertools.chain(relevancy1, relevancy0)) self.weights = list(itertools.chain(weights1, weights0)) if __name__ == '__main__': tokenizer = Tokenizer() tokenizer.load() random_questions = CorpusSearcher() random_facts = CorpusSearcher() # прочитаем список случайных вопросов из заранее сформированного файла # (см. код на C# https://github.com/Koziev/chatbot/tree/master/CSharpCode/ExtractFactsFromParsing # и результаты его работы https://github.com/Koziev/NLP_Datasets/blob/master/Samples/questions4.txt) print('Loading random questions and facts...') with codecs.open(questions_path, 'r', 'utf-8') as rdr: for line in rdr: if len(line) < 40: question = line.strip() question = ru_sanitize(u' '.join( tokenizer.tokenize(question.lower()))) random_questions.add_phrase(normalize_qline(question))
str_pairs1 = [self.str_pairs[i] for i in pos_sample_indexes] relevancy1 = [self.relevancy[i] for i in pos_sample_indexes] weights1 = [self.weights[i] for i in pos_sample_indexes] # оставшиеся после усечения негативные примеры str_pairs0 = [self.str_pairs[i] for i in neg_sample_indexes] relevancy0 = [self.relevancy[i] for i in neg_sample_indexes] weights0 = [self.weights[i] for i in neg_sample_indexes] self.str_pairs = list(itertools.chain(str_pairs1, str_pairs0)) self.relevancy = list(itertools.chain(relevancy1, relevancy0)) self.weights = list(itertools.chain(weights1, weights0)) if __name__ == '__main__': tokenizer = Tokenizer() tokenizer.load() random_questions = CorpusSearcher() random_facts = CorpusSearcher() # прочитаем список случайных вопросов из заранее сформированного файла # (см. код на C# https://github.com/Koziev/chatbot/tree/master/CSharpCode/ExtractFactsFromParsing # и результаты его работы https://github.com/Koziev/NLP_Datasets/blob/master/Samples/questions4.txt) print('Loading random questions and facts...') with codecs.open(questions_path, 'r', 'utf-8') as rdr: for line in rdr: if len(line) < 40: question = line.strip() question = ru_sanitize(u' '.join(tokenizer.tokenize(question.lower()))) random_questions.add_phrase(normalize_qline(question))
max_seq_len = 100 # макс. длина предложений, кол-во sentencepiece элементов, т.е. примерно в 3 раза больше, чем слов. nb_epochs = 200 spm_items = 8000 # при обучении sentencepiece ограничиваем словарь модели таким количеством элементов tmp_folder = '../tmp' corpus_path = '/media/inkoziev/corpora/Corpus/Raw/ru/text_blocks.txt' def split_str(s): #return tokenizer.tokenize(phrase1) return sp.EncodeAsPieces(s) #return list(itertools.chain(*(word2pieces(word) for word in s.split()))) segmenter = Segmenter() tokenizer = Tokenizer() tokenizer.load() # --------------- SENTENCEPIECE ---------------------- # Готовим корпус для обучения SentencePiece sentencepiece_corpus = os.path.join(tmp_folder, 'sentencepiece_corpus.txt') nb_from_corpus = 0 max_nb_samples = 10000000 # макс. кол-во предложений для обучения SentencePiece with io.open(sentencepiece_corpus, 'w', encoding='utf-8') as wrt: for file in glob.glob(os.path.join('../data', '*.csv')): print(u'Loading samples from {}...'.format(file)) df = pd.read_csv(file, encoding='utf-8', delimiter='\t',