Beispiel #1
0
class PhraseCleaner:
    def __init__(self):
        self.tokenizer = Tokenizer()
        self.tokenizer.load()

    def process(self, phrase):
        return u' '.join(self.tokenizer.tokenize(phrase.lower()))
Beispiel #2
0
facts_path = args.facts
max_nb_facts = args.max_nb_facts
metric = args.metric

if metric == 'jaccard':
    shingle_len = args.shingle_len

if metric in ['wmd', 'w2v']:
    word2vector_path = args.word2vector
    print('Loading the w2v model "{}"'.format(word2vector_path))
    w2v = gensim.models.KeyedVectors.load_word2vec_format(word2vector_path, binary=True)
    #w2v_dims = len(w2v.syn0[0])


tokenizer = Tokenizer()
tokenizer.load()

phrases1 = []
with io.open(facts_path, 'r', encoding='utf-8') as rdr:
    for phrase in rdr:
        words = tokenizer.tokenize(phrase.strip())
        if len(words) > 0:
            if metric == 'w2v':
                phrases1.append((phrase.strip(), words, get_average_vector(words, w2v)))
            else:
                phrases1.append((phrase.strip(), words))
            if len(phrases1) >= max_nb_facts:
                break

nb_phrases = len(phrases1)
Beispiel #3
0
 def __init__(self):
     self.tokenizer = Tokenizer()
     self.tokenizer.load()
Beispiel #4
0
        str_pairs1 = [self.str_pairs[i] for i in pos_sample_indexes]
        relevancy1 = [self.relevancy[i] for i in pos_sample_indexes]
        weights1 = [self.weights[i] for i in pos_sample_indexes]

        # оставшиеся после усечения негативные примеры
        str_pairs0 = [self.str_pairs[i] for i in neg_sample_indexes]
        relevancy0 = [self.relevancy[i] for i in neg_sample_indexes]
        weights0 = [self.weights[i] for i in neg_sample_indexes]

        self.str_pairs = list(itertools.chain(str_pairs1, str_pairs0))
        self.relevancy = list(itertools.chain(relevancy1, relevancy0))
        self.weights = list(itertools.chain(weights1, weights0))


if __name__ == '__main__':
    tokenizer = Tokenizer()
    tokenizer.load()
    random_questions = CorpusSearcher()
    random_facts = CorpusSearcher()

    # прочитаем список случайных вопросов из заранее сформированного файла
    # (см. код на C# https://github.com/Koziev/chatbot/tree/master/CSharpCode/ExtractFactsFromParsing
    # и результаты его работы https://github.com/Koziev/NLP_Datasets/blob/master/Samples/questions4.txt)
    print('Loading random questions and facts...')
    with codecs.open(questions_path, 'r', 'utf-8') as rdr:
        for line in rdr:
            if len(line) < 40:
                question = line.strip()
                question = ru_sanitize(u' '.join(
                    tokenizer.tokenize(question.lower())))
                random_questions.add_phrase(normalize_qline(question))
Beispiel #5
0
        str_pairs1 = [self.str_pairs[i] for i in pos_sample_indexes]
        relevancy1 = [self.relevancy[i] for i in pos_sample_indexes]
        weights1 = [self.weights[i] for i in pos_sample_indexes]

        # оставшиеся после усечения негативные примеры
        str_pairs0 = [self.str_pairs[i] for i in neg_sample_indexes]
        relevancy0 = [self.relevancy[i] for i in neg_sample_indexes]
        weights0 = [self.weights[i] for i in neg_sample_indexes]

        self.str_pairs = list(itertools.chain(str_pairs1, str_pairs0))
        self.relevancy = list(itertools.chain(relevancy1, relevancy0))
        self.weights = list(itertools.chain(weights1, weights0))


if __name__ == '__main__':
    tokenizer = Tokenizer()
    tokenizer.load()
    random_questions = CorpusSearcher()
    random_facts = CorpusSearcher()

    # прочитаем список случайных вопросов из заранее сформированного файла
    # (см. код на C# https://github.com/Koziev/chatbot/tree/master/CSharpCode/ExtractFactsFromParsing
    # и результаты его работы https://github.com/Koziev/NLP_Datasets/blob/master/Samples/questions4.txt)
    print('Loading random questions and facts...')
    with codecs.open(questions_path, 'r', 'utf-8') as rdr:
        for line in rdr:
            if len(line) < 40:
                question = line.strip()
                question = ru_sanitize(u' '.join(tokenizer.tokenize(question.lower())))
                random_questions.add_phrase(normalize_qline(question))
Beispiel #6
0
max_seq_len = 100  # макс. длина предложений, кол-во sentencepiece элементов, т.е. примерно в 3 раза больше, чем слов.
nb_epochs = 200
spm_items = 8000  # при обучении sentencepiece ограничиваем словарь модели таким количеством элементов

tmp_folder = '../tmp'
corpus_path = '/media/inkoziev/corpora/Corpus/Raw/ru/text_blocks.txt'


def split_str(s):
    #return tokenizer.tokenize(phrase1)
    return sp.EncodeAsPieces(s)
    #return list(itertools.chain(*(word2pieces(word) for word in s.split())))


segmenter = Segmenter()
tokenizer = Tokenizer()
tokenizer.load()

# --------------- SENTENCEPIECE ----------------------

# Готовим корпус для обучения SentencePiece
sentencepiece_corpus = os.path.join(tmp_folder, 'sentencepiece_corpus.txt')

nb_from_corpus = 0
max_nb_samples = 10000000  # макс. кол-во предложений для обучения SentencePiece
with io.open(sentencepiece_corpus, 'w', encoding='utf-8') as wrt:
    for file in glob.glob(os.path.join('../data', '*.csv')):
        print(u'Loading samples from {}...'.format(file))
        df = pd.read_csv(file,
                         encoding='utf-8',
                         delimiter='\t',