Ejemplo n.º 1
0
def test_file_as_list(tmp_path):
    words_test = tmp_path / 'test.csv'
    words_test.write_text(CONTENT)
    words = file_as_list(words_test, local=False)
    assert words == [
        'category1;answer1', 'category2;answer2', 'category3;answer3'
    ]
Ejemplo n.º 2
0
def training(processors, wiki_extracts, questions, name):
    start = watch.time()

    log_info(f'collecting wiki_extracts from folder {wiki_extracts}')

    concepts = get_concepts_from_folder(wiki_extracts)
    concepts_sentences = get_sentences_from_concepts(concepts)
    questions = (line.rstrip('\r\n')
                 for line in file_as_list(questions, local=False))

    for processor in processors:
        questions = processor(questions)
        concepts_sentences = processor(concepts_sentences)

    concepts_sentences = list(concepts_sentences)
    questions = list(questions)

    log_info(f'found {len(concepts_sentences)} sentences')
    log_info(f'collected {len(questions)} questions')

    sentences = get_words_from_sentences(concepts_sentences)
    questions = get_words_from_sentences(questions)

    log_info(f'creating language model')
    model, vectors = create_w2v_model(sentences, questions)

    save_model(model, f'w2v_{name}100_model.w2v')
    save_pre_computed_vectors(vectors, f'w2v_{name}100_vectors.pickle')

    log_info(f'training completed in {watch.time() - start}s\n')
Ejemplo n.º 3
0
def remove_stopwords_stream(doc, stopwords='res/custom_ch_stopwords.txt'):
    '''
        Remove stopwords in stdin defined in a file.
    '''
    stopwords = file_as_list(stopwords)
    doc = (x for x in doc if x != '\n')
    return map(lambda x: remove(x, stopwords), doc)
Ejemplo n.º 4
0
def remove_stopwords(doc, stopwords='res/custom_ch_stopwords.txt'):
    '''
        Remove stopwords in a list defined in a file.
    '''
    stopwords = file_as_list(stopwords)
    doc = list(filter(lambda x: x != '', doc))
    return list(map(lambda x: remove(x, stopwords), doc))
Ejemplo n.º 5
0
def category_to_answer(map, file):
    map = file_as_dict(map, sep=';', local=False)
    map = {k.lower(): v for k, v in map.items()}
    cat = file_as_list(file, local=False)

    for c in cat:
        if c not in map:
            print(c)
Ejemplo n.º 6
0
def remove_stopwords_json_stream(json_docs,
                                 stopwords='res/custom_ch_stopwords.txt',
                                 text_key='text'):
    '''
    Removes stopwords in stdin defined as json documents.
    '''
    stopwords = file_as_list(stopwords)
    for doc in json_docs:
        doc[text_key] = remove(doc[text_key], stopwords)
        click.get_text_stream('stdout', 'utf-8').write(dump_json(doc) + '\n')
Ejemplo n.º 7
0
def remove_stopwords_json(json_doc,
                          text_key,
                          stopwords='res/custom_ch_stopwords.txt'):
    '''
    Remove stopwords in a single text property of the json document.
    :param json_doc: The json document
    :param text_key: The key of the text property
    :param stopwords: The filepath to a stop word list
    :return: The json document without stop words
    '''
    stopwords = file_as_list(stopwords)
    json_doc[text_key] = remove(json_doc[text_key], stopwords)
    return json_doc
Ejemplo n.º 8
0
def process_pipeline(processors, wiki_extracts, questions, name):
    start = watch.time()
    concepts = get_concepts_from_folder(wiki_extracts)
    questions = (line.rstrip('\r\n') for line in file_as_list(questions, local=False))
    for processor in processors:
        questions = processor(questions)
        concepts = processor(concepts)
    concepts = list(concepts)
    questions = list(questions)
    log_info(f'creating language model')
    model_100, vectorizer_100 = create_language_model(concepts, questions, 0)
    model_95, vectorizer_95 = create_language_model(concepts, questions, 0.25)
    model_90, vectorizer_90 = create_language_model(concepts, questions, 0.5)
    save(model_100, vectorizer_100, f'tfidf_{name}100_model', f'tfidf_{name}100_vectors')
    save(model_95, vectorizer_95, f'tfidf_{name}75_model', f'tfidf_{name}75_vectors')
    save(model_90, vectorizer_90, f'tfidf_{name}50_model', f'tfidf_{name}50_vectors')
    log_info(f'training completed in {watch.time() - start}s\n')
Ejemplo n.º 9
0
def load_tags_answers(dataset):
    tags = file_as_list(f'/mnt/data/raw/{dataset}_tags.txt', local=False)
    answers = file_as_list(f'/mnt/data/raw/{dataset}_answers.txt', local=False)
    questions = file_as_list(f'/mnt/data/raw/{dataset}_questions.txt',
                             local=False)
    return tags, answers, questions
Ejemplo n.º 10
0
def ionesoft_stopwords(question):
    stopwords = file_as_list('res/custom_ch_stopwords.txt')
    question = ionesoft_normalize(question)
    return remove(question, stopwords)
Ejemplo n.º 11
0
def stackexchange_stopwords(question):
    stopwords = file_as_list('res/custom_en_stopwords.txt')
    question = stackexchange_normalize(question)
    return remove(question, stopwords)