Esempio n. 1
0
def training(processors, wiki_extracts, questions, name):
    start = watch.time()

    log_info(f'collecting wiki_extracts from folder {wiki_extracts}')

    concepts = get_concepts_from_folder(wiki_extracts)
    concepts_sentences = get_sentences_from_concepts(concepts)
    questions = (line.rstrip('\r\n')
                 for line in file_as_list(questions, local=False))

    for processor in processors:
        questions = processor(questions)
        concepts_sentences = processor(concepts_sentences)

    concepts_sentences = list(concepts_sentences)
    questions = list(questions)

    log_info(f'found {len(concepts_sentences)} sentences')
    log_info(f'collected {len(questions)} questions')

    sentences = get_words_from_sentences(concepts_sentences)
    questions = get_words_from_sentences(questions)

    log_info(f'creating language model')
    model, vectors = create_w2v_model(sentences, questions)

    save_model(model, f'w2v_{name}100_model.w2v')
    save_pre_computed_vectors(vectors, f'w2v_{name}100_vectors.pickle')

    log_info(f'training completed in {watch.time() - start}s\n')
Esempio n. 2
0
def get_concepts_from_folder(wiki_extracts):
    concepts = []
    for extract in os.listdir(wiki_extracts):
        if extract.startswith('wiki_'):
            log_info(f'- collecting {extract}')
            with open(wiki_extracts + extract, 'r', encoding='utf-8') as f:
                for line in f.readlines():
                    json_concept = from_str_to_json(line)
                    concepts.append(json_concept['text'])
    return concepts
Esempio n. 3
0
def normalize():
    start = watch.time()
    doc = click.get_text_stream('stdin', 'utf-8').read()

    log_info(f'normalizing json')

    doc = extract_json_documents(doc)
    normalize_json_stream(doc)

    log_info(f'normalization completed in {watch.time() - start}s\n')
Esempio n. 4
0
def stopwords(stopwords):
    start = watch.time()
    doc = click.get_text_stream('stdin', 'utf-8').read()

    log_info(f'remove stopwords json')

    doc = extract_json_documents(doc)
    remove_stopwords_json_stream(doc, stopwords)

    log_info(f'removal of stopwords completed in {watch.time() - start}s\n')
Esempio n. 5
0
def stackexchange(xml_file, questions_file, answers_file, tags_file):
    start = watch.time()

    log_info(f'extract questions with answers from stackexchange')

    xml = untangle.parse(xml_file)
    extract = extract_questions_with_answers(xml)

    save(extract, questions_file, answers_file, tags_file)

    log_info(f'extraction completed in {watch.time() - start}s\n')
Esempio n. 6
0
def extract():
    start = watch.time()
    doc = click.get_text_stream('stdin', 'utf-8').read()

    log_info(f'extract articles from raw data')

    json_docs = extract_json_documents(doc)

    for doc in json_docs:
        write_json_doc_out(doc)

    log_info(f'article extracting ended in {watch.time() - start}s\n')
Esempio n. 7
0
def update_answer_or_save(row, answer_ids, store):
    question, question_id = extract_text_with_id(row)
    answer_id = extract_answer_id(row)
    tags = extract_tags(row)
    log_info(tags)
    if answer_id in answer_ids:
        obj = next(qa for qa in store['questions_answers']
                   if qa['answer_id'] == answer_id)
        obj['question'] = question
        obj['tags'] = tags
    else:
        answer_ids.append(answer_id)
        save_as_json(question, answer_id, None, tags, store)
Esempio n. 8
0
def process_pipeline(processors, wiki_extracts, questions, name):
    start = watch.time()
    concepts = get_concepts_from_folder(wiki_extracts)
    questions = (line.rstrip('\r\n') for line in file_as_list(questions, local=False))
    for processor in processors:
        questions = processor(questions)
        concepts = processor(concepts)
    concepts = list(concepts)
    questions = list(questions)
    log_info(f'creating language model')
    model_100, vectorizer_100 = create_language_model(concepts, questions, 0)
    model_95, vectorizer_95 = create_language_model(concepts, questions, 0.25)
    model_90, vectorizer_90 = create_language_model(concepts, questions, 0.5)
    save(model_100, vectorizer_100, f'tfidf_{name}100_model', f'tfidf_{name}100_vectors')
    save(model_95, vectorizer_95, f'tfidf_{name}75_model', f'tfidf_{name}75_vectors')
    save(model_90, vectorizer_90, f'tfidf_{name}50_model', f'tfidf_{name}50_vectors')
    log_info(f'training completed in {watch.time() - start}s\n')
Esempio n. 9
0
def extract(column, separator):
    start = watch.time()
    csv = click.get_text_stream('stdin', 'utf-8').read()

    log_info(f'extracting column: {column} separator: {separator}')

    lines = extract_column_stream(csv, column, separator)

    log_info(f'extracted {lines} lines')
    log_info(f'extraction completed in {watch.time() - start}s\n')
Esempio n. 10
0
def lemm(language):
    start = watch.time()
    doc = click.get_text_stream('stdin', 'utf-8').readlines()

    log_info(f'lemming')

    for i, line in enumerate(lemm_doc_stream(doc, language)):
        click.get_text_stream('stdout', 'utf-8').write(f'{line}\n')

    log_info(f'lemmed {i+1} lines')
    log_info(f'lemming completed in {watch.time() - start}s\n')
Esempio n. 11
0
def stopwords(stopwords):
    start = watch.time()
    doc = click.get_text_stream('stdin', 'utf-8').readlines()

    log_info(f'remove stopwords')

    for i, line in enumerate(remove_stopwords_stream(doc, stopwords)):
        click.get_text_stream('stdout', 'utf-8').write(f'{line}')

    log_info(f'removed stopwords from {i+1} lines')
    log_info(f'removal of stopwords completed in {watch.time() - start}s\n')
Esempio n. 12
0
def normalize():
    start = watch.time()
    doc = click.get_text_stream('stdin', 'utf-8').readlines()

    log_info(f'normalizing')

    for i, line in enumerate(normalize_doc_stream(doc)):
        click.get_text_stream('stdout', 'utf-8').write(f'{line}\n')

    log_info(f'normalized {i+1} lines')
    log_info(f'normalization completed in {watch.time() - start}s\n')
Esempio n. 13
0
def grammar(grammar):
    start = watch.time()
    doc = click.get_text_stream('stdin', 'utf-8').readlines()

    log_info(f'correcting with grammar: {grammar}')

    for i, line in enumerate(correct_grammar_stream(doc, grammar)):
        click.get_text_stream('stdout', 'utf-8').write(f'{line}')

    log_info(f'corrected {i+1} lines')
    log_info(f'correction completed in {watch.time() - start}s\n')
Esempio n. 14
0
def test_log_info(capsys):
    logging.getLogger().addHandler(logging.StreamHandler())
    log_info(msg='I am in a unit test', log=logging)
    captured = capsys.readouterr()
    log_msg = 'I am in a unit test\n'
    assert captured.err == log_msg