def training(processors, wiki_extracts, questions, name): start = watch.time() log_info(f'collecting wiki_extracts from folder {wiki_extracts}') concepts = get_concepts_from_folder(wiki_extracts) concepts_sentences = get_sentences_from_concepts(concepts) questions = (line.rstrip('\r\n') for line in file_as_list(questions, local=False)) for processor in processors: questions = processor(questions) concepts_sentences = processor(concepts_sentences) concepts_sentences = list(concepts_sentences) questions = list(questions) log_info(f'found {len(concepts_sentences)} sentences') log_info(f'collected {len(questions)} questions') sentences = get_words_from_sentences(concepts_sentences) questions = get_words_from_sentences(questions) log_info(f'creating language model') model, vectors = create_w2v_model(sentences, questions) save_model(model, f'w2v_{name}100_model.w2v') save_pre_computed_vectors(vectors, f'w2v_{name}100_vectors.pickle') log_info(f'training completed in {watch.time() - start}s\n')
def get_concepts_from_folder(wiki_extracts): concepts = [] for extract in os.listdir(wiki_extracts): if extract.startswith('wiki_'): log_info(f'- collecting {extract}') with open(wiki_extracts + extract, 'r', encoding='utf-8') as f: for line in f.readlines(): json_concept = from_str_to_json(line) concepts.append(json_concept['text']) return concepts
def normalize(): start = watch.time() doc = click.get_text_stream('stdin', 'utf-8').read() log_info(f'normalizing json') doc = extract_json_documents(doc) normalize_json_stream(doc) log_info(f'normalization completed in {watch.time() - start}s\n')
def stopwords(stopwords): start = watch.time() doc = click.get_text_stream('stdin', 'utf-8').read() log_info(f'remove stopwords json') doc = extract_json_documents(doc) remove_stopwords_json_stream(doc, stopwords) log_info(f'removal of stopwords completed in {watch.time() - start}s\n')
def stackexchange(xml_file, questions_file, answers_file, tags_file): start = watch.time() log_info(f'extract questions with answers from stackexchange') xml = untangle.parse(xml_file) extract = extract_questions_with_answers(xml) save(extract, questions_file, answers_file, tags_file) log_info(f'extraction completed in {watch.time() - start}s\n')
def extract(): start = watch.time() doc = click.get_text_stream('stdin', 'utf-8').read() log_info(f'extract articles from raw data') json_docs = extract_json_documents(doc) for doc in json_docs: write_json_doc_out(doc) log_info(f'article extracting ended in {watch.time() - start}s\n')
def update_answer_or_save(row, answer_ids, store): question, question_id = extract_text_with_id(row) answer_id = extract_answer_id(row) tags = extract_tags(row) log_info(tags) if answer_id in answer_ids: obj = next(qa for qa in store['questions_answers'] if qa['answer_id'] == answer_id) obj['question'] = question obj['tags'] = tags else: answer_ids.append(answer_id) save_as_json(question, answer_id, None, tags, store)
def process_pipeline(processors, wiki_extracts, questions, name): start = watch.time() concepts = get_concepts_from_folder(wiki_extracts) questions = (line.rstrip('\r\n') for line in file_as_list(questions, local=False)) for processor in processors: questions = processor(questions) concepts = processor(concepts) concepts = list(concepts) questions = list(questions) log_info(f'creating language model') model_100, vectorizer_100 = create_language_model(concepts, questions, 0) model_95, vectorizer_95 = create_language_model(concepts, questions, 0.25) model_90, vectorizer_90 = create_language_model(concepts, questions, 0.5) save(model_100, vectorizer_100, f'tfidf_{name}100_model', f'tfidf_{name}100_vectors') save(model_95, vectorizer_95, f'tfidf_{name}75_model', f'tfidf_{name}75_vectors') save(model_90, vectorizer_90, f'tfidf_{name}50_model', f'tfidf_{name}50_vectors') log_info(f'training completed in {watch.time() - start}s\n')
def extract(column, separator): start = watch.time() csv = click.get_text_stream('stdin', 'utf-8').read() log_info(f'extracting column: {column} separator: {separator}') lines = extract_column_stream(csv, column, separator) log_info(f'extracted {lines} lines') log_info(f'extraction completed in {watch.time() - start}s\n')
def lemm(language): start = watch.time() doc = click.get_text_stream('stdin', 'utf-8').readlines() log_info(f'lemming') for i, line in enumerate(lemm_doc_stream(doc, language)): click.get_text_stream('stdout', 'utf-8').write(f'{line}\n') log_info(f'lemmed {i+1} lines') log_info(f'lemming completed in {watch.time() - start}s\n')
def stopwords(stopwords): start = watch.time() doc = click.get_text_stream('stdin', 'utf-8').readlines() log_info(f'remove stopwords') for i, line in enumerate(remove_stopwords_stream(doc, stopwords)): click.get_text_stream('stdout', 'utf-8').write(f'{line}') log_info(f'removed stopwords from {i+1} lines') log_info(f'removal of stopwords completed in {watch.time() - start}s\n')
def normalize(): start = watch.time() doc = click.get_text_stream('stdin', 'utf-8').readlines() log_info(f'normalizing') for i, line in enumerate(normalize_doc_stream(doc)): click.get_text_stream('stdout', 'utf-8').write(f'{line}\n') log_info(f'normalized {i+1} lines') log_info(f'normalization completed in {watch.time() - start}s\n')
def grammar(grammar): start = watch.time() doc = click.get_text_stream('stdin', 'utf-8').readlines() log_info(f'correcting with grammar: {grammar}') for i, line in enumerate(correct_grammar_stream(doc, grammar)): click.get_text_stream('stdout', 'utf-8').write(f'{line}') log_info(f'corrected {i+1} lines') log_info(f'correction completed in {watch.time() - start}s\n')
def test_log_info(capsys): logging.getLogger().addHandler(logging.StreamHandler()) log_info(msg='I am in a unit test', log=logging) captured = capsys.readouterr() log_msg = 'I am in a unit test\n' assert captured.err == log_msg