import zipfile import pandas as pd import text_analytic_tools.utility as utility logger = utility.getLogger("text_analytic_tools") def store_tokenized_corpus_as_archive(tokenized_docs, target_filename): """Stores a tokenized (string) corpus to a zip archive Parameters ---------- tokenized_docs : [type] [description] corpus_source_filepath : [type] [description] Returns ------- [type] [description] """ file_stats = [] process_count = 0 # TODO: Enable store of all documents line-by-line in a single file with zipfile.ZipFile(target_filename, "w") as zf: for document_id, document_name, chunk_index, tokens in tokenized_docs:
# -*- coding: utf-8 -*- import re import zipfile import ftfy import textacy import text_analytic_tools.utility as utility from . import utils logger = utility.getLogger('corpus_text_analysis') HYPHEN_REGEXP = re.compile(r'\b(\w+)-\s*\r?\n\s*(\w+)\b', re.UNICODE) def preprocess_text(source_filename, target_filename, tick=utility.noop): ''' Pre-process of zipped archive that contains text documents Returns ------- Zip-archive ''' filenames = utility.zip_get_filenames(source_filename) texts = ((filename, utility.zip_get_text(source_filename, filename)) for filename in filenames) logger.info('Preparing text corpus...') tick(0, len(filenames)) with zipfile.ZipFile(target_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
import text_analytic_tools.domain.common_logic as common_logic import text_analytic_tools import text_analytic_tools.utility as utility import text_analytic_tools.common.textacy_utility as textacy_utility import text_analytic_tools.common as common logger = utility.getLogger('tCoIR') current_domain = text_analytic_tools.CURRENT_DOMAIN container = None source_path = '/home/roger/source/text_analytic_tools/data/tCoIR/tCoIR_en_45-72.txt.zip' if container is None: container = textacy_utility.load_or_create(source_path=source_path, language='en', document_index=None, merge_entities=False, overwrite=False, use_compression=True, disabled_pipes=tuple( ("ner", "parser", "textcat"))) corpus = container.textacy_corpus min_freq_stats = { k: textacy_utility.generate_word_count_score(corpus, k, 10) for k in ['lemma', 'lower', 'orth'] } max_doc_freq_stats = {