def read_payload(filename: str) -> DocumentPayload: filename = replace_extension(filename, ".feather") return DocumentPayload( content_type=ContentType.TAGGED_FRAME, content=pd.read_feather(filename), filename=replace_extension(strip_paths(filename), ".csv"), )
def load(folder: str, tag: str) -> "TokenWindowCountMatrix": """Loads documents' (rows) token (column) window counts matrix""" matrix: scipy.sparse.spmatrix = None filename = to_filename(folder=folder, tag=tag, postfix=DOCUMENT_COUNTS_POSTFIX) if os.path.isfile(replace_extension(filename, '.npz')): matrix = scipy.sparse.load_npz(replace_extension(filename, '.npz')) if os.path.isfile(replace_extension(filename, '.npy')): matrix = np.load(replace_extension(filename, '.npy'), allow_pickle=True).item() return TokenWindowCountMatrix(document_term_window_counts=matrix)
def store(self, folder: str, tag: str, compressed: bool = True) -> None: """Stores documents' (rows) token (column) window counts matrix""" filename = to_filename(folder=folder, tag=tag, postfix=DOCUMENT_COUNTS_POSTFIX) if compressed: assert scipy.sparse.issparse(self.document_term_window_counts) scipy.sparse.save_npz(replace_extension(filename, '.npz'), self.document_term_window_counts, compressed=True) else: np.save(replace_extension(filename, '.npy'), self.document_term_window_counts, allow_pickle=True)
def load_options(filename: str) -> dict: """Loads co-occurrence compute options""" options_filename = replace_extension(filename, 'json') if os.path.isfile(options_filename): options = read_json(options_filename) return options return {'not_found': options_filename}
def load_feather(filename: str) -> CoOccurrenceDataFrame: """Reads co-occurrences stored in Apache Arrow feather file format""" feather_filename: str = replace_extension(filename, ".feather") if os.path.isfile(feather_filename): co_occurrences: pd.DataFrame = pd.read_feather(feather_filename) return co_occurrences return None
def load_co_occurrences(filename: str) -> CoOccurrenceDataFrame: """Load co-occurrences from CSV-file if exists on disk""" feather_filename: str = replace_extension(filename, ".feather") """ Read FEATHER if exists """ if os.path.isfile(feather_filename): df: pd.DataFrame = pd.read_feather(feather_filename) if 'index' in df.columns: df.drop(columns='index', inplace=True) return df """ Read CSV if exists """ if os.path.isfile(filename): co_occurrences: pd.DataFrame = pd.read_csv(filename, sep='\t', header=0, decimal=',', index_col=0, engine='c') with contextlib.suppress(Exception): logger.info("caching to FEATHER file") store_feather(feather_filename, co_occurrences) return co_occurrences return None
def store_co_occurrences(*, filename: str, co_occurrences: CoOccurrenceDataFrame, store_feather: bool = True) -> None: """Store co-occurrence result data to CSV-file (if loaded)""" if co_occurrences is None: return if filename.endswith('zip'): archive_name = f"{strip_path_and_extension(filename)}.csv" compression = dict(method='zip', archive_name=archive_name) else: compression = 'infer' logger.info("storing co-occurrences (CSV)") co_occurrences.to_csv(filename, sep='\t', header=True, compression=compression, decimal=',') if store_feather: with contextlib.suppress(Exception): logger.info("storing co-occurrences (feather)") co_occurrences.reset_index(drop=True).to_feather(replace_extension( filename, ".feather"), compression="lz4")
def write_payload(folder: str, payload: DocumentPayload) -> DocumentPayload: filename: str = jj(folder, replace_extension(payload.filename, ".feather")) payload.content.reset_index(drop=True).to_feather(filename, compression="lz4") return payload
def store(self, filename: str) -> "Token2Id": """Store dictionary as CSV""" # pandas_to_csv_zip(filename, dfs=(self.to_dataframe(), strip_paths(filename)), sep='\t', header=True) with zipfile.ZipFile(filename, mode='w', compression=zipfile.ZIP_DEFLATED) as fp: data_str = self.to_dataframe().to_csv(sep='\t', header=True) fp.writestr(replace_extension(strip_paths(filename), ".csv"), data=data_str) self.store_tf(filename) return self
def prepare_train_corpus( input_filename, pos_includes, pos_excludes, chunk_size, lemmatize, lower, remove_stopwords, min_word_length, keep_symbols, keep_numerals, version, ): """Prepares the a training corpus from Sparv XML archive""" transform_opts: TokensTransformOpts = TokensTransformOpts( to_lower=lower, remove_stopwords=remove_stopwords is not None, language=remove_stopwords, min_len=min_word_length, max_len=None, keep_numerals=keep_numerals, keep_symbols=keep_symbols, ) extract_opts = ExtractTaggedTokensOpts( pos_includes=pos_includes, pos_excludes=pos_excludes, lemmatize=lemmatize, ) output_filename = replace_extension( timestamp_filename(suffix_filename(input_filename, "text")), 'zip') reader_opts = { 'chunk_size': chunk_size, } sparv_corpus.sparv_xml_extract_and_store( source=input_filename, target=output_filename, version=version, extract_opts=extract_opts, reader_opts=reader_opts, transform_opts=transform_opts, ) store_options_to_json_file( input_filename, output_filename, transform_opts, dict(version=version, extract_tokens_opts=extract_opts, reader_opts=reader_opts), )
def store_options_to_json_file(input_filename, output_filename, transform_opts: TokensTransformOpts, sparv_extract_opts): store_options = { 'input': input_filename, 'output': output_filename, 'transform_opts': transform_opts.props, 'sparv_extract_opts': sparv_extract_opts, } store_options_filename = replace_extension(output_filename, 'json') with open(store_options_filename, 'w') as json_file: json.dump(store_options, json_file)
def smart_load(filename: str, *, missing_ok: bool = False, feather_pipe: Callable[[pd.DataFrame], pd.DataFrame] = None, **kwargs) -> pd.DataFrame: feather_filename: str = pu.replace_extension(filename, "feather") if isfile(feather_filename): data: pd.DataFrame = pd.read_feather(feather_filename) if feather_pipe is not None: data = data.pipe(feather_pipe, **kwargs) elif isfile(filename): data = pd.read_csv(filename, **CSV_OPTS) else: if missing_ok: return None raise FileNotFoundError(f"{filename}") return data
def _store_csv(self, target_folder: str) -> None: data: list[tuple[pd.DataFrame, str]] = [ (self.document_index.rename_axis(''), 'documents.csv'), (self.dictionary, 'dictionary.csv'), (self.topic_token_weights, 'topic_token_weights.csv'), (self.topic_token_overview, 'topic_token_overview.csv'), (self.document_topic_weights, 'document_topic_weights.csv'), (self.topic_diagnostics, 'topic_diagnostics.csv'), (self.token_diagnostics, 'token_diagnostics.csv'), ] for (df, name) in data: if df is None: continue archive_name = jj(target_folder, pu.replace_extension(name, ".zip")) pu.pandas_to_csv_zip(archive_name, (df, name), extension="csv", sep='\t')
def test_pipeline_can_can_be_saved_in_feather(config: CorpusConfig): tagged_corpus_source: str = os.path.join( CORPUS_FOLDER, 'legal_instrument_five_docs_test_pos_csv.zip') pipeline = CorpusPipeline(config=config).checkpoint(tagged_corpus_source, force_checkpoint=False) for payload in pipeline.resolve(): tagged_frame: pd.DataFrame = payload.content filename = os.path.join( OUTPUT_FOLDER, replace_extension(payload.filename, ".feather")) tagged_frame.reset_index(drop=True).to_feather(filename, compression="lz4") assert os.path.isfile(filename) apa = pd.read_feather(filename) assert apa is not None
def store_feather(filename: str, co_occurrences: CoOccurrenceDataFrame) -> None: """Stores co-occurrences in Apache Arrow feather file format""" feather_filename: str = replace_extension(filename, ".feather") co_occurrences = co_occurrences.reset_index(drop=True) co_occurrences.to_feather(feather_filename, compression="lz4")
def payload_exists(folder: str, payload: DocumentPayload) -> DocumentPayload: filename = jj(folder, replace_extension(payload.filename, ".feather")) return os.path.isfile(filename)
def options_filename(folder: str, tag: str) -> str: return replace_extension(co_occurrence_filename(folder, tag), 'json')
def process_payload(self, payload: DocumentPayload) -> DocumentPayload: payload = super().process_payload(payload) payload.content.to_feather( jj(self.folder, replace_extension(payload.filename, 'feather'))) return payload