def create_test_source_info(filenames: List[str]) -> cr.SourceInfo: basenames = [strip_paths(x) for x in filenames] return cr.SourceInfo( names=basenames, name_to_filename={strip_paths(x): x for x in filenames}, metadata=[{'filename': x, 'year': int(x[2:6])} for x in basenames], )
def get_info(self, opts: TextReaderOpts) -> SourceInfo: filenames = self.namelist(pattern=opts.filename_pattern) basenames = strip_paths(filenames) return SourceInfo( name_to_filename={strip_paths(name): filename for name, filename in zip(basenames, filenames)}, names=basenames, metadata=extract_filenames_metadata(filenames=basenames, filename_fields=opts.filename_fields), )
def read_payload(filename: str) -> DocumentPayload: filename = replace_extension(filename, ".feather") return DocumentPayload( content_type=ContentType.TAGGED_FRAME, content=pd.read_feather(filename), filename=replace_extension(strip_paths(filename), ".csv"), )
def create_instream(self) -> Iterable[DocumentPayload]: fg = self.token2id.id2token.get dg = self.docid2name.get pg = self.pipeline.payload.pos_schema.id_to_pos.get text_column, pos_column, lemma_column = self.pipeline.payload.tagged_columns_names2 loaded_frame_columns: set = None for filename in tqdm(self.corpus_filenames, total=len(self.corpus_filenames)): loaded_frame: pd.DataFrame = self.load_tagged_frame(filename) if self.id_to_token: if 'token_id' in loaded_frame.columns: loaded_frame[text_column] = loaded_frame.token_id.apply(fg) if 'lemma_id' in loaded_frame.columns: loaded_frame[lemma_column] = loaded_frame.lemma_id.apply( fg) loaded_frame[pos_column] = loaded_frame.pos_id.apply(pg) loaded_frame.drop(columns=['token_id', 'pos_id', 'lemma_id'], inplace=True, errors='ignore') if 'document_id' not in (loaded_frame_columns or (loaded_frame_columns := set( loaded_frame.columns))): payload: DocumentPayload = DocumentPayload( content_type=self.out_content_type, content=loaded_frame, filename=strip_paths(filename)) self.register_pos_counts(payload) yield payload else: for document_id, tagged_frame in loaded_frame.groupby( 'document_id'): tagged_frame.reset_index(drop=True, inplace=True) payload: DocumentPayload = DocumentPayload( content_type=self.out_content_type, content=tagged_frame, filename=dg(document_id)) self.register_pos_counts(payload) yield payload
def store(self, filename: str) -> "Token2Id": """Store dictionary as CSV""" # pandas_to_csv_zip(filename, dfs=(self.to_dataframe(), strip_paths(filename)), sep='\t', header=True) with zipfile.ZipFile(filename, mode='w', compression=zipfile.ZIP_DEFLATED) as fp: data_str = self.to_dataframe().to_csv(sep='\t', header=True) fp.writestr(replace_extension(strip_paths(filename), ".csv"), data=data_str) self.store_tf(filename) return self
def apply_filename_fields(document_index: DocumentIndex, filename_fields: FilenameFieldSpecs): """Extends document index with filename fields defined by `filename_fields`""" if 'filename' not in document_index.columns: raise DocumentIndexError("filename not in document index") filenames = [ strip_paths(filename) for filename in document_index.filename.tolist() ] metadata: List[Mapping[str, Any]] = extract_filenames_metadata( filenames=filenames, filename_fields=filename_fields) for key, values in list_of_dicts_to_dict_of_lists(metadata).items(): if key not in document_index.columns: document_index[key] = values return document_index
def find_tags(folder: str) -> List[str]: """Return dump tags in specified folder.""" known_suffixes = [ '_vector_data.npz', '_vector_data.npy', '_vectorizer_data.pickle', '_document_index.csv.gz', ] tags: List[str] = list({ x[0:len(x) - len(suffix)] for suffix in known_suffixes for x in strip_paths(glob.glob(jj(folder, f'*{suffix}'))) }) return tags
def get_info(self, opts: TextReaderOpts) -> SourceInfo: filenames = self.namelist(pattern=opts.filename_pattern) basenames = strip_paths(filenames) filename_metadata = extract_filenames_metadata( filenames=basenames, filename_fields=opts.filename_fields) columns = [ x for x in self.filtered_data.columns.tolist() if x != self.text_column ] dataframe_metadata = self.filtered_data[columns].to_dict('records') metadata = [{ **x, **y } for x, y in zip(filename_metadata, dataframe_metadata)] name_to_filename = { strip_paths(name): filename for name, filename in zip(basenames, filenames) } return SourceInfo(name_to_filename=name_to_filename, names=basenames, metadata=metadata)
def metadata(self) -> Sequence[Dict[str, Any]]: return self._get_metadata(strip_paths(self._get_filenames()))
def test_strip_path(): assert strip_paths('/tmp/hej.txt') == 'hej.txt' assert strip_paths(['/tmp/hej.txt']) == ['hej.txt'] assert strip_paths('/tmp/hej') == 'hej' assert strip_paths('hej.x') == 'hej.x'
def exists(self, filename: str) -> bool: return filename in strip_paths(self.filenames)
def namelist(self, *, pattern: str = '*.*') -> List[str]: # pylint: disable=unused-argument return strip_paths( [name for name, _ in self.items if fnmatch(name, pattern)])
def __init__(self, items: List[StoreItemPair]): self.items: List[StoreItemPair] = items self.map = {strip_paths(item[0]): item for item in self.items} self.filenames = strip_paths([name for name, _ in self.items])
def namelist(self, *, pattern: str = '*.*') -> List[str]: return strip_paths(glob(self.to_path(pattern or '*.*')))