def get_info(self, opts: TextReaderOpts) -> SourceInfo: filenames = self.namelist(pattern=opts.filename_pattern) basenames = strip_paths(filenames) return SourceInfo( name_to_filename={strip_paths(name): filename for name, filename in zip(basenames, filenames)}, names=basenames, metadata=extract_filenames_metadata(filenames=basenames, filename_fields=opts.filename_fields), )
def apply_filename_fields(document_index: DocumentIndex, filename_fields: FilenameFieldSpecs): """Extends document index with filename fields defined by `filename_fields`""" if 'filename' not in document_index.columns: raise DocumentIndexError("filename not in document index") filenames = [ strip_paths(filename) for filename in document_index.filename.tolist() ] metadata: List[Mapping[str, Any]] = extract_filenames_metadata( filenames=filenames, filename_fields=filename_fields) for key, values in list_of_dicts_to_dict_of_lists(metadata).items(): if key not in document_index.columns: document_index[key] = values return document_index
def from_filenames( filenames: List[str], filename_fields: FilenameFieldSpecs) -> "DocumentIndexHelper": if filename_fields is None: return None if hasattr(filename_fields, 'filename_fields'): """Is actually a TextReaderOpts""" filename_fields = filename_fields.filename_fields _metadata = extract_filenames_metadata(filenames=filenames, filename_fields=filename_fields) _index = metadata_to_document_index(_metadata) return DocumentIndexHelper(_index)
def __init__( self, filename: str, fields: Dict[str, int], filename_fields: FilenameFieldSpecs = None, index_field: str = None, sep: str = ' # ', ): """Simple corpus for document per line data """ with open(filename, 'r') as f: lines = f.readlines() if 'filename' not in fields or 'text' not in fields: raise ValueError( "Fields `filename` and `text` are not specified (required fields)" ) data = list_of_dicts_to_dict_of_lists( [{k: data[fields[k]] for k in fields} for data in [line.split(sep) for line in lines]]) self._filenames = data['filename'] self.iterator = None self.tokens = [[x.lower() for x in text.split() if len(x) > 0] for text in data['text']] fields_data = {k: v for k, v in data.items() if k != 'text'} if filename_fields is not None: filename_data = extract_filenames_metadata( filenames=self._filenames, filename_fields=filename_fields) fields_data = { **fields_data, **list_of_dicts_to_dict_of_lists(filename_data) } self._document_index = metadata_to_document_index( fields_data, document_id_field=index_field)
def __init__(self, filename: str, reader_opts: TextReaderOpts): filename_fields = reader_opts.filename_fields index_field = reader_opts.index_field with open(filename, 'r') as f: lines = f.readlines() self.corpus_data = [ dict(filename=data[0], title=data[1], text=data[2], tokens=[x.lower() for x in data[2].split() if len(x) > 0]) for data in [line.split(' # ') for line in lines] ] self.filenames = [x['filename'] for x in self.corpus_data] self.iterator = None metadata = extract_filenames_metadata(filenames=self.filenames, filename_fields=filename_fields) self.document_index: pd.DataFrame = metadata_to_document_index( metadata, document_id_field=index_field) self.document_index['title'] = [x['title'] for x in self.corpus_data]
def get_info(self, opts: TextReaderOpts) -> SourceInfo: filenames = self.namelist(pattern=opts.filename_pattern) basenames = strip_paths(filenames) filename_metadata = extract_filenames_metadata( filenames=basenames, filename_fields=opts.filename_fields) columns = [ x for x in self.filtered_data.columns.tolist() if x != self.text_column ] dataframe_metadata = self.filtered_data[columns].to_dict('records') metadata = [{ **x, **y } for x, y in zip(filename_metadata, dataframe_metadata)] name_to_filename = { strip_paths(name): filename for name, filename in zip(basenames, filenames) } return SourceInfo(name_to_filename=name_to_filename, names=basenames, metadata=metadata)
def _create_all_metadata(self) -> Sequence[Dict[str, Any]]: return extract_filenames_metadata( filenames=self._all_filenames, filename_fields=self.reader_opts.filename_fields, )
def _create_metadata(self) -> List[Mapping[str, Any]]: return extract_filenames_metadata( filenames=self.filenames, filename_fields=self.reader_opts.filename_fields)