Exemple #1
0
 def get_info(self, opts: TextReaderOpts) -> SourceInfo:
     filenames = self.namelist(pattern=opts.filename_pattern)
     basenames = strip_paths(filenames)
     return SourceInfo(
         name_to_filename={strip_paths(name): filename for name, filename in zip(basenames, filenames)},
         names=basenames,
         metadata=extract_filenames_metadata(filenames=basenames, filename_fields=opts.filename_fields),
     )
Exemple #2
0
def apply_filename_fields(document_index: DocumentIndex,
                          filename_fields: FilenameFieldSpecs):
    """Extends document index with filename fields defined by `filename_fields`"""
    if 'filename' not in document_index.columns:
        raise DocumentIndexError("filename not in document index")
    filenames = [
        strip_paths(filename) for filename in document_index.filename.tolist()
    ]
    metadata: List[Mapping[str, Any]] = extract_filenames_metadata(
        filenames=filenames, filename_fields=filename_fields)
    for key, values in list_of_dicts_to_dict_of_lists(metadata).items():
        if key not in document_index.columns:
            document_index[key] = values
    return document_index
Exemple #3
0
    def from_filenames(
            filenames: List[str],
            filename_fields: FilenameFieldSpecs) -> "DocumentIndexHelper":

        if filename_fields is None:
            return None

        if hasattr(filename_fields, 'filename_fields'):
            """Is actually a TextReaderOpts"""
            filename_fields = filename_fields.filename_fields

        _metadata = extract_filenames_metadata(filenames=filenames,
                                               filename_fields=filename_fields)
        _index = metadata_to_document_index(_metadata)

        return DocumentIndexHelper(_index)
Exemple #4
0
    def __init__(
        self,
        filename: str,
        fields: Dict[str, int],
        filename_fields: FilenameFieldSpecs = None,
        index_field: str = None,
        sep: str = ' # ',
    ):
        """Simple corpus for document per line data  """
        with open(filename, 'r') as f:
            lines = f.readlines()

        if 'filename' not in fields or 'text' not in fields:
            raise ValueError(
                "Fields `filename` and `text` are not specified (required fields)"
            )

        data = list_of_dicts_to_dict_of_lists(
            [{k: data[fields[k]]
              for k in fields}
             for data in [line.split(sep) for line in lines]])

        self._filenames = data['filename']
        self.iterator = None
        self.tokens = [[x.lower() for x in text.split() if len(x) > 0]
                       for text in data['text']]

        fields_data = {k: v for k, v in data.items() if k != 'text'}

        if filename_fields is not None:

            filename_data = extract_filenames_metadata(
                filenames=self._filenames, filename_fields=filename_fields)
            fields_data = {
                **fields_data,
                **list_of_dicts_to_dict_of_lists(filename_data)
            }

        self._document_index = metadata_to_document_index(
            fields_data, document_id_field=index_field)
Exemple #5
0
    def __init__(self, filename: str, reader_opts: TextReaderOpts):

        filename_fields = reader_opts.filename_fields
        index_field = reader_opts.index_field

        with open(filename, 'r') as f:
            lines = f.readlines()

        self.corpus_data = [
            dict(filename=data[0],
                 title=data[1],
                 text=data[2],
                 tokens=[x.lower() for x in data[2].split() if len(x) > 0])
            for data in [line.split(' # ') for line in lines]
        ]
        self.filenames = [x['filename'] for x in self.corpus_data]
        self.iterator = None

        metadata = extract_filenames_metadata(filenames=self.filenames,
                                              filename_fields=filename_fields)
        self.document_index: pd.DataFrame = metadata_to_document_index(
            metadata, document_id_field=index_field)
        self.document_index['title'] = [x['title'] for x in self.corpus_data]
Exemple #6
0
    def get_info(self, opts: TextReaderOpts) -> SourceInfo:

        filenames = self.namelist(pattern=opts.filename_pattern)
        basenames = strip_paths(filenames)
        filename_metadata = extract_filenames_metadata(
            filenames=basenames, filename_fields=opts.filename_fields)
        columns = [
            x for x in self.filtered_data.columns.tolist()
            if x != self.text_column
        ]
        dataframe_metadata = self.filtered_data[columns].to_dict('records')
        metadata = [{
            **x,
            **y
        } for x, y in zip(filename_metadata, dataframe_metadata)]
        name_to_filename = {
            strip_paths(name): filename
            for name, filename in zip(basenames, filenames)
        }

        return SourceInfo(name_to_filename=name_to_filename,
                          names=basenames,
                          metadata=metadata)
Exemple #7
0
 def _create_all_metadata(self) -> Sequence[Dict[str, Any]]:
     return extract_filenames_metadata(
         filenames=self._all_filenames,
         filename_fields=self.reader_opts.filename_fields,
     )
Exemple #8
0
 def _create_metadata(self) -> List[Mapping[str, Any]]:
     return extract_filenames_metadata(
         filenames=self.filenames,
         filename_fields=self.reader_opts.filename_fields)