Beispiel #1
0
def read_payload(filename: str) -> DocumentPayload:
    filename = replace_extension(filename, ".feather")
    return DocumentPayload(
        content_type=ContentType.TAGGED_FRAME,
        content=pd.read_feather(filename),
        filename=replace_extension(strip_paths(filename), ".csv"),
    )
Beispiel #2
0
    def load(folder: str, tag: str) -> "TokenWindowCountMatrix":
        """Loads documents' (rows) token (column) window counts matrix"""
        matrix: scipy.sparse.spmatrix = None
        filename = to_filename(folder=folder,
                               tag=tag,
                               postfix=DOCUMENT_COUNTS_POSTFIX)
        if os.path.isfile(replace_extension(filename, '.npz')):
            matrix = scipy.sparse.load_npz(replace_extension(filename, '.npz'))

        if os.path.isfile(replace_extension(filename, '.npy')):
            matrix = np.load(replace_extension(filename, '.npy'),
                             allow_pickle=True).item()

        return TokenWindowCountMatrix(document_term_window_counts=matrix)
Beispiel #3
0
 def store(self, folder: str, tag: str, compressed: bool = True) -> None:
     """Stores documents' (rows) token (column) window counts matrix"""
     filename = to_filename(folder=folder,
                            tag=tag,
                            postfix=DOCUMENT_COUNTS_POSTFIX)
     if compressed:
         assert scipy.sparse.issparse(self.document_term_window_counts)
         scipy.sparse.save_npz(replace_extension(filename, '.npz'),
                               self.document_term_window_counts,
                               compressed=True)
     else:
         np.save(replace_extension(filename, '.npy'),
                 self.document_term_window_counts,
                 allow_pickle=True)
Beispiel #4
0
def load_options(filename: str) -> dict:
    """Loads co-occurrence compute options"""
    options_filename = replace_extension(filename, 'json')
    if os.path.isfile(options_filename):
        options = read_json(options_filename)
        return options
    return {'not_found': options_filename}
Beispiel #5
0
def load_feather(filename: str) -> CoOccurrenceDataFrame:
    """Reads co-occurrences stored in Apache Arrow feather file format"""
    feather_filename: str = replace_extension(filename, ".feather")
    if os.path.isfile(feather_filename):
        co_occurrences: pd.DataFrame = pd.read_feather(feather_filename)
        return co_occurrences
    return None
Beispiel #6
0
def load_co_occurrences(filename: str) -> CoOccurrenceDataFrame:
    """Load co-occurrences from CSV-file if exists on disk"""

    feather_filename: str = replace_extension(filename, ".feather")
    """ Read FEATHER if exists """
    if os.path.isfile(feather_filename):

        df: pd.DataFrame = pd.read_feather(feather_filename)
        if 'index' in df.columns:
            df.drop(columns='index', inplace=True)

        return df
    """ Read CSV if exists """
    if os.path.isfile(filename):

        co_occurrences: pd.DataFrame = pd.read_csv(filename,
                                                   sep='\t',
                                                   header=0,
                                                   decimal=',',
                                                   index_col=0,
                                                   engine='c')

        with contextlib.suppress(Exception):
            logger.info("caching to FEATHER file")
            store_feather(feather_filename, co_occurrences)

        return co_occurrences

    return None
Beispiel #7
0
def store_co_occurrences(*,
                         filename: str,
                         co_occurrences: CoOccurrenceDataFrame,
                         store_feather: bool = True) -> None:
    """Store co-occurrence result data to CSV-file (if loaded)"""
    if co_occurrences is None:
        return

    if filename.endswith('zip'):
        archive_name = f"{strip_path_and_extension(filename)}.csv"
        compression = dict(method='zip', archive_name=archive_name)
    else:
        compression = 'infer'

    logger.info("storing co-occurrences (CSV)")
    co_occurrences.to_csv(filename,
                          sep='\t',
                          header=True,
                          compression=compression,
                          decimal=',')

    if store_feather:
        with contextlib.suppress(Exception):
            logger.info("storing co-occurrences (feather)")
            co_occurrences.reset_index(drop=True).to_feather(replace_extension(
                filename, ".feather"),
                                                             compression="lz4")
Beispiel #8
0
def write_payload(folder: str, payload: DocumentPayload) -> DocumentPayload:

    filename: str = jj(folder, replace_extension(payload.filename, ".feather"))

    payload.content.reset_index(drop=True).to_feather(filename,
                                                      compression="lz4")

    return payload
Beispiel #9
0
    def store(self, filename: str) -> "Token2Id":
        """Store dictionary as CSV"""

        # pandas_to_csv_zip(filename, dfs=(self.to_dataframe(), strip_paths(filename)), sep='\t', header=True)
        with zipfile.ZipFile(filename, mode='w', compression=zipfile.ZIP_DEFLATED) as fp:
            data_str = self.to_dataframe().to_csv(sep='\t', header=True)
            fp.writestr(replace_extension(strip_paths(filename), ".csv"), data=data_str)

        self.store_tf(filename)

        return self
Beispiel #10
0
def prepare_train_corpus(
    input_filename,
    pos_includes,
    pos_excludes,
    chunk_size,
    lemmatize,
    lower,
    remove_stopwords,
    min_word_length,
    keep_symbols,
    keep_numerals,
    version,
):
    """Prepares the a training corpus from Sparv XML archive"""
    transform_opts: TokensTransformOpts = TokensTransformOpts(
        to_lower=lower,
        remove_stopwords=remove_stopwords is not None,
        language=remove_stopwords,
        min_len=min_word_length,
        max_len=None,
        keep_numerals=keep_numerals,
        keep_symbols=keep_symbols,
    )
    extract_opts = ExtractTaggedTokensOpts(
        pos_includes=pos_includes,
        pos_excludes=pos_excludes,
        lemmatize=lemmatize,
    )
    output_filename = replace_extension(
        timestamp_filename(suffix_filename(input_filename, "text")), 'zip')

    reader_opts = {
        'chunk_size': chunk_size,
    }

    sparv_corpus.sparv_xml_extract_and_store(
        source=input_filename,
        target=output_filename,
        version=version,
        extract_opts=extract_opts,
        reader_opts=reader_opts,
        transform_opts=transform_opts,
    )

    store_options_to_json_file(
        input_filename,
        output_filename,
        transform_opts,
        dict(version=version,
             extract_tokens_opts=extract_opts,
             reader_opts=reader_opts),
    )
Beispiel #11
0
def store_options_to_json_file(input_filename, output_filename,
                               transform_opts: TokensTransformOpts,
                               sparv_extract_opts):

    store_options = {
        'input': input_filename,
        'output': output_filename,
        'transform_opts': transform_opts.props,
        'sparv_extract_opts': sparv_extract_opts,
    }

    store_options_filename = replace_extension(output_filename, 'json')
    with open(store_options_filename, 'w') as json_file:
        json.dump(store_options, json_file)
Beispiel #12
0
def smart_load(filename: str,
               *,
               missing_ok: bool = False,
               feather_pipe: Callable[[pd.DataFrame], pd.DataFrame] = None,
               **kwargs) -> pd.DataFrame:
    feather_filename: str = pu.replace_extension(filename, "feather")
    if isfile(feather_filename):
        data: pd.DataFrame = pd.read_feather(feather_filename)
        if feather_pipe is not None:
            data = data.pipe(feather_pipe, **kwargs)
    elif isfile(filename):
        data = pd.read_csv(filename, **CSV_OPTS)
    else:
        if missing_ok:
            return None
        raise FileNotFoundError(f"{filename}")
    return data
Beispiel #13
0
    def _store_csv(self, target_folder: str) -> None:

        data: list[tuple[pd.DataFrame, str]] = [
            (self.document_index.rename_axis(''), 'documents.csv'),
            (self.dictionary, 'dictionary.csv'),
            (self.topic_token_weights, 'topic_token_weights.csv'),
            (self.topic_token_overview, 'topic_token_overview.csv'),
            (self.document_topic_weights, 'document_topic_weights.csv'),
            (self.topic_diagnostics, 'topic_diagnostics.csv'),
            (self.token_diagnostics, 'token_diagnostics.csv'),
        ]

        for (df, name) in data:
            if df is None:
                continue
            archive_name = jj(target_folder,
                              pu.replace_extension(name, ".zip"))
            pu.pandas_to_csv_zip(archive_name, (df, name),
                                 extension="csv",
                                 sep='\t')
Beispiel #14
0
def test_pipeline_can_can_be_saved_in_feather(config: CorpusConfig):

    tagged_corpus_source: str = os.path.join(
        CORPUS_FOLDER, 'legal_instrument_five_docs_test_pos_csv.zip')

    pipeline = CorpusPipeline(config=config).checkpoint(tagged_corpus_source,
                                                        force_checkpoint=False)

    for payload in pipeline.resolve():

        tagged_frame: pd.DataFrame = payload.content

        filename = os.path.join(
            OUTPUT_FOLDER, replace_extension(payload.filename, ".feather"))

        tagged_frame.reset_index(drop=True).to_feather(filename,
                                                       compression="lz4")

        assert os.path.isfile(filename)

        apa = pd.read_feather(filename)

        assert apa is not None
Beispiel #15
0
def store_feather(filename: str,
                  co_occurrences: CoOccurrenceDataFrame) -> None:
    """Stores co-occurrences in Apache Arrow feather file format"""
    feather_filename: str = replace_extension(filename, ".feather")
    co_occurrences = co_occurrences.reset_index(drop=True)
    co_occurrences.to_feather(feather_filename, compression="lz4")
Beispiel #16
0
def payload_exists(folder: str, payload: DocumentPayload) -> DocumentPayload:
    filename = jj(folder, replace_extension(payload.filename, ".feather"))
    return os.path.isfile(filename)
Beispiel #17
0
def options_filename(folder: str, tag: str) -> str:
    return replace_extension(co_occurrence_filename(folder, tag), 'json')
Beispiel #18
0
 def process_payload(self, payload: DocumentPayload) -> DocumentPayload:
     payload = super().process_payload(payload)
     payload.content.to_feather(
         jj(self.folder, replace_extension(payload.filename, 'feather')))
     return payload