Example #1
0
def test_load_and_store_bundle():

    filename = co_occurrence.to_filename(folder='./tests/test_data/VENUS',
                                         tag='VENUS')
    target_folder: str = f'./tests/output/{uuid.uuid4()}'

    bundle: co_occurrence.Bundle = co_occurrence.Bundle.load(filename)

    assert bundle is not None
    assert isinstance(bundle.corpus, VectorizedCorpus)
    assert isinstance(bundle.co_occurrences, pd.DataFrame)
    assert isinstance(bundle.compute_options, dict)
    assert bundle.folder == './tests/test_data/VENUS'
    assert bundle.tag == 'VENUS'

    os.makedirs(target_folder)

    expected_filename = co_occurrence.to_filename(folder=target_folder,
                                                  tag='MARS')

    bundle.store(folder=target_folder, tag='MARS')

    assert os.path.isfile(expected_filename)

    shutil.rmtree(target_folder, ignore_errors=True)
Example #2
0
def test_compute_and_store_bundle():

    tag: str = f'{uuid.uuid4()}'

    target_folder: str = jj(OUTPUT_FOLDER, tag)
    target_filename: str = co_occurrence.to_filename(folder=target_folder,
                                                     tag=tag)

    os.makedirs(target_folder, exist_ok=True)

    simple_corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDEFG_3DOCS)
    context_opts: co_occurrence.ContextOpts = co_occurrence.ContextOpts(
        concept={'g'}, ignore_concept=False, context_width=2)
    bundle: co_occurrence.Bundle = test_utils.create_simple_bundle_by_pipeline(
        data=simple_corpus,
        context_opts=context_opts,
        folder=target_folder,
        tag=tag,
    )

    bundle.store()

    assert os.path.isfile(target_filename)

    shutil.rmtree(target_folder, ignore_errors=True)
Example #3
0
def test_load_options():

    folder, tag = './tests/test_data/VENUS', 'VENUS'

    filename: str = co_occurrence.to_filename(folder=folder, tag=tag)

    options = co_occurrence.load_options(filename)

    assert options is not None
Example #4
0
def test_folder_and_tag_to_filename():

    expected_filename: str = f'./tests/test_data/dummy/dummy{co_occurrence.FILENAME_POSTFIX}'

    folder, tag = './tests/test_data/dummy', 'dummy'

    filename: str = co_occurrence.to_filename(folder=folder, tag=tag)

    assert filename == expected_filename
Example #5
0
def test_load_co_occurrences():

    folder, tag = './tests/test_data/VENUS', 'VENUS'

    filename = co_occurrence.to_filename(folder=folder, tag=tag)

    co_occurrences: pd.DataFrame = co_occurrence.load_co_occurrences(filename)

    assert co_occurrences is not None
    assert 16399 == len(co_occurrences)
    assert 125197 == co_occurrences.value.sum()
Example #6
0
    def command_line(self, script: str) -> str:

        options: List[str] = []

        for key, value in self.command_line_options().items():
            if isinstance(value, bool):
                options.append(key)
            elif isinstance(value, (str, )):
                options.append(f"{key} \"{value}\"")
            elif isinstance(
                    value,
                (
                    str,
                    int,
                ),
            ):
                options.append(f"{key} {value}")
            elif isinstance(
                    value,
                (
                    list,
                    tuple,
                    set,
                ),
            ):
                options.extend([f"{key} \"{v}\"" for v in value])
            else:
                logger.warning(f"skipped option {key} {value}")

        config_filename: str = "doit.yml"
        target_filename: str = to_filename(folder=self.target_folder,
                                           tag=self.corpus_tag)
        command: str = (
            f"{script} {' '.join(options)} {config_filename} {self.corpus_source} {target_filename} {self.corpus_tag}"
        )

        return command
Example #7
0
def bundle() -> co_occurrence.Bundle:
    folder, tag = './tests/test_data/SSI', 'SSI'
    filename = co_occurrence.to_filename(folder=folder, tag=tag)
    bundle: co_occurrence.Bundle = co_occurrence.Bundle.load(
        filename, compute_frame=False)
    return bundle
def load_bundle(folder: str, tag: str):
    filename = to_filename(folder=folder, tag=tag)
    bundle: Bundle = Bundle.load(filename, compute_frame=False)
    return bundle
Example #9
0
def create_bundle(tag: str = 'DUMMY') -> Bundle:
    folder = f'./tests/test_data/{tag}'
    filename = to_filename(folder=folder, tag=tag)
    bundle: Bundle = Bundle.load(filename, compute_frame=False)
    return bundle
Example #10
0
def create_bundle() -> Bundle:
    folder, tag = './tests/test_data/VENUS', 'VENUS'
    filename = to_filename(folder=folder, tag=tag)
    bundle: Bundle = Bundle.load(filename, compute_frame=False)
    return bundle
Example #11
0
def bundle():
    folder, tag = './tests/test_data/SSI', 'SSI'
    filename = to_filename(folder=folder, tag=tag)
    bundle: Bundle = Bundle.load(filename, compute_frame=False)
    return bundle
Example #12
0
def compute(
    args: interface.ComputeOpts,
    corpus_config: pipeline.CorpusConfig,
    tagged_corpus_source: Optional[str] = None,
) -> co_occurrence.Bundle:
    """Creates and stores a concept co-occurrence bundle using specified options."""

    try:

        assert args.is_satisfied()

        target_filename = co_occurrence.to_filename(folder=args.target_folder,
                                                    tag=args.corpus_tag)

        os.makedirs(args.target_folder, exist_ok=True)

        tagged_corpus_source: Optional[str] = tagged_corpus_source or jj(
            dirname(args.corpus_source),
            f"{args.corpus_tag}{POS_TAGGED_FRAME_FILENAME_POSTFIX}")

        tagged_frame_pipeline: pipeline.CorpusPipeline = corpus_config.get_pipeline(
            "tagged_frame_pipeline",
            corpus_source=args.corpus_source,
            tagged_corpus_source=tagged_corpus_source,
            enable_checkpoint=args.enable_checkpoint,
            force_checkpoint=args.force_checkpoint,
        )

        args.extract_opts.passthrough_tokens = args.context_opts.concept
        args.extract_opts.block_tokens = []
        # args.extract_opts.block_chars = ''
        args.extract_opts.global_tf_threshold = args.tf_threshold
        args.extract_opts.global_tf_threshold_mask = args.tf_threshold_mask

        p: pipeline.CorpusPipeline = (
            tagged_frame_pipeline +
            pipeline.wildcard_to_partition_by_document_co_occurrence_pipeline(
                transform_opts=args.transform_opts,
                extract_opts=args.extract_opts,
                context_opts=args.context_opts,
                global_tf_threshold=args.tf_threshold,
            ))

        bundle: co_occurrence.Bundle = p.value()

        if bundle.corpus is None:
            raise co_occurrence.ZeroComputeError()

        bundle.tag = args.corpus_tag
        bundle.folder = args.target_folder

        try:
            bundle.co_occurrences = bundle.corpus.to_co_occurrences(
                bundle.token2id)
        except ValueError as ex:
            logger.error("fatal: to_co_occurrences failed (skipping)")
            logger.exception(ex)

        bundle.compute_options = compile_compute_options(args, target_filename)

        bundle.store()

        return bundle

    except (
            ValueError,
            FileNotFoundError,
            PermissionError,
    ) as ex:
        logger.error(ex)
        raise
    except Exception as ex:
        logger.error(ex)
        raise