def test_load_and_store_bundle(): filename = co_occurrence.to_filename(folder='./tests/test_data/VENUS', tag='VENUS') target_folder: str = f'./tests/output/{uuid.uuid4()}' bundle: co_occurrence.Bundle = co_occurrence.Bundle.load(filename) assert bundle is not None assert isinstance(bundle.corpus, VectorizedCorpus) assert isinstance(bundle.co_occurrences, pd.DataFrame) assert isinstance(bundle.compute_options, dict) assert bundle.folder == './tests/test_data/VENUS' assert bundle.tag == 'VENUS' os.makedirs(target_folder) expected_filename = co_occurrence.to_filename(folder=target_folder, tag='MARS') bundle.store(folder=target_folder, tag='MARS') assert os.path.isfile(expected_filename) shutil.rmtree(target_folder, ignore_errors=True)
def test_compute_and_store_bundle(): tag: str = f'{uuid.uuid4()}' target_folder: str = jj(OUTPUT_FOLDER, tag) target_filename: str = co_occurrence.to_filename(folder=target_folder, tag=tag) os.makedirs(target_folder, exist_ok=True) simple_corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDEFG_3DOCS) context_opts: co_occurrence.ContextOpts = co_occurrence.ContextOpts( concept={'g'}, ignore_concept=False, context_width=2) bundle: co_occurrence.Bundle = test_utils.create_simple_bundle_by_pipeline( data=simple_corpus, context_opts=context_opts, folder=target_folder, tag=tag, ) bundle.store() assert os.path.isfile(target_filename) shutil.rmtree(target_folder, ignore_errors=True)
def test_load_options(): folder, tag = './tests/test_data/VENUS', 'VENUS' filename: str = co_occurrence.to_filename(folder=folder, tag=tag) options = co_occurrence.load_options(filename) assert options is not None
def test_folder_and_tag_to_filename(): expected_filename: str = f'./tests/test_data/dummy/dummy{co_occurrence.FILENAME_POSTFIX}' folder, tag = './tests/test_data/dummy', 'dummy' filename: str = co_occurrence.to_filename(folder=folder, tag=tag) assert filename == expected_filename
def test_load_co_occurrences(): folder, tag = './tests/test_data/VENUS', 'VENUS' filename = co_occurrence.to_filename(folder=folder, tag=tag) co_occurrences: pd.DataFrame = co_occurrence.load_co_occurrences(filename) assert co_occurrences is not None assert 16399 == len(co_occurrences) assert 125197 == co_occurrences.value.sum()
def command_line(self, script: str) -> str: options: List[str] = [] for key, value in self.command_line_options().items(): if isinstance(value, bool): options.append(key) elif isinstance(value, (str, )): options.append(f"{key} \"{value}\"") elif isinstance( value, ( str, int, ), ): options.append(f"{key} {value}") elif isinstance( value, ( list, tuple, set, ), ): options.extend([f"{key} \"{v}\"" for v in value]) else: logger.warning(f"skipped option {key} {value}") config_filename: str = "doit.yml" target_filename: str = to_filename(folder=self.target_folder, tag=self.corpus_tag) command: str = ( f"{script} {' '.join(options)} {config_filename} {self.corpus_source} {target_filename} {self.corpus_tag}" ) return command
def bundle() -> co_occurrence.Bundle: folder, tag = './tests/test_data/SSI', 'SSI' filename = co_occurrence.to_filename(folder=folder, tag=tag) bundle: co_occurrence.Bundle = co_occurrence.Bundle.load( filename, compute_frame=False) return bundle
def load_bundle(folder: str, tag: str): filename = to_filename(folder=folder, tag=tag) bundle: Bundle = Bundle.load(filename, compute_frame=False) return bundle
def create_bundle(tag: str = 'DUMMY') -> Bundle: folder = f'./tests/test_data/{tag}' filename = to_filename(folder=folder, tag=tag) bundle: Bundle = Bundle.load(filename, compute_frame=False) return bundle
def create_bundle() -> Bundle: folder, tag = './tests/test_data/VENUS', 'VENUS' filename = to_filename(folder=folder, tag=tag) bundle: Bundle = Bundle.load(filename, compute_frame=False) return bundle
def bundle(): folder, tag = './tests/test_data/SSI', 'SSI' filename = to_filename(folder=folder, tag=tag) bundle: Bundle = Bundle.load(filename, compute_frame=False) return bundle
def compute( args: interface.ComputeOpts, corpus_config: pipeline.CorpusConfig, tagged_corpus_source: Optional[str] = None, ) -> co_occurrence.Bundle: """Creates and stores a concept co-occurrence bundle using specified options.""" try: assert args.is_satisfied() target_filename = co_occurrence.to_filename(folder=args.target_folder, tag=args.corpus_tag) os.makedirs(args.target_folder, exist_ok=True) tagged_corpus_source: Optional[str] = tagged_corpus_source or jj( dirname(args.corpus_source), f"{args.corpus_tag}{POS_TAGGED_FRAME_FILENAME_POSTFIX}") tagged_frame_pipeline: pipeline.CorpusPipeline = corpus_config.get_pipeline( "tagged_frame_pipeline", corpus_source=args.corpus_source, tagged_corpus_source=tagged_corpus_source, enable_checkpoint=args.enable_checkpoint, force_checkpoint=args.force_checkpoint, ) args.extract_opts.passthrough_tokens = args.context_opts.concept args.extract_opts.block_tokens = [] # args.extract_opts.block_chars = '' args.extract_opts.global_tf_threshold = args.tf_threshold args.extract_opts.global_tf_threshold_mask = args.tf_threshold_mask p: pipeline.CorpusPipeline = ( tagged_frame_pipeline + pipeline.wildcard_to_partition_by_document_co_occurrence_pipeline( transform_opts=args.transform_opts, extract_opts=args.extract_opts, context_opts=args.context_opts, global_tf_threshold=args.tf_threshold, )) bundle: co_occurrence.Bundle = p.value() if bundle.corpus is None: raise co_occurrence.ZeroComputeError() bundle.tag = args.corpus_tag bundle.folder = args.target_folder try: bundle.co_occurrences = bundle.corpus.to_co_occurrences( bundle.token2id) except ValueError as ex: logger.error("fatal: to_co_occurrences failed (skipping)") logger.exception(ex) bundle.compute_options = compile_compute_options(args, target_filename) bundle.store() return bundle except ( ValueError, FileNotFoundError, PermissionError, ) as ex: logger.error(ex) raise except Exception as ex: logger.error(ex) raise