def test_procurement_boundary(self): vocab.add_linked_art_boundary_check() a = model.Activity() p = vocab.ProvenanceEntry() a.caused = p js = factory.toJSON(a) self.assertTrue(not 'classified_as' in js['caused'][0])
def rewrite_output_files(r, update_filename=False, parallel=False, concurrency=4, path=None, files=None, **kwargs): print(f'Rewriting JSON output files') vocab.add_linked_art_boundary_check() vocab.add_attribute_assignment_check() if not files: if path is None: path = output_file_path p = Path(path) files = p.rglob('*.json') files = list(files) if 'content_filter_re' in kwargs: print(f'rewriting with content filter: {kwargs["content_filter_re"]}') if parallel: pool = multiprocessing.Pool(concurrency) partition_size = max(min(25000, int(len(files) / concurrency)), 10) file_partitions = list(chunks(files, partition_size)) args = list((file_partition, r, update_filename, i + 1, len(file_partitions), kwargs) for i, file_partition in enumerate(file_partitions)) print(f'{len(args)} worker partitions with size {partition_size}') _ = pool.starmap(_rewrite_output_files, args) else: _rewrite_output_files(files, r, update_filename, 1, 1, kwargs)
def run(self, **options): vocab.add_linked_art_boundary_check() vocab.add_attribute_assignment_check() services = self.get_services(**options) super().run(services=services, **options) post_map = services['post_sale_map'] self.generate_prev_post_sales_data(post_map)
def test_linguistic_object_boundary(self): vocab.add_linked_art_boundary_check() jrnl = vocab.JournalText(label="journal") issue = vocab.IssueText(label="issue") issue.part_of = jrnl issue.referred_to_by = vocab.MaterialStatement(content="Statement") js = factory.toJSON(issue) # Have not embedded journal in issue self.assertTrue(not 'classified_as' in js['part_of'][0]) # Have embedded statement in issue self.assertTrue('content' in js['referred_to_by'][0]) self.assertTrue('type' in js['referred_to_by'][0]['classified_as'][0]['classified_as'][0])
def test_boundary_setter(self): vocab.add_linked_art_boundary_check() p = model.Person() p2 = model.Person() n = model.Name() n.content = "Test" p2.identified_by = n p.exact_match = p2 # Now, Test should not appear in the resulting JSON of p factory.linked_art_boundaries = True js = factory.toJSON(p) self.assertTrue(not 'identified_by' in js['exact_match'][0]) factory.linked_art_boundaries = False js = factory.toJSON(p) self.assertTrue('identified_by' in js['exact_match'][0])
def run(self, **options): vocab.add_linked_art_boundary_check() vocab.add_attribute_assignment_check() services = self.get_services(**options) super().run(services=services, **options)
project = sys.argv[1] pipe = importlib.import_module(f'pipeline.projects.{project}') Pipeline = pipe.Pipeline sys.argv = [sys.argv[0], *sys.argv[2:]] ### Run the Pipeline if __name__ == '__main__': if settings.DEBUG: LIMIT = int(os.environ.get('GETTY_PIPELINE_LIMIT', 1)) PACK_SIZE = 1 else: LIMIT = int(os.environ.get('GETTY_PIPELINE_LIMIT', 10000000)) PACK_SIZE = 10000000 vocab.add_linked_art_boundary_check() print_dot = False if 'dot' in sys.argv[1:]: print_dot = True sys.argv[1:] = [a for a in sys.argv[1:] if a != 'dot'] parser = bonobo.get_argument_parser() with bonobo.parse_args(parser) as options: try: pipeline = Pipeline( output_path=settings.output_file_path, models=settings.arches_models, pack_size=PACK_SIZE, limit=LIMIT, debug=settings.DEBUG )