# Execute the extractor on our family_batch. xtra.execute_extractions(family_batch=event['family_batch'], input_type=str) # All metadata are held in XtractAgent's memory. Flush to disk! xtra.flush_metadata_to_files(writer='json') return xtra.get_completion_stats() mock_event = dict() test_fam_1 = Family() test_fam_2 = Family() base_path = "/Users/tylerskluzacek/xtract-sdk/tests/xtract-tabular/tests/test_files" test_fam_1.add_group(files=[{ 'path': os.path.join(base_path, 'comma_delim'), 'metadata': dict() }], parser=None) test_fam_1.download_type = "LOCAL" print(test_fam_1.to_dict()) fam_batch = FamilyBatch() fam_batch.add_family(test_fam_1) mock_event['family_batch'] = fam_batch data = extract_tabular(mock_event) print(data)
def group(self, file_ls: List[str]): crawl_tallies = { "text": 0, "tabular": 0, "images": 0, "presentation": 0, "other": 0, "hierarch": 0, "compressed": 0 } """Given list of metadata dicts, output updated list of extractors NOTE FOR THIS GROUPER :: 1 file = 1 family = 1 group = 1 file """ if not self.by_file: raise ValueError( "Unable to process groups of more than 1 file by extension!") families = [] mappings = self.get_mappings() for fdict in file_ls: groups = [] valid_mapping = False mimeType = None for mapping in mappings: if fdict['extension'].lower() in mappings[mapping]: # TODO: this will eventually need to be a list of extractors. fdict['extractor'] = mapping # mapping = extractor_name! valid_mapping = True mimeType = fdict["mimeType"] crawl_tallies[mapping] += 1 if not valid_mapping: mimeType = fdict["mimeType"] if 'vnd.google-apps.document' in mimeType: fdict['extractor'] = "text" mimeType = "text/plain" crawl_tallies["text"] += 1 elif 'vnd.google-apps.spreadsheet' in mimeType: fdict['extractor'] = "tabular" mimeType = "text/csv" crawl_tallies['tabular'] += 1 elif 'vnd.google-apps.presentation' in mimeType: # fdict['extractor'] = "text" # TODO: this should come back soon. fdict['extractor'] = None mimeType = None crawl_tallies['presentation'] += 1 # TODO from Will: " slides: text, tabular, images, BERT... order is not important" else: # Now we default to None fdict['extractor'] = None mimeType = None crawl_tallies['other'] += 1 groups.append(fdict) family = Family() family.add_group(files=[{ "path": fdict["id"], "metadata": fdict, "mimeType": mimeType }], parser=fdict["extractor"]) families.append(family.to_dict()) return families
import os from xtract_sdk.xtract import XtractAgent from xtract_sdk.packagers.family import Family from xtract_sdk.packagers.family_batch import FamilyBatch xag = XtractAgent(ep_name='tyler_test', xtract_dir='/Users/tylerskluzacek/.xtract') fam_to_process = Family(download_type='LOCAL', base_url="") base_path = '/Users/tylerskluzacek/data_folder/413cafa0-9b43-4ffb-9c54-4834dd265a46' fam_to_process.add_group(files=[{ 'path': os.path.join(base_path, 'INCAR'), 'metadata': {} }, { 'path': os.path.join(base_path, 'OUTCAR'), 'metadata': {} }, { 'path': os.path.join(base_path, 'POSCAR'), 'metadata': {} }], parser='dft') fam_to_process = fam_to_process.to_dict() xag.load_family(fam_to_process) xag.fetch_all_files() for item in xag.ready_families: print(item)
'path': 'v', 'metadata': {} }], parser="potato") assert type( group_id) is str, "fam.add_group is not returning an id of type str" print(type(fam.files)) assert sorted([item["path"] for item in fam.files]) == ['a', 'b', 'c', 'd', 'e'], \ "fam.files not properly inheriting group.files" assert sorted([item["path"] for item in fam2.files]) == ['v', 'w', 'x', 'y', 'z'], \ "fam.files not properly inheriting group.files" # Here we test if going to_dict and from_dict leads us to our original family object. dict_fam = fam.to_dict() back_to_reg_fam = Family(download_type="gdrive") back_to_reg_fam.from_dict(dict_fam) assert fam.family_id == back_to_reg_fam.family_id, "to_dict -> from_dict family_ids do not match" assert fam.download_type == back_to_reg_fam.download_type, "to_dict -> from_dict family_ids do not match" print(fam.files) print(back_to_reg_fam.files) assert fam.files == back_to_reg_fam.files for group in back_to_reg_fam.groups: assert group in fam.groups, "to_dict -> from_dic group_ids do not map" assert fam.groups[group].metadata == back_to_reg_fam.groups[group].metadata assert fam.groups[group].parser == back_to_reg_fam.groups[group].parser assert fam.groups[group].files == back_to_reg_fam.groups[group].files