def create_many_family_mock_event(files, parser=None): # TODO: this will break for matio mock_event = dict() fam_batch = FamilyBatch() family_id = None for file in files: if type(file) is dict: family_id = str(file['family_id']) file = file['filename'] test_fam_1 = Family() group_file_objs = [] base_path = file group_file_objs.append({'path': base_path, 'metadata': dict()}) test_fam_1.download_type = "LOCAL" test_fam_1.add_group(files=group_file_objs, parser=parser) if family_id is not None: test_fam_1.family_id = family_id fam_batch.add_family(test_fam_1) mock_event['family_batch'] = fam_batch return mock_event
def create_mock_event(files, parser=None): mock_event = dict() fam_batch = FamilyBatch() test_fam_1 = Family() group_file_objs = [] for file in files: base_path = file group_file_objs.append({'path': base_path, 'metadata': dict()}) test_fam_1.download_type = "LOCAL" test_fam_1.add_group(files=group_file_objs, parser=parser) fam_batch.add_family(test_fam_1) mock_event['family_batch'] = fam_batch return mock_event
# Execute the extractor on our family_batch. xtra.execute_extractions(family_batch=event['family_batch'], input_type=str) # All metadata are held in XtractAgent's memory. Flush to disk! xtra.flush_metadata_to_files(writer='json') return xtra.get_completion_stats() mock_event = dict() test_fam_1 = Family() test_fam_2 = Family() base_path = "/Users/tylerskluzacek/xtract-sdk/tests/xtract-tabular/tests/test_files" test_fam_1.add_group(files=[{ 'path': os.path.join(base_path, 'comma_delim'), 'metadata': dict() }], parser=None) test_fam_1.download_type = "LOCAL" print(test_fam_1.to_dict()) fam_batch = FamilyBatch() fam_batch.add_family(test_fam_1) mock_event['family_batch'] = fam_batch data = extract_tabular(mock_event) print(data)
ep_name = "test_tabular_ep" xtract_dir = "/Users/tylerskluzacek/.xtract" # Note: this is the following to my local version of the git repo 'xtract-tabular' sys_path_add = "/Users/tylerskluzacek/PycharmProjects/xtract-xpcs" module_path = "gather_xpcs_metadata" # The file containing 'execute_extractor' recursion_depth = 5000 metadata_write_path = "/Users/tylerskluzacek/Desktop/test_metadata" # HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT. test_fam_1 = Family() test_fam_2 = Family() base_path = '/Users/tylerskluzacek/A001_00004_Vol20_att1_Rq0_0001-100000.hdf' test_fam_1.add_group(files=[{ 'path': base_path, 'metadata': dict() }], parser=None) test_fam_1.download_type = "LOCAL" print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}") fam_batch = FamilyBatch() fam_batch.add_family(test_fam_1) extractor = XPCSExtractor() event = extractor.create_event(family_batch=fam_batch, ep_name=ep_name, xtract_dir=xtract_dir, module_path=module_path, sys_path_add=sys_path_add, metadata_write_path=metadata_write_path)
# ****** JOAO LOCAL TESTS ****** ep_name = "test_images_ep" xtract_dir = "/Users/joaovictor/.xtract" # Note: this is the following to my local version of the git repo 'xtract-images' sys_path_add = "/Users/joaovictor/xtract/xtract-images" module_path = "xtract_images_main" # The file containing 'execute_extractor' recursion_depth = 5000 metadata_write_path = "/Users/joaovictor/Desktop/test_metadata" # HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT. test_fam_1 = Family() test_fam_2 = Family() base_path = "/Users/joaovictor/xtract/xtract-images/training_data/" test_fam_1.download_type = "LOCAL" test_fam_1.add_group(files=[{'path': os.path.join(base_path, 'graphics/social.png'), 'metadata': dict()}], parser=None) test_fam_1.add_group(files=[{'path': os.path.join(base_path, 'maps/Fig1.jpg.png'), 'metadata': dict()}], parser=None) test_fam_1.add_group(files=[{'path': os.path.join(base_path, 'maps/311.jpg'), 'metadata': dict()}], parser=None) print(test_fam_1) print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}") fam_batch = FamilyBatch() fam_batch.add_family(test_fam_1) # ****** TEST AND RUN THE EXTRACTOR ON YOUR FAMILYBATCH() ******* extractor = ImagesExtractor() print(dir(extractor)) event = extractor.create_event(family_batch=fam_batch,
from extractors.utils.base_extractor import base_extractor # ****** JOAO LOCAL TESTS ****** ep_name = "test_keyword_ep" xtract_dir = "/Users/joaovictor/.xtract" # Note: this is the following to my local version of the git repo 'xtract-keyword' sys_path_add = "/Users/joaovictor/xtract/xtract-keyword" module_path = "xtract_keyword_main" # The file containing 'execute_extractor' metadata_write_path = "/Users/joaovictor/Desktop/test_metadata" # HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT. test_fam_1 = Family() test_fam_2 = Family() base_path = "/Users/joaovictor/xtract/xtract-keyword/tests/test_files" test_fam_1.add_group(files=[{'path': os.path.join(base_path, 'freetext2'), 'metadata': dict()}], parser=None) test_fam_1.download_type = "LOCAL" # print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}") fam_batch = FamilyBatch() fam_batch.add_family(test_fam_1) # # ****** TEST AND RUN THE EXTRACTOR ON YOUR FAMILYBATCH() ******* extractor = KeywordExtractor() event = extractor.create_event(family_batch=fam_batch, ep_name=ep_name, xtract_dir=xtract_dir, module_path=module_path, sys_path_add=sys_path_add, metadata_write_path=metadata_write_path)
pickle.dump(creds, token) return creds gdr = GoogleDriveDownloader(auth_creds=do_login_flow()) file_1 = "1RbSdH_nI0EHvxFswpl1Qss7CyWXBHo-o" # JPG image! file_2 = "1ecjFs55sNxBiwoAtztHcoA450Gh7ak0m9VqK0Wrm1Ms" # free text document fam_1 = Family() # TODO: Put the Google Drive arguments into a "gdrive_cfg" sub-dicitonary. fam_1.add_group(files=[{ 'path': file_1, 'is_gdoc': False, 'metadata': {}, 'mimeType': 'image/jpg' }], parser='image') fam_2 = Family() fam_2.add_group(files=[{ 'path': file_2, 'is_gdoc': True, 'metadata': {}, 'mimeType': 'text/plain' }], parser='keyword') fam_batch = FamilyBatch() fam_batch.add_family(fam_1)
ep_name = "test_python_ep" xtract_dir = "/Users/joaovictor/.xtract" # Note: this is the following to my local version of the git repo 'xtract-keyword' sys_path_add = "/Users/joaovictor/xtract/xtract-python" module_path = "xtract_python_main" # The file containing 'execute_extractor' metadata_write_path = "/Users/joaovictor/Desktop/test_metadata" # HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT. test_fam_1 = Family() test_fam_2 = Family() base_path = "/Users/joaovictor/xtract/xtract-python" test_fam_1.add_group(files=[{ 'path': os.path.join(base_path, 'tests/test_files/multi_line.py'), 'metadata': dict() }], parser=None) test_fam_1.download_type = "LOCAL" print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}") fam_batch = FamilyBatch() fam_batch.add_family(test_fam_1) # # ****** TEST AND RUN THE EXTRACTOR ON YOUR FAMILYBATCH() ******* extractor = PythonExtractor() event = extractor.create_event(family_batch=fam_batch, ep_name=ep_name, xtract_dir=xtract_dir, module_path=module_path,
import os from xtract_sdk.xtract import XtractAgent from xtract_sdk.packagers.family import Family from xtract_sdk.packagers.family_batch import FamilyBatch xag = XtractAgent(ep_name='tyler_test', xtract_dir='/Users/tylerskluzacek/.xtract') fam_to_process = Family(download_type='LOCAL', base_url="") base_path = '/Users/tylerskluzacek/data_folder/413cafa0-9b43-4ffb-9c54-4834dd265a46' fam_to_process.add_group(files=[{ 'path': os.path.join(base_path, 'INCAR'), 'metadata': {} }, { 'path': os.path.join(base_path, 'OUTCAR'), 'metadata': {} }, { 'path': os.path.join(base_path, 'POSCAR'), 'metadata': {} }], parser='dft') fam_to_process = fam_to_process.to_dict() xag.load_family(fam_to_process) xag.fetch_all_files() for item in xag.ready_families: print(item)
from extractors.utils.base_extractor import base_extractor # ****** JOAO LOCAL TESTS ****** ep_name = "test_netcdf_ep" xtract_dir = "/Users/joaovictor/.xtract" # Note: this is the following to my local version of the git repo 'xtract-netcdf' sys_path_add = "/Users/joaovictor/xtract/xtract-netcdf" module_path = "xtract_netcdf_main" # The file containing 'execute_extractor' metadata_write_path = "/Users/joaovictor/Desktop/test_metadata" # HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT. test_fam_1 = Family() test_fam_2 = Family() base_path = "/Users/joaovictor/xtract/xtract-netcdf/tests/test_files/" test_fam_1.add_group(files=[{'path': os.path.join(base_path, 'sresa1b_ncar_ccsm3-example.nc'), 'metadata': dict()}], parser=None) test_fam_1.download_type = "LOCAL" print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}") fam_batch = FamilyBatch() fam_batch.add_family(test_fam_1) # # ****** TEST AND RUN THE EXTRACTOR ON YOUR FAMILYBATCH() ******* extractor = NetCDFExtractor() event = extractor.create_event(family_batch=fam_batch, ep_name=ep_name, xtract_dir=xtract_dir, module_path=module_path, sys_path_add=sys_path_add, metadata_write_path=metadata_write_path)
from funcx import FuncXClient from extractors.xtract_tabular import tabular_extract import time from xtract_sdk.packagers.family import Family from xtract_sdk.packagers.family_batch import FamilyBatch file_id = "1XCS2Xqu35TiQgCpI8J8uu4Mss9FNnp1-AuHo-pMujb4" file_id2 = "0B5nDSpS9a_3kUFdiTXRFdS12QUk" family_1 = Family() family_2 = Family() family_1.add_group(files=[{ 'path': file_id, 'is_gdoc': True, 'mimeType': "text/csv" }], parser='xtract-tabular') family_1.base_url = "" family_2.add_group(files=[{ 'path': file_id2, 'is_gdoc': False }], parser='xtract-tabular') family_2.download_type = "GDRIVE" fam_batch = FamilyBatch() fam_batch.add_family(family_1) fam_batch.add_family(family_2)
from extractors.utils.base_extractor import base_extractor ep_name = "test_jsonxml_ep" xtract_dir = "/Users/joaovictor/.xtract" sys_path_add = "/Users/joaovictor/xtract/xtract-jsonxml" module_path = "xtract_jsonxml_main" metadata_write_path = "/Users/joaovictor/Desktop/test_metadata" # HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT. test_fam_1 = Family() test_fam_2 = Family() base_path = "/Users/joaovictor/xtract/xtract-jsonxml/test_files" test_fam_1.add_group(files=[{ 'path': os.path.join(base_path, 'RY9405.xml'), 'metadata': dict() }], parser=None) test_fam_1.download_type = "LOCAL" # print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}") fam_batch = FamilyBatch() fam_batch.add_family(test_fam_1) # # ****** TEST AND RUN THE EXTRACTOR ON YOUR FAMILYBATCH() ******* extractor = JsonXMLExtractor() event = extractor.create_event(family_batch=fam_batch, ep_name=ep_name, xtract_dir=xtract_dir, module_path=module_path, sys_path_add=sys_path_add,
# ****** JOAO LOCAL TESTS ****** ep_name = "test_hdf_ep" xtract_dir = "/Users/joaovictor/.xtract" # Note: this is the following to my local version of the git repo 'xtract-hdf' sys_path_add = "/Users/joaovictor/xtract/xtract-hdf" module_path = "xtract_hdf_main" # The file containing 'execute_extractor' metadata_write_path = "/Users/joaovictor/Desktop/test_metadata" # HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT. test_fam_1 = Family() test_fam_2 = Family() base_path = "/Users/joaovictor/xtract/xtract-hdf/" test_fam_1.add_group(files=[{ 'path': os.path.join(base_path, 'local_test.hdf5'), 'metadata': dict() }], parser=None) test_fam_1.download_type = "LOCAL" print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}") fam_batch = FamilyBatch() fam_batch.add_family(test_fam_1) # # ****** TEST AND RUN THE EXTRACTOR ON YOUR FAMILYBATCH() ******* extractor = HDFExtractor() event = extractor.create_event(family_batch=fam_batch, ep_name=ep_name, xtract_dir=xtract_dir, module_path=module_path, sys_path_add=sys_path_add,
from funcx import FuncXClient from extractors.xtract_images import images_extract import time from xtract_sdk.packagers.family import Family from xtract_sdk.packagers.family_batch import FamilyBatch fam_1 = Family() fam_batch = FamilyBatch() fam_batch.add_family(fam_1) fam_1.add_group(files=[{"path": '/home/skluzacek/i_spy.jpeg', "is_gdoc": False, "mimeType": "image/jpg", "metadata": {}}], parser='image') def test(event): import os return os.environ['container_version'] def main(fxc, ep_id): container_uuid = fxc.register_container('xtract-images.img', 'singularity') print("Container UUID: {}".format(container_uuid)) fn_uuid = fxc.register_function(images_extract, #ep_id, # TODO: We do not need ep id here container_uuid=container_uuid, description="New sum function defined without string spec") print("FN_UUID : ", fn_uuid) res = fxc.run({'family_batch': fam_batch, 'creds': None, 'download_file': False}, endpoint_id=ep_id, function_id=fn_uuid)
def group(self, file_ls: List[str]): crawl_tallies = { "text": 0, "tabular": 0, "images": 0, "presentation": 0, "other": 0, "hierarch": 0, "compressed": 0 } """Given list of metadata dicts, output updated list of extractors NOTE FOR THIS GROUPER :: 1 file = 1 family = 1 group = 1 file """ if not self.by_file: raise ValueError( "Unable to process groups of more than 1 file by extension!") families = [] mappings = self.get_mappings() for fdict in file_ls: groups = [] valid_mapping = False mimeType = None for mapping in mappings: if fdict['extension'].lower() in mappings[mapping]: # TODO: this will eventually need to be a list of extractors. fdict['extractor'] = mapping # mapping = extractor_name! valid_mapping = True mimeType = fdict["mimeType"] crawl_tallies[mapping] += 1 if not valid_mapping: mimeType = fdict["mimeType"] if 'vnd.google-apps.document' in mimeType: fdict['extractor'] = "text" mimeType = "text/plain" crawl_tallies["text"] += 1 elif 'vnd.google-apps.spreadsheet' in mimeType: fdict['extractor'] = "tabular" mimeType = "text/csv" crawl_tallies['tabular'] += 1 elif 'vnd.google-apps.presentation' in mimeType: # fdict['extractor'] = "text" # TODO: this should come back soon. fdict['extractor'] = None mimeType = None crawl_tallies['presentation'] += 1 # TODO from Will: " slides: text, tabular, images, BERT... order is not important" else: # Now we default to None fdict['extractor'] = None mimeType = None crawl_tallies['other'] += 1 groups.append(fdict) family = Family() family.add_group(files=[{ "path": fdict["id"], "metadata": fdict, "mimeType": mimeType }], parser=fdict["extractor"]) families.append(family.to_dict()) return families
from xtract_sdk.packagers.family import Family from xtract_sdk.packagers.family_batch import FamilyBatch from xtract_sdk.downloaders.google_drive import GoogleDriveDownloader import time import pickle, os # TODO: extract from a list of families fam = Family(str(0), headers={'potato': 'tomato'}, metadata=None) fam2 = Family(str(1), headers={'potato': 'tomato'}, metadata=None) group_id = fam.add_group(files=[{ 'path': 'a', 'metadata': {} }, { 'path': 'b', 'metadata': {} }, { 'path': 'c', 'metadata': {} }], parser="camel") group_id2 = fam.add_group(files=[{ 'path': 'c', 'metadata': {} }, { 'path': 'd', 'metadata': {} }, { 'path': 'e', 'metadata': {} }],