Example #1
0
def create_many_family_mock_event(files, parser=None):
    # TODO: this will break for matio
    mock_event = dict()

    fam_batch = FamilyBatch()
    family_id = None

    for file in files:

        if type(file) is dict:
            family_id = str(file['family_id'])
            file = file['filename']

        test_fam_1 = Family()
        group_file_objs = []

        base_path = file
        group_file_objs.append({'path': base_path, 'metadata': dict()})
        test_fam_1.download_type = "LOCAL"

        test_fam_1.add_group(files=group_file_objs, parser=parser)
        if family_id is not None:
            test_fam_1.family_id = family_id
        fam_batch.add_family(test_fam_1)

    mock_event['family_batch'] = fam_batch
    return mock_event
Example #2
0
def create_mock_event(files, parser=None):
    mock_event = dict()

    fam_batch = FamilyBatch()

    test_fam_1 = Family()
    group_file_objs = []

    for file in files:

        base_path = file
        group_file_objs.append({'path': base_path, 'metadata': dict()})
        test_fam_1.download_type = "LOCAL"

    test_fam_1.add_group(files=group_file_objs, parser=parser)
    fam_batch.add_family(test_fam_1)

    mock_event['family_batch'] = fam_batch
    return mock_event
Example #3
0
    # Execute the extractor on our family_batch.
    xtra.execute_extractions(family_batch=event['family_batch'],
                             input_type=str)

    # All metadata are held in XtractAgent's memory. Flush to disk!
    xtra.flush_metadata_to_files(writer='json')

    return xtra.get_completion_stats()


mock_event = dict()

test_fam_1 = Family()
test_fam_2 = Family()

base_path = "/Users/tylerskluzacek/xtract-sdk/tests/xtract-tabular/tests/test_files"
test_fam_1.add_group(files=[{
    'path': os.path.join(base_path, 'comma_delim'),
    'metadata': dict()
}],
                     parser=None)
test_fam_1.download_type = "LOCAL"
print(test_fam_1.to_dict())

fam_batch = FamilyBatch()
fam_batch.add_family(test_fam_1)
mock_event['family_batch'] = fam_batch

data = extract_tabular(mock_event)
print(data)
Example #4
0
ep_name = "test_tabular_ep"
xtract_dir = "/Users/tylerskluzacek/.xtract"
# Note: this is the following to my local version of the git repo 'xtract-tabular'
sys_path_add = "/Users/tylerskluzacek/PycharmProjects/xtract-xpcs"
module_path = "gather_xpcs_metadata"  # The file containing 'execute_extractor'
recursion_depth = 5000
metadata_write_path = "/Users/tylerskluzacek/Desktop/test_metadata"

# HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT.
test_fam_1 = Family()
test_fam_2 = Family()

base_path = '/Users/tylerskluzacek/A001_00004_Vol20_att1_Rq0_0001-100000.hdf'
test_fam_1.add_group(files=[{
    'path': base_path,
    'metadata': dict()
}],
                     parser=None)
test_fam_1.download_type = "LOCAL"
print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}")

fam_batch = FamilyBatch()
fam_batch.add_family(test_fam_1)

extractor = XPCSExtractor()
event = extractor.create_event(family_batch=fam_batch,
                               ep_name=ep_name,
                               xtract_dir=xtract_dir,
                               module_path=module_path,
                               sys_path_add=sys_path_add,
                               metadata_write_path=metadata_write_path)
Example #5
0
# ****** JOAO LOCAL TESTS ******
ep_name = "test_images_ep"
xtract_dir = "/Users/joaovictor/.xtract"
# Note: this is the following to my local version of the git repo 'xtract-images'
sys_path_add = "/Users/joaovictor/xtract/xtract-images"
module_path = "xtract_images_main"  # The file containing 'execute_extractor'
recursion_depth = 5000
metadata_write_path = "/Users/joaovictor/Desktop/test_metadata"

# HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT.
test_fam_1 = Family()
test_fam_2 = Family()
base_path = "/Users/joaovictor/xtract/xtract-images/training_data/"
test_fam_1.download_type = "LOCAL"
test_fam_1.add_group(files=[{'path': os.path.join(base_path, 'graphics/social.png'), 'metadata': dict()}], parser=None)
test_fam_1.add_group(files=[{'path': os.path.join(base_path, 'maps/Fig1.jpg.png'), 'metadata': dict()}], parser=None)
test_fam_1.add_group(files=[{'path': os.path.join(base_path, 'maps/311.jpg'), 'metadata': dict()}], parser=None)

print(test_fam_1)

print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}")

fam_batch = FamilyBatch()
fam_batch.add_family(test_fam_1)

# ****** TEST AND RUN THE EXTRACTOR ON YOUR FAMILYBATCH() *******
extractor = ImagesExtractor()
print(dir(extractor))

event = extractor.create_event(family_batch=fam_batch,
from extractors.utils.base_extractor import base_extractor

# ****** JOAO LOCAL TESTS ******
ep_name = "test_keyword_ep"
xtract_dir = "/Users/joaovictor/.xtract"
# Note: this is the following to my local version of the git repo 'xtract-keyword'
sys_path_add = "/Users/joaovictor/xtract/xtract-keyword"
module_path = "xtract_keyword_main"  # The file containing 'execute_extractor'
metadata_write_path = "/Users/joaovictor/Desktop/test_metadata"

# HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT.
test_fam_1 = Family()
test_fam_2 = Family()

base_path = "/Users/joaovictor/xtract/xtract-keyword/tests/test_files"
test_fam_1.add_group(files=[{'path': os.path.join(base_path, 'freetext2'), 'metadata': dict()}], parser=None)
test_fam_1.download_type = "LOCAL"
# print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}")

fam_batch = FamilyBatch()
fam_batch.add_family(test_fam_1)

# # ****** TEST AND RUN THE EXTRACTOR ON YOUR FAMILYBATCH() *******
extractor = KeywordExtractor()
event = extractor.create_event(family_batch=fam_batch,
                               ep_name=ep_name,
                               xtract_dir=xtract_dir,
                               module_path=module_path,
                               sys_path_add=sys_path_add,
                               metadata_write_path=metadata_write_path)
Example #7
0
            pickle.dump(creds, token)

    return creds


gdr = GoogleDriveDownloader(auth_creds=do_login_flow())

file_1 = "1RbSdH_nI0EHvxFswpl1Qss7CyWXBHo-o"  # JPG image!
file_2 = "1ecjFs55sNxBiwoAtztHcoA450Gh7ak0m9VqK0Wrm1Ms"  # free text document

fam_1 = Family()
# TODO: Put the Google Drive arguments into a "gdrive_cfg" sub-dicitonary.
fam_1.add_group(files=[{
    'path': file_1,
    'is_gdoc': False,
    'metadata': {},
    'mimeType': 'image/jpg'
}],
                parser='image')

fam_2 = Family()
fam_2.add_group(files=[{
    'path': file_2,
    'is_gdoc': True,
    'metadata': {},
    'mimeType': 'text/plain'
}],
                parser='keyword')

fam_batch = FamilyBatch()
fam_batch.add_family(fam_1)
Example #8
0
ep_name = "test_python_ep"
xtract_dir = "/Users/joaovictor/.xtract"
# Note: this is the following to my local version of the git repo 'xtract-keyword'
sys_path_add = "/Users/joaovictor/xtract/xtract-python"
module_path = "xtract_python_main"  # The file containing 'execute_extractor'
metadata_write_path = "/Users/joaovictor/Desktop/test_metadata"

# HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT.
test_fam_1 = Family()
test_fam_2 = Family()

base_path = "/Users/joaovictor/xtract/xtract-python"
test_fam_1.add_group(files=[{
    'path':
    os.path.join(base_path, 'tests/test_files/multi_line.py'),
    'metadata':
    dict()
}],
                     parser=None)
test_fam_1.download_type = "LOCAL"
print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}")

fam_batch = FamilyBatch()
fam_batch.add_family(test_fam_1)

# # ****** TEST AND RUN THE EXTRACTOR ON YOUR FAMILYBATCH() *******
extractor = PythonExtractor()
event = extractor.create_event(family_batch=fam_batch,
                               ep_name=ep_name,
                               xtract_dir=xtract_dir,
                               module_path=module_path,
Example #9
0
import os

from xtract_sdk.xtract import XtractAgent
from xtract_sdk.packagers.family import Family
from xtract_sdk.packagers.family_batch import FamilyBatch

xag = XtractAgent(ep_name='tyler_test',
                  xtract_dir='/Users/tylerskluzacek/.xtract')

fam_to_process = Family(download_type='LOCAL', base_url="")
base_path = '/Users/tylerskluzacek/data_folder/413cafa0-9b43-4ffb-9c54-4834dd265a46'
fam_to_process.add_group(files=[{
    'path': os.path.join(base_path, 'INCAR'),
    'metadata': {}
}, {
    'path': os.path.join(base_path, 'OUTCAR'),
    'metadata': {}
}, {
    'path': os.path.join(base_path, 'POSCAR'),
    'metadata': {}
}],
                         parser='dft')
fam_to_process = fam_to_process.to_dict()

xag.load_family(fam_to_process)
xag.fetch_all_files()

for item in xag.ready_families:
    print(item)
Example #10
0
from extractors.utils.base_extractor import base_extractor

# ****** JOAO LOCAL TESTS ******
ep_name = "test_netcdf_ep"
xtract_dir = "/Users/joaovictor/.xtract"
# Note: this is the following to my local version of the git repo 'xtract-netcdf'
sys_path_add = "/Users/joaovictor/xtract/xtract-netcdf"
module_path = "xtract_netcdf_main"  # The file containing 'execute_extractor'
metadata_write_path = "/Users/joaovictor/Desktop/test_metadata"

# HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT.
test_fam_1 = Family()
test_fam_2 = Family()

base_path = "/Users/joaovictor/xtract/xtract-netcdf/tests/test_files/"
test_fam_1.add_group(files=[{'path': os.path.join(base_path, 'sresa1b_ncar_ccsm3-example.nc'), 'metadata': dict()}], parser=None)
test_fam_1.download_type = "LOCAL"
print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}")

fam_batch = FamilyBatch()
fam_batch.add_family(test_fam_1)

# # ****** TEST AND RUN THE EXTRACTOR ON YOUR FAMILYBATCH() *******
extractor = NetCDFExtractor()
event = extractor.create_event(family_batch=fam_batch,
                               ep_name=ep_name,
                               xtract_dir=xtract_dir,
                               module_path=module_path,
                               sys_path_add=sys_path_add,
                               metadata_write_path=metadata_write_path)
from funcx import FuncXClient
from extractors.xtract_tabular import tabular_extract
import time

from xtract_sdk.packagers.family import Family
from xtract_sdk.packagers.family_batch import FamilyBatch

file_id = "1XCS2Xqu35TiQgCpI8J8uu4Mss9FNnp1-AuHo-pMujb4"
file_id2 = "0B5nDSpS9a_3kUFdiTXRFdS12QUk"

family_1 = Family()
family_2 = Family()

family_1.add_group(files=[{
    'path': file_id,
    'is_gdoc': True,
    'mimeType': "text/csv"
}],
                   parser='xtract-tabular')
family_1.base_url = ""

family_2.add_group(files=[{
    'path': file_id2,
    'is_gdoc': False
}],
                   parser='xtract-tabular')
family_2.download_type = "GDRIVE"

fam_batch = FamilyBatch()
fam_batch.add_family(family_1)
fam_batch.add_family(family_2)
Example #12
0
from extractors.utils.base_extractor import base_extractor

ep_name = "test_jsonxml_ep"
xtract_dir = "/Users/joaovictor/.xtract"
sys_path_add = "/Users/joaovictor/xtract/xtract-jsonxml"
module_path = "xtract_jsonxml_main"
metadata_write_path = "/Users/joaovictor/Desktop/test_metadata"

# HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT.
test_fam_1 = Family()
test_fam_2 = Family()

base_path = "/Users/joaovictor/xtract/xtract-jsonxml/test_files"
test_fam_1.add_group(files=[{
    'path': os.path.join(base_path, 'RY9405.xml'),
    'metadata': dict()
}],
                     parser=None)
test_fam_1.download_type = "LOCAL"
# print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}")

fam_batch = FamilyBatch()
fam_batch.add_family(test_fam_1)

# # ****** TEST AND RUN THE EXTRACTOR ON YOUR FAMILYBATCH() *******
extractor = JsonXMLExtractor()
event = extractor.create_event(family_batch=fam_batch,
                               ep_name=ep_name,
                               xtract_dir=xtract_dir,
                               module_path=module_path,
                               sys_path_add=sys_path_add,
Example #13
0
# ****** JOAO LOCAL TESTS ******
ep_name = "test_hdf_ep"
xtract_dir = "/Users/joaovictor/.xtract"
# Note: this is the following to my local version of the git repo 'xtract-hdf'
sys_path_add = "/Users/joaovictor/xtract/xtract-hdf"
module_path = "xtract_hdf_main"  # The file containing 'execute_extractor'
metadata_write_path = "/Users/joaovictor/Desktop/test_metadata"

# HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT.
test_fam_1 = Family()
test_fam_2 = Family()

base_path = "/Users/joaovictor/xtract/xtract-hdf/"
test_fam_1.add_group(files=[{
    'path': os.path.join(base_path, 'local_test.hdf5'),
    'metadata': dict()
}],
                     parser=None)
test_fam_1.download_type = "LOCAL"
print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}")

fam_batch = FamilyBatch()
fam_batch.add_family(test_fam_1)

# # ****** TEST AND RUN THE EXTRACTOR ON YOUR FAMILYBATCH() *******
extractor = HDFExtractor()
event = extractor.create_event(family_batch=fam_batch,
                               ep_name=ep_name,
                               xtract_dir=xtract_dir,
                               module_path=module_path,
                               sys_path_add=sys_path_add,
from funcx import FuncXClient
from extractors.xtract_images import images_extract
import time

from xtract_sdk.packagers.family import Family
from xtract_sdk.packagers.family_batch import FamilyBatch

fam_1 = Family()
fam_batch = FamilyBatch()
fam_batch.add_family(fam_1)

fam_1.add_group(files=[{"path": '/home/skluzacek/i_spy.jpeg', "is_gdoc": False, "mimeType": "image/jpg", "metadata": {}}], parser='image')


def test(event):
    import os
    return os.environ['container_version']


def main(fxc, ep_id):
    container_uuid = fxc.register_container('xtract-images.img', 'singularity')
    print("Container UUID: {}".format(container_uuid))
    fn_uuid = fxc.register_function(images_extract,
                                    #ep_id, # TODO: We do not need ep id here
                                    container_uuid=container_uuid,
                                    description="New sum function defined without string spec")
    print("FN_UUID : ", fn_uuid)
    res = fxc.run({'family_batch': fam_batch,
                   'creds': None,
                   'download_file': False},
                  endpoint_id=ep_id, function_id=fn_uuid)
Example #15
0
    def group(self, file_ls: List[str]):

        crawl_tallies = {
            "text": 0,
            "tabular": 0,
            "images": 0,
            "presentation": 0,
            "other": 0,
            "hierarch": 0,
            "compressed": 0
        }
        """Given list of metadata dicts, output updated list of extractors
            NOTE FOR THIS GROUPER :: 1 file = 1 family = 1 group = 1 file """
        if not self.by_file:
            raise ValueError(
                "Unable to process groups of more than 1 file by extension!")

        families = []

        mappings = self.get_mappings()

        for fdict in file_ls:

            groups = []
            valid_mapping = False
            mimeType = None
            for mapping in mappings:

                if fdict['extension'].lower() in mappings[mapping]:
                    # TODO: this will eventually need to be a list of extractors.
                    fdict['extractor'] = mapping  # mapping = extractor_name!
                    valid_mapping = True
                    mimeType = fdict["mimeType"]

                    crawl_tallies[mapping] += 1

            if not valid_mapping:
                mimeType = fdict["mimeType"]
                if 'vnd.google-apps.document' in mimeType:
                    fdict['extractor'] = "text"
                    mimeType = "text/plain"
                    crawl_tallies["text"] += 1
                elif 'vnd.google-apps.spreadsheet' in mimeType:
                    fdict['extractor'] = "tabular"
                    mimeType = "text/csv"
                    crawl_tallies['tabular'] += 1
                elif 'vnd.google-apps.presentation' in mimeType:
                    # fdict['extractor'] = "text"  # TODO: this should come back soon.
                    fdict['extractor'] = None
                    mimeType = None
                    crawl_tallies['presentation'] += 1
                    # TODO from Will: " slides: text, tabular, images, BERT... order is not important"
                else:
                    # Now we default to None
                    fdict['extractor'] = None
                    mimeType = None
                    crawl_tallies['other'] += 1

            groups.append(fdict)

            family = Family()

            family.add_group(files=[{
                "path": fdict["id"],
                "metadata": fdict,
                "mimeType": mimeType
            }],
                             parser=fdict["extractor"])

            families.append(family.to_dict())

        return families
Example #16
0
from xtract_sdk.packagers.family import Family
from xtract_sdk.packagers.family_batch import FamilyBatch
from xtract_sdk.downloaders.google_drive import GoogleDriveDownloader
import time
import pickle, os

# TODO: extract from a list of families
fam = Family(str(0), headers={'potato': 'tomato'}, metadata=None)
fam2 = Family(str(1), headers={'potato': 'tomato'}, metadata=None)

group_id = fam.add_group(files=[{
    'path': 'a',
    'metadata': {}
}, {
    'path': 'b',
    'metadata': {}
}, {
    'path': 'c',
    'metadata': {}
}],
                         parser="camel")
group_id2 = fam.add_group(files=[{
    'path': 'c',
    'metadata': {}
}, {
    'path': 'd',
    'metadata': {}
}, {
    'path': 'e',
    'metadata': {}
}],