Ejemplos de Family en Python, ejemplos de xtract_sdk.packagers.family.Family en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: xtract_nothing.py Proyecto: xtracthub/xtract-service

def nothing_extract(event):

    """
    Function
    :param event (dict) -- contains auth header and list of HTTP links to extractable files:
    :return metadata (dict) -- metadata as gotten from the materials_io library:
    """

    import time

    import os
    import sys

    from xtract_sdk.packagers.family import Family
    from xtract_sdk.packagers.family_batch import FamilyBatch

    t0 = time.time()

    sys.path.insert(1, '/')

    # A list of file paths
    all_families = event['family_batch']

    if type(all_families) == dict:
        family_batch = FamilyBatch()
        for family in all_families["families"]:
            fam = Family()
            fam.from_dict(family)
            family_batch.add_family(fam)
        all_families = family_batch

    for family in all_families.families:
        family_id = family.family_id
        fam_files = family.files
        headers = family.headers

        for file_obj in fam_files:

            # new_path = os.path.join(family_id, local_filename)

            for i in range(10):
                with open(file_obj['path'], 'r') as f:
                    f.close()

    t1 = time.time()

    return {"family_batch": all_families,
            "container_version": os.environ["container_version"],
            "transfer_time": 0,
            "import_time": 0,
            "family_fetch_time": 0,
            "file_unpack_time": 0,
            "full_extract_loop_time": 0,
            "total_time": t1 - t0
            }

Ejemplo n.º 2

0

Mostrar archivo

def create_mock_event(files, parser=None):
    mock_event = dict()

    fam_batch = FamilyBatch()

    test_fam_1 = Family()
    group_file_objs = []

    for file in files:

        base_path = file
        group_file_objs.append({'path': base_path, 'metadata': dict()})
        test_fam_1.download_type = "LOCAL"

    test_fam_1.add_group(files=group_file_objs, parser=parser)
    fam_batch.add_family(test_fam_1)

    mock_event['family_batch'] = fam_batch
    return mock_event

Ejemplo n.º 3

0

Mostrar archivo

def create_many_family_mock_event(files, parser=None):
    # TODO: this will break for matio
    mock_event = dict()

    fam_batch = FamilyBatch()
    family_id = None

    for file in files:

        if type(file) is dict:
            family_id = str(file['family_id'])
            file = file['filename']

        test_fam_1 = Family()
        group_file_objs = []

        base_path = file
        group_file_objs.append({'path': base_path, 'metadata': dict()})
        test_fam_1.download_type = "LOCAL"

        test_fam_1.add_group(files=group_file_objs, parser=parser)
        if family_id is not None:
            test_fam_1.family_id = family_id
        fam_batch.add_family(test_fam_1)

    mock_event['family_batch'] = fam_batch
    return mock_event

Ejemplo n.º 4

0

Mostrar archivo

Archivo: images_test.py Proyecto: xtracthub/xtract-service

from extractors.xtract_imagesort import ImagesExtractor
from xtract_sdk.packagers.family import Family
from xtract_sdk.packagers.family_batch import FamilyBatch
from extractors.utils.base_extractor import base_extractor

# ****** JOAO LOCAL TESTS ******
ep_name = "test_images_ep"
xtract_dir = "/Users/joaovictor/.xtract"
# Note: this is the following to my local version of the git repo 'xtract-images'
sys_path_add = "/Users/joaovictor/xtract/xtract-images"
module_path = "xtract_images_main"  # The file containing 'execute_extractor'
recursion_depth = 5000
metadata_write_path = "/Users/joaovictor/Desktop/test_metadata"

# HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT.
test_fam_1 = Family()
test_fam_2 = Family()
base_path = "/Users/joaovictor/xtract/xtract-images/training_data/"
test_fam_1.download_type = "LOCAL"
test_fam_1.add_group(files=[{'path': os.path.join(base_path, 'graphics/social.png'), 'metadata': dict()}], parser=None)
test_fam_1.add_group(files=[{'path': os.path.join(base_path, 'maps/Fig1.jpg.png'), 'metadata': dict()}], parser=None)
test_fam_1.add_group(files=[{'path': os.path.join(base_path, 'maps/311.jpg'), 'metadata': dict()}], parser=None)

print(test_fam_1)

print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}")

fam_batch = FamilyBatch()
fam_batch.add_family(test_fam_1)

# ****** TEST AND RUN THE EXTRACTOR ON YOUR FAMILYBATCH() *******

Ejemplo n.º 5

0

Mostrar archivo

Archivo: keyword_tests.py Proyecto: xtracthub/xtract-service

import os
from extractors.xtract_keyword import KeywordExtractor
from xtract_sdk.packagers.family import Family
from xtract_sdk.packagers.family_batch import FamilyBatch
from extractors.utils.base_extractor import base_extractor

# ****** JOAO LOCAL TESTS ******
ep_name = "test_keyword_ep"
xtract_dir = "/Users/joaovictor/.xtract"
# Note: this is the following to my local version of the git repo 'xtract-keyword'
sys_path_add = "/Users/joaovictor/xtract/xtract-keyword"
module_path = "xtract_keyword_main"  # The file containing 'execute_extractor'
metadata_write_path = "/Users/joaovictor/Desktop/test_metadata"

# HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT.
test_fam_1 = Family()
test_fam_2 = Family()

base_path = "/Users/joaovictor/xtract/xtract-keyword/tests/test_files"
test_fam_1.add_group(files=[{'path': os.path.join(base_path, 'freetext2'), 'metadata': dict()}], parser=None)
test_fam_1.download_type = "LOCAL"
# print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}")

fam_batch = FamilyBatch()
fam_batch.add_family(test_fam_1)

# # ****** TEST AND RUN THE EXTRACTOR ON YOUR FAMILYBATCH() *******
extractor = KeywordExtractor()
event = extractor.create_event(family_batch=fam_batch,
                               ep_name=ep_name,
                               xtract_dir=xtract_dir,

Ejemplo n.º 6

0

Mostrar archivo

    def orch_thread(self, headers):
        to_terminate = False

        print(f"ENDPOINTS TO CHECK: {self.fx_eps_to_check}")
        all_extractors = get_all_extractors(self.fx_eps_to_check)
        print(f"Fetched all extractors... {all_extractors}")

        fxc = get_fx_client(headers)

        self.cur_status = "EXTRACTING"

        while True:

            # If our accounting is complete
            # NOTE: when concurrent, will also need to check if scheduling is DONE.
            if self.counters['fx']['success'] + \
                    self.counters['fx']['failed'] + \
                    self.counters['flagged_unknown'] == self.counters['cumu_scheduled'] \
                    and self.cur_status == 'SCHEDULED':
                to_terminate = True

            if to_terminate:
                print("[ORCH] Terminating!")
                print(f"Final counters: {self.counters}")
                self.cur_status = 'COMPLETED'  # TODO: Need to push this status to DB.
                break

            print(f"[ORCH] WQ length: {self.to_xtract_q.qsize()}")

            if self.to_xtract_q.empty() and self.funcx_current_tasks.empty():
                print(f"[ORCH] Empty work thread. Sleeping...")
                time.sleep(5)

            else:
                batch = fxc.create_batch()
                batch_len = 0
                while not self.to_xtract_q.empty(
                ):  # TODO: also need max batch size here.
                    family = self.to_xtract_q.get()
                    self.counters['cumu_orch_enter'] += 1

                    extractor_id = family['first_extractor']

                    if extractor_id in extractor_map:
                        extractor = extractor_map[extractor_id]
                    else:
                        self.counters['flagged_unknown'] += 1
                        continue

                    # We should not need to repack and add an empty base_url
                    fam_batch = FamilyBatch()
                    packed_family = Family()
                    family['base_url'] = None
                    packed_family.from_dict(family)

                    fam_batch.add_family(packed_family)

                    # TODO: hardcodes galore.
                    event = extractor.create_event(
                        family_batch=fam_batch,
                        ep_name='default',
                        xtract_dir="/home/tskluzac/.xtract",
                        sys_path_add="/",
                        module_path=f"xtract_{extractor_id}_main",
                        metadata_write_path='/home/tskluzac/mdata')

                    fx_ep_id = self.fx_eps_to_check[
                        0]  # TODO: Should not be fixed to first fx_ep.

                    print(f"Endpoint ID: {fx_ep_id}")
                    batch.add(
                        event,
                        endpoint_id=fx_ep_id,
                        function_id=all_extractors[f"xtract-{extractor_id}"]
                        [fx_ep_id])
                    batch_len += 1

                # Only want to send tasks if we retrieved tasks.
                if batch_len > 0:
                    batch_res = fxc.batch_run(batch)
                    time.sleep(1.1)
                    for item in batch_res:
                        self.funcx_current_tasks.put(item)

                poll_batch = []

                # print("Entering task loop")
                for i in range(0, 20):  # TODO: hardcode
                    if not self.funcx_current_tasks.empty():
                        tid = self.funcx_current_tasks.get()
                        poll_batch.append(tid)
                # print(f"Current length of poll_batch: {len(poll_batch)}")

                if len(poll_batch) > 0:
                    x = fxc.get_batch_result(poll_batch)
                    time.sleep(1.1)
                    # print(f"Poll result: {x}")
                    for item in x:
                        result = x[item]

                        if result['status'] == 'success':
                            self.counters['fx']['success'] += 1

                        elif result['status'] == 'failed':
                            result['exception'].reraise()
                            self.counters['fx']['failures'] += 1

                        elif result['pending']:
                            self.funcx_current_tasks.put(item)
                        else:
                            # If we haven't figured it out until here, we need some dev...
                            raise ValueError(
                                "[ORCH] CRITICAL Unrecognized funcX status...")
                    print(self.counters)

Ejemplo n.º 7

0

Mostrar archivo

            flow = InstalledAppFlow.from_client_secrets_file(
                '../xtract_sdk/downloaders/credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    return creds


gdr = GoogleDriveDownloader(auth_creds=do_login_flow())

file_1 = "1RbSdH_nI0EHvxFswpl1Qss7CyWXBHo-o"  # JPG image!
file_2 = "1ecjFs55sNxBiwoAtztHcoA450Gh7ak0m9VqK0Wrm1Ms"  # free text document

fam_1 = Family()
# TODO: Put the Google Drive arguments into a "gdrive_cfg" sub-dicitonary.
fam_1.add_group(files=[{
    'path': file_1,
    'is_gdoc': False,
    'metadata': {},
    'mimeType': 'image/jpg'
}],
                parser='image')

fam_2 = Family()
fam_2.add_group(files=[{
    'path': file_2,
    'is_gdoc': True,
    'metadata': {},
    'mimeType': 'text/plain'

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test_matio_local.py Proyecto: xtracthub/xtract-research-notebooks

    def preproc_fam_batches(self):

        fam_count = 0

        # Just create an empty one out here so Python doesn't yell at me.
        fam_batch = FamilyBatch()

        num_overloads = 0
        # while we have files and haven't exceeded the weak scaling threshold (file_cutoff)
        while not self.family_queue.empty() and fam_count < file_cutoff:

            fam_batch = FamilyBatch()
            total_fam_batch_size = 0

            # Keep making batch until
            while len(fam_batch.families
                      ) < map_size and not self.family_queue.empty(
                      ) and fam_count < file_cutoff:

                fam_count += 1
                fam = self.family_queue.get()

                total_family_size = 0
                # First convert to the correct paths
                for file_obj in fam['files']:
                    old_path = file_obj['path']
                    new_path = self.path_converter(fam['family_id'], old_path)
                    file_obj['path'] = new_path
                    file_size = file_obj['metadata']['physical']['size']
                    total_family_size += file_size

                for group in fam['groups']:
                    for file_obj in group['files']:
                        old_path = file_obj['path']
                        new_path = self.path_converter(fam['family_id'],
                                                       old_path)
                        file_obj['path'] = new_path

                empty_fam = Family()
                empty_fam.from_dict(fam)

                # We will ONLY handle the SIZE issue in here.

                if soft_batch_bytes_max > 0:
                    # So if this last file would put us over the top,
                    if total_fam_batch_size + total_family_size > soft_batch_bytes_max:
                        num_overloads += 1
                        print(f"Num overloads {num_overloads}")
                        # then we append the old batch (if not empty),
                        if len(fam_batch.families) > 0:
                            self.fam_batches.append(fam_batch)

                        # empty the old one
                        fam_batch = FamilyBatch()
                        total_fam_batch_size = total_family_size

                        assert (len(fam_batch.families) == 0)

                # and then continue (here we either add to our prior fam_batch OR the new one).
                fam_batch.add_family(empty_fam)

            assert len(fam_batch.families) <= map_size

            self.fam_batches.append(fam_batch)

        # img_extractor = NothingExtractor()
        img_extractor = MatioExtractor()

        # TODO: ADDING TEST. Making sure we have all of our files here.

        ta = time.time()
        num_families = 0
        for item in self.fam_batches:
            num_families += len(item.families)

        print(num_families)
        tb = time.time()
        print(f"Time to move families: {tb-ta}")
        time.sleep(5)
        # exit()

        # exit()

        # This check makes sure our batches are the correct size to avoid the January 2021 disaster of having vastly
        #  incorrect numbers of batches.
        #
        #  Here we are checking that the number of families we are processing is LESS than the total number of
        #   batches times the batch size (e.g., the last batch can be full or empty), and the number of families
        #   is GREATER than the case where our last map is missing.
        #
        #
        #  This leaves a very small window for error. Could use modulus to be more exact.

        # TODO: Bring this back (but use for grouping by num. files)

        # try:
        #     assert len(self.fam_batches) * (map_size-1) <= fam_count <= len(self.fam_batches) * map_size
        # except AssertionError as e:
        #     print(f"Caught {e} after creating client batches...")
        #     print(f"Number of batches: {len(self.fam_batches)}")
        #     print(f"Family Count: {fam_count}")
        #
        #     print("Cannot continue. Exiting...")
        #     exit()

        print(f"Container type: {container_type}")
        print(f"Location: {location}")
        self.fn_uuid = img_extractor.register_function(
            container_type=container_type,
            location=location,
            ep_id=ep_id,
            group="a31d8dce-5d0a-11ea-afea-0a53601d30b5")

        # funcX batching. Here we take the 'user' FamilyBatch objects and put them into a batch we send to funcX.
        num_fx_batches = 0
        current_batch = []

        print(f"Number of family batches: {len(self.fam_batches)}")
        for fam_batch in self.fam_batches:

            # print(len(current_batch))
            # print(batch_size)

            if len(current_batch) < batch_size:
                current_batch.append(fam_batch)
            else:
                # print("Marking batch!")
                # print(len(current_batch))
                self.funcx_batches.put(current_batch)
                current_batch = [fam_batch]
                num_fx_batches += 1

        # Grab the stragglers.
        if len(current_batch) > 0:
            print("Marking batch!")
            self.funcx_batches.put(current_batch)
            num_fx_batches += 1

        # See same description as above (map example) for explanation.
        try:
            theor_full_batches = math.ceil(len(self.fam_batches) / batch_size)

            # print(f"Theoretical full batches: {}")
            assert theor_full_batches == num_fx_batches
        except AssertionError as e:
            print(f"Caught {e} after creating funcX batches...")
            print(f"Number of batches: {self.funcx_batches.qsize()}")
            print(f"Family Count: {num_fx_batches}")

            print("Cannot continue. Exiting...")
            exit()

Ejemplo n.º 9

0

Mostrar archivo

import os
# from extractors.xtract_python import PythonExtractor
from xtract_sdk.packagers.family import Family
from xtract_sdk.packagers.family_batch import FamilyBatch
from extractors.utils.base_extractor import base_extractor

# ****** JOAO LOCAL TESTS ******
ep_name = "test_python_ep"
xtract_dir = "/Users/joaovictor/.xtract"
# Note: this is the following to my local version of the git repo 'xtract-keyword'
sys_path_add = "/Users/joaovictor/xtract/xtract-python"
module_path = "xtract_python_main"  # The file containing 'execute_extractor'
metadata_write_path = "/Users/joaovictor/Desktop/test_metadata"

# HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT.
test_fam_1 = Family()
test_fam_2 = Family()

base_path = "/Users/joaovictor/xtract/xtract-python"
test_fam_1.add_group(files=[{
    'path':
    os.path.join(base_path, 'tests/test_files/multi_line.py'),
    'metadata':
    dict()
}],
                     parser=None)
test_fam_1.download_type = "LOCAL"
print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}")

fam_batch = FamilyBatch()
fam_batch.add_family(test_fam_1)

Ejemplo n.º 10

0

Mostrar archivo

import os

from xtract_sdk.xtract import XtractAgent
from xtract_sdk.packagers.family import Family
from xtract_sdk.packagers.family_batch import FamilyBatch

xag = XtractAgent(ep_name='tyler_test',
                  xtract_dir='/Users/tylerskluzacek/.xtract')

fam_to_process = Family(download_type='LOCAL', base_url="")
base_path = '/Users/tylerskluzacek/data_folder/413cafa0-9b43-4ffb-9c54-4834dd265a46'
fam_to_process.add_group(files=[{
    'path': os.path.join(base_path, 'INCAR'),
    'metadata': {}
}, {
    'path': os.path.join(base_path, 'OUTCAR'),
    'metadata': {}
}, {
    'path': os.path.join(base_path, 'POSCAR'),
    'metadata': {}
}],
                         parser='dft')
fam_to_process = fam_to_process.to_dict()

xag.load_family(fam_to_process)
xag.fetch_all_files()

for item in xag.ready_families:
    print(item)

Ejemplo n.º 11

0

Mostrar archivo

import os
from extractors.xtract_netcdf import NetCDFExtractor
from xtract_sdk.packagers.family import Family
from xtract_sdk.packagers.family_batch import FamilyBatch
from extractors.utils.base_extractor import base_extractor

# ****** JOAO LOCAL TESTS ******
ep_name = "test_netcdf_ep"
xtract_dir = "/Users/joaovictor/.xtract"
# Note: this is the following to my local version of the git repo 'xtract-netcdf'
sys_path_add = "/Users/joaovictor/xtract/xtract-netcdf"
module_path = "xtract_netcdf_main"  # The file containing 'execute_extractor'
metadata_write_path = "/Users/joaovictor/Desktop/test_metadata"

# HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT.
test_fam_1 = Family()
test_fam_2 = Family()

base_path = "/Users/joaovictor/xtract/xtract-netcdf/tests/test_files/"
test_fam_1.add_group(files=[{'path': os.path.join(base_path, 'sresa1b_ncar_ccsm3-example.nc'), 'metadata': dict()}], parser=None)
test_fam_1.download_type = "LOCAL"
print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}")

fam_batch = FamilyBatch()
fam_batch.add_family(test_fam_1)

# # ****** TEST AND RUN THE EXTRACTOR ON YOUR FAMILYBATCH() *******
extractor = NetCDFExtractor()
event = extractor.create_event(family_batch=fam_batch,
                               ep_name=ep_name,
                               xtract_dir=xtract_dir,

Ejemplo n.º 12

0

Mostrar archivo

Archivo: midway_tab_test.py Proyecto: xtracthub/xtract-service

from funcx import FuncXClient
from extractors.xtract_tabular import tabular_extract
import time

from xtract_sdk.packagers.family import Family
from xtract_sdk.packagers.family_batch import FamilyBatch

file_id = "1XCS2Xqu35TiQgCpI8J8uu4Mss9FNnp1-AuHo-pMujb4"
file_id2 = "0B5nDSpS9a_3kUFdiTXRFdS12QUk"

family_1 = Family()
family_2 = Family()

family_1.add_group(files=[{
    'path': file_id,
    'is_gdoc': True,
    'mimeType': "text/csv"
}],
                   parser='xtract-tabular')
family_1.base_url = ""

family_2.add_group(files=[{
    'path': file_id2,
    'is_gdoc': False
}],
                   parser='xtract-tabular')
family_2.download_type = "GDRIVE"

fam_batch = FamilyBatch()
fam_batch.add_family(family_1)
fam_batch.add_family(family_2)

Ejemplo n.º 13

0

Mostrar archivo

    def preproc_fam_batches(self):

        total_tasks = 0

        print("PREPROCESSING!")
        while not self.image_path_list.empty():

            fam_batch = FamilyBatch()
            # print(len(fam_batch.families))
            while len(fam_batch.families) < map_size:

                if self.image_path_list.empty():
                    break

                path = self.image_path_list.get()
                print(path)
                family = dict()

                family['family_id'] = None

                # TODO: CHANGE THIS FOR THETA.
                if system == 'midway2':
                    family['files'] = [{
                        'path':
                        f'/project2/chard/skluzacek/train2014/{path}'
                    }]
                elif system == 'theta':
                    family['files'] = [{
                        'path':
                        f'/projects/CSC249ADCD01/skluzacek/train2014/{path}'
                    }]
                family['metadata'] = dict()
                family['headers'] = None
                family['download_type'] = None
                family['groups'] = []

                empty_fam = Family()
                empty_fam.from_dict(family)
                print("ADDING FAMILY TO FAM BATCH")
                fam_batch.add_family(empty_fam)

            #if total_tasks > max_tasks:
            self.fam_batches.append(fam_batch)

        img_extractor = ImageExtractor()

        print(f"REGISTERING FUNCTION")
        self.fn_uuid = img_extractor.register_function(
            container_type=container_type,
            location=location,
            ep_id=ep_id,
            group="a31d8dce-5d0a-11ea-afea-0a53601d30b5")

        current_batch = []
        for fam_batch in self.fam_batches:
            if len(current_batch) < batch_size:
                current_batch.append(fam_batch)
            else:
                print(f"Length of current batch: {len(current_batch)}")
                self.funcx_batches.put(current_batch)
                current_batch = [fam_batch]

        # Grab the stragglers.
        if len(current_batch) > 0:
            self.funcx_batches.put(current_batch)

        print("Let me see")

        batch_counter = 0

Ejemplo n.º 14

0

Mostrar archivo

Archivo: midway_image_tests.py Proyecto: xtracthub/xtract-service

from funcx import FuncXClient
from extractors.xtract_images import images_extract
import time

from xtract_sdk.packagers.family import Family
from xtract_sdk.packagers.family_batch import FamilyBatch

fam_1 = Family()
fam_batch = FamilyBatch()
fam_batch.add_family(fam_1)

fam_1.add_group(files=[{"path": '/home/skluzacek/i_spy.jpeg', "is_gdoc": False, "mimeType": "image/jpg", "metadata": {}}], parser='image')


def test(event):
    import os
    return os.environ['container_version']


def main(fxc, ep_id):
    container_uuid = fxc.register_container('xtract-images.img', 'singularity')
    print("Container UUID: {}".format(container_uuid))
    fn_uuid = fxc.register_function(images_extract,
                                    #ep_id, # TODO: We do not need ep id here
                                    container_uuid=container_uuid,
                                    description="New sum function defined without string spec")
    print("FN_UUID : ", fn_uuid)
    res = fxc.run({'family_batch': fam_batch,
                   'creds': None,
                   'download_file': False},
                  endpoint_id=ep_id, function_id=fn_uuid)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: extension_grouper.py Proyecto: xtracthub/abyss

    def group(self, file_ls: List[str]):

        crawl_tallies = {
            "text": 0,
            "tabular": 0,
            "images": 0,
            "presentation": 0,
            "other": 0,
            "hierarch": 0,
            "compressed": 0
        }
        """Given list of metadata dicts, output updated list of extractors
            NOTE FOR THIS GROUPER :: 1 file = 1 family = 1 group = 1 file """
        if not self.by_file:
            raise ValueError(
                "Unable to process groups of more than 1 file by extension!")

        families = []

        mappings = self.get_mappings()

        for fdict in file_ls:

            groups = []
            valid_mapping = False
            mimeType = None
            for mapping in mappings:

                if fdict['extension'].lower() in mappings[mapping]:
                    # TODO: this will eventually need to be a list of extractors.
                    fdict['extractor'] = mapping  # mapping = extractor_name!
                    valid_mapping = True
                    mimeType = fdict["mimeType"]

                    crawl_tallies[mapping] += 1

            if not valid_mapping:
                mimeType = fdict["mimeType"]
                if 'vnd.google-apps.document' in mimeType:
                    fdict['extractor'] = "text"
                    mimeType = "text/plain"
                    crawl_tallies["text"] += 1
                elif 'vnd.google-apps.spreadsheet' in mimeType:
                    fdict['extractor'] = "tabular"
                    mimeType = "text/csv"
                    crawl_tallies['tabular'] += 1
                elif 'vnd.google-apps.presentation' in mimeType:
                    # fdict['extractor'] = "text"  # TODO: this should come back soon.
                    fdict['extractor'] = None
                    mimeType = None
                    crawl_tallies['presentation'] += 1
                    # TODO from Will: " slides: text, tabular, images, BERT... order is not important"
                else:
                    # Now we default to None
                    fdict['extractor'] = None
                    mimeType = None
                    crawl_tallies['other'] += 1

            groups.append(fdict)

            family = Family()

            family.add_group(files=[{
                "path": fdict["id"],
                "metadata": fdict,
                "mimeType": mimeType
            }],
                             parser=fdict["extractor"])

            families.append(family.to_dict())

        return families

Ejemplo n.º 16

0

Mostrar archivo

Archivo: xpcs_tests.py Proyecto: xtracthub/xtract-service

from extractors.xtract_xpcs import XPCSExtractor
from xtract_sdk.packagers.family import Family
from xtract_sdk.packagers.family_batch import FamilyBatch
from extractors.utils.base_extractor import base_extractor

# # TYLER LOCAL TESTS
ep_name = "test_tabular_ep"
xtract_dir = "/Users/tylerskluzacek/.xtract"
# Note: this is the following to my local version of the git repo 'xtract-tabular'
sys_path_add = "/Users/tylerskluzacek/PycharmProjects/xtract-xpcs"
module_path = "gather_xpcs_metadata"  # The file containing 'execute_extractor'
recursion_depth = 5000
metadata_write_path = "/Users/tylerskluzacek/Desktop/test_metadata"

# HERE WE PACK LOCAL FAMILIES INTO SAME STRUCTURES AS USED BY XTRACT.
test_fam_1 = Family()
test_fam_2 = Family()

base_path = '/Users/tylerskluzacek/A001_00004_Vol20_att1_Rq0_0001-100000.hdf'
test_fam_1.add_group(files=[{
    'path': base_path,
    'metadata': dict()
}],
                     parser=None)
test_fam_1.download_type = "LOCAL"
print(f"[DEBUG] JSON form of our family object: {test_fam_1.to_dict()}")

fam_batch = FamilyBatch()
fam_batch.add_family(test_fam_1)

extractor = XPCSExtractor()

Ejemplo n.º 17

0

Mostrar archivo

        recursion_depth=5000,
        metadata_write_path="/Users/tylerskluzacek/Desktop/test_metadata")

    # Execute the extractor on our family_batch.
    xtra.execute_extractions(family_batch=event['family_batch'],
                             input_type=str)

    # All metadata are held in XtractAgent's memory. Flush to disk!
    xtra.flush_metadata_to_files(writer='json')

    return xtra.get_completion_stats()


mock_event = dict()

test_fam_1 = Family()
test_fam_2 = Family()

base_path = "/Users/tylerskluzacek/xtract-sdk/tests/xtract-tabular/tests/test_files"
test_fam_1.add_group(files=[{
    'path': os.path.join(base_path, 'comma_delim'),
    'metadata': dict()
}],
                     parser=None)
test_fam_1.download_type = "LOCAL"
print(test_fam_1.to_dict())

fam_batch = FamilyBatch()
fam_batch.add_family(test_fam_1)
mock_event['family_batch'] = fam_batch

Ejemplo n.º 18

0

Mostrar archivo

from xtract_sdk.packagers.family import Family
from xtract_sdk.packagers.family_batch import FamilyBatch
from xtract_sdk.downloaders.google_drive import GoogleDriveDownloader
import time
import pickle, os

# TODO: extract from a list of families
fam = Family(str(0), headers={'potato': 'tomato'}, metadata=None)
fam2 = Family(str(1), headers={'potato': 'tomato'}, metadata=None)

group_id = fam.add_group(files=[{
    'path': 'a',
    'metadata': {}
}, {
    'path': 'b',
    'metadata': {}
}, {
    'path': 'c',
    'metadata': {}
}],
                         parser="camel")
group_id2 = fam.add_group(files=[{
    'path': 'c',
    'metadata': {}
}, {
    'path': 'd',
    'metadata': {}
}, {
    'path': 'e',
    'metadata': {}
}],