Beispiel #1
0
    # Execute the extractor on our family_batch.
    xtra.execute_extractions(family_batch=event['family_batch'],
                             input_type=str)

    # All metadata are held in XtractAgent's memory. Flush to disk!
    xtra.flush_metadata_to_files(writer='json')

    return xtra.get_completion_stats()


mock_event = dict()

test_fam_1 = Family()
test_fam_2 = Family()

base_path = "/Users/tylerskluzacek/xtract-sdk/tests/xtract-tabular/tests/test_files"
test_fam_1.add_group(files=[{
    'path': os.path.join(base_path, 'comma_delim'),
    'metadata': dict()
}],
                     parser=None)
test_fam_1.download_type = "LOCAL"
print(test_fam_1.to_dict())

fam_batch = FamilyBatch()
fam_batch.add_family(test_fam_1)
mock_event['family_batch'] = fam_batch

data = extract_tabular(mock_event)
print(data)
Beispiel #2
0
    def group(self, file_ls: List[str]):

        crawl_tallies = {
            "text": 0,
            "tabular": 0,
            "images": 0,
            "presentation": 0,
            "other": 0,
            "hierarch": 0,
            "compressed": 0
        }
        """Given list of metadata dicts, output updated list of extractors
            NOTE FOR THIS GROUPER :: 1 file = 1 family = 1 group = 1 file """
        if not self.by_file:
            raise ValueError(
                "Unable to process groups of more than 1 file by extension!")

        families = []

        mappings = self.get_mappings()

        for fdict in file_ls:

            groups = []
            valid_mapping = False
            mimeType = None
            for mapping in mappings:

                if fdict['extension'].lower() in mappings[mapping]:
                    # TODO: this will eventually need to be a list of extractors.
                    fdict['extractor'] = mapping  # mapping = extractor_name!
                    valid_mapping = True
                    mimeType = fdict["mimeType"]

                    crawl_tallies[mapping] += 1

            if not valid_mapping:
                mimeType = fdict["mimeType"]
                if 'vnd.google-apps.document' in mimeType:
                    fdict['extractor'] = "text"
                    mimeType = "text/plain"
                    crawl_tallies["text"] += 1
                elif 'vnd.google-apps.spreadsheet' in mimeType:
                    fdict['extractor'] = "tabular"
                    mimeType = "text/csv"
                    crawl_tallies['tabular'] += 1
                elif 'vnd.google-apps.presentation' in mimeType:
                    # fdict['extractor'] = "text"  # TODO: this should come back soon.
                    fdict['extractor'] = None
                    mimeType = None
                    crawl_tallies['presentation'] += 1
                    # TODO from Will: " slides: text, tabular, images, BERT... order is not important"
                else:
                    # Now we default to None
                    fdict['extractor'] = None
                    mimeType = None
                    crawl_tallies['other'] += 1

            groups.append(fdict)

            family = Family()

            family.add_group(files=[{
                "path": fdict["id"],
                "metadata": fdict,
                "mimeType": mimeType
            }],
                             parser=fdict["extractor"])

            families.append(family.to_dict())

        return families
Beispiel #3
0
import os

from xtract_sdk.xtract import XtractAgent
from xtract_sdk.packagers.family import Family
from xtract_sdk.packagers.family_batch import FamilyBatch

xag = XtractAgent(ep_name='tyler_test',
                  xtract_dir='/Users/tylerskluzacek/.xtract')

fam_to_process = Family(download_type='LOCAL', base_url="")
base_path = '/Users/tylerskluzacek/data_folder/413cafa0-9b43-4ffb-9c54-4834dd265a46'
fam_to_process.add_group(files=[{
    'path': os.path.join(base_path, 'INCAR'),
    'metadata': {}
}, {
    'path': os.path.join(base_path, 'OUTCAR'),
    'metadata': {}
}, {
    'path': os.path.join(base_path, 'POSCAR'),
    'metadata': {}
}],
                         parser='dft')
fam_to_process = fam_to_process.to_dict()

xag.load_family(fam_to_process)
xag.fetch_all_files()

for item in xag.ready_families:
    print(item)
Beispiel #4
0
    'path': 'v',
    'metadata': {}
}],
                           parser="potato")

assert type(
    group_id) is str, "fam.add_group is not returning an id of type str"
print(type(fam.files))

assert sorted([item["path"] for item in fam.files]) == ['a', 'b', 'c', 'd', 'e'], \
    "fam.files not properly inheriting group.files"
assert sorted([item["path"] for item in fam2.files]) == ['v', 'w', 'x', 'y', 'z'], \
    "fam.files not properly inheriting group.files"

# Here we test if going to_dict and from_dict leads us to our original family object.
dict_fam = fam.to_dict()
back_to_reg_fam = Family(download_type="gdrive")
back_to_reg_fam.from_dict(dict_fam)

assert fam.family_id == back_to_reg_fam.family_id, "to_dict -> from_dict family_ids do not match"
assert fam.download_type == back_to_reg_fam.download_type, "to_dict -> from_dict family_ids do not match"

print(fam.files)
print(back_to_reg_fam.files)
assert fam.files == back_to_reg_fam.files

for group in back_to_reg_fam.groups:
    assert group in fam.groups, "to_dict -> from_dic group_ids do not map"
    assert fam.groups[group].metadata == back_to_reg_fam.groups[group].metadata
    assert fam.groups[group].parser == back_to_reg_fam.groups[group].parser
    assert fam.groups[group].files == back_to_reg_fam.groups[group].files