def eml_data_table(tables: list) -> list: data_table = list() for table in tables: t = dict() t["Entity Name"] = clean(table.find("./entityName").xpath("string()")) description = table.find("./entityDescription") if description is not None: t["Entity Description"] = clean(description.xpath("string(")) physical = table.findall(".//physical") if len(physical) > 0: t["Physical"] = eml_physical(physical) data_table.append(t) methods = table.find("./methods") if methods is not None: t["Methods"] = eml_methods(methods) return data_table
def eml_responsible_party(parties: list, position: bool) -> list: # list of element_objs responsible_party = list() for party in parties: individual_names = list() organization_names = list() position_names = list() _individual_names = party.findall("./individualName") for _individual_name in _individual_names: given_names = list() _given_names = _individual_name.findall("./givenName") for _given_name in _given_names: given_name = (clean(_given_name.xpath("string()"))).strip() given_names.append(given_name) _sur_name = _individual_name.find("./surName") sur_name = (clean(_sur_name.xpath("string()"))).strip() individual_name = { "sur_name": sur_name, "given_names": given_names } individual_names.append(individual_name) _organization_names = party.findall("./organizationName") for _organization_name in _organization_names: organization_name = (clean( _organization_name.xpath("string()"))).strip() organization_names.append(organization_name) _position_names = party.findall("./positionName") for _position_name in _position_names: position_name = (clean(_position_name.xpath("string()"))) position_names.append(position_name) party_grp = { "individual_names": individual_names, "organization_names": organization_names, "position_names": position_names } responsible_party.append(party_grp) processed = process_responsible_party(responsible_party, position) return processed
def eml_resource(element_obj) -> dict: resource = dict() resource["Title"] = clean(element_obj.find("./title").xpath("string()")) creator = element_obj.findall("./creator") resource["Creator"] = eml_responsible_party(creator, position=False) alternate_identifiers = element_obj.findall("./alternateIdentifier") alt_id = list() for alternate_identifier in alternate_identifiers: alt_id.append(clean(alternate_identifier.xpath("string()"))) resource["Alternate Identifier"] = alt_id abstract = element_obj.find("./abstract") if abstract is not None: resource["Abstract"] = html_text(eml_text(abstract)) intellectual_rights = element_obj.find("./intellectualRights") if intellectual_rights is not None: resource["Intellectual Rights"] = \ html_text(eml_text(intellectual_rights)) return resource
def eml_physical(phys: list) -> list: physical = list() for phy in phys: p = dict() p["Object Name"] = phy.find("./objectName").text.strip() size = phy.find("./size") if size is not None: value = clean(size.xpath("string()")) unit = size.attrib["unit"].strip() p["Size"] = f"{value} ({unit})" checksums = phy.findall(".//authentication") if len(checksums) > 0: for checksum in checksums: c = list() value = clean(checksum.xpath("string()")) method = checksum.attrib["method"].strip() c.append(f"{value} ({method})") p["Checksum(s)"] = c compression_methods = phy.findall(".//compressionMethod") if len(compression_methods) > 0: for compression_method in compression_methods: c = list() c.append(clean(compression_method.xpath("string()"))) p["Compression Method"] = c encoding_methods = phy.findall(".//encodingMethod") if len(encoding_methods) > 0: for encoding_method in encoding_methods: c = list() c.append(clean(encoding_method.xpath("string()"))) p["Encoding Method"] = c character_encoding = phy.find("./characterEncoding") if character_encoding is not None: value = clean(character_encoding.xpath("string()")) p["Character Encoding"] = value p["Data Format"] = eml_data_format(phy.find("./dataFormat")) physical.append(p) return physical
def eml_method_step(element_obj) -> dict: method_step = dict() description = element_obj.find("./description") method_step["Description"] = html_text(eml_text(description)) instrumentation = element_obj.findall("./instrumentation") if len(instrumentation) > 0: i = list() for instrument in instrumentation: i.append(clean(instrument.xpath("string()"))) method_step["Instrumentation"] = i substep = element_obj.findall("./subStep") if len(substep) > 0: s = list() for step in substep: s.append(eml_method_step(step)) method_step["Sub Step(s)"] = s return method_step
def eml_data_format(df) -> dict: data_format = dict() f = df.getchildren()[0] # Can only be one child if f.tag == "textFormat": tf = dict() num_header_lines = f.find("./numHeaderLines") if num_header_lines is not None: tf["Header Lines"] = clean(num_header_lines.xpath("string()")) num_footer_lines = f.find("./numFooterLines") if num_footer_lines is not None: tf["Footer Lines"] = clean(num_footer_lines.xpath("string()")) record_delimiters = f.findall("./recordDelimiter") if record_delimiters is not None: rd = list() for record_delimiter in record_delimiters: rd.append(clean(record_delimiter.xpath("string()"))) tf["Record Delimiter(s)"] = rd physical_line_delimiters = f.findall(".//physicalLineDelimiter") if len(physical_line_delimiters) > 0: pd = list() for physical_line_delimiter in physical_line_delimiters: pd.append(clean(physical_line_delimiter.xpath("string()"))) tf["Physical Line Delimiter(s)"] = pd num_physical_lines_per_record = f.find("./numPhysicalLinesPerRecord") if num_physical_lines_per_record is not None: tf["Physical Lines Per Record"] = \ clean(num_physical_lines_per_record.xpath("string()")) max_record_length = f.find("./maxRecordLength") if max_record_length is not None: tf["Maximum Record Length"] = \ clean(max_record_length.xpath("string()")) attribute_orientation = f.find("./attributeOrientation") tf["Attribute Orientation"] = \ clean(attribute_orientation.xpath("string()")) simple_delimited = f.find("./simpleDelimited") if simple_delimited is not None: td = dict() fd = list() field_delimiters = simple_delimited.findall("./fieldDelimiter") for field_delimiter in field_delimiters: fd.append(clean(field_delimiter.xpath("string()"))) td["Field Delimiter"] = fd collapse_delimiters = simple_delimited.find("./collapseDelimiters") if collapse_delimiters is not None: td["Collapse Delimiters"] = \ clean(collapse_delimiters.xpath("string()")) quote_characters = simple_delimited.findall("./quoteCharacter") if quote_characters is not None: qc = list() for quote_character in quote_characters: qc.append(clean(quote_character.xpath("string()"))) td["Quote Character"] = qc literal_characters = simple_delimited.findall("./literalCharacter") if literal_characters is not None: lc = list() for literal_character in literal_characters: lc.append(clean(literal_character.xpath("string()"))) td["Literal Character"] = lc complex = f.find("./complex") if complex is not None: c = list() c_children = complex.getchildren() for c_child in c_children: if c_child == "textFixed": tf = dict() field_width = c_child.find("./fieldWidth") tf["Field Width"] = clean(field_width.xpath("string()")) line_number = c_child.find("./lineNumber") if line_number is not None: tf["Line Number"] = \ clean(line_number.xpath("string()")) field_start_column = c_child.find("./fieldStartColumn") if field_start_column is not None: tf["Field Start Column"] = \ clean(field_start_column.xpath("string()")) c.append({"textFixed": tf}) else: # c_child == "textDelimited" td = dict() field_delimiter = c_child.find("./fieldDelimiter") td["Field Delimiter"] = \ clean(field_delimiter.xpath("string()")) collapse_delimiters = \ c_child.find("./collapseDelimiters") if collapse_delimiters is not None: td["Collpase Delimiters"] = \ clean(collapse_delimiters.xpath("string()")) line_number = c_child.find("./lineNumber") if line_number is not None: td["Line Number"] = clean( line_number.xpath("string()")) quote_characters = c_child.findall("./quoteCharacter") if quote_characters is not None: qc = list() for quote_character in quote_characters: qc.append(clean(quote_character.xpath("string()"))) td["Quote Character"] = qc literal_characters = c_child.findall("./literalCharacter") if literal_characters is not None: lc = list() for literal_character in literal_characters: lc.append( clean(literal_character.xpath("string()"))) td["Literal Character"] = lc c.append({"Text Delimited": td}) data_format["Text"] = tf elif f.tag == "externallyDefinedFormat": edf = dict() format_name = f.find("./formatName") edf["Format Name"] = clean(format_name.xpath("string()")) format_version = f.find("./formatVersion") if format_version is not None: edf["Format Version"] = clean(format_version.xpath("string()")) data_format[{"Externally Defined"}] = edf else: # f.tag == binaryRasterFormat brf = dict() row_col_orientation = f.find("./rowColumnOrientation") brf["Row/Column Orientation"] = \ clean(row_col_orientation.xpath("string()")) multi_band = f.find("./multiBand") if multi_band is not None: nbands = multi_band.find("./nbands") layout = multi_band.find("./layout") brf["Multi-band"] = { "Number of bands": clean(nbands.xpath("string()")), "Layout": clean(layout.xpath("string()")) } num_of_bits = f.find("./nbits") brf["Number of bits"] = clean(num_of_bits.xpath("string()")) byte_order = f.find("./byteorder") brf["Byte Order"] = clean(byte_order.xpath("string()")) skip_bytes = f.find("./skipbytes") if skip_bytes is not None: brf["Skip bytes"] = clean(skip_bytes.xpath("string()")) band_row_bytes = f.find("./bandrowbytes") if band_row_bytes is not None: brf["Band row bytes"] = clean(band_row_bytes.xpath("string()")) total_row_bytes = f.find("./totalrowbytes") if total_row_bytes is not None: brf["Total row bytes"] = clean(total_row_bytes.xpath("string()")) band_gap_bytes = f.find("./bandgapbytes") if band_gap_bytes is not None: brf["Band-gap bytes"] = clean(band_gap_bytes.xpath("string()")) data_format["Binary Raster"] = brf return data_format