def convert(input_path, metadata=None, verbose=False): if verbose: print("Begin converting") # Collect the metadata if not metadata: dataset_metadata = { "globus_subject": "http://www.west-code.org/database/gw100/index.php", # REQ string: Unique value (should be URI if possible) "acl": [ "public" ], # REQ list of strings: UUID(s) of users/groups allowed to access data, or ["public"] "mdf_source_name": "gw100", # REQ string: Unique name for dataset "mdf-publish.publication.collection": "GW100", # RCM string: Collection the dataset belongs to # "mdf_data_class": , # RCM string: Type of data in all records in the dataset (do not provide for multi-type datasets) "cite_as": [ "M.J. van Setten, F. Caruso, S. Sharifzadeh, X. Ren, M. Scheffler, F. Liu, J. Lischner, L. Lin, J.R. Deslippe, S.G. Louie, C. Yang, F. Weigend, J.B. Neaton, F. Evers, and P. Rinke, GW100: Benchmarking G0W0 for Molecular Systems, J. Chem. Theory Comput. 11, 5665 (2015).", "M. Govoni et al., (2016). In preparation.", "P.J. Linstrom and W.G. Mallard, Eds., NIST Chemistry WebBook, NIST Standard Reference Database Number 69, National Institute of Standards and Technology, Gaithersburg MD, 20899, http://webbook.nist.gov." ], # REQ list of strings: Complete citation(s) for this dataset. # "license": , # RCM string: License to use the dataset (preferrably a link to the actual license). "dc.title": "Benchmark of G0W0 on 100 Molecules", # REQ string: Title of dataset "dc.creator": "The University of Chicago, Argonne National Laboratory", # REQ string: Owner of dataset "dc.identifier": "http://www.west-code.org/database/gw100/index.php", # REQ string: Link to dataset (dataset DOI if available) # "dc.contributor.author": , # RCM list of strings: Author(s) of dataset # "dc.subject": , # RCM list of strings: Keywords about dataset # "dc.description": , # RCM string: Description of dataset contents # "dc.relatedidentifier": , # RCM list of strings: Link(s) to related materials (such as an article) # "dc.year": # RCM integer: Year of dataset creation } elif type(metadata) is str: try: with open(metadata, 'r') as metadata_file: dataset_metadata = json.load(metadata_file) except Exception as e: sys.exit("Error: Unable to read metadata: " + repr(e)) elif type(metadata) is dict: dataset_metadata = metadata else: sys.exit("Error: Invalid metadata parameter") # Make a Validator to help write the feedstock # You must pass the metadata to the constructor # Each Validator instance can only be used for a single dataset # dataset_validator = Validator(dataset_metadata, strict=False) # You can also force the Validator to treat warnings as errors with strict=True dataset_validator = Validator(dataset_metadata, strict=True) # Get the data # Each record also needs its own metadata with open(input_path) as in_file: data = in_file.read() for record in tqdm(parse_tab(data), desc="Processing records", disable=not verbose): link = "http://www.west-code.org/database/gw100/pag/" + record[ "cas"] + ".php" record_metadata = { "globus_subject": link, # REQ string: Unique value (should be URI to record if possible) "acl": [ "public" ], # REQ list of strings: UUID(s) of users/groups allowed to access data, or ["public"] # "mdf-publish.publication.collection": , # OPT string: Collection the record belongs to (if different from dataset) # "mdf_data_class": , # OPT string: Type of data in record (if not set in dataset metadata) "mdf-base.material_composition": record[ "formula"], # RCM string: Chemical composition of material in record # "cite_as": , # OPT list of strings: Complete citation(s) for this record (if different from dataset) # "license": , # OPT string: License to use the record (if different from dataset) (preferrably a link to the actual license). "dc.title": "GW100 - " + record["name"], # REQ string: Title of record # "dc.creator": , # OPT string: Owner of record (if different from dataset) "dc.identifier": link, # RCM string: Link to record (record webpage, if available) # "dc.contributor.author": , # OPT list of strings: Author(s) of record (if different from dataset) # "dc.subject": , # OPT list of strings: Keywords about record # "dc.description": , # OPT string: Description of record # "dc.relatedidentifier": , # OPT list of strings: Link(s) to related materials (if different from dataset) # "dc.year": , # OPT integer: Year of record creation (if different from dataset) "data": { # RCM dictionary: Other record data (described below) # "raw": , # RCM string: Original data record text, if feasible # "files": , # RCM dictionary: {file_type : uri_to_file} pairs, data files (Example: {"cif" : "https://example.org/cifs/data_file.cif"}) "name": record["name"] # "cas_number": record["cas"] # other # RCM any JSON-valid type: Any other data fields you would like to include go in the "data" dictionary. Keys will be prepended with 'mdf_source_name:' } } # Pass each individual record to the Validator result = dataset_validator.write_record(record_metadata) # Check if the Validator accepted the record, and print a message if it didn't # If the Validator returns "success" == True, the record was written successfully if result["success"] is not True: print("Error:", result["message"], ":", result.get("invalid_metadata", "")) # The Validator may return warnings if strict=False, which should be noted if result.get("warnings", None): print("Warnings:", result["warnings"]) if verbose: print("Finished converting")
def convert(input_path, metadata=None, verbose=False): if verbose: print("Begin converting") # Collect the metadata if not metadata: dataset_metadata = { "globus_subject": "https://organicelectronics.nrel.gov", "acl": ["public"], "mdf_source_name": "nrel_pv", "mdf-publish.publication.collection": "NREL Organic Photovoltaic Database", "mdf_data_class": "csv", "cite_as": ["Gaussian 09, (Revisions B.01, C.01 and D.01), M. J. Frisch, et al., Gaussian, Inc., Wallingford CT, 2009. See gaussian.com", "Ross E. Larsen, J. Phys. Chem. C, 120, 9650-9660 (2016). DOI: 10.1021/acs .jpcc.6b02138"], # "license": , "dc.title": "National Renewable Energy Laboratory Organic Photovoltaic Database", "dc.creator": "NREL", "dc.identifier": "https://organicelectronics.nrel.gov", "dc.contributor.author": ["Ross Larsen", "Dana Olson", "Nikos Kopidakis", "Zbyslaw Owczarczyk", "Scott Hammond", "Peter Graf", "Travis Kemper", "Scott Sides", "Kristin Munch", "David Evenson", "Craig Swank"], # "dc.subject": , "dc.description": "Welcome to the National Renewable Energy Laboratory materials discovery database for organic electronic materials. The focus is on materials for organic photovoltaic (OPV) absorber materials but materials suitable for other applications may be found here as well.", "dc.relatedidentifier": ["https://dx.doi.org/10.1021/acs.jpcc.6b02138"] # "dc.year": } elif type(metadata) is str: try: with open(metadata, 'r') as metadata_file: dataset_metadata = json.load(metadata_file) except Exception as e: sys.exit("Error: Unable to read metadata: " + repr(e)) elif type(metadata) is dict: dataset_metadata = metadata else: sys.exit("Error: Invalid metadata parameter") # Make a Validator to help write the feedstock # You must pass the metadata to the constructor # Each Validator instance can only be used for a single dataset dataset_validator = Validator(dataset_metadata) # Get the data # Each record also needs its own metadata with open(input_path, 'r') as raw_in: for record in tqdm(parse_tab(raw_in.read()), desc="Processing files", disable= not verbose): record_metadata = { "globus_subject": record["URL"], "acl": ["public"], # "mdf-publish.publication.collection": , # "mdf_data_class": "csv", "mdf-base.material_composition": record["common_tag"], # "cite_as": , # "license": , "dc.title": "NREL OPV - " + record["common_tag"], # "dc.creator": , "dc.identifier": record["URL"], # "dc.contributor.author": , # "dc.subject": , # "dc.description": , # "dc.relatedidentifier": , # "dc.year": , "data": { "raw": json.dumps(record) # "files": } } # Pass each individual record to the Validator result = dataset_validator.write_record(record_metadata) # Check if the Validator accepted the record, and print a message if it didn't # If the Validator returns "success" == True, the record was written successfully if result["success"] is not True: print("Error:", result["message"], ":", result.get("invalid_metadata", "")) if verbose: print("Finished converting")
def convert(input_path, metadata=None, verbose=False): if verbose: print("Begin converting") # Collect the metadata if not metadata: dataset_metadata = { "globus_subject": "https://archive.ics.uci.edu/ml/datasets/QSAR+biodegradation", "acl": ["public"], "mdf_source_name": "qsar_biodeg", "mdf-publish.publication.collection": "QSAR Biodegradation Data Set", "mdf_data_class": "csv", "cite_as": [ "Mansouri, K., Ringsted, T., Ballabio, D., Todeschini, R., Consonni, V. (2013). Quantitative Structure - Activity Relationship models for ready biodegradability of chemicals. Journal of Chemical Information and Modeling, 53, 867-878", "Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science." ], # "license": , "mdf_version": "0.1.0", "dc.title": "QSAR biodegradation Data Set", "dc.creator": "Milano Chemometrics and QSAR Research Group", "dc.identifier": "https://archive.ics.uci.edu/ml/machine-learning-databases/00254/", "dc.contributor.author": [ "Mansouri, K.", "Ringsted, T.", "Ballabio, D.", "Todeschini, R.", "Consonni, V." ], # "dc.subject": , "dc.description": "Data set containing values for 41 attributes (molecular descriptors) used to classify 1055 chemicals into 2 classes (ready and not ready biodegradable).", # "dc.relatedidentifier": , "dc.year": 2013 } elif type(metadata) is str: try: dataset_metadata = json.loads(metadata) except Exception: try: with open(metadata, 'r') as metadata_file: dataset_metadata = json.load(metadata_file) except Exception as e: sys.exit("Error: Unable to read metadata: " + repr(e)) elif type(metadata) is dict: dataset_metadata = metadata else: sys.exit("Error: Invalid metadata parameter") # Make a Validator to help write the feedstock # You must pass the metadata to the constructor # Each Validator instance can only be used for a single dataset #dataset_validator = Validator(dataset_metadata, strict=False) # You can also force the Validator to treat warnings as errors with strict=True dataset_validator = Validator(dataset_metadata, strict=True) # Get the data # Each record should be exactly one dictionary # It is recommended that you convert your records one at a time, but it is possible to put them all into one big list (see below) # It is also recommended that you use a parser to help with this process if one is available for your datatype i = 1 headers = [ "SpMax_L", "J_Dz(e)", "nHM", "F01[N-N]", "F04[C-N]", "NssssC", "nCb-", "C%", "nCp", "nO", "F03[C-N]", "SdssC", "HyWi_B(m)", "LOC", " SM6_L", "F03[C-O]", "Me", "Mi", "nN-N", "nArNO2", "nCRX3", "SpPosA_B(p)", "nCIR", "B01[C-Br]", "B03[C-Cl]", "N-073", "SpMax_A", "Psi_i_1d", "B04[C-Br]", "SdO", "TI2_L", "nCrt", "C-026", "F02[C-N]", "nHDon", "SpMax_B(m)", "Psi_i_A", "nN", "SM6_B(m)", " nArCOOR", "nX", "experimental class" ] # Each record also needs its own metadata sep=";" with open(input_path, 'r') as raw_in: for row_data in tqdm(parse_tab(raw_in.read(), sep=";", headers=headers), desc="Processing data", disable=not verbose): record = [] for key, value in row_data.items(): record.append(key + ": " + value) uri = "https://data.materialsdatafacility.org/collections/" + "qsar_biodeg/biodeg.csv" record_metadata = { "globus_subject": uri + "#" + str(i), "acl": ["public"], # "mdf-publish.publication.collection": , # "mdf_data_class": , # "mdf-base.material_composition": , # "cite_as": , # "license": , "dc.title": "qsar_biodeg - " + "record: " + str(i), # "dc.creator": , # "dc.identifier": , # "dc.contributor.author": , # "dc.subject": , # "dc.description": , # "dc.relatedidentifier": , # "dc.year": , "data": { "raw": json.dumps(record), # "files": , } } i += 1 # Pass each individual record to the Validator result = dataset_validator.write_record(record_metadata) # Check if the Validator accepted the record, and print a message if it didn't # If the Validator returns "success" == True, the record was written successfully if result["success"] is not True: print("Error:", result["message"], ":", result.get("invalid_metadata", "")) # The Validator may return warnings if strict=False, which should be noted if result.get("warnings", None): print("Warnings:", result["warnings"]) # Alternatively, if the only way you can process your data is in one large list, you can pass the list to the Validator # You still must add the required metadata to your records # It is recommended to use the previous method if possible # result = dataset_validator.write_dataset(your_records_with_metadata) #if result["success"] is not True: #print("Error:", result["message"]) # You're done! if verbose: print("Finished converting")
def convert(input_path, metadata=None, verbose=False): if verbose: print("Begin converting") # Collect the metadata if not metadata: dataset_metadata = { "globus_subject": "http://qmml.org/datasets.html#gdb8-15", "acl": ["public"], "mdf_source_name": "gdb8-15", "mdf-publish.publication.collection": "gdb8-15", "mdf_data_class": "txt", "cite_as": ["Electronic spectra of 22k molecules Raghunathan Ramakrishnan, Mia Hartmann, Enrico Tapavicza, O. Anatole von Lilienfeld, J. Chem. Phys. submitted (2015)", "Structures of 22k molecules Raghunathan Ramakrishnan, Pavlo Dral, Matthias Rupp, O. Anatole von Lilienfeld Scientific Data 1, Article number: 140022 (2014). doi:10.1038/sdata.2014.22"], # "license": , "mdf_version": "0.1.0", "dc.title": "Electronic spectra from TDDFT and machine learning in chemical space", "dc.creator": "University of Basel, California State University, Argonnne National Labratory", "dc.identifier": "http://aip.scitation.org/doi/suppl/10.1063/1.4928757", "dc.contributor.author": ["Raghunathan Ramakrishnan", "Mia Hartmann", "Enrico Tapavicza", "O. Anatole von Lilienfeld"], "dc.subject": ["Density functional theory, Excitation energies, Computer modeling, Oscillators, Molecular spectra"], "dc.description": "Due to its favorable computational efficiency, time-dependent (TD) density functional theory (DFT) enables the prediction of electronic spectra in a high-throughput manner across chemical space. Its predictions, however, can be quite inaccurate. We resolve this issue with machine learning models trained on deviations of reference second-order approximate coupled-cluster (CC2) singles and doubles spectra from TDDFT counterparts, or even from DFT gap. We applied this approach to low-lying singlet-singlet vertical electronic spectra of over 20 000 synthetically feasible small organic molecules with up to eight CONF atoms.", "dc.relatedidentifier": ["http://dx.doi.org/10.1063/1.4928757http://dx.doi.org/10.1063/1.4928757"], "dc.year": 2015 } elif type(metadata) is str: try: dataset_metadata = json.loads(metadata) except Exception: try: with open(metadata, 'r') as metadata_file: dataset_metadata = json.load(metadata_file) except Exception as e: sys.exit("Error: Unable to read metadata: " + repr(e)) elif type(metadata) is dict: dataset_metadata = metadata else: sys.exit("Error: Invalid metadata parameter") # Make a Validator to help write the feedstock # You must pass the metadata to the constructor # Each Validator instance can only be used for a single dataset #dataset_validator = Validator(dataset_metadata, strict=False) # You can also force the Validator to treat warnings as errors with strict=True dataset_validator = Validator(dataset_metadata, strict=True) # Get the data # Each record should be exactly one dictionary # It is recommended that you convert your records one at a time, but it is possible to put them all into one big list (see below) # It is also recommended that you use a parser to help with this process if one is available for your datatype headers = ["Index", "E1-CC2", "E2-CC2", "f1-CC2", "f2-CC2", "E1-PBE0", "E2-PBE0", "f1-PBE0", "f2-PBE0", "E1-PBE0", "E2-PBE0", "f1-PBE0", "f2-PBE0", "E1-CAM", "E2-CAM", "f1-CAM", "f2-CAM"] # Each record also needs its own metadata with open(input_path, 'r') as raw_in: data = raw_in.read() #Start at line 29 for data starter = data.find(" 1 0.43295186 0.43295958") #Remove the spaces before the index column decomp = data[starter:].split("\n") stripped_decomp = [] for line in decomp: stripped_decomp.append(line.strip()) #Open gdb9-14 feedstock to get chemical composition with open(os.path.join(paths.feedstock, "gdb9-14_all.json"), 'r') as json_file: lines = json_file.readlines() full_json_data = [json.loads(line) for line in lines] #Composition needed doesn't begin until after record 6095 json_data = full_json_data[6095:] for record in tqdm(parse_tab("\n".join(stripped_decomp), headers=headers, sep=" "), desc="Processing files", disable=not verbose): comp = json_data[int(record["Index"])]["mdf-base.material_composition"] uri = "https://data.materialsdatafacility.org/collections/gdb-8-15/gdb8_22k_elec_spec.txt#" + record["Index"] record_metadata = { "globus_subject": uri, "acl": ["public"], # "mdf-publish.publication.collection": , # "mdf_data_class": , "mdf-base.material_composition": comp, # "cite_as": , # "license": , "dc.title": "gdb8-15 - " + "record: " + record["Index"], # "dc.creator": , "dc.identifier": uri, # "dc.contributor.author": , # "dc.subject": , # "dc.description": , # "dc.relatedidentifier": , # "dc.year": , "data": { "raw": json.dumps(record), # "files": , } } # Pass each individual record to the Validator result = dataset_validator.write_record(record_metadata) # Check if the Validator accepted the record, and print a message if it didn't # If the Validator returns "success" == True, the record was written successfully if result["success"] is not True: print("Error:", result["message"], ":", result.get("invalid_metadata", "")) # The Validator may return warnings if strict=False, which should be noted if result.get("warnings", None): print("Warnings:", result["warnings"]) # Alternatively, if the only way you can process your data is in one large list, you can pass the list to the Validator # You still must add the required metadata to your records # It is recommended to use the previous method if possible # result = dataset_validator.write_dataset(your_records_with_metadata) #if result["success"] is not True: #print("Error:", result["message"]) # You're done! if verbose: print("Finished converting")
def convert(input_path, metadata=None, verbose=False): if verbose: print("Begin converting") # Collect the metadata if not metadata: dataset_metadata = { "globus_subject": "http://hdl.handle.net/11256/836", "acl": ["public"], "mdf_source_name": "fe_cr_al_oxidation", "mdf-publish.publication.collection": "Fe-Cr-Al Oxidation Studies", # "mdf_data_class": , "cite_as": [ "Bunn, Jonathan K.; Fang, Randy L.; Albing, Mark R.; Mehta, Apurva; Kramer, Matt J.; Besser, Matt F.; Hattrick-Simpers, Jason R High-throughput Diffraction and Spectroscopic Data for Fe-Cr-Al Oxidation Studies (2015-06-28)" ], "license": "http://creativecommons.org/licenses/by-sa/3.0/us/", "dc.title": "High-throughput Diffraction and Spectroscopic Data for Fe-Cr-Al Oxidation Studies", "dc.creator": "University of South Carolina, SLAC National Accelerator Laboratory, Iowa State University", "dc.identifier": "http://hdl.handle.net/11256/836", "dc.contributor.author": [ "Bunn, Jonathan K.", "Fang, Randy L.", "Albing, Mark R.", "Mehta, Apurva", "Kramer, Matt J.", "Besser, Matt F.", "Hattrick-Simpers, Jason R" ], # "dc.subject": , "dc.description": "The data set was used to evaluate a Fe-Cr-Al thin film samples in a narrow composition region centered on known bulk compositions. The data are composed of two individual studies. The first set of data is a low temperature oxidation study on composition spread sampled performed at SLAC Beamline 1-5. Only the integrated and background subtracted 1-D spectra are included, the 2-D data and calibrations are available upon request. The second set of data was taken during high temperature oxidation of selected samples. These data are exclusively Raman data with values taken as a function of total oxidation time.", "dc.relatedidentifier": [ "http://iopscience.iop.org/article/10.1088/0957-4484/26/27/274003/meta", "http://dx.doi.org/10.1088/0957-4484/26/27/274003" ], "dc.year": 2015 } elif type(metadata) is str: try: with open(metadata, 'r') as metadata_file: dataset_metadata = json.load(metadata_file) except Exception as e: sys.exit("Error: Unable to read metadata: " + repr(e)) elif type(metadata) is dict: dataset_metadata = metadata else: sys.exit("Error: Invalid metadata parameter") # Make a Validator to help write the feedstock # You must pass the metadata to the constructor # Each Validator instance can only be used for a single dataset #dataset_validator = Validator(dataset_metadata, strict=False) # You can also force the Validator to treat warnings as errors with strict=True dataset_validator = Validator(dataset_metadata, strict=True) # Get the data with open( os.path.join( input_path, "Fe_Cr_Al_data", "Point Number to Composition.csv")) as composition_file: composition_list = list(parse_tab(composition_file.read())) compositions = {} for comp in composition_list: compositions[int(comp.pop("Sample Number"))] = comp # Each record also needs its own metadata for data_file in tqdm(find_files(input_path, ".txt"), desc="Processing files", disable=not verbose): link = "https://data.materialsdatafacility.org/collections/" + data_file[ "no_root_path"] + "/" + data_file["filename"] temp_k = data_file["filename"].split(" ")[0] point_num = int(data_file["filename"].replace( "_", " ").split(" ")[-1].split(".")[0]) record_metadata = { "globus_subject": link, "acl": ["public"], # "mdf-publish.publication.collection": , # "mdf_data_class": , "mdf-base.material_composition": "FeCrAl", # "cite_as": , # "license": , "dc.title": "Fe-Cr-Al Oxidation - " + data_file["filename"].split(".")[0], # "dc.creator": , "dc.identifier": link, # "dc.contributor.author": , # "dc.subject": , # "dc.description": , # "dc.relatedidentifier": , # "dc.year": , "data": { # "raw": , "files": { "csv": link }, "temperature_k": temp_k, "atomic_composition_percent": { "Fe": compositions[point_num]["Fe at. %"], "Cr": compositions[point_num]["Cr at. %"], "Al": compositions[point_num]["Al at. %"] } } } # Pass each individual record to the Validator result = dataset_validator.write_record(record_metadata) # Check if the Validator accepted the record, and print a message if it didn't # If the Validator returns "success" == True, the record was written successfully if result["success"] is not True: print("Error:", result["message"], ":", result.get("invalid_metadata", "")) # The Validator may return warnings if strict=False, which should be noted if result.get("warnings", None): print("Warnings:", result["warnings"]) if verbose: print("Finished converting")
def convert(input_path, metadata=None, verbose=False): if verbose: print("Begin converting") # Collect the metadata if not metadata: dataset_metadata = { "globus_subject": "https://doi.org/10.17863/CAM.705", "acl": ["public"], "mdf_source_name": "quinary_alloys", "mdf-publish.publication.collection": "Ni-Co-Al-Ti-Cr Quinary Alloys", # "mdf_data_class": , "cite_as": ['Christofidou, K. A., Jones, N. G., Pickering, E. J., Flacau, R., Hardy, M. C., & Stone, H. J. Research Data Supporting "The microstructure and hardness of Ni-Co-Al-Ti-Cr quinary alloys" [Dataset]. https://doi.org/10.17863/CAM.705'], "license": "http://creativecommons.org/licenses/by/4.0/", "mdf_version": "0.1.0", "dc.title": 'Research Data Supporting "The microstructure and hardness of Ni-Co-Al-Ti-Cr quinary alloys"', "dc.creator": "University of Cambridge", "dc.identifier": "https://doi.org/10.17863/CAM.705", "dc.contributor.author": ["Christofidou, K. A.", "Jones, N. G.", "Pickering, E. J.", "Flacau, R.", "Hardy, M. C.", "Stone, H. J."], "dc.subject": ["DSC", "SEM", "TEM", "neutron diffraction", "thermodynamics", "hardness"], "dc.description": "DSC files, neutron diffraction data, hardness measurements, SEM and TEM images and thermodynamic simulations are provided for all alloy compositions studied and presented in this manuscript. The naming convention is provided in the manuscript along with the composition of each alloy.", "dc.relatedidentifier": ["https://doi.org/10.1016/j.jallcom.2016.07.159"], "dc.year": 2016 } elif type(metadata) is str: try: dataset_metadata = json.loads(metadata) except Exception: try: with open(metadata, 'r') as metadata_file: dataset_metadata = json.load(metadata_file) except Exception as e: sys.exit("Error: Unable to read metadata: " + repr(e)) elif type(metadata) is dict: dataset_metadata = metadata else: sys.exit("Error: Invalid metadata parameter") # Make a Validator to help write the feedstock # You must pass the metadata to the constructor # Each Validator instance can only be used for a single dataset # dataset_validator = Validator(dataset_metadata, strict=False) # You can also force the Validator to treat warnings as errors with strict=True dataset_validator = Validator(dataset_metadata, strict=True) # Get the data # Each record also needs its own metadata with open(os.path.join(input_path, "alloy_data.csv"), 'r') as adata: raw_data = adata.read() for record in tqdm(parse_tab(raw_data), desc="Processing records", disable= not verbose): links = {} mdf_base = "https://data.materialsdatafacility.org/collections/quinary_alloys/" for ln in find_files(input_path, record["Alloy"]): key = "_".join(ln["no_root_path"].split("/")).replace(" ", "_") links[key] = mdf_base + os.path.join(ln["no_root_path"], ln["filename"]) record_metadata = { "globus_subject": mdf_base + "alloy_data.csv#" + record["Alloy"], "acl": ["public"], # "mdf-publish.publication.collection": , # "mdf_data_class": , "mdf-base.material_composition": "NiCoAlTiCr", # "cite_as": , # "license": , "dc.title": "Ni-Co-Al-Ti-Cr Quinary Alloys " + record["Alloy"], # "dc.creator": , "dc.identifier": mdf_base + "alloy_data.csv", # "dc.contributor.author": , # "dc.subject": , # "dc.description": , # "dc.relatedidentifier": , # "dc.year": , "data": { "raw": json.dumps(record), "files": links, "atomic_composition_percent": { "Ni": record["Ni"], "Co": record["Co"], "Al": record["Al"], "Ti": record["Ti"], "Cr": record["Cr"] } } } # Pass each individual record to the Validator result = dataset_validator.write_record(record_metadata) # Check if the Validator accepted the record, and print a message if it didn't # If the Validator returns "success" == True, the record was written successfully if result["success"] is not True: print("Error:", result["message"], ":", result.get("invalid_metadata", "")) # The Validator may return warnings if strict=False, which should be noted if result.get("warnings", None): print("Warnings:", result["warnings"]) if verbose: print("Finished converting")
def convert(input_path, metadata=None, verbose=False): if verbose: print("Begin converting") # Collect the metadata if not metadata: dataset_metadata = { "globus_subject": "https://www.nist.gov/pml/spectrum-th-ar-hollow-cathode-lamps", "acl": ["public"], "mdf_source_name": "nist_th_ar_lamp_spectrum", "mdf-publish.publication.collection": "NIST Spectrum of Th-Ar Hollow Cathode Lamps", # "mdf_data_class": , "cite_as": ["NIST SRD 161"], # "license": , "dc.title": "NIST Spectrum of Th-Ar Hollow Cathode Lamps", "dc.creator": "NIST", "dc.identifier": "https://www.nist.gov/pml/spectrum-th-ar-hollow-cathode-lamps", "dc.contributor.author": ["Gillian Nave", "Craig J. Sansonetti1", "Florian Kerber"], "dc.subject": ["Spectroscopy", "Reference data"], "dc.description": "This atlas presents observations of the infra-red (IR) spectrum of a low current Th-Ar hollow cathode lamp with the 2-m Fourier transform spectrometer (FTS) at the National Institute of Standards and Technology. These observations establish more than 2400 lines that are suitable for use as wavelength standards in the range 691 nm to 5804 nm.", # "dc.relatedidentifier": , "dc.year": 2009 } elif type(metadata) is str: try: with open(metadata, 'r') as metadata_file: dataset_metadata = json.load(metadata_file) except Exception as e: sys.exit("Error: Unable to read metadata: " + repr(e)) elif type(metadata) is dict: dataset_metadata = metadata else: sys.exit("Error: Invalid metadata parameter") # Make a Validator to help write the feedstock # You must pass the metadata to the constructor # Each Validator instance can only be used for a single dataset # dataset_validator = Validator(dataset_metadata, strict=False) # You can also force the Validator to treat warnings as errors with strict=True dataset_validator = Validator(dataset_metadata, strict=True) # Get the data # Each record also needs its own metadata headers = ["wavenumber", "wavenumber_uncertainty_le-3", "snr", "fwhm_le-3", "intensity", "species", "lower_level", "lower_j", "upper_level", "upper_j", "vacuum_wavelength", "vacuum_wavelength_uncertainty_le-3"] with open(input_path) as in_file: raw = in_file.read() while " " in raw: raw = raw.replace(" ", " ") for record in tqdm(parse_tab(raw, headers=headers, sep=" "), desc="Processing records", disable= not verbose): link = "http://physics.nist.gov/PhysRefData/ThArLampAtlas/node9.html#" + record["wavenumber"] record_metadata = { "globus_subject": link, "acl": ["public"], # "mdf-publish.publication.collection": , # "mdf_data_class": , "mdf-base.material_composition": record["species"], # "cite_as": , # "license": , "dc.title": "Hollow Cathode Lamp Spectrum - " + record["wavenumber"], # "dc.creator": , "dc.identifier": link, # "dc.contributor.author": , # "dc.subject": , # "dc.description": , # "dc.relatedidentifier": , # "dc.year": , "data": { "raw": json.dumps(record) } } # Pass each individual record to the Validator result = dataset_validator.write_record(record_metadata) # Check if the Validator accepted the record, and print a message if it didn't # If the Validator returns "success" == True, the record was written successfully if result["success"] is not True: print("Error:", result["message"], ":", result.get("invalid_metadata", "")) # The Validator may return warnings if strict=False, which should be noted if result.get("warnings", None): print("Warnings:", result["warnings"]) if verbose: print("Finished converting")