Python Validator Examples

Programming Language: Python

Namespace/Package Name: mdf_refinery.validator

Class/Type: Validator

Examples at hotexamples.com: 30

Python Validator - 30 examples found. These are the top rated real world Python examples of mdf_refinery.validator.Validator extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Validator(30)

cancel_validation(30)

write_record(30)

Example #1

Show file

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    # NOTE: For fields that represent people (e.g. mdf-data_contact), other IDs can be added (ex. "github": "jgaff").
    #    It is recommended that all people listed in mdf-data_contributor have a github username listed.
    #
    # If there are other useful fields not covered here, another block (dictionary at the same level as "mdf") can be created for those fields.
    # The block must be called the same thing as the source_name for the dataset.
    if not metadata:
        ## Metadata:dataset
        dataset_metadata = {
            "mdf": {
                "title":
                "A Counterion-Directed Approach to the Diels-Alder Paradigm: Cascade Synthesis of Tricyclic Fused Cyclopropanes",
                "acl": ["public"],
                "source_name":
                "tricyclic_fused_cyclopropanes",
                "data_contact": {
                    "given_name": "Robert",
                    "family_name": "Paton",
                    "email": "*****@*****.**",
                    "institution": "University of Oxford",
                },
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78",
                }],
                "citation": [
                    "Paton, R. (2016). A Counterion-Directed Approach to the Diels-Alder Paradigm: Cascade Synthesis of Tricyclic Fused Cyclopropanes [Data set]. Zenodo. http://doi.org/10.5281/zenodo.60147"
                ],
                "author": [{
                    "given_name": "Robert",
                    "family_name": "Paton",
                    "email": "*****@*****.**",
                    "institution": "University of Oxford",
                }],
                "license":
                "https://creativecommons.org/publicdomain/zero/1.0/",
                "collection":
                "Tricyclic Fused Propanes",
                "tags": ["DFT", "computational chemistry"],
                "description":
                "An approach to the intramolecular Diels–Alder reaction has led to a cascade synthesis of complex carbocycles composed of three fused rings and up to five stereocenters with complete stereocontrol. Computational analysis reveals that the reaction proceeds by a Michael/Michael/cyclopropanation/epimerization cascade in which size and coordination of the counterion is key.",
                "year":
                2016,
                "links": {
                    "landing_page":
                    "https://doi.org/10.5281/zenodo.60147",
                    "publication": [
                        "http://onlinelibrary.wiley.com/doi/10.1002/anie.201608534/full"
                    ],
                    #"data_doi": "",
                    #"related_id": ,

                    #"data_link": {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #},
                },
            },

            #"mrr": {

            #},

            #"dc": {

            #},
        }
        ## End metadata
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in tqdm(find_files(input_path, "log"),
                          desc="Processing files",
                          disable=not verbose):
        record = parse_ase(
            os.path.join(data_file["path"], data_file["filename"]),
            "gaussian-out")
        ## Metadata:record
        record_metadata = {
            "mdf": {
                "title":
                "Tricyclic Fused Cyclopropanes - " +
                record["chemical_formula"],
                "acl": ["public"],
                "composition":
                record["chemical_formula"],

                #                "tags": ,
                #                "description": ,
                # "raw": json.dumps(record),
                "links": {

                    #                    "landing_page": ,
                    #                    "publication": ,
                    #                    "data_doi": ,
                    #                    "related_id": ,
                    "log": {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host":
                        "https://data.materialsdatafacility.org",
                        "path":
                        "/collections/tricyclic_fused_cyclopropanes/" +
                        data_file["filename"],
                    },
                },

                #                "citation": ,

                #                "data_contact": {

                #                    "given_name": ,
                #                    "family_name": ,
                #                    "email": ,
                #                    "institution": ,

                #                    },

                #                "author": [{

                #                    "given_name": ,
                #                    "family_name": ,
                #                    "email": ,
                #                    "institution": ,

                #                    }],

                #                "year": ,
            },

            # "dc": {

            # },
        }
        ## End metadata

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and stop processing if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if not result["success"]:
            if not dataset_validator.cancel_validation()["success"]:
                print(
                    "Error cancelling validation. The partial feedstock may not be removed."
                )
            raise ValueError(result["message"] + "\n" +
                             result.get("details", ""))

    # You're done!
    if verbose:
        print("Finished converting")

Example #2

Show file

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    # NOTE: For fields that represent people (e.g. mdf-data_contact), other IDs can be added (ex. "github": "jgaff").
    #    It is recommended that all people listed in mdf-data_contributor have a github username listed.
    #
    # If there are other useful fields not covered here, another block (dictionary at the same level as "mdf") can be created for those fields.
    # The block must be called the same thing as the source_name for the dataset.
    if not metadata:
        ## Metadata:dataset
        dataset_metadata = {
            "mdf": {
                "title":
                "ChEMBL Database",
                "acl": ["public"],
                "source_name":
                "chembl_db",
                "data_contact": {
                    "given_name":
                    "John P.",
                    "family_name":
                    "Overington",
                    "email":
                    "*****@*****.**",
                    "institution":
                    "European Molecular Biology Laboratory European Bioinformatics Institute",
                },
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78",
                }],
                "citation": [
                    "A.P. Bento, A. Gaulton, A. Hersey, L.J. Bellis, J. Chambers, M. Davies, F.A. Krüger, Y. Light, L. Mak, S. McGlinchey, M. Nowotka, G. Papadatos, R. Santos and J.P. Overington (2014) 'The ChEMBL bioactivity database: an update.' Nucleic Acids Res., 42 1083-1090. DOI: 10.1093/nar/gkt1031 PMID: 24214965",
                    "M. Davies, M. Nowotka, G. Papadatos, F. Atkinson, G.J.P. van Westen, N Dedman, R. Ochoa and J.P. Overington  (2014) 'myChEMBL: A Virtual Platform for Distributing Cheminformatics Tools and Open Data' Challenges 5 (334-337) DOI: 10.3390/challe5020334",
                    "S. Jupp, J. Malone, J. Bolleman, M. Brandizi, M. Davies, L. Garcia, A. Gaulton, S. Gehant, C. Laibe, N. Redaschi, S.M Wimalaratne, M. Martin, N. Le Novère, H. Parkinson, E. Birney and A.M Jenkinson (2014) The EBI RDF Platform: Linked Open Data for the Life Sciences Bioinformatics 30 1338-1339 DOI: 10.1093/bioinformatics/btt765 PMID: 24413672"
                ],

                #"author": [],
                "license":
                "https://creativecommons.org/licenses/by-sa/3.0/",
                "collection":
                "ChEMBL db",
                "tags": ["SAR"],
                "description":
                "ChEMBL is a database of bioactive drug-like small molecules, it contains 2-D structures, calculated properties (e.g. logP, Molecular Weight, Lipinski Parameters, etc.) and abstracted bioactivities (e.g. binding constants, pharmacology and ADMET data).vThe data is abstracted and curated from the primary scientific literature, and cover a significant fraction of the SAR and discovery of modern drugs We attempt to normalise the bioactivities into a uniform set of end-points and units where possible, and also to tag the links between a molecular target and a published assay with a set of varying confidence levels. Additional data on clinical progress of compounds is being integrated into ChEMBL at the current time.",
                "year":
                2017,
                "links": {
                    "landing_page":
                    "https://www.ebi.ac.uk/chembl/downloads",
                    #"publication": [""],
                    "data_doi":
                    "ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/",
                    #"related_id": "",

                    #"data_link": {

                    #"globus_endpoint": ,
                    #"http_host": "",

                    #"path": "",

                    #},
                },
            },

            #"mrr": {

            #},

            #"dc": {

            #},
        }
        ## End metadata
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata

    # Set up multiprocessing
    md_files = multiprocessing.JoinableQueue()
    rc_out = multiprocessing.JoinableQueue()
    counter = multiprocessing.Value('i', 0)
    err_counter = multiprocessing.Value('i', 0)
    killswitch = multiprocessing.Value('i', 0)
    # Find all the sdf files
    sdf_list = [
        os.path.join(sdf["path"], sdf["filename"])
        for sdf in tqdm(find_files(input_path, "sdf$"),
                        desc="Finding files",
                        disable=not verbose)
    ]
    # Process to add data into queue
    adder = multiprocessing.Process(
        target=(lambda sdf_list: [md_files.put(sdf) for sdf in sdf_list]),
        args=(sdf_list, ))
    # Processes to process records from input queue to output queue
    processors = [
        multiprocessing.Process(target=process_chembl_db,
                                args=(md_files, rc_out, err_counter,
                                      killswitch))
        for i in range(NUM_PROCESSORS)
    ]
    # Processes to write data from output queue
    #    writers = [multiprocessing.Process(target=do_validation, args=(rc_out, dataset_validator, counter, killswitch)) for i in range(NUM_WRITERS)]
    w = multiprocessing.Process(target=do_validation,
                                args=(rc_out, dataset_validator, counter,
                                      killswitch))
    # Process to manage progress bar
    prog_bar = multiprocessing.Process(target=track_progress,
                                       args=(len(sdf_list), counter,
                                             err_counter, killswitch))

    # Start adder
    adder.start()
    # Start processors, writers, and progress bar after data is in queue
    while md_files.empty():
        sleep(1)
    [p.start() for p in processors]
    #    [w.start() for w in writers]
    w.start()
    if verbose:
        prog_bar.start()

    # Wait on adder to finish
    adder.join()
    #print("ADDER JOINED")
    # Wait on both queues to be complete
    md_files.join()
    #print("MD_FILES JOINED")
    rc_out.join()
    #print("RC_OUT JOINED")
    # Trigger remote termination of processes without purpose
    killswitch.value = 1
    #print("KILLSWITCH AT 1")
    # Wait on all the processes to terminate
    [p.join() for p in processors]
    #print("PROCESSORS JOINED")
    #    [w.join() for w in writers]
    w.join()
    #print("W JOINED")
    if prog_bar.is_alive():
        prog_bar.join()

    if verbose:
        print("Finished converting")
        print("There were", err_counter.value, "errors")

Example #3

Show file

File: scan_ionic_liquids_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    # NOTE: For fields that represent people (e.g. mdf-data_contact), other IDs can be added (ex. "github": "jgaff").
    #    It is recommended that all people listed in mdf-data_contributor have a github username listed.
    #
    # If there are other useful fields not covered here, another block (dictionary at the same level as "mdf") can be created for those fields.
    # The block must be called the same thing as the source_name for the dataset.
    if not metadata:
        ## Metadata:dataset
        dataset_metadata = {
            "mdf": {

                "title": "Data for the article \"Performance of SCAN density functional method for a set of ionic liquids\"",
                "acl": ["public"],
                "source_name": "scan_ionic_liquids",

                "data_contact": {
                    
                    "given_name": "Vladislav",
                    "family_name": "Ivaništšev",
                    "email": "*****@*****.**",
                    "institution": "University of Tartu",

                },

                "data_contributor": [{
                    
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78",

                }],

                "citation": ["Karu, Karl, Ers, Heigo, Mišin, Maksim, Sun, Jianwei, & Ivaništšev, Vladislav. (2017). Data for the article \"Performance of SCAN density functional method for a set of ionic liquids\" [Data set]. Zenodo. http://doi.org/10.5281/zenodo.495089"],

                "author": [{

                    "given_name": "Karl",
                    "family_name": "Karu",
                    "institution": "University of Tartu",

                },
                {

                    "given_name": "Heigo",
                    "family_name": "Ers",
                    "institution": "University of Tartu",

                },
                {

                    "given_name": "Maksim",
                    "family_name": "Mišin",
                    "institution": "University of Tartu",

                },
                {

                    "given_name": "Jianwei",
                    "family_name": "Sun",
                    "institution": "The University of Texas at El Paso",

                },
                {

                    "given_name": "Vladislav",
                    "family_name": "Ivaništšev",
                    "email": "*****@*****.**",
                    "institution": "University of Tartu",

                }],

                #"license": "",
                "collection": "SCAN of Ionic Liquids",
                #"tags": [""],
                "description": "The repository (https://github.com/vilab-tartu/SCAN) contains the database, geometries and an illustrative ipython notebook supporting the article \"Performance of SCAN density functional method for a set of ionic liquids\". ",
                "year": 2017,

                "links": {

                    "landing_page": "https://doi.org/10.5281/zenodo.495089",
                    "publication": ["https://github.com/vilab-tartu/SCAN/tree/v.05"],
                    #"data_doi": "",
                    #"related_id": ,

                    #"data_link": {

                        #"globus_endpoint": ,
                        #"http_host": ,

                        #"path": ,
                        #},
                    },
                },

            #"mrr": {

                #},

            #"dc": {

                #},


        }
        ## End metadata
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")



    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)


    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in tqdm(find_files(input_path, "xyz"), desc="Processing files", disable=not verbose):
        record = parse_ase(os.path.join(data_file["path"], data_file["filename"]), "xyz")
        ## Metadata:record
        record_metadata = {
            "mdf": {

                "title": "SCAN of Ionic Liquids - " + record["chemical_formula"],
                "acl": ["public"],
                "composition": record["chemical_formula"],

#                "tags": ,
#                "description": ,
               # "raw": json.dumps(record),

                "links": {

#                    "landing_page": ,
#                    "publication": ,
#                    "data_doi": ,
#                    "related_id": ,

                    "xyz": {

                        "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host": "https://data.materialsdatafacility.org",

                        "path": "/collections/scan_ionic_liquids/" + data_file["no_root_path"] + "/" + data_file["filename"],
                        },
                    
                    "json": {

                        "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host": "https://data.materialsdatafacility.org",

                        "path": "/collections/scan_ionic_liquids/database.json",
                        },
                    },

#                "citation": ,

#                "data_contact": {

#                    "given_name": ,
#                    "family_name": ,
#                    "email": ,
#                    "institution": ,

#                    },

#                "author": [{

#                    "given_name": ,
#                    "family_name": ,
#                    "email": ,
#                    "institution": ,

#                    }],

#                "year": ,

                },

           # "dc": {

           # },


        }
        ## End metadata

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and stop processing if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if not result["success"]:
            if not dataset_validator.cancel_validation()["success"]:
                print("Error cancelling validation. The partial feedstock may not be removed.")
            raise ValueError(result["message"] + "\n" + result.get("details", ""))


    # You're done!
    if verbose:
        print("Finished converting")

Example #4

Show file

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    # NOTE: For fields that represent people (e.g. mdf-data_contact), other IDs can be added (ex. "github": "jgaff").
    #    It is recommended that all people listed in mdf-data_contributor have a github username listed.
    #
    # If there are other useful fields not covered here, another block (dictionary at the same level as "mdf") can be created for those fields.
    # The block must be called the same thing as the source_name for the dataset.
    if not metadata:
        ## Metadata:dataset
        dataset_metadata = {
            "mdf": {
                "title":
                "PanDDA analysis of JMJD2D screened against Zenobia Fragment Library",
                "acl": ["public"],
                "source_name":
                "pandda_zenobia_fragment",
                "data_contact": {
                    "given_name": "Frank",
                    "family_name": "von Delft",
                    "email": "*****@*****.**",
                    "institution": "University of Oxford",
                },
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78",
                }],
                "citation": [
                    "Pearce, N., Bradley, A., Marsden, B. D., & von Delft, F. (2016). PanDDA analysis of JMJD2D screened against Zenobia Fragment Library [Data set]. Zenodo. http://doi.org/10.5281/zenodo.48770"
                ],
                "author": [{
                    "given_name": "Nicholas",
                    "family_name": "Pearce",
                    "institution": "University of Oxford",
                }, {
                    "given_name": "Anthony",
                    "family_name": "Bradley",
                    "institution": "University of Oxford",
                }, {
                    "given_name": "Brian D",
                    "family_name": "Marsden",
                    "institution": "University of Oxford",
                }, {
                    "given_name": "Frank",
                    "family_name": "von Delft",
                    "email": "*****@*****.**",
                    "institution": "University of Oxford",
                }],
                "license":
                "https://creativecommons.org/licenses/by-sa/4.0/",
                "collection":
                "PanDDA Zenobia Fragment",
                "tags": [
                    "PANDDA", "Fragment Screening by X-ray Crystallography",
                    "Structural Genomics Consortium (SGC)",
                    "Diamond Light Source I04-1"
                ],
                "description":
                "De-methylase JMJD2D screened against the Zenobia Fragment Library by X-ray Crystallography.",
                "year":
                2016,
                "links": {
                    "landing_page":
                    "https://doi.org/10.5281/zenodo.48770",
                    "publication":
                    ["https://www.nature.com/articles/ncomms15123#ref33"],
                    #"data_doi": "",
                    #"related_id": "",

                    #"data_link": {

                    #"globus_endpoint": ,
                    #"http_host": "",

                    #"path": "",

                    #},
                },
            },

            #"mrr": {

            #},

            #"dc": {

            #},
        }
        ## End metadata
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in tqdm(find_files(input_path, "pdb"),
                          desc="Processing Files",
                          disable=not verbose):
        record = parse_ase(
            os.path.join(data_file["path"], data_file["filename"]),
            "proteindatabank")
        ## Metadata:record
        record_metadata = {
            "mdf": {
                "title":
                "PanDDA Zenobia Fragment Library - " +
                record["chemical_formula"],
                "acl": ["public"],
                "composition":
                record["chemical_formula"],

                #"tags": ,
                #"description": ,
                #"raw": ,
                "links": {

                    #"landing_page": ,
                    #"publication": ,
                    #"data_doi": ,
                    #"related_id": ,
                    "pdb": {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host":
                        "https://data.materialsdatafacility.org",
                        "path":
                        "/collections/pandda_zenobia_fragment/" +
                        data_file["no_root_path"] + "/" +
                        data_file["filename"],
                    },
                },

                #"citation": ,

                #"data_contact": {

                #"given_name": ,
                #"family_name": ,
                #"email": ,
                #"institution": ,

                #},

                #"author": [{

                #"given_name": ,
                #"family_name": ,
                #"email": ,
                #"institution": ,

                #}],

                #"year": ,
            },

            #"dc": {

            #},
        }
        ## End metadata
        if "ligand" in data_file["filename"]:
            record_metadata["mdf"]["links"]["cif"] = {
                "globus_endpoint":
                "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                "http_host":
                "https://data.materialsdatafacility.org",
                "path":
                "/collections/pandda_zenobia_fragment/" +
                data_file["no_root_path"] + "/ligand.cif",
            }
        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and stop processing if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if not result["success"]:
            if not dataset_validator.cancel_validation()["success"]:
                print(
                    "Error cancelling validation. The partial feedstock may not be removed."
                )
            raise ValueError(result["message"] + "\n" +
                             result.get("details", ""))

    # TODO: Save your converter as [mdf-source_name]_converter.py
    # You're done!
    if verbose:
        print("Finished converting")

Example #5

Show file

File: amcs_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "The American Mineralogist Crystal Structure Database",
                "acl": ["public"],
                "source_name":
                "amcs",
                "citation": [
                    "Downs, R.T. and Hall-Wallace, M. (2003) The American Mineralogist Crystal Structure Database. American Mineralogist 88, 247-250."
                ],
                "data_contact": {
                    "given_name": "Robert",
                    "family_name": "Downs",
                    "email": "*****@*****.**",
                    "institution": "University of Arizona"

                    # IDs
                },
                "author": [{
                    "given_name": "Robert",
                    "family_name": "Downs",
                    "email": "*****@*****.**",
                    "institution": "University of Arizona"
                }, {
                    "given_name": "Michelle",
                    "family_name": "Hall-Wallace",
                    "institution": "University of Arizona"
                }],

                #            "license": ,
                "collection":
                "AMCS",
                "tags": ["crystal structure", "minerals"],
                "description":
                "A crystal structure database that includes every structure published in the American Mineralogist, The Canadian Mineralogist, European Journal of Mineralogy and Physics and Chemistry of Minerals, as well as selected datasets from other journals.",
                "year":
                2003,
                "links": {
                    "landing_page":
                    "http://rruff.geo.arizona.edu/AMS/amcsd.php",

                    #                "publication": ,
                    #                "dataset_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

                #           "mrr": ,
                "data_contributor": [{
                    "given_name": "Jonathon",
                    "family_name": "Gaff",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "jgaff"
                }]
            }
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    dataset_validator = Validator(dataset_metadata)

    # Get the data
    for cif in tqdm(find_files(root=input_path,
                               file_pattern=".cif",
                               verbose=verbose),
                    desc="Processing files",
                    disable=not verbose):
        cif_data = parse_pymatgen(os.path.join(cif["path"],
                                               cif["filename"]))["structure"]
        if cif_data:
            with open(os.path.join(cif["path"], cif["filename"])) as cif_file:
                cif_file.readline()
                mineral_name = cif_file.readline().split("'")[1]
            link = "http://rruff.geo.arizona.edu/AMS/minerals/" + mineral_name
            clink = "/AMS/xtal_data/CIFfiles/" + cif["filename"]
            dlink = "/AMS/xtal_data/DIFfiles/" + cif["filename"].replace(
                ".cif", ".txt")
            record_metadata = {
                "mdf": {
                    "title": "AMCS - " + mineral_name,
                    "acl": ["public"],
                    "tags": [mineral_name],
                    #                "description": ,
                    "composition": cif_data["material_composition"],
                    #                "raw": ,
                    "links": {
                        "landing_page": link,

                        #                    "publication": ,
                        #                    "dataset_doi": ,

                        #                    "related_id": ,
                        "cif": {
                            "http_host": "http://rruff.geo.arizona.edu",
                            "path": clink,
                        },
                        "dif": {
                            "http_host": "http://rruff.geo.arizona.edu",
                            "path": dlink
                        }
                    },

                    #                "citation": ,
                    #                "data_contact": {

                    #                    "given_name": ,
                    #                    "family_name": ,

                    #                    "email": ,
                    #                    "institution":,

                    # IDs
                    #                },

                    #                "author": ,

                    #                "license": ,
                    #                "collection": ,
                    #                "data_format": ,
                    #                "data_type": ,
                    #                "year": ,

                    #                "mrr":

                    #            "processing": ,
                    #            "structure":,
                }
            }

            # Pass each individual record to the Validator
            result = dataset_validator.write_record(record_metadata)

            # Check if the Validator accepted the record, and print a message if it didn't
            # If the Validator returns "success" == True, the record was written successfully
            if result["success"] is not True:
                print("Error:", result["message"])

    if verbose:
        print("Finished converting")

Example #6

Show file

File: sar_chemogenomics_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    # NOTE: For fields that represent people (e.g. mdf-data_contact), other IDs can be added (ex. "github": "jgaff").
    #    It is recommended that all people listed in mdf-data_contributor have a github username listed.
    #
    # If there are other useful fields not covered here, another block (dictionary at the same level as "mdf") can be created for those fields.
    # The block must be called the same thing as the source_name for the dataset.
    if not metadata:
        ## Metadata:dataset
        dataset_metadata = {
            "mdf": {

                "title": "The ‘SAR Matrix’ method and its extensions for applications in medicinal chemistry and chemogenomics",
                "acl": ["public"],
                "source_name": "sar_chemogenomics",

                "data_contact": {
                    
                    "given_name": "Jürgen",
                    "family_name": "Bajorath",
                    "email": "*****@*****.**",
                    "institution": "Rheinische Friedrich-Wilhelms-Universität",

                },

                "data_contributor": [{
                    
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78",

                }],

                "citation": ["Gupta-Ostermann, D., & Bajorath, J. (2014). The ‘SAR Matrix’ method and its extensions for applications in medicinal chemistry and chemogenomics [Data set]. F1000Research. Zenodo. http://doi.org/10.5281/zenodo.10457"],

                "author": [{

                    "given_name": "Disha",
                    "family_name": "Gupta-Ostermann",
                    "institution": "Rheinische Friedrich-Wilhelms-Universität",

                },
                {

                    "given_name": "Jürgen",
                    "family_name": "Bajorath",
                    "email": "*****@*****.**",
                    "institution": "Rheinische Friedrich-Wilhelms-Universität",

                }],

                "license": "https://creativecommons.org/publicdomain/zero/1.0/",
                "collection": "SAR Chemogenomics",
                #"tags": [""],
                "description": "We describe the ‘Structure-Activity Relationship (SAR) Matrix’ (SARM) methodology that is based upon a special two-step application of the matched molecular pair (MMP) formalism. The SARM method has originally been designed for the extraction, organization, and visualization of compound series and associated SAR information from compound data sets.",
                "year": 2014,

                "links": {

                    "landing_page": "https://doi.org/10.5281/zenodo.10457",
                    "publication": ["https://f1000research.com/articles/3-113/v2"],
                    #"data_doi": "",
                    #"related_id": ,

                    "zip": {

                        #"globus_endpoint": ,
                        "http_host": "https://zenodo.org",

                        "path": "/record/10457/files/Cpd_data_sets.zip",
                        },
                    },
                },

            #"mrr": {

                #},

            #"dc": {

                #},


        }
        ## End metadata
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")



    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)


    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    errors=0
    for data_file in tqdm(find_files(input_path, "sdf"), desc="Processing files", disable=not verbose):
        try:
            record = parse_ase(os.path.join(data_file["path"], data_file["filename"]), "sdf")
        except Exception as e:
            errors+=1
        ## Metadata:record
        record_metadata = {
            "mdf": {

                "title": "SAR Chemogenomics - " + record["chemical_formula"],
                "acl": ["public"],
                "composition": record["chemical_formula"],

#                "tags": ,
#                "description": ,
                #"raw": json.dumps(record),

                "links": {

#                    "landing_page": ,
#                    "publication": ,
#                    "data_doi": ,
#                    "related_id": ,

                    "sdf": {

                        "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host": "https://data.materialsdatafacility.org",

                        "path": "/collections/sar_chemogenoomics/" + data_file["no_root_path"] + "/" + data_file["filename"],
                        },
                    },

#                "citation": ,

#                "data_contact": {

#                    "given_name": ,
#                    "family_name": ,
#                    "email": ,
#                    "institution": ,

#                    },

#                "author": [{

#                    "given_name": ,
#                    "family_name": ,
#                    "email": ,
#                    "institution": ,

#                    }],

#                "year": ,

                },

           # "dc": {

           # },


        }
        ## End metadata

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and stop processing if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if not result["success"]:
            if not dataset_validator.cancel_validation()["success"]:
                print("Error cancelling validation. The partial feedstock may not be removed.")
            raise ValueError(result["message"] + "\n" + result.get("details", ""))


    # You're done!
    if verbose:
        print("ERRORS: " + str(errors))
        print("Finished converting")

Example #7

Show file

File: cys_scanning_phoq_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    # NOTE: For fields that represent people (e.g. mdf-data_contact), other IDs can be added (ex. "github": "jgaff").
    #    It is recommended that all people listed in mdf-data_contributor have a github username listed.
    #
    # If there are other useful fields not covered here, another block (dictionary at the same level as "mdf") can be created for those fields.
    # The block must be called the same thing as the source_name for the dataset.
    if not metadata:
        ## Metadata:dataset
        dataset_metadata = {
            "mdf": {
                "title":
                "Cys-Scanning Disulfide Crosslinking and Bayesian Modeling Probe the Transmembrane Signaling Mechanism of the Histidine Kinase, PhoQ",
                "acl": ["public"],
                "source_name":
                "cys_scanning_phoq",
                "data_contact": {
                    "given_name": "William F",
                    "family_name": "DeGrado",
                    "email": "*****@*****.**",
                    "institution": "University of California, San Francisco",
                },
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78",
                }],
                "citation": [
                    "Molnar, K. S., Bonomi, M., Pellarin, R., Clinthorne, G. D., Gonzalez, G., Goldberg, S. D., … DeGrado, W. F. (2014). Multi-state modeling of the PhoQ two-component system [Data set]. Structure. Zenodo. http://doi.org/10.5281/zenodo.46600"
                ],
                "author": [{
                    "given_name": "Kathleen S",
                    "family_name": "Molnar",
                    "institution": "University of Pennsylvania",
                }, {
                    "given_name":
                    "Massimiliano",
                    "family_name":
                    "Bonomi",
                    "institution":
                    "University of California, San Francisco",
                }, {
                    "given_name":
                    "Riccardo",
                    "family_name":
                    "Pellarin",
                    "institution":
                    "University of California, San Francisco",
                }, {
                    "given_name": "Graham D",
                    "family_name": "Clinthorne",
                    "institution": "University of Pennsylvania",
                }, {
                    "given_name": "Gabriel",
                    "family_name": "Gonzalez",
                    "institution": "University of Pennsylvania",
                }, {
                    "given_name": "Shalom D",
                    "family_name": "Goldberg",
                    "institution": "University of Pennsylvania",
                }, {
                    "given_name": "Mark",
                    "family_name": "Goulian",
                    "institution": "University of Pennsylvania",
                }, {
                    "given_name":
                    "Andrej",
                    "family_name":
                    "Sali",
                    "institution":
                    "University of California, San Francisco",
                }, {
                    "given_name":
                    "William F",
                    "family_name":
                    "DeGrado",
                    "email":
                    "*****@*****.**",
                    "institution":
                    "University of California, San Francisco",
                }],
                "license":
                "http://www.opensource.org/licenses/LGPL-2.1",
                "collection":
                "Cys-Scanning PhoQ",
                "tags": [
                    "Integrative Modeling Platform (IMP)",
                    "Cysteine crosslinks", "Multi-state"
                ],
                "description":
                "Bacteria transduce signals across the membrane using two-component systems (TCSs), consisting of a membrane-spanning sensor histidine kinase and a cytoplasmic response regulator. In gram-negative bacteria, the PhoPQ TCS senses cations and antimicrobial peptides, yet little is known about the structural changes involved in transmembrane signaling. We construct a model of PhoQ signal transduction using Bayesian inference, based on disulfide crosslinking data and homologous crystal structures.",
                "year":
                2014,
                "links": {
                    "landing_page": "https://doi.org/10.5281/zenodo.46600",
                    "publication":
                    ["https://doi.org/10.1016/j.str.2014.04.019"],
                    #"data_doi": "",
                    #"related_id": "",
                    "zip": {

                        #"globus_endpoint": ,
                        "http_host": "https://zenodo.org",
                        "path": "/record/46600/files/phoq-v1.0.zip",
                    },
                },
            },

            #"mrr": {

            #},

            #"dc": {

            #},
        }
        ## End metadata
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in tqdm(find_files(input_path, "pdb"),
                          desc="Processing Files",
                          disable=not verbose):
        if "data" not in data_file[
                "no_root_path"]:  #frame files are under pqr format which currently we do not have a file reader
            continue
        record = parse_ase(
            os.path.join(data_file["path"], data_file["filename"]),
            "proteindatabank")
        ## Metadata:record
        record_metadata = {
            "mdf": {
                "title": "Cys-Scanning PhoQ - " + record["chemical_formula"],
                "acl": ["public"],
                "composition": record["chemical_formula"],

                #"tags": ,
                #"description": ,
                #"raw": ,
                "links": {

                    #"landing_page": ,
                    #"publication": ,
                    #"data_doi": ,
                    #"related_id": ,
                    "pdb": {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host":
                        "https://data.materialsdatafacility.org",
                        "path":
                        "/collections/cys_scanning_phoq/" +
                        data_file["no_root_path"] + "/" +
                        data_file["filename"],
                    },
                },

                #"citation": ,

                #"data_contact": {

                #"given_name": ,
                #"family_name": ,
                #"email": ,
                #"institution": ,

                #},

                #"author": [{

                #"given_name": ,
                #"family_name": ,
                #"email": ,
                #"institution": ,

                #}],

                #"year": ,
            },

            #"dc": {

            #},
        }
        ## End metadata

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and stop processing if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if not result["success"]:
            if not dataset_validator.cancel_validation()["success"]:
                print(
                    "Error cancelling validation. The partial feedstock may not be removed."
                )
            raise ValueError(result["message"] + "\n" +
                             result.get("details", ""))

    # You're done!
    if verbose:
        print("Finished converting")

Example #8

Show file

File: nist_xray_tran_en_db_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "NIST X-Ray Transition Energies Database",
                "acl": ["public"],
                "source_name":
                "nist_xray_tran_en_db",
                "citation": [
                    "http://physics.nist.gov/PhysRefData/XrayTrans/Html/refs.html"
                ],
                "data_contact": {
                    "given_name": "Lawrence",
                    "family_name": "Hudson",
                    "email": "*****@*****.**",
                    "institution":
                    "National Institute of Standards and Technology"
                },

                #            "author": ,

                #            "license": ,
                "collection":
                "NIST X-Ray Transition Energies",
                "tags": ["Radiation", "Spectroscopy", "Reference data"],
                "description":
                "This x-ray transition table provides the energies for K transitions connecting the K shell (n = 1) to the shells with principal quantum numbers n = 2 to 4 and L transitions connecting the L1, L2, and L3 shells (n = 2) to the shells with principal quantum numbers n = 3 and 4. The elements covered include Z = 10, neon to Z = 100, fermium. There are two unique features of this database: (1) all experimental values are on a scale consistent with the International System of measurement (the SI) and the numerical values are determined using constants from the Recommended Values of the Fundamental Physical Constants: 1998 [115] and (2) accurate theoretical estimates are included for all transitions. The user will find that for many of the transitions, the experimental and theoretical values are very consistent. It is our hope that the theoretical values will provide a useful estimate for missing or poorly measured experimental values.",
                "year":
                2003,
                "links": {
                    "landing_page":
                    "https://www.nist.gov/pml/x-ray-transition-energies-database",

                    #                "publication": ,
                    #                "dataset_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

                #            "mrr": ,
                "data_contributor": {
                    "given_name": "Jonathon",
                    "family_name": "Gaff",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "jgaff"
                }
            }
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    dataset_validator = Validator(dataset_metadata)

    # Get the data
    headers = [
        'element', 'A', 'transition', 'theory_(eV)', 'theory_uncertainty_(eV)',
        'direct_(eV)', 'direct_uncertainty_(eV)', 'combined_(eV)',
        'combined_uncertainty_(eV)', 'vapor_(eV)', 'vapor_uncertainty_(eV)',
        'blend', 'reference'
    ]
    with open(os.path.join(input_path, "xray_tran_en_db.txt")) as in_file:
        raw_data = in_file.read()
    for record in tqdm(parse_tab(raw_data, sep="\t", headers=headers),
                       desc="Processing data",
                       disable=not verbose):
        record_metadata = {
            "mdf": {
                "title": "X-Ray Transition - " + record["element"],
                "acl": ["public"],

                #            "tags": ,
                #            "description": ,
                "composition": record["element"],
                "raw": json.dumps(record),
                "links": {
                    "landing_page":
                    "http://physics.nist.gov/PhysRefData/XrayTrans/Html/search.html",

                    #                "publication": ,
                    #                "dataset_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #},
                },

                #            "citation": ,
                #            "data_contact": {

                #                "given_name": ,
                #                "family_name": ,

                #                "email": ,
                #                "institution":,

                # IDs
                #                },

                #            "author": ,

                #            "license": ,
                #            "collection": ,
                #            "data_format": ,
                #            "data_type": ,
                #            "year": ,

                #            "mrr":

                #            "processing": ,
                #            "structure":,
            }
        }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    if verbose:
        print("Finished converting")

Example #9

Show file

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "A compilation of ab-initio calculations of embrittling potencies in binary metallic alloys",
                "acl": ['public'],
                "source_name":
                "binary_metallic_alloys_ab_initio",
                "citation": [
                    "Gibson, Michael A., and Christopher A. Schuh. “A Compilation of Ab-Initio Calculations of Embrittling Potencies in Binary Metallic Alloys.” Data in Brief 6 (2016): 143–148. PMC. Web. 29 June 2017."
                ],
                "data_contact": {
                    "given_name": "Michael A.",
                    "family_name": "Gibson",
                    "email": "*****@*****.**",
                    "instituition": "Massachusetts Institute of Technology"
                },
                "author": [{
                    "given_name":
                    "Michael A.",
                    "family_name":
                    "Gibson",
                    "email":
                    "*****@*****.**",
                    "instituition":
                    "Massachusetts Institute of Technology"
                }, {
                    "given_name":
                    "Christopher A.",
                    "family_name":
                    "Schuh",
                    "email":
                    "*****@*****.**",
                    "instituition":
                    "Massachusetts Institute of Technology"
                }],
                "license":
                "http://creativecommons.org/licenses/by/4.0/",
                "collection":
                "Binary Metallic Alloys Ab-initio Calculations",
                "tags": [
                    "Grain Boundary Segregation", "Embrittlement",
                    "Ab-Initio Calculation", "Surface", "Segregation",
                    "Fracture"
                ],
                "description":
                "Segregation-induced changes in interfacial cohesion often control the mechanical properties of metals. The change in the work of separation of an interface upon segregation of a solute to the interface, termed the embrittling potency, is an atomic-level quantity used to predict and understand embrittlement phenomena. We present a compilation of calculations of embrittling potencies, along with references for these calculations.",
                "year":
                2015,
                "links": {
                    "landing_page":
                    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4706572/",

                    #"publication": ,
                    # "data_doi": ,

                    # "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

                #            "mrr": ,
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78"
                }]
            }
        }

    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in find_files(input_path, "csv"):
        with open(os.path.join(data_file["path"], data_file["filename"]),
                  'r') as raw_in:
            total_data_lst = raw_in.readlines()
            #remove first line descriptions
            total_data = "".join(total_data_lst[1:])
        for record in tqdm(parse_tab(total_data),
                           desc="Processing file: " + data_file["filename"],
                           disable=not verbose):
            comp = record["Solvent"] + record["Solute"]
            record_metadata = {
                "mdf": {
                    "title": "Binary Metallic Alloys Ab-initio - " + comp,
                    "acl": ['public'],

                    #            "tags": ,
                    #            "description": ,
                    "composition": comp,
                    #            "raw": ,
                    "links": {
                        #                "landing_page": ,

                        #                "publication": ,
                        #                "data_doi": ,

                        #                "related_id": ,
                        "csv": {
                            "globus_endpoint":
                            "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                            "http_host":
                            "https://data.materialsdatafacility.org",
                            "path":
                            "/collections/binary_metallic_alloys_ab_initio/" +
                            data_file["filename"],
                        },
                    },

                    #            "citation": ,
                    #            "data_contact": {

                    #                "given_name": ,
                    #                "family_name": ,

                    #                "email": ,
                    #                "institution":,

                    #                },

                    #            "author": ,

                    #            "license": ,
                    #            "collection": ,
                    #            "year": ,

                    #            "mrr":

                    #            "processing": ,
                    #            "structure":,
                }
            }

            # Pass each individual record to the Validator
            result = dataset_validator.write_record(record_metadata)

            # Check if the Validator accepted the record, and print a message if it didn't
            # If the Validator returns "success" == True, the record was written successfully
            if result["success"] is not True:
                print("Error:", result["message"])

    # You're done!
    if verbose:
        print("Finished converting")

Example #10

Show file

File: converter_template.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    # TODO: Make sure the metadata is present in some form.
    # Fields can be:
    #    REQ (Required, must be present)
    #    RCM (Recommended, should be present if possible)
    #    OPT (Optional, can be present if useful)
    # NOTE: For fields that represent people (e.g. mdf-data_contact), other IDs can be added (ex. "github": "jgaff").
    #    It is recommended that all people listed in mdf-data_contributor have a github username listed.
    #
    # If there are other useful fields not covered here, another block (dictionary at the same level as "mdf") can be created for those fields.
    # The block must be called the same thing as the source_name for the dataset.
    if not metadata:
        ## Metadata:dataset
        dataset_metadata = {
            # REQ dictionary: MDF-format dataset metadata
            "mdf": {

                # REQ string: The title of the dataset
                "title": ,

                # REQ list of strings: The UUIDs allowed to view this metadata, or 'public'
                "acl": ,

                # REQ string: A short version of the dataset name, for quick reference. Spaces and dashes will be replaced with underscores, and other non-alphanumeric characters will be removed.
                "source_name": ,

                # REQ dictionary: The contact person/steward/custodian for the dataset
                "data_contact": {

                    # REQ string: The person's given (or first) name
                    "given_name": ,

                    # REQ string: The person's family (or last) name
                    "family_name": ,

                    # REQ string: The person's email address
                    "email": ,

                    # RCM string: The primary affiliation for the person
                    "institution": ,

                },

                # REQ list of dictionaries: The person/people contributing the tools (harvester, this converter) to ingest the dataset
                "data_contributor": [{

                    # REQ string: The person's given (or first) name
                    "given_name": ,

                    # REQ string: The person's family (or last) name
                    "family_name": ,

                    # REQ string: The person's email address
                    "email": ,

                    # RCM string: The primary affiliation for the person
                    "institution": ,

                    # RCM string: The person's GitHub username
                    "github": ,


                }],

                # RCM list of strings: The full bibliographic citation(s) for the dataset
                "citation": ,

                # RCM list of dictionaries: A list of the authors of this dataset
                "author": [{

                    # REQ string: The person's given (or first) name
                    "given_name": ,

                    # REQ string: The person's family (or last) name
                    "family_name": ,

                    # RCM string: The person's email address
                    "email": ,

                    # RCM string: The primary affiliation for the person
                    "institution": ,


                }],

                # RCM string: A link to the license for distribution of the dataset
                "license": ,

                # RCM string: The repository (that should already be in MDF) holding the dataset
                "repository": ,

                # RCM string: The collection for the dataset, commonly a portion of the title
                "collection": ,

                # RCM list of strings: Tags, keywords, or other general descriptors for the dataset
                "tags": ,

                # RCM string: A description of the dataset
                "description": ,

                # RCM integer: The year of dataset creation
                "year": ,

                # REQ dictionary: Links relating to the dataset
                "links": {

                    # REQ string: The human-friendly landing page for the dataset
                    "landing_page": ,

                    # RCM list of strings: The DOI(s) (in link form, ex. 'https://dx.doi.org/10.12345') for publications connected to the dataset
                    "publication": ,

                    # RCM string: The DOI of the dataset itself (in link form)
                    "data_doi": ,

                    # OPT list of strings: The mdf-id(s) of related entries, not including records from this dataset
                    "related_id": ,

                    # RCM dictionary: Links to raw data files from the dataset (multiple allowed, field name should be data type)
                    "data_link": {

                        # RCM string: The ID of the Globus Endpoint hosting the file
                        "globus_endpoint": ,

                        # RCM string: The fully-qualified HTTP hostname, including protocol, but without the path (for example, 'https://data.materialsdatafacility.org')
                        "http_host": ,

                        # REQ string: The full path to the data file on the host
                        "path": ,

                    },

                },

            },

            # OPT dictionary: DataCite-format metadata
            "dc": {

            },


        }
        ## End metadata
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")



    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)


    # Get the data
    # TODO: Write the code to convert your dataset's records into JSON-serializable Python dictionaries
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for record in your_records:
        # Fields can be:
        #    REQ (Required, must be present)
        #    RCM (Recommended, should be present if possible)
        #    OPT (Optional, can be present if useful)
        ## Metadata:record
        record_metadata = {
            # REQ dictionary: MDF-format record metadata
            "mdf": {

                # REQ string: The title of the record
                "title": ,

                # RCM list of strings: The UUIDs allowed to view this metadata, or 'public' (defaults to the dataset ACL)
                "acl": ,

                # RCM string: Subject material composition, expressed in a chemical formula (ex. Bi2S3)
                "composition": ,

                # RCM list of strings: Tags, keywords, or other general descriptors for the record
                "tags": ,

                # RCM string: A description of the record
                "description": ,

                # RCM string: The record as a JSON string (see json.dumps())
                "raw": ,

                # REQ dictionary: Links relating to the record
                "links": {

                    # RCM string: The human-friendly landing page for the record (defaults to the dataset landing page)
                    "landing_page": ,

                    # RCM list of strings: The DOI(s) (in link form, ex. 'https://dx.doi.org/10.12345') for publications specific to this record
                    "publication": ,

                    # RCM string: The DOI of the record itself (in link form)
                    "data_doi": ,

                    # OPT list of strings: The mdf-id(s) of related entries, not including the dataset entry
                    "related_id": ,

                    # RCM dictionary: Links to raw data files from the dataset (multiple allowed, field name should be data type)
                    "data_link": {

                        # RCM string: The ID of the Globus Endpoint hosting the file
                        "globus_endpoint": ,

                        # RCM string: The fully-qualified HTTP hostname, including protocol, but without the path (for example, 'https://data.materialsdatafacility.org')
                        "http_host": ,

                        # REQ string: The full path to the data file on the host
                        "path": ,

                    },

                },

                # OPT list of strings: The full bibliographic citation(s) for the record, if different from the dataset
                "citation": ,

                # OPT dictionary: The contact person/steward/custodian for the record, if different from the dataset
                "data_contact": {

                    # REQ string: The person's given (or first) name
                    "given_name": ,

                    # REQ string: The person's family (or last) name
                    "family_name": ,

                    # REQ string: The person's email address
                    "email": ,

                    # RCM string: The primary affiliation for the person
                    "institution": ,

                },

                # OPT list of dictionaries: A list of the authors of this record, if different from the dataset
                "author": [{

                    # REQ string: The person's given (or first) name
                    "given_name": ,

                    # REQ string: The person's family (or last) name
                    "family_name": ,

                    # RCM string: The person's email address
                    "email": ,

                    # RCM string: The primary affiliation for the person
                    "institution": ,


                }],

                # OPT integer: The year of dataset creation, if different from the dataset
                "year": ,

            },

            # OPT dictionary: DataCite-format metadata
            "dc": {

            },


        }

Example #11

Show file

File: qsar_biodeg_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf-title":
            "QSAR biodegradation Data Set",
            "mdf-acl": ["public"],
            "mdf-source_name":
            "qsar_biodeg",
            "mdf-citation": [
                "Mansouri, K., Ringsted, T., Ballabio, D., Todeschini, R., Consonni, V. (2013). Quantitative Structure - Activity Relationship models for ready biodegradability of chemicals. Journal of Chemical Information and Modeling, 53, 867-878",
                "Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science."
            ],
            "mdf-data_contact": {
                "given_name": "Davide",
                "family_name": "Ballabio",
                "email": "*****@*****.**",
                "institution": "Università degli Studi di Milano-Bicocca",
            },
            "mdf-author": [{
                "given_name":
                "Davide",
                "family_name":
                "Ballabio",
                "email":
                "*****@*****.**",
                "institution":
                "Università degli Studi di Milano-Bicocca",
            }, {
                "given_name":
                "Kamel",
                "family_name":
                "Mansouri",
                "institution":
                "Università degli Studi di Milano-Bicocca",
            }, {
                "given_name":
                "Tine",
                "family_name":
                "Ringsted",
                "institution":
                "Università degli Studi di Milano-Bicocca",
            }, {
                "given_name":
                "Roberto",
                "family_name":
                "Todeschini",
                "institution":
                "Università degli Studi di Milano-Bicocca",
            }, {
                "given_name":
                "Viviana",
                "family_name":
                "Consonni",
                "institution":
                "Università degli Studi di Milano-Bicocca",
            }],

            #            "mdf-license": ,
            "mdf-collection":
            "QSAR Biodegradation Data Set",
            "mdf-data_format":
            "csv",
            "mdf-data_type":
            "Biodegradation",
            "mdf-tags": ["biodegredation", "Chemometrics"],
            "mdf-description":
            "Data set containing values for 41 attributes (molecular descriptors) used to classify 1055 chemicals into 2 classes (ready and not ready biodegradable).",
            "mdf-year":
            2013,
            "mdf-links": {
                "mdf-landing_page":
                "https://archive.ics.uci.edu/ml/datasets/QSAR+biodegradation",

                #                "mdf-publication": ,
                #                "mdf-dataset_doi": ,

                #                "mdf-related_id": ,

                # data links: {

                #"globus_endpoint": ,
                #"http_host": ,

                #"path": ,
                #}
            },

            #           "mdf-mrr": ,
            "mdf-data_contributor": [{
                "given_name": "Evan",
                "family_name": "Pike",
                "email": "*****@*****.**",
                "institution": "The University of Chicago",
                "github": "dep78"
            }, {
                "given_name": "Jonathon",
                "family_name": "Gaff",
                "email": "*****@*****.**",
                "institution": "The University of Chicago",
                "github": "jgaff"
            }]
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    dataset_validator = Validator(dataset_metadata)

    # Get the data
    i = 1
    headers = [
        "SpMax_L", "J_Dz(e)", "nHM", "F01[N-N]", "F04[C-N]", "NssssC", "nCb-",
        "C%", "nCp", "nO", "F03[C-N]", "SdssC", "HyWi_B(m)", "LOC", " SM6_L",
        "F03[C-O]", "Me", "Mi", "nN-N", "nArNO2", "nCRX3", "SpPosA_B(p)",
        "nCIR", "B01[C-Br]", "B03[C-Cl]", "N-073", "SpMax_A", "Psi_i_1d",
        "B04[C-Br]", "SdO", "TI2_L", "nCrt", "C-026", "F02[C-N]", "nHDon",
        "SpMax_B(m)", "Psi_i_A", "nN", "SM6_B(m)", " nArCOOR", "nX",
        "experimental class"
    ]
    with open(os.path.join(input_path, "biodeg.csv"), 'r') as raw_in:
        for row_data in tqdm(parse_tab(raw_in.read(), sep=";",
                                       headers=headers),
                             desc="Processing data",
                             disable=not verbose):
            record = []
            for key, value in row_data.items():
                record.append(key + ": " + value)
            record_metadata = {
                "mdf-title": "QSAR Biodegradation #" + str(i),
                "mdf-acl": ["public"],

                #            "mdf-tags": ,
                #            "mdf-description": ,

                #            "mdf-composition": ,
                "mdf-raw": json.dumps(record),
                "mdf-links": {
                    #                "mdf-landing_page": ,

                    #                "mdf-publication": ,
                    #                "mdf-dataset_doi": ,

                    #                "mdf-related_id": ,
                    "csv": {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host": "https://data.materialsdatafacility.org",
                        "path": "/collections/qsar_biodeg/biodeg.csv",
                    },
                },

                #            "mdf-citation": ,
                #            "mdf-data_contact": {

                #                "given_name": ,
                #                "family_name": ,

                #                "email": ,
                #                "institution":,

                # IDs
                #                },

                #            "mdf-author": ,

                #            "mdf-license": ,
                #            "mdf-collection": ,
                #            "mdf-data_format": ,
                #            "mdf-data_type": ,
                #            "mdf-year": ,

                #            "mdf-mrr":

                #            "mdf-processing": ,
                #            "mdf-structure":,
            }
            i += 1

            # Pass each individual record to the Validator
            result = dataset_validator.write_record(record_metadata)

            # Check if the Validator accepted the record, and print a message if it didn't
            # If the Validator returns "success" == True, the record was written successfully
            if result["success"] is not True:
                print("Error:", result["message"])

    if verbose:
        print("Finished converting")

Example #12

Show file

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
        "mdf": {
            "title": "Computation-Ready Experimental Metal-Organic Frameworks Database",
            "acl": ["public"],
            "source_name": "core_mof",
            "citation": ['D. Nazarian, J. Camp, D.S. Sholl, "A Comprehensive Set of High-Quality Point Charges for Simulations of Metal-Organic Frameworks," Chemistry of Materials, 2016, 28 (3), pp 785–793'],
            "data_contact": {

                "given_name": "Yongchul",
                "family_name": "Chung",

                "email": "*****@*****.**",
                "institution": "Pusan National University"
                },

            "author": [{
                "given_name": "Dalar",
                "family_name": "Nazarian",
                "institution": "Georgia Institute of Technology"
                },
                {
                "given_name": "Jeffrey",
                "family_name": "Camp",
                "institution": "Georgia Institute of Technology"
                },
                {
                "given_name": "David",
                "family_name": "Sholl",
                "email": "*****@*****.**",
                "institution": "Georgia Institute of Technology"
                }],

#            "license": ,

            "collection": "CoRE-MOF",
            "tags": ["simulation", "metallic-organic", "framework"],

            "description": "High-throughput computational screening of metal-organic frameworks rely on the availability of disorder-free atomic coordinate files which can be used as input to simulation software packages. We have created CoRE MOF database and its variants which contains almost all MOFs that have been reported in the literature.",
            "year": 2014,

            "links": {

                "landing_page": "http://gregchung.github.io/CoRE-MOFs/",

                "publication": ["https://dx.doi.org/10.1021/acs.chemmater.5b03836", "https://dx.doi.org/10.1021/cm502594j"],
#                "dataset_doi": ,

#                "related_id": ,

                # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

#            "mrr": ,

            "data_contributor": [{
                "given_name": "Jonathon",
                "family_name": "Gaff",
                "email": "*****@*****.**",
                "institution": "The University of Chicago",
                "github": "jgaff"
                }]
            }
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")


    dataset_validator = Validator(dataset_metadata)


    # Get the data
    doi_path = os.path.join(input_path, "structure-doi-CoRE-MOFsV2.0.csv")
    cif_path = os.path.join(input_path, "core-mof-v1.0-ddec")
    # Get DOIs
    doi_dict = {}
    with open(doi_path) as dois:
        for line in dois:
            values = line.split(",")
            if values[1] != "-":
                doi_dict[values[0]] = values[1]
    # Get CIFs
    for cif in tqdm(find_files(cif_path, file_pattern=".cif", verbose=verbose), desc="Processing CIFs", disable= not verbose):
        with open(os.path.join(cif["path"], cif["filename"])) as cif_in:
            # Discard non-CIF, duplicate metadata in first line
            cif_in.readline()
            file_data = parse_ase(file_path=cif_in, data_format="cif", verbose=False)

        record_metadata = {
        "mdf": {
            "title": "CoRE-MOF - " + file_data["chemical_formula"] + " (" + cif["filename"].split("_")[0] + ")",
            "acl": ["public"],

#            "tags": ,
#            "description": ,
            
            "composition": file_data["chemical_formula"],
#            "raw": ,

            "links": {
                "landing_page": "https://github.com/gregchung/gregchung.github.io/blob/master/CoRE-MOFs/core-mof-v1.0-ddec/" + cif["filename"],

#                "publication": ,
#                "dataset_doi": ,

#                "related_id": ,

                "cif": {
 
#                    "globus_endpoint": ,
                    "http_host": "https://raw.githubusercontent.com",

                    "path": "/gregchung/gregchung.github.io/master/CoRE-MOFs/core-mof-v1.0-ddec/" + cif["filename"],
                    },

#            "citation": ,
#            "data_contact": {

#                "given_name": ,
#                "family_name": ,

#                "email": ,
#                "institution":,

                # IDs
            },

#            "author": ,

#            "license": ,
#            "collection": ,
#            "data_format": ,
#            "data_type": ,
#            "year": ,

#            "mrr":

#            "processing": ,
#            "structure":,
            }
        }
        pubs = [doi_dict[key] for key in doi_dict.keys() if cif["filename"].startswith(key)]
        if pubs:
            record_metadata["mdf"]["links"]["publication"] = pubs

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])


    if verbose:
        print("Finished converting")

Example #13

Show file

File: mpi_mainz_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title": "The MPI-Mainz UV/VIS Spectral Atlas of Gaseous Molecules",
                "acl": ['public'],
                "source_name": "mpi_mainz",
                "citation": ["Keller-Rudek, H. M.-P. I. for C. M. G., Moortgat, G. K. M.-P. I. for C. M. G., Sander, R. M.-P. I. for C. M. G., & Sörensen, R. M.-P. I. for C. M. G. (2013). The MPI-Mainz UV/VIS Spectral Atlas of Gaseous Molecules [Data set]. Zenodo. http://doi.org/10.5281/zenodo.6951,"],
                "data_contact": {
        
                    "given_name": "Keller-Rudek",
                    "family_name": "Hannelore",
                    
                    "email": "*****@*****.**",
                    "instituition": "Max-Planck Institute for Chemistry"
        
                    },
        
                "author": [{
                    
                    "given_name": "Keller-Rudek",
                    "family_name": "Hannelore",
                    
                    "email": "*****@*****.**",
                    "instituition": "Max-Planck Institute for Chemistry"
                    
                    },
                    {
                        
                    "given_name": "Geert K.",
                    "family_name": "Moortgat",
                    
                    "institution": "Max-Planck Institute for Chemistry"
                    
                    },
                    {
                    
                    "given_name": "Sander",
                    "family_name": "Rolf",
                    
                    "email": "*****@*****.**",
                    "instituition": "Max-Planck Institute for Chemistry",
                    
                    },
                    {
                    
                    "given_name": "Sörensen",
                    "family_name": "Rüdiger",
                    
                    "instituition": "Max-Planck Institute for Chemistry",
                    
                    }],
        
                "license": "https://creativecommons.org/licenses/by/4.0/",
        
                "collection": "UV/VIS Spectral Atlas",
                "tags": ["cross sections", "quantum yields"],
        
                "description": "This archive contains a frozen snapshot of all cross section and quantum yield data files from the MPI-Mainz UV/VIS Spectral Atlas of Gaseous Molecules.",
                "year": 2013,
        
                "links": {
        
                    "landing_page": "https://doi.org/10.5281/zenodo.6951",
        
                    "publication": ["http://www.earth-syst-sci-data.net/5/365/2013/essd-5-365-2013.pdf"],
                    "data_doi": "https://doi.org/10.5281/zenodo.6951",
        
        #            "related_id": ,
        
                    # data links: {
                    
                        #"globus_endpoint": ,
                        #"http_host": ,
        
                        #"path": ,
                        #}
                    },
        
        #            "mrr": ,
        
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78"
                    }]
                }
            }
        
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")



    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)


    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in tqdm(find_files(input_path, ".txt"), desc="Processing files", disable=not verbose):
        with open(os.path.join(data_file["path"], data_file["filename"]), 'r', errors='ignore') as raw_in:
            record = raw_in.read()
        #Get the composition
        comp = data_file["filename"].split("_")[0]
        #Get the temperature
        temp = data_file["filename"].split("_")[2][:-1]
        record_metadata = {
            "mdf": {
                "title": "mpi_mainz - " + comp,
                "acl": ['public'],
    
    #            "tags": ,
    #            "description": ,
                
                "composition": comp,
    #            "raw": ,
    
                "links": {
                    #"landing_page": ,
    
    #                "publication": ,
    #                "data_doi": ,
    
    #                "related_id": ,
    
                    "txt": {
                        "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host": "https://data.materialsdatafacility.org",
    
                        "path": "/collections/mpi_mainz/" + data_file["no_root_path"] + "/" + data_file["filename"],
                        },
                    },
    
    #            "citation": ,
    #            "data_contact": {
    
    #                "given_name": ,
    #                "family_name": ,
    
    #                "email": ,
    #                "institution":,
    
    #                },
    
    #            "author": ,
    
    #            "license": ,
    #            "collection": ,
    #            "year": ,
    
    #            "mrr":
    
    #            "processing": ,
    #            "structure":,
                },
                "mpi_mainz": {
                    "temperature": {
                        "value": temp,
                        "unit" : "K"
                        }
                    }
            }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    # You're done!
    if verbose:
        print("Finished converting")

Example #14

Show file

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    # NOTE: For fields that represent people (e.g. mdf-data_contact), other IDs can be added (ex. "github": "jgaff").
    #    It is recommended that all people listed in mdf-data_contributor have a github username listed.
    #
    # If there are other useful fields not covered here, another block (dictionary at the same level as "mdf") can be created for those fields.
    # The block must be called the same thing as the source_name for the dataset.
    if not metadata:
        ## Metadata:dataset
        dataset_metadata = {
            "mdf": {
                "title":
                "BaCrO3-x JSSC 2015 (High-pressure BaCrO3 polytypes and the 5H–BaCrO2.8 phase)",
                "acl": ["public"],
                "source_name":
                "high_pressure_ba_cro3",
                "data_contact": {
                    "given_name": "Attfield J.",
                    "family_name": "Paul",
                    "email": "*****@*****.**",
                    "institution":
                    "University of Edinburgh School of Chemistry",
                },
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78",
                }],
                "citation": [
                    "Attfield, J. Paul. (2015). BaCrO3-x JSSC 2015, 2014-2015 [dataset]. University of Edinburgh School of Chemistry. http://dx.doi.org/10.7488/ds/305."
                ],
                "author": [{
                    "given_name":
                    "Attfield J.",
                    "family_name":
                    "Paul",
                    "email":
                    "*****@*****.**",
                    "institution":
                    "University of Edinburgh School of Chemistry",
                }],
                "license":
                "http://creativecommons.org/licenses/by/4.0/legalcode",
                "collection":
                "High Pressure Ba-CrO3",
                "tags": [
                    "Reduced oxides", "Perovskites", "High pressure synthesis",
                    "Vacancyordering", "Magnetic structure"
                ],
                "description":
                "Polytypism of BaCrO3 perovskites has been investigated at 900–1100 °C and pressures up to 22 GPa. Hexagonal 5H, 4H, and 6H perovskites are observed with increasing pressure, and the cubic 3C perovskite (a=3.99503(1) Å) is observed in bulk form for the first time at 19–22 GPa. An oxygen-deficient material with limiting composition 5H–BaCrO2.8 is synthesised at 1200 °C under ambient pressure. This contains double tetrahedral Cr4+ layers and orders antiferromagnetically below 260 K with a (0 0 1/2) magnetic structure.",
                "year":
                2015,
                "links": {
                    "landing_page":
                    "http://www.research.ed.ac.uk/portal/en/datasets/bacro3x-jssc-2015-highpressure-bacro3-polytypes-and-the-5hbacro28-phase(17dcd792-2bb9-43d9-b244-a1d3a3ea7c15).html",
                    "publication":
                    ["http://dx.doi.org/10.1016/j.jssc.2015.09.029"],
                    "data_doi": "http://dx.doi.org/10.7488/ds/305",
                    #"related_id": ,
                    "zip": {

                        #"globus_endpoint": ,
                        "http_host":
                        "http://datashare.is.ed.ac.uk",
                        "path":
                        "/bitstream/handle/10283/862/BaCrO3Data.zip?sequence=1&isAllowed=y",
                    },
                },
            },

            #"mrr": {

            #},

            #"dc": {

            #},
        }
        ## End metadata
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in tqdm(find_files(input_path, "cif"),
                          desc="Processing files",
                          disable=not verbose):
        record = parse_ase(
            os.path.join(data_file["path"], data_file["filename"]), "cif")
        ## Metadata:record
        record_metadata = {
            "mdf": {
                "title":
                "High Pressure Ba-CrO3 - " + record["chemical_formula"],
                "acl": ["public"],
                "composition": record["chemical_formula"],

                #                "tags": ,
                #                "description": ,
                #"raw": json.dumps(record),
                "links": {

                    #                    "landing_page": ,
                    #                    "publication": ,
                    #                    "data_doi": ,
                    #                    "related_id": ,
                    "cif": {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host":
                        "https://data.materialsdatafacility.org",
                        "path":
                        "/collections/high_pressure_ba_cro3/" +
                        data_file["no_root_path"] + "/" +
                        data_file["filename"],
                    },
                },

                #                "citation": ,

                #                "data_contact": {

                #                    "given_name": ,
                #                    "family_name": ,
                #                    "email": ,
                #                    "institution": ,

                #                    },

                #                "author": [{

                #                    "given_name": ,
                #                    "family_name": ,
                #                    "email": ,
                #                    "institution": ,

                #                    }],

                #                "year": ,
            },

            # "dc": {

            # },
        }
        ## End metadata
        if "X-ray" in data_file["path"]:
            if "oxidised" in data_file["filename"]:
                ext = ".XY"
            else:
                ext = ".xye"
            name = data_file["filename"].split(".")[0] + ext
            record_metadata["mdf"]["links"][ext[1:]] = {
                "globus_endpoint":
                "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                "http_host":
                "https://data.materialsdatafacility.org",
                "path":
                "/collections/high_pressure_ba_cro3/" +
                data_file["no_root_path"] + "/" + name,
            }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and stop processing if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if not result["success"]:
            if not dataset_validator.cancel_validation()["success"]:
                print(
                    "Error cancelling validation. The partial feedstock may not be removed."
                )
            raise ValueError(result["message"] + "\n" +
                             result.get("details", ""))

    # You're done!
    if verbose:
        print("Finished converting")

Example #15

Show file

File: monoborides_dft_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title": "Mechanical Properties and Phase Stability of Monoborides using Density Functional Theory Calculations",
                "acl": ['public'],
                "source_name": "monoborides_dft",
                "citation": ["Kim, Hyojung; Trinkle, Dallas R., \"Mechanical Properties and Phase Stability of Monoborides using Density Functional Theory Calculations,\" 2017, http://dx.doi.org/doi:10.18126/M24S3J"],
                "data_contact": {
    
                    "given_name": "Dallas R.",
                    "family_name": "Trinkle",
    
                    "email": "*****@*****.**",
                    "institution": "University of Illinois at Urbana-Champaign",
                    
                    },
    
                "author": [{
                    
                    "given_name": "Dallas R.",
                    "family_name": "Trinkle",
    
                    "email": "*****@*****.**",
                    "institution": "University of Illinois at Urbana-Champaign",
                    
                    },
                    {
                    
                    "given_name": "Kim",
                    "family_name": "Hyojung",
                                    
                    }],
    
    #            "license": ,
    
                "collection": "Monoborides DFT",
                "tags": ["ab-initio", "special quasirandom structure", "DFT", "polycrystalline mechanical properties", "stacking fault energy", "solubility limit", "monoboride", "B27 structure", "Bf structure", "Vegard's law"],
    
                "description": "This data demonstrates the Ti-monoborides with improved polycrystalline elastic properties such as Young's modulus and Pugh's ratio, and stacking fault energies. The lattice parameters, total energies and elastic constants of monoborides are computed using density functional theory",
                "year": 2017,
    
                "links": {
    
                    "landing_page": "http://dx.doi.org/doi:10.18126/M24S3J",
    
    #                "publication": [""],
    #                "data_doi": "",
    
    #                "related_id": ,
    
                    # data links: {
                    
                        #"globus_endpoint": ,
                        #"http_host": ,
    
                        #"path": ,
                        #}
                    },
    
    #            "mrr": ,
    
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78"
                    }]
                }
            }
        
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")



    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)


    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in tqdm(find_files(input_path, "OUTCAR"), desc="Processing files", disable=not verbose):
        record = parse_ase(os.path.join(data_file["path"], data_file["filename"]), "vasp-out")
        record_metadata = {
            "mdf": {
                "title": "Monoborides DFT - " + record["chemical_formula"],
                "acl": ['public'],
    
    #            "tags": ,
    #            "description": ,
                
                "composition": record["chemical_formula"],
    #            "raw": ,
    
                "links": {
                    #"landing_page": ,
    
    #                "publication": ,
    #                "data_doi": ,
    
    #                "related_id": ,
    
                    "outcar": {
                        "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host": "https://data.materialsdatafacility.org",
    
                        "path": "/published/publication_232/data/" + data_file["no_root_path"] + "/" + data_file["filename"],
                        },
                    },
    
    #            "citation": ,
    #            "data_contact": {
    
    #                "given_name": ,
    #                "family_name": ,
    
    #                "email": ,
    #                "institution":,
    
    #                },
    
    #            "author": ,
    
    #            "license": ,
    #            "collection": ,
    #            "year": ,
    
    #            "mrr":
    
    #            "processing": ,
    #            "structure":,
                }
            }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    # You're done!
    if verbose:
        print("Finished converting")

Example #16

Show file

File: hopv_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "Harvard Organic Photovoltaic Dataset",
                "acl": ["public"],
                "source_name":
                "hopv",
                "citation": [
                    "Aspuru-Guzik, Alan (2016): The Harvard Organic Photovoltaics 2015 (HOPV) dataset: An experiment-theory calibration resource.. Figshare. https://doi.org/10.6084/m9.figshare.1610063.v4"
                ],
                "data_contact": {
                    "given_name": "Alan",
                    "family_name": "Aspuru-Guzik",
                    "email": "*****@*****.**",
                    "institution": "Harvard University"
                },
                "author": [{
                    "given_name": "Alan",
                    "family_name": "Aspuru-Guzik",
                    "email": "*****@*****.**",
                    "institution": "Harvard University"
                }],
                "license":
                "https://creativecommons.org/licenses/by/4.0/",
                "collection":
                "Harvard Organic Photovoltaic Dataset",
                "tags": [
                    "Organic Photovoltaic Cells", "quantum chemistry",
                    "density functional theory", "calibration"
                ],
                "description":
                "The Harvard Organic Photovoltaic Dataset (HOPV15) presented in this work is a collation of experimental photovoltaic data from the literature, and corresponding quantum-chemical calculations performed over a range of geometries, each with quantum chemical results using a variety of density functionals and basis sets.",
                "year":
                2016,
                "links": {
                    "landing_page":
                    "https://figshare.com/articles/HOPV15_Dataset/1610063/4",
                    "publication":
                    ["https://dx.doi.org/10.1038/sdata.2016.86"],
                    "data_doi":
                    "https://dx.doi.org/10.6084/m9.figshare.1610063.v4"

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

                #            "mrr": ,
                "data_contributor": [{
                    "given_name": "Jonathon",
                    "family_name": "Gaff",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "jgaff"
                }]
            }
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    dataset_validator = Validator(dataset_metadata)

    # Get the data
    with open(os.path.join(input_path, "HOPV_15_revised_2.data"),
              'r') as in_file:
        index = 0
        eof = False
        smiles = in_file.readline()  # Priming read
        if not smiles:
            eof = True
        while not eof:
            index += 1
            filename = "hopv_" + str(index) + ".txt"
            #Molecule level
            molecule = {}
            molecule["smiles"] = smiles.strip()
            molecule["inchi"] = in_file.readline().strip()
            exp_data = in_file.readline().strip().split(',')
            molecule["experimental_data"] = {
                "doi": exp_data[0],
                "inchikey": exp_data[1],
                "construction": exp_data[2],
                "architecture": exp_data[3],
                "complement": exp_data[4],
                "h**o": float(exp_data[5]),
                "lumo": float(exp_data[6]),
                "electrochemical_gap": float(exp_data[7]),
                "optical_gap": float(exp_data[8]),
                "pce": float(exp_data[9]),
                "voc": float(exp_data[10]),
                "jsc": float(exp_data[11]),
                "fill_factor": float(exp_data[12])
            }
            molecule["pruned_smiles"] = in_file.readline().strip()
            molecule["num_conformers"] = int(in_file.readline().strip())
            #Conformer level
            list_conformers = []
            for i in range(molecule["num_conformers"]):
                conformer = {}
                conformer["conformer_number"] = int(
                    in_file.readline().strip("\n Cconformer"))
                conformer["num_atoms"] = int(in_file.readline().strip())
                #Atom level
                list_atoms = []
                for j in range(conformer["num_atoms"]):
                    atom_data = in_file.readline().strip().split(' ')
                    atom = {
                        "element": atom_data[0],
                        "x_coordinate": float(atom_data[1]),
                        "y_coordinate": float(atom_data[2]),
                        "z_coordinate": float(atom_data[3])
                    }
                    list_atoms.append(atom)
                conformer["atoms"] = list_atoms
                #Four sets of calculated data
                list_calc = []
                for k in range(4):
                    calc_data = in_file.readline().strip().split(",")
                    calculated = {
                        "set_description": calc_data[0],
                        "h**o": float(calc_data[1]),
                        "lumo": float(calc_data[2]),
                        "gap": float(calc_data[3]),
                        "scharber_pce": float(calc_data[4]),
                        "scharber_voc": float(calc_data[5]),
                        "scharber_jsc": float(calc_data[6])
                    }
                    list_calc.append(calculated)
                conformer["calculated_data"] = list_calc
                list_conformers.append(conformer)
            molecule["conformers"] = list_conformers

            uri = "/collections/hopv/" + filename

            experimental = {}
            for key, value in molecule["experimental_data"].items():
                if value == value:  # Check for nan
                    experimental[key] = value

            record_metadata = {
                "mdf": {
                    "title": "HOPV - " + molecule["smiles"],
                    "acl": ["public"],

                    #                "tags": ,
                    #                "description": ,
                    "composition": molecule["smiles"],
                    #                "raw": ,
                    "links": {
                        #                    "landing_page": ,

                        #                    "publication": ,
                        #                    "dataset_doi": ,

                        #                    "related_id": ,
                        "molecule": {
                            "globus_endpoint":
                            "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                            "http_host":
                            "https://data.materialsdatafacility.org",
                            "path": uri
                        },
                        "original": {
                            "globus_endpoint":
                            "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                            "http_host":
                            "https://data.materialsdatafacility.org",
                            "path": "/collections/hopv/HOPV_15_revised_2.data"
                        }
                    }

                    #                "citation": ,
                    #                "data_contact": {

                    #                    "given_name": ,
                    #                    "family_name": ,

                    #                    "email": ,
                    #                    "institution":,

                    # IDs
                    #                },

                    #                "author": ,

                    #                "license": ,
                    #                "collection": ,
                    #                "data_format": ,
                    #                "data_type": ,
                    #                "year": ,

                    #                "mrr":

                    #            "processing": ,
                    #            "structure":,
                },
                "hopv": {
                    "experimental_data": experimental
                }
            }

            # Pass each individual record to the Validator
            result = dataset_validator.write_record(record_metadata)

            # Check if the Validator accepted the record, and print a message if it didn't
            # If the Validator returns "success" == True, the record was written successfully
            if result["success"] is not True:
                print("Error:", result["message"])
                if verbose:
                    print(result["details"])
            else:
                with open(input_path + filename, 'w') as outfile:
                    json.dump(molecule, outfile)

            smiles = in_file.readline()  # Next molecule
            if not smiles:  # Empty line is EOF
                eof = True

    if verbose:
        print("Finished converting")

Example #17

Show file

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    # NOTE: For fields that represent people (e.g. mdf-data_contact), other IDs can be added (ex. "github": "jgaff").
    #    It is recommended that all people listed in mdf-data_contributor have a github username listed.
    #
    # If there are other useful fields not covered here, another block (dictionary at the same level as "mdf") can be created for those fields.
    # The block must be called the same thing as the source_name for the dataset.
    if not metadata:
        ## Metadata:dataset
        dataset_metadata = {
            "mdf": {
                "title":
                "Benzonitrile on Si(001). XPS, NEXAFS, and STM data. Accepted for publication in PCCP Sept. 2016",
                "acl": ["public"],
                "source_name":
                "benzonitrile_si",
                "data_contact": {
                    "given_name": "Steven",
                    "family_name": "Schofield",
                    "email": "*****@*****.**",
                    "institution": "University College London",
                },
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78",
                }],
                "citation": [
                    "O'Donnell, Kane, Hedgeland, Holly, Moore, Gareth (2016) Benzonitrile on Si(001). XPS, NEXAFS, and STM data. Accepted for publication in PCCP Sept. 2016."
                ],
                "author": [{
                    "given_name": "Kane",
                    "family_name": "O'Donnell",
                    "institution": "Curtin University",
                }, {
                    "given_name": "Holly",
                    "family_name": "Hedgeland",
                    "institution": "The Open University",
                }, {
                    "given_name": "Gareth",
                    "family_name": "Moore",
                    "institution": "University College London",
                }, {
                    "given_name": "Asif",
                    "family_name": "Suleman",
                    "institution": "University College London",
                }, {
                    "given_name": "Manuel",
                    "family_name": "Siegl",
                    "institution": "University College London",
                }, {
                    "given_name": "Lars",
                    "family_name": "Thomsen",
                    "institution": "The Australian Synchrotron",
                }, {
                    "given_name": "Oliver",
                    "family_name": "Warschkow",
                    "institution": "The University of Sydney",
                }, {
                    "given_name": "Steven",
                    "family_name": "Schofield",
                    "email": "*****@*****.**",
                    "institution": "University College London",
                }],
                "license":
                "https://creativecommons.org/licenses/by/4.0/",
                "collection":
                "Benzonitrile on Si",
                "tags":
                ["benzonitrile", "Si(001)", "adsorption", "XPS", "NEXAFS"],
                "description":
                "This data set contains original XPS and NEXAFS data collected at the Australian Synchrotron.  The data are the results of experiments investigating benzonitrile adsorption to the Si(001) surface.  The results were written up and have been accepted for publication in Physical Chemistry Chemical Physics in Sept. 2016. The publication date is not yet known.",
                "year":
                2016,
                "links": {
                    "landing_page":
                    "https://doi.org/10.5281/zenodo.154112",
                    "publication": [
                        "http://pubs.rsc.org/en/content/articlepdf/2016/CP/C6CP04328C"
                    ],
                    #"data_doi": "",
                    #"related_id": "",

                    #"data_link": {

                    #"globus_endpoint": ,
                    #"http_host": "",

                    #"path": "",

                    #},
                },
            },

            #"mrr": {

            #},

            #"dc": {

            #},
        }
        ## End metadata
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    errors = 0
    for data_file in tqdm(find_files(input_path, "(pdb$|qe$)"),
                          desc="Processing Files",
                          disable=not verbose):
        dtype = data_file["filename"].split(".")[-1]
        if dtype == "pdb":
            ftype = "proteindatabank"
        else:
            ftype = "espresso-in"
        try:
            record = parse_ase(
                os.path.join(data_file["path"], data_file["filename"]), ftype)
        except Exception as e:
            errors += 1
        ## Metadata:record
        record_metadata = {
            "mdf": {
                "title": "Benzonitrile on Si - " + record["chemical_formula"],
                "acl": ["public"],
                "composition": record["chemical_formula"],

                #"tags": ,
                #"description": ,
                #"raw": ,
                "links": {

                    #"landing_page": ,
                    #"publication": ,
                    #"data_doi": ,
                    #"related_id": ,
                    dtype: {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host":
                        "https://data.materialsdatafacility.org",
                        "path":
                        "/collections/benzonitrile_si/" +
                        data_file["no_root_path"] + "/" +
                        data_file["filename"],
                    },
                },

                #"citation": ,

                #"data_contact": {

                #"given_name": ,
                #"family_name": ,
                #"email": ,
                #"institution": ,

                #},

                #"author": [{

                #"given_name": ,
                #"family_name": ,
                #"email": ,
                #"institution": ,

                #}],

                #"year": ,
            },

            #"dc": {

            #},
        }
        ## End metadata

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and stop processing if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if not result["success"]:
            if not dataset_validator.cancel_validation()["success"]:
                print(
                    "Error cancelling validation. The partial feedstock may not be removed."
                )
            raise ValueError(result["message"] + "\n" +
                             result.get("details", ""))

    # You're done!
    if verbose:
        print("ERRORS: " + str(errors))
        print("Finished converting")

Example #18

Show file

File: nanomine_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title": "NanoMine",
                "acl": ["public"],
                "source_name": "nanomine",
                "citation": ["Publication pending"],
                "data_contact": {
                    "given_name": "L. Catherine",
                    "family_name": "Brinson",
                    "email": "*****@*****.**",
                    "institution": "Northwestern University",

                    # IDs
                },
                "author": {
                    "given_name": "Yixing",
                    "family_name": "Wang",
                    "email": "yixingwang2014_at_u.northwestern.edu",
                    "institution": "Northwestern University"
                },

                #            "license": ,
                "collection": "NanoMine",
                "tags": ["polymer", "nanocomposites"],
                "description":
                "Material Informatics for Polymer Nanocomposites",
                "year": 2014,
                "links": {
                    "landing_page": "http://nanomine.northwestern.edu:8000/",

                    #                "publication": ,
                    #                "dataset_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

                #            "mrr": ,
                "data_contributor": {
                    "given_name": "Jonathon",
                    "family_name": "Gaff",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "jgaff"
                }
            }
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    dataset_validator = Validator(dataset_metadata)

    # Get the data
    with open(os.path.join(input_path, "nanomine.dump"), 'r') as dump_file:
        for line in tqdm(dump_file,
                         desc="Processing files",
                         disable=not verbose):
            record = json.loads(line)
            # Try extracting required data
            try:
                citation = record["content"]["PolymerNanocomposite"][
                    "DATA_SOURCE"]["Citation"]["CommonFields"]  # Shortcut
                uri = "http://nanomine.northwestern.edu:8000/explore/detail_result_keyword?id=" + record[
                    "_id"]["$oid"]
                record_metadata = {
                    "mdf": {
                        "title": citation["Title"],
                        "acl": ["public"],

                        #                    "tags": ,
                        #                    "description": ,

                        #                    "composition": ,
                        "raw": json.dumps(record),
                        "links": {
                            "landing_page": uri,

                            #                        "publication": ,
                            #                        "dataset_doi": ,

                            #                        "related_id": ,

                            # data links: {

                            #"globus_endpoint": ,
                            #"http_host": ,

                            #"path": ,
                            #},
                        },

                        #                    "citation": ,
                        #                    "data_contact": {

                        #                        "given_name": ,
                        #                        "family_name": ,

                        #                        "email": ,
                        #                        "institution":,

                        # IDs
                        #    },

                        #                    "author": ,

                        #                    "license": ,
                        #                    "collection": ,
                        #                    "data_format": ,
                        #                    "data_type": ,
                        #                    "year": ,

                        #                    "mrr":

                        #            "processing": ,
                        #            "structure":,
                    }
                }
            except Exception as e:
                # Something required failed. Skip record.
                #                print(repr(e))
                continue

            # Now extract non-required data (which can be missing)
            # Material composition
            mat_comp = get_nanomine_materials(record)
            if mat_comp:
                record_metadata["mdf"]["composition"] = mat_comp

            # Related identifiers (DOI, URL, and image links)
            image_list = record["content"]["PolymerNanocomposite"].get(
                "MICROSTRUCTURE", {}).get("ImageFile", [])
            if type(image_list) is not list:
                image_list = [image_list]
            related_list = [
                citation.get("DOI", "").replace("doi:", "http://dx.doi.org/"),
                citation.get("URL", "")
            ] + [
                image["File"]
                for image in image_list if image.get("File", None)
            ]
            if related_list:
                record_metadata["mdf"]["links"]["publication"] = [
                    rel for rel in related_list if rel
                ]

            # Year
            year = citation.get("PublicationYear")
            if year:
                record_metadata["mdf"]["year"] = int(year)

            # Pass each individual record to the Validator
            result = dataset_validator.write_record(record_metadata)

            # Check if the Validator accepted the record, and print a message if it didn't
            # If the Validator returns "success" == True, the record was written successfully
            if result["success"] is not True:
                print("Error:", result["message"])

    if verbose:
        print("Finished converting")

Example #19

Show file

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title": "Dynamic behaviour of the silica-water-bio electrical double layer in the presence of a divalent electrolyte",
                "acl": ['public'],
                "source_name": "silica_water_edl",
                "citation": ["Lowe, B.M., Maekawa, Y., Shibuta, Y., Sakata, T., Skylaris, C.-K. and Green, N.G. (2016) Dynamic Behaviour of the Silica-Water-Bio Electrical Double Layer in the Presence of a Divalent Electrolyte. Physical Chemistry Chemical Physics, https://doi.org/10.1039/C6CP04101A"],
                "data_contact": {
    
                    "given_name": "Benjamin",
                    "family_name": "Lowe",
                    
                    "email": "*****@*****.**",
                    "instituition": "University of Southampton"
    
                    },
    
                "author": [{
                    
                    "given_name": "Benjamin",
                    "family_name": "Lowe",
                    
                    "email": "*****@*****.**",
                    "instituition": "University of Southampton"
                    
                    },
                    {
                        
                    "given_name": "Toshiya",
                    "family_name": "Sakata",
                    
                    "email": "*****@*****.**",
                    "institution": "The University of Tokyo"
                    
                    },
                    {
                    
                    "given_name": "Nicolas",
                    "family_name": "Green",
                    
                    "email": "*****@*****.**",
                    "instituition": "University of Southampton",
                    
                    },
                    {
                    
                    "given_name": "Yuki",
                    "family_name": "Maekawa",
                    
                    "instituition": "The University of Tokyo",
                    
                    },
                    {
                    
                    "given_name": "Yasushi",
                    "family_name": "Shibuta",
                    
                    "instituition": "The University of Tokyo",
                    
                    },
                    {
                    
                    "given_name": "Chris",
                    "family_name": "Skylaris",
                    
                    "instituition": "University of Southampton",
                    
                    }],
    
                "license": "http://creativecommons.org/licenses/by/4.0/",
    
                "collection": "Silica Water EDL",
                "tags": ["BioFET", "BioFETs", "BioFED", "molecular dynamics", "MD"],
    
                "description": "Explicit-solvent atomistic calculations of this electric field are presented and the structure and dynamics of the interface are investigated in different ionic strengths using molecular dynamics simulations. Novel results from simulation of the addition of DNA molecules and divalent ions are also presented, the latter of particular importance in both physiological solutions and biosensing experiments",
                "year": 2016,
    
                "links": {
    
                    "landing_page": "https://eprints.soton.ac.uk/401018/",
    
                    "publication": ["http://eprints.soton.ac.uk/401017", "http://dx.doi.org/10.1039/C6CP04101A"],
                    "data_doi": "https://eprints.soton.ac.uk/401018/",
    
    #                "related_id": ,
    
                    # data links: {
                    
                        #"globus_endpoint": ,
                        #"http_host": ,
    
                        #"path": ,
                        #}
                    },
    
    #            "mrr": ,
    
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78"
                    }]
                }
            }
        
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")



    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)


    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in tqdm(find_files(input_path, "record-0.xyz"), desc="Processing files", disable=not verbose):
        record = parse_ase(os.path.join(data_file["path"], data_file["filename"]), "xyz")
        record_metadata = {
            "mdf": {
                "title": "Silica Water EDL - " + record["chemical_formula"],
                "acl": ['public'],
    
    #            "tags": ,
    #            "description": ,
                
                "composition": record["chemical_formula"],
    #            "raw": ,
    
                "links": {
                    #"landing_page": ,
    
    #                "publication": ,
    #                "data_doi": ,
    
    #                "related_id": ,
    
                    "xyz": {
                        "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host": "https://data.materialsdatafacility.org",
    
                        "path": "/collections/silica_water_edl/" + data_file["no_root_path"] + "/" + data_file["filename"],
                        },
                    },
    
    #            "citation": ,
    #            "data_contact": {
    
    #                "given_name": ,
    #                "family_name": ,
    
    #                "email": ,
    #                "institution":,
    
    #                },
    
    #            "author": ,
    
    #            "license": ,
    #            "collection": ,
    #            "year": ,
    
    #            "mrr":
    
    #            "processing": ,
    #            "structure":,
                },
            "silica_water_edl": {
                "number_of_frames": 3000
                }
            }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    # You're done!
    if verbose:
        print("Finished converting")

Example #20

Show file

File: cod_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    # Fields can be:
    #    REQ (Required, must be present)
    #    RCM (Recommended, should be present if possible)
    #    OPT (Optional, can be present if useful)
    # NOTE: For fields that represent people (e.g. mdf-data_contact), other IDs can be added (ex. "github": "jgaff").
    #    It is recommended that all people listed in mdf-data_contributor have a github username listed.
    #
    # If there are other useful fields not covered here, another block (dictionary at the same level as "mdf") can be created for those fields.
    # The block must be called the same thing as the source_name for the dataset.
    if not metadata:
        ## Metadata:dataset
        dataset_metadata = {
            # REQ dictionary: MDF-format dataset metadata
            "mdf": {

                # REQ string: The title of the dataset
                "title":
                "Crystallography Open Database",

                # REQ list of strings: The UUIDs allowed to view this metadata, or 'public'
                "acl": ["public"],

                # REQ string: A short version of the dataset name, for quick reference. Spaces and dashes will be replaced with underscores, and other non-alphanumeric characters will be removed.
                "source_name":
                "cod",

                # REQ dictionary: The contact person/steward/custodian for the dataset
                "data_contact": {

                    # REQ string: The person's given (or first) name
                    "given_name": "Daniel",

                    # REQ string: The person's family (or last) name
                    "family_name": "Chateigner",

                    # REQ string: The person's email address
                    "email": "*****@*****.**",

                    # RCM string: The primary affiliation for the person
                    #                    "institution": ,
                },

                # REQ list of dictionaries: The person/people contributing the tools (harvester, this converter) to ingest the dataset (i.e. you)
                "data_contributor": [{

                    # REQ string: The person's given (or first) name
                    "given_name": "Jonathon",

                    # REQ string: The person's family (or last) name
                    "family_name": "Gaff",

                    # REQ string: The person's email address
                    "email": "*****@*****.**",

                    # RCM string: The primary affiliation for the person
                    "institution": "The University of Chicago",

                    # RCM string: The person's GitHub username
                    "github": "jgaff",
                }],

                # RCM list of strings: The full bibliographic citation(s) for the dataset
                "citation": [
                    'Merkys, A., Vaitkus, A., Butkus, J., Okulič-Kazarinas, M., Kairys, V. & Gražulis, S. (2016) "COD::CIF::Parser: an error-correcting CIF parser for the Perl language". Journal of Applied Crystallography 49.',
                    'Gražulis, S., Merkys, A., Vaitkus, A. & Okulič-Kazarinas, M. (2015) "Computing stoichiometric molecular composition from crystal structures". Journal of Applied Crystallography 48, 85-91.',
                    'Gražulis, S., Daškevič, A., Merkys, A., Chateigner, D., Lutterotti, L., Quirós, M., Serebryanaya, N. R., Moeck, P., Downs, R. T. & LeBail, A. (2012) "Crystallography Open Database (COD): an open-access collection of crystal structures and platform for world-wide collaboration". Nucleic Acids Research 40, D420-D427.',
                    'Grazulis, S., Chateigner, D., Downs, R. T., Yokochi, A. T., Quiros, M., Lutterotti, L., Manakova, E., Butkus, J., Moeck, P. & Le Bail, A. (2009) "Crystallography Open Database – an open-access collection of crystal structures". J. Appl. Cryst. 42, 726-729.',
                    'Downs, R. T. & Hall-Wallace, M. (2003) "The American Mineralogist Crystal Structure Database". American Mineralogist 88, 247-250.'
                ],

                # RCM list of dictionaries: A list of the authors of this dataset
                #                "author": [{

                # REQ string: The person's given (or first) name
                #                    "given_name": ,

                # REQ string: The person's family (or last) name
                #                    "family_name": ,

                # RCM string: The person's email address
                #                    "email": ,

                # RCM string: The primary affiliation for the person
                #                    "institution": ,

                #                }],

                # RCM string: A link to the license for distribution of the dataset
                "license":
                "Public Domain",

                # RCM string: The collection for the dataset, commonly a portion of the title
                "collection":
                "COD",

                # RCM list of strings: Tags, keywords, or other general descriptors for the dataset
                "tags": ["Crystallography"],

                # RCM string: A description of the dataset
                "description":
                "Open-access collection of crystal structures of organic, inorganic, metal-organic compounds and minerals, excluding biopolymers.",

                # RCM integer: The year of dataset creation
                "year":
                2003,

                # REQ dictionary: Links relating to the dataset
                "links": {

                    # REQ string: The human-friendly landing page for the dataset
                    "landing_page": "http://www.crystallography.net/cod/",

                    # RCM list of strings: The DOI(s) (in link form, ex. 'https://dx.doi.org/10.12345') for publications connected to the dataset
                    #                    "publication": ,

                    # RCM string: The DOI of the dataset itself (in link form)
                    #                    "data_doi": ,

                    # OPT list of strings: The mdf-id(s) of related entries, not including records from this dataset
                    #                    "related_id": ,

                    # RCM dictionary: Links to raw data files from the dataset (multiple allowed, field name should be data type)
                    #                    "data_link": {

                    # RCM string: The ID of the Globus Endpoint hosting the file
                    #                        "globus_endpoint": ,

                    # RCM string: The fully-qualified HTTP hostname, including protocol, but without the path (for example, 'https://data.materialsdatafacility.org')
                    #                        "http_host": ,

                    # REQ string: The full path to the data file on the host
                    #                        "path": ,

                    #                    },
                },
            },

            # OPT dictionary: MRR-format metadata
            "mrr": {},

            # OPT dictionary: DataCite-format metadata
            "dc": {},
        }
        ## End metadata
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    # Set up multiprocessing
    md_files = multiprocessing.JoinableQueue()
    rc_out = multiprocessing.JoinableQueue()
    counter = multiprocessing.Value('i', 0)
    err_counter = multiprocessing.Value('i', 0)
    killswitch = multiprocessing.Value('i', 0)
    # Find all the cif files
    cif_list = [
        os.path.join(cif["path"], cif["filename"])
        for cif in tqdm(find_files(input_path, "cif$"),
                        desc="Finding files",
                        disable=not verbose)
    ]
    # Process to add data into queue
    adder = multiprocessing.Process(
        target=(lambda cif_list: [md_files.put(cif) for cif in cif_list]),
        args=(cif_list, ))
    # Processes to process records from input queue to output queue
    processors = [
        multiprocessing.Process(target=process_cod,
                                args=(md_files, rc_out, err_counter,
                                      killswitch))
        for i in range(NUM_PROCESSORS)
    ]
    # Processes to write data from output queue
    #    writers = [multiprocessing.Process(target=do_validation, args=(rc_out, dataset_validator, counter, killswitch)) for i in range(NUM_WRITERS)]
    w = multiprocessing.Process(target=do_validation,
                                args=(rc_out, dataset_validator, counter,
                                      killswitch))
    # Process to manage progress bar
    prog_bar = multiprocessing.Process(target=track_progress,
                                       args=(len(cif_list), counter,
                                             err_counter, killswitch))

    # Start adder
    adder.start()
    # Start processors, writers, and progress bar after data is in queue
    while md_files.empty():
        sleep(1)
    [p.start() for p in processors]
    #    [w.start() for w in writers]
    w.start()
    if verbose:
        prog_bar.start()

    # Wait on adder to finish
    adder.join()
    if verbose:
        print("Adder has completed.")
    # Wait on both queues to be complete
    md_files.join()
    if verbose:
        print("Input Queue is empty.")
    rc_out.join()
    if verbose:
        print("Output Queue is empty.")
    # Trigger remote termination of processes without purpose
    killswitch.value = 1
    if verbose:
        print("Terminating remaining processes.")
    # Wait on all the processes to terminate
    [p.join() for p in processors]
    if verbose:
        print("All processors terminated.")
#    [w.join() for w in writers]
    w.join()
    if verbose:
        print("Writer terminated")
    if prog_bar.is_alive():
        prog_bar.join()
        print("Progress bar terminated.")

    if verbose:
        print("Finished converting")
        print("There were", err_counter.value, "errors")

Example #21

Show file

File: nist_atom_weight_iso_comp_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "NIST Atomic Weights and Isotopic Compositions with Relative Atomic Masses",
                "acl": ["public"],
                "source_name": "nist_atom_weight_iso_comp",
                "citation": ["NIST Standard Reference Database 144"],
                "data_contact": {
                    "given_name":
                    "Karen",
                    "family_name":
                    "Olsen",
                    "email":
                    "*****@*****.**",
                    "institution":
                    "National Institute of Standards and Technology",
                },

                #            "author": ,

                #            "license": ,
                "collection": "NIST Atomic Weights and Isotopic Compositions",
                "tags": ["atomic weight", "isotopic composition"],
                "description":
                "The atomic weights are available for elements 1 through 118 and isotopic compositions or abundances are given when appropriate.",
                "year": 1999,
                "links": {
                    "landing_page":
                    "https://www.nist.gov/pml/atomic-weights-and-isotopic-compositions-relative-atomic-masses",
                    "publication": [
                        "http://www.ciaaw.org/atomic-weights.htm",
                        "http://www.iupac.org/publications/pac/83/2/0397/",
                        "http://amdc.impcas.ac.cn/evaluation/data2012/ame.html"
                    ],
                    #                "dataset_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

                #           "mrr": ,
                "data_contributor": {
                    "given_name": "Jonathon",
                    "family_name": "Gaff",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "jgaff"
                }
            }
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    dataset_validator = Validator(dataset_metadata)

    # Get the data
    with open(os.path.join(input_path, "notes.json")) as notes:
        note_lookup = json.load(notes)
    with open(os.path.join(input_path,
                           "atom_weight_iso_comp.txt")) as raw_file:
        raw = raw_file.read()
    raw = raw.replace("&nbsp;", "")
    record_list = raw.split("\n\n")
    for raw_record in tqdm(record_list,
                           desc="Processing records",
                           disable=not verbose):
        record = {}
        for line in raw_record.split("\n"):
            data_list = line.split("=")
            if len(data_list) > 1 and data_list[1].strip():
                record[data_list[0].strip().lower().replace(
                    " ", "_")] = data_list[1].strip()

        record_metadata = {
            "mdf": {
                "title":
                "NIST Atomic Weights - " + record["atomic_symbol"] +
                record["mass_number"],
                "acl": ["public"],

                #            "tags": ,
                "description":
                ",".join([
                    note_lookup[n] for n in record.get("notes", "").split(",")
                    if record.get("notes", "")
                ]),
                "composition":
                record["atomic_symbol"],
                "raw":
                json.dumps(record),
                "links": {
                    #                "landing_page": ,

                    #                "publication": ,
                    #                "dataset_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #},
                },

                #            "citation": ,
                #            "data_contact": {

                #                "given_name": ,
                #                "family_name": ,

                #                "email": ,
                #                "institution":,

                # IDs
                #                },

                #            "author": ,

                #            "license": ,
                #            "collection": ,
                #            "data_format": ,
                #            "data_type": ,
                #            "year": ,

                #            "mrr":

                #            "processing": ,
                #            "structure":,
            }
        }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    if verbose:
        print("Finished converting")

Example #22

Show file

File: nist_ip_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "NIST Interatomic Potentials Repository Project",
                "acl": ["public"],
                "source_name":
                "nist_ip",
                "citation": [
                    'C.A. Becker, et al., "Considerations for choosing and using force fields and interatomic potentials in materials science and engineering," Current Opinion in Solid State and Materials Science, 17, 277-283 (2013). https://www.ctcms.nist.gov/potentials'
                ],
                "data_contact": {
                    "given_name": "Lucas",
                    "family_name": "Hale",
                    "email": "*****@*****.**",
                    "institution":
                    "National Institute of Standards and Technology"
                },
                "author": [{
                    "given_name":
                    "Lucas",
                    "family_name":
                    "Hale",
                    "institution":
                    "National Institute of Standards and Technology"
                }, {
                    "given_name":
                    "Zachary",
                    "family_name":
                    "Trautt",
                    "institution":
                    "National Institute of Standards and Technology"
                }, {
                    "given_name":
                    "Chandler",
                    "family_name":
                    "Becker",
                    "institution":
                    "National Institute of Standards and Technology"
                }],

                #            "license": ,
                "collection":
                "NIST Interatomic Potentials",
                "tags": ["interatomic potential", "forcefield"],
                "description":
                "This repository provides a source for interatomic potentials (force fields), related files, and evaluation tools to help researchers obtain interatomic models and judge their quality and applicability.",
                "year":
                2013,
                "links": {
                    "landing_page": "https://www.ctcms.nist.gov/potentials/",

                    #                "publication": ,
                    #                "data_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

                #            "mrr": ,
                "data_contributor": {
                    "given_name": "Jonathon",
                    "family_name": "Gaff",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "jgaff"
                }
            }
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    dataset_validator = Validator(dataset_metadata)

    # Get the data
    for file_data in tqdm(find_files(input_path, "\.json$"),
                          desc="Processing files",
                          disable=not verbose):
        try:
            with open(os.path.join(file_data["path"], file_data["filename"]),
                      'r') as ip_file:
                ip_data = json.load(ip_file)["interatomic-potential"]
            if not ip_data:
                raise ValueError("No data in file")
        except Exception as e:
            if verbose:
                print("Error reading '" +
                      os.path.join(file_data["path"], file_data["filename"]) +
                      "'")
            continue
        url_list = []
        link_texts = []
        for artifact in ip_data["implementation"]:
            for web_link in artifact["artifact"]:
                url = web_link.get("web-link", {}).get("URL", None)
                if url:
                    if not url.startswith("http"):
                        url = "http://" + url
                    url_list.append(url)
                link_text = web_link.get("web-link", {}).get("link-text", None)
                if link_text:
                    link_texts.append(link_text)

        record_metadata = {
            "mdf": {
                "title":
                "NIST Interatomic Potential - " + ", ".join(link_texts),
                "acl": ["public"],

                #            "tags": ,
                #            "description": ,
                "composition": "".join(ip_data["element"]),
                "raw": json.dumps(ip_data),
                "links": {
                    #                "landing_page": ,
                    "publication": url_list,
                    #                "data_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #},
                },

                #            "citation": ,
                #            "data_contact": {

                #                "given_name": ,
                #                "family_name": ,

                #                "email": ,
                #                "institution":,

                # IDs
                #                },

                #            "author": ,

                #            "license": ,
                #            "collection": ,
                #            "data_format": ,
                #            "data_type": ,
                #            "year": ,

                #            "mrr":

                #            "processing": ,
                #            "structure":,
            }
        }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    if verbose:
        print("Finished converting")

Example #23

Show file

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title": "Derivation and Validation of Toxicophores for Mutagenicity Prediction",
                "acl": ['public'],
                "source_name": "bursi_toxicophores",
                "citation": ["Bursi, Roberta (2005/01/01). Derivation and Validation of Toxicophores for Mutagenicity Prediction. Journal of Medicinal Chemistry, 48, 312-320. doi: 10.1021/jm040835a"],
                "data_contact": {
    
                    "given_name": "Roberta",
                    "family_name": "Bursi",
                    
                    "email": "*****@*****.**",
                    "instituition": "Molecular Design & Informatics Department, N.V. Organon"
    
                    },
    
                "author": [{
                    
                    "given_name": "Roberta",
                    "family_name": "Bursi",
                    
                    "email": "*****@*****.**",
                    "instituition": "Molecular Design & Informatics Department, N.V. Organon"
                    
                    },
                    {
                    
                    "given_name": "Ross",
                    "family_name": "McGuire",
                    
                    "instituition": "Molecular Design & Informatics Department, N.V. Organon"
                    
                    },
                    {
                    
                    "given_name": "Jeroen",
                    "family_name": "Kazius",
                    
                    "instituition": "Universiteit Leiden"
                    
                    }],
    
                #"license": "",
    
                "collection": "Toxicophores for Mutagenicity Prediction",
                #"tags": ,
    
                "description": "Mutagenicity is one of the numerous adverse properties of a compound that hampers its potential to become a marketable drug. Toxic properties can often be related to chemical structure, more specifically, to particular substructures, which are generally identified as toxicophores. A number of toxicophores have already been identified in the literature. This study aims at increasing the current degree of reliability and accuracy of mutagenicity predictions by identifying novel toxicophores from the application of new criteria for toxicophore rule derivation and validation to a considerably sized mutagenicity dataset.",
                "year": 2005,
    
                "links": {
    
                    "landing_page": "http://pubs.acs.org/doi/full/10.1021/jm040835a",
    
                    #"publication": [""],
                    #"data_doi": ,
    
                    #"related_id": ,
    
                    "sdf": {
                    
                        #"globus_endpoint": ,
                        "http_host": "ftp://ftp.ics.uci.edu",
    
                        "path": "/pub/baldig/learning/Bursi/source/cas_4337.sdf",
                        }
                    },
    
    #            "mrr": ,
    
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78"
                    }]
                }
            }
        
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")



    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in tqdm(find_files(input_path, "sdf"), desc="Processing files", disable=not verbose):
        record = parse_ase(os.path.join(data_file["path"], data_file["filename"]), "sdf")
        record_metadata = {
            "mdf": {
                "title": "Bursi Toxicophores - " + record["chemical_formula"],
                "acl": ['public'],
    
    #            "tags": ,
    #            "description": ,
                
                "composition": record["chemical_formula"],
    #            "raw": ,
    
                "links": {
    #                "landing_page": ,
    
    #                "publication": ,
    #                "data_doi": ,
    
    #                "related_id": ,
    
                    "sdf": {
                        "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host": "https://data.materialsdatafacility.org",
    
                        "path": "/collections/bursi_toxicophores/" + data_file["filename"],
                        },
                    },
    
    #            "citation": ,
    #            "data_contact": {
    
    #                "given_name": ,
    #                "family_name": ,
    
    #                "email": ,
    #                "institution":,
    
    #                },
    
    #            "author": ,
    
    #            "license": ,
    #            "collection": ,
    #            "year": ,
    
    #            "mrr":
    
    #            "processing": ,
    #            "structure":,
                }
            }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    # You're done!
    if verbose:
        print("Finished converting")

Example #24

Show file

File: delaney_esol_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    # NOTE: For fields that represent people (e.g. mdf-data_contact), other IDs can be added (ex. "github": "jgaff").
    #    It is recommended that all people listed in mdf-data_contributor have a github username listed.
    #
    # If there are other useful fields not covered here, another block (dictionary at the same level as "mdf") can be created for those fields.
    # The block must be called the same thing as the source_name for the dataset.
    if not metadata:
        ## Metadata:dataset
        dataset_metadata = {
            "mdf": {
                "title":
                "ESOL: Estimating Aqueous Solubility Directly from Molecular Structure",
                "acl": ["public"],
                "source_name":
                "delaney_esol",
                "data_contact": {
                    "given_name":
                    "John S.",
                    "family_name":
                    "Delaney",
                    "email":
                    "*****@*****.**",
                    "institution":
                    "Syngenta, Jealott's Hill International Research Centre",
                },
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78",
                }],
                "citation": [
                    "Delaney, John S. (2004/05/01). ESOL:  Estimating Aqueous Solubility Directly from Molecular Structure. Journal of Chemical Information and Computer Sciences, 44, 1000-1005. doi: 10.1021/ci034243x"
                ],
                "author": [{
                    "given_name":
                    "John S.",
                    "family_name":
                    "Delaney",
                    "email":
                    "*****@*****.**",
                    "institution":
                    "Syngenta, Jealott's Hill International Research Centre",
                }],

                #"license": "",
                "collection":
                "ESOL",
                #"tags": [""],
                "description":
                "This paper describes a simple method for estimating the aqueous solubility (ESOL − Estimated SOLubility) of a compound directly from its structure. The model was derived from a set of 2874 measured solubilities using linear regression against nine molecular properties. The most significant parameter was calculated logPoctanol, followed by molecular weight, proportion of heavy atoms in aromatic systems, and number of rotatable bonds. The model performed consistently well across three validation sets, predicting solubilities within a factor of 5−8 of their measured values, and was competitive with the well-established “General Solubility Equation” for medicinal/agrochemical sized molecules.",
                "year":
                2004,
                "links": {
                    "landing_page":
                    "http://pubs.acs.org/doi/abs/10.1021/ci034243x#ci034243xAF1",
                    "publication": [
                        "http://pubs.acs.org/doi/full/10.1021/ci034243x#ci034243xAF1"
                    ],
                    #"data_doi": "",
                    #"related_id": "",

                    #"data_link": {

                    #"globus_endpoint": ,
                    #"http_host": "",

                    #"path": "",

                    #},
                },
            },

            #"mrr": {

            #},

            #"dc": {

            #},
        }
        ## End metadata
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata

    with open(os.path.join(input_path, "delaney_esol.txt"), 'r') as raw_in:
        headers = raw_in.readline().strip("\n").split(",")
        data = raw_in.readlines()

    for line in data:
        line_data = line.strip("\n").split(",")
        record = {}
        indx = -3
        record[headers[0]] = ",".join(line_data[:indx])

        for head in headers[1:]:
            record[head] = line_data[indx]
            indx += 1
        ## Metadata:record
        record_metadata = {
            "mdf": {
                "title": "ESOL - " + record["SMILES"],
                "acl": ["public"],
                "composition": record["SMILES"],

                #"tags": ,
                #"description": ,
                "raw": json.dumps(record),
                "links": {

                    #"landing_page": ,
                    #"publication": ,
                    #"data_doi": ,
                    #"related_id": ,
                    "txt": {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host": "https://data.materialsdatafacility.org",
                        "path": "/collections/delaney_esol/delaney_esol.txt",
                    },
                },

                #"citation": ,

                #"data_contact": {

                #"given_name": ,
                #"family_name": ,
                #"email": ,
                #"institution": ,

                #},

                #"author": [{

                #"given_name": ,
                #"family_name": ,
                #"email": ,
                #"institution": ,

                #}],

                #"year": ,
            },

            #"dc": {

            #},
        }
        ## End metadata

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and stop processing if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if not result["success"]:
            if not dataset_validator.cancel_validation()["success"]:
                print(
                    "Error cancelling validation. The partial feedstock may not be removed."
                )
            raise ValueError(result["message"] + "\n" +
                             result.get("details", ""))

    # You're done!
    if verbose:
        print("Finished converting")

Example #25

Show file

File: ti64_acoustic_loss_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "On the effect of hydrogen on the elastic moduli and acoustic loss behaviour of Ti-6Al-4V",
                "acl": ['public'],
                "source_name":
                "ti64_acoustic_loss",
                "citation": [
                    "Driver, S. L., Jones, N. G., Stone, H. J., Rugg, D., & Carpenter, M. A. Research Data Supporting \"On the effect of hydrogen on the elastic moduli and acoustic loss behaviour of Ti-6Al-4V\" [Dataset]. https://doi.org/10.17863/CAM.90"
                ],
                "data_contact": {
                    "given_name": "Sarah L.",
                    "family_name": "Driver",
                    "email": "*****@*****.**",
                    "instituition": "University of Cambridge"
                },
                "author": [{
                    "given_name": "Sarah L.",
                    "family_name": "Driver",
                    "email": "*****@*****.**",
                    "instituition": "University of Cambridge"
                }, {
                    "given_name": "Nicholas G.",
                    "family_name": "Jones",
                    "instituition": "University of Cambridge"
                }, {
                    "given_name": "Howard J.",
                    "family_name": "Stone",
                    "instituition": "University of Cambridge"
                }, {
                    "given_name": "David",
                    "family_name": "Rugg",
                    "instituition": "Rolls-Royce plc., Derby, UK"
                }, {
                    "given_name": "Michael A.",
                    "family_name": "Carpenter",
                    "instituition": "University of Cambridge"
                }],
                "license":
                "http://creativecommons.org/licenses/by/4.0/",
                "collection":
                "Ti-6Al-4V Acoustic Loss",
                "tags": [
                    "Titanium alloys", "resonant ultrasound spectroscopy",
                    "microstructure", "mobility", "hydrogen in metals",
                    "internal friction"
                ],
                "description":
                "Resonant Ultrasound Spectroscopy data of a sample of Ti-6Al-4V alloy. Response of the sample due to applied frequency is recorded at set temperatures.",
                "year":
                2016,
                "links": {
                    "landing_page":
                    "https://doi.org/10.17863/CAM.90",
                    "publication":
                    ["https://doi.org/10.1080/14786435.2016.1198054"],
                    # "data_doi": "",

                    #"related_id": ,
                    "txt": {

                        #"globus_endpoint": ,
                        "http_host":
                        "https://www.repository.cam.ac.uk",
                        "path":
                        "/bitstream/handle/1810/256150/ti64-rawdata.txt?sequence=1&isAllowed=y",
                    }
                },

                #            "mrr": ,
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78"
                }]
            }
        }

    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata

    for data_file in tqdm(find_files(input_path, "txt"),
                          desc="Processing records",
                          disable=not verbose):
        temperature = data_file["filename"].split("<")[1].split(">")[0]
        record_metadata = {
            "mdf": {
                "title": "Ti64 Acoustic Loss at " + temperature + "K",
                "acl": ['public'],

                #        "tags": ,
                #        "description": ,
                "composition": "Ti-6Al-4V",
                #        "raw": ,
                "links": {
                    #            "landing_page": ,

                    #            "publication": ,
                    #            "data_doi": ,

                    #            "related_id": ,
                    "txt": {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host":
                        "https://data.materialsdatafacility.org",
                        "path":
                        "/collections/ti64_acoustic_loss/" +
                        data_file["filename"],
                    },
                },

                #        "citation": ,
                #        "data_contact": {

                #            "given_name": ,
                #            "family_name": ,

                #            "email": ,
                #            "institution":,

                #            },

                #        "author": ,

                #        "license": ,
                #        "collection": ,
                #        "year": ,

                #        "mrr":

                #        "processing": ,
                #        "structure":,
            }
        }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    # You're done!
    if verbose:
        print("Finished converting")

Example #26

Show file

File: silverman_qsar_comma_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    # NOTE: For fields that represent people (e.g. mdf-data_contact), other IDs can be added (ex. "github": "jgaff").
    #    It is recommended that all people listed in mdf-data_contributor have a github username listed.
    #
    # If there are other useful fields not covered here, another block (dictionary at the same level as "mdf") can be created for those fields.
    # The block must be called the same thing as the source_name for the dataset.
    if not metadata:
        ## Metadata:dataset
        dataset_metadata = {
            "mdf": {
                "title":
                "Comparative Molecular Moment Analysis (CoMMA):  3D-QSAR without Molecular Superposition",
                "acl": ["public"],
                "source_name":
                "silverman_qsar_comma",
                "data_contact": {
                    "given_name": "Daniel. E.",
                    "family_name": "Platt",
                    "email": "*****@*****.**",
                    "institution": "Thomas J. Watson Research Center",
                },
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78",
                }],
                "citation": [
                    "Platt, Daniel. E. (1996/01/01). Comparative Molecular Moment Analysis (CoMMA):  3D-QSAR without Molecular Superposition. Journal of Medicinal Chemistry, 39, 2129-2140. doi: 10.1021/jm950589q"
                ],
                "author": [{
                    "given_name": "B. D.",
                    "family_name": "Silverman",
                    "institution": "Thomas J. Watson Research Center",
                }, {
                    "given_name": "Daniel. E.",
                    "family_name": "Platt",
                    "email": "*****@*****.**",
                    "institution": "Thomas J. Watson Research Center",
                }],

                #"license": "",
                "collection":
                "Silverman QSAR CoMMA",
                #"tags": [""],
                "description":
                "3d-QSAR procedures utilize descriptors that characterize molecular shape and charge distributions responsible for the steric and electrostatic nonbonding interactions intimately involved in ligand−receptor binding. Comparative molecular moment analysis (CoMMA) utilizes moments of the molecular mass and charge distributions up to and including second order in the development of molecular similarity descriptors. As a consequence, two Cartesian reference frames are then defined with respect to each molecular structure.",
                "year":
                1996,
                "links": {
                    "landing_page":
                    "ftp://ftp.ics.uci.edu/pub/baldig/learning/Silverman/",
                    "publication":
                    ["http://pubs.acs.org/doi/full/10.1021/jm950589q"],
                    #"data_doi": "",
                    #"related_id": ,

                    #"data_link": {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #},
                },
            },

            #"mrr": {

            #},

            #"dc": {

            #},
        }
        ## End metadata
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in tqdm(find_files(input_path, "sdf"),
                          desc="Processing files",
                          disable=not verbose):
        record = parse_ase(
            os.path.join(data_file["path"], data_file["filename"]), "sdf")
        ## Metadata:record
        record_metadata = {
            "mdf": {
                "title":
                "Silverman QSAR CoMMA - " + record["chemical_formula"],
                "acl": ["public"],
                "composition": record["chemical_formula"],

                #                "tags": ,
                #                "description": ,
                # "raw": json.dumps(record),
                "links": {

                    #                    "landing_page": ,
                    #                    "publication": ,
                    #                    "data_doi": ,
                    #                    "related_id": ,
                    "sdf": {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host":
                        "https://data.materialsdatafacility.org",
                        "path":
                        "/collections/silverman_qsar_comma/" +
                        data_file["no_root_path"] + "/" +
                        data_file["filename"],
                    },
                    "txt": {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host": "https://data.materialsdatafacility.org",
                        "path":
                        "/collections/silverman_qsar_comma/activity.txt"
                    }
                },

                #                "citation": ,

                #                "data_contact": {

                #                    "given_name": ,
                #                    "family_name": ,
                #                    "email": ,
                #                    "institution": ,

                #                    },

                #                "author": [{

                #                    "given_name": ,
                #                    "family_name": ,
                #                    "email": ,
                #                    "institution": ,

                #                    }],

                #                "year": ,
            },

            # "dc": {

            # },
        }
        ## End metadata

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and stop processing if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if not result["success"]:
            if not dataset_validator.cancel_validation()["success"]:
                print(
                    "Error cancelling validation. The partial feedstock may not be removed."
                )
            raise ValueError(result["message"] + "\n" +
                             result.get("details", ""))

    # You're done!
    if verbose:
        print("Finished converting")

Example #27

Show file

File: deso_water_md_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    # NOTE: For fields that represent people (e.g. mdf-data_contact), other IDs can be added (ex. "github": "jgaff").
    #    It is recommended that all people listed in mdf-data_contributor have a github username listed.
    #
    # If there are other useful fields not covered here, another block (dictionary at the same level as "mdf") can be created for those fields.
    # The block must be called the same thing as the source_name for the dataset.
    if not metadata:
        ## Metadata:dataset
        dataset_metadata = {
            "mdf": {
                "title":
                "DESO-water force fields and MD systems",
                "acl": ["public"],
                "source_name":
                "deso_water_md",
                "data_contact": {
                    "given_name": "Vitaly V.",
                    "family_name": "Chaban",
                    "email": "*****@*****.**",
                    "institution": "University of Rochester",
                },
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78",
                }],
                "citation": [
                    "Vitaly V. Chaban. (2017). DESO-water force fields and MD systems [Data set]. Zenodo. http://doi.org/10.5281/zenodo.254273"
                ],
                "author": [{
                    "given_name": "Vitaly V.",
                    "family_name": "Chaban",
                    "email": "*****@*****.**",
                    "institution": "Federal University of Sǎo Paulo",
                }],
                "license":
                "https://creativecommons.org/licenses/by/4.0/",
                "collection":
                "DESO Water MD",
                #"tags": [""],
                "description":
                "DESO-water force fields and MD systems",
                "year":
                2017,
                "links": {
                    "landing_page":
                    "https://doi.org/10.5281/zenodo.254273",
                    "publication": [
                        "http://pubs.rsc.org/-/content/articlelanding/2016/cp/c5cp08006a/unauth#!divAbstract"
                    ],
                    #"data_doi": "",
                    #"related_id": ,

                    #"data_link": {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #},
                },
            },

            #"mrr": {

            #},

            #"dc": {

            #},
        }
        ## End metadata
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in tqdm(find_files(input_path, "(gro$|pdb$)"),
                          desc="Processing files",
                          disable=not verbose):
        dtype = data_file["filename"].split(".")[-1]
        if dtype == "gro":
            ftype = "gromacs"
        else:
            ftype = "proteindatabank"
        record = parse_ase(
            os.path.join(data_file["path"], data_file["filename"]), ftype)
        record_metadata = {
            "mdf": {
                "title": "DESO Water MD - " + record["chemical_formula"],
                "acl": ["public"],
                "composition": record["chemical_formula"],

                #                "tags": ,
                #                "description": ,
                #"raw": json.dumps(record),
                "links": {

                    #                    "landing_page": ,
                    #                    "publication": ,
                    #                    "data_doi": ,
                    #                    "related_id": ,
                    dtype: {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host":
                        "https://data.materialsdatafacility.org",
                        "path":
                        "/collections/deso_water_md/" +
                        data_file["no_root_path"] + "/" +
                        data_file["filename"],
                    },
                },

                #                "citation": ,

                #                "data_contact": {

                #                    "given_name": ,
                #                    "family_name": ,
                #                    "email": ,
                #                    "institution": ,

                #                    },

                #                "author": [{

                #                    "given_name": ,
                #                    "family_name": ,
                #                    "email": ,
                #                    "institution": ,

                #                    }],

                #                "year": ,
            },

            # "dc": {

            # },
        }
        ## End metadata

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and stop processing if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if not result["success"]:
            if not dataset_validator.cancel_validation()["success"]:
                print(
                    "Error cancelling validation. The partial feedstock may not be removed."
                )
            raise ValueError(result["message"] + "\n" +
                             result.get("details", ""))

    # You're done!
    if verbose:
        print("Finished converting")

Example #28

Show file

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "Fast and Accurate Modeling of Molecular Atomization Energies with Machine Learning",
                "acl": ['public'],
                "source_name":
                "gdb7_12",
                "citation": [
                    "Matthias Rupp, Alexandre Tkatchenko, Klaus-Robert Müller, O. Anatole von Lilienfeld: Fast and Accurate Modeling of Molecular Atomization Energies with Machine Learning, Physical Review Letters 108(5): 058301, 2012. DOI: 10.1103/PhysRevLett.108.058301",
                    "Gr\'egoire Montavon, Katja Hansen, Siamac Fazli, Matthias Rupp, Franziska Biegler, Andreas Ziehe, Alexandre Tkatchenko, O. Anatole von Lilienfeld, Klaus-Robert M\"uller: Learning Invariant Representations of Molecules for Atomization Energy Prediction, Advances in Neural Information Processing Systems 25 (NIPS 2012), Lake Tahoe, Nevada, USA, December 3-6, 2012."
                ],
                "data_contact": {
                    "given_name": "O. Anatole",
                    "family_name": "von Lilienfeld",
                    "email": "*****@*****.**",
                    "institution": "Argonne National Laboratory",
                },
                "author": [{
                    "given_name": "O. Anatole",
                    "family_name": "von Lilienfeld",
                    "email": "*****@*****.**",
                    "instituition": "Argonne National Laboratory"
                }, {
                    "given_name":
                    "Alexandre",
                    "family_name":
                    "Tkatchenko",
                    "institution":
                    "University of California Los Angeles, Fritz-Haber-Institut der Max-Planck-Gesellschaft"
                }, {
                    "given_name":
                    "Matthias",
                    "family_name":
                    "Rupp",
                    "instituition":
                    "Technical University of Berlin, University of California Los Angeles",
                }, {
                    "given_name":
                    "Klaus-Robert",
                    "family_name":
                    "Müller",
                    "email":
                    "*****@*****.**",
                    "instituition":
                    "Technical University of Berlin, University of California Los Angeles",
                }],

                #            "license": ,
                "collection":
                "gdb7_12",
                #            "tags": ,
                "description":
                "7k small organic molecules, close to their ground states, with DFT atomization energies. 7,165 small organic molecules composed of H, C, N, O, S, saturated with H, and up to 7 non-H atoms. Molecules relaxed with an empirical potential. Atomization energies calculated using DFT with hybrid PBE0 functional.",
                "year":
                2012,
                "links": {
                    "landing_page":
                    "http://qmml.org/datasets.html#gdb7-12",
                    "publication":
                    ["https://doi.org/10.1103/PhysRevLett.108.058301"],
                    #"data_doi": "",

                    #                "related_id": ,
                    "zip": {

                        #"globus_endpoint": ,
                        "http_host": "http://qmml.org",
                        "path": "/Datasets/gdb7-12.zip",
                    }
                },

                #            "mrr": ,
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78"
                }]
            }
        }

    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in tqdm(find_files(input_path, "xyz"),
                          desc="Processing files",
                          disable=not verbose):
        record = parse_ase(
            os.path.join(data_file["path"], data_file["filename"]), "xyz")
        record_metadata = {
            "mdf": {
                "title": "gdb7_12 " + data_file["filename"],
                "acl": ['public'],

                #            "tags": ,
                #            "description": ,
                "composition": record["chemical_formula"],
                #            "raw": ,
                "links": {
                    #"landing_page": ,

                    #                "publication": ,
                    #                "data_doi": ,

                    #                "related_id": ,
                    "xyz": {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host": "https://data.materialsdatafacility.org",
                        "path":
                        "/collections/gdb7_12/" + data_file["filename"],
                    },
                },

                #            "citation": ,
                #            "data_contact": {

                #                "given_name": ,
                #                "family_name": ,

                #                "email": ,
                #                "institution":,

                #                },

                #            "author": ,

                #            "license": ,
                #            "collection": ,
                #            "year": ,

                #            "mrr":

                #            "processing": ,
                #            "structure":,
            }
        }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    # You're done!
    if verbose:
        print("Finished converting")

Example #29

Show file

File: cxidb_converter.py Project: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "The Coherent X-ray Imaging Data Bank",
                "acl": ["public"],
                "source_name":
                "cxidb",
                "citation": [
                    "Maia, F. R. N. C. The Coherent X-ray Imaging Data Bank. Nat. Methods 9, 854–855 (2012)."
                ],
                "data_contact": {
                    "given_name": "Filipe",
                    "family_name": "Maia",
                    "email": "*****@*****.**",
                    "institution": "Lawrence Berkeley National Laboratory",

                    # IDs
                },
                "author": {
                    "given_name": "Filipe",
                    "family_name": "Maia",
                    "institution": "Lawrence Berkeley National Laboratory",

                    # IDs
                },

                #            "license": ,
                "collection":
                "CXIDB",
                "tags": ["x-ray", "coherent"],
                "description":
                "A new database which offers scientists from all over the world a unique opportunity to access data from Coherent X-ray Imaging (CXI) experiments.",
                "year":
                2012,
                "links": {
                    "landing_page": "http://www.cxidb.org/",

                    #                "publication": ,
                    #                "dataset_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

                #            "mrr": ,
                "data_contributor": {
                    "given_name": "Jonathon",
                    "family_name": "Gaff",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "jgaff"
                }
            }
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    dataset_validator = Validator(dataset_metadata)

    # Get the data
    for dir_data in tqdm(find_files(input_path,
                                    file_pattern="json",
                                    verbose=verbose),
                         desc="Processing metadata",
                         disable=not verbose):
        with open(os.path.join(dir_data["path"],
                               dir_data["filename"])) as file_data:
            cxidb_data = json.load(file_data)
        record_metadata = {
            "mdf": {
                "title": cxidb_data["citation_title"],
                "acl": ["public"],

                #            "tags": ,
                #            "description": ,

                #            "composition": ,
                "raw": json.dumps(cxidb_data),
                "links": {
                    "landing_page":
                    cxidb_data["url"],
                    "publication": [
                        cxidb_data.get("citation_DOI", None),
                        cxidb_data.get("entry_DOI", None)
                    ],
                    #                "dataset_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #},
                },

                #            "citation": ,
                #            "data_contact": {

                #                "given_name": ,
                #                "family_name": ,

                #                "email": ,
                #                "institution":,

                # IDs
                #                },

                #            "author": ,

                #            "license": ,
                #            "collection": ,
                #            "data_format": ,
                #            "data_type": ,
                #            "year": ,

                #            "mrr":

                #            "processing": ,
                #            "structure":,
            }
        }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    if verbose:
        print("Finished converting")

Example #30

Show file

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
        "mdf": {
            "title": "JCAP Benchmarking Database",
            "acl": ["public"],
            "source_name": "jcap_benchmarking_db",
            "citation": ["McCrory, C. C. L., Jung, S. H., Peters, J. C. & Jaramillo, T. F. Benchmarking Heterogeneous Electrocatalysts for the Oxygen Evolution Reaction. Journal of the American Chemical Society 135, 16977-16987, DOI: 10.1021/ja407115p (2013)", "McCrory, C. C. L. et al. Benchmarking HER and OER Electrocatalysts for Solar Water Splitting Devices. Journal of the American Chemical Society, 137, 4347–4357, DOI: 10.1021/ja510442p (2015)"],
            "data_contact": {

                "given_name": "Charles",
                "family_name": "McCrory",

                "email": "*****@*****.**",
                "institution": "Joint Center for Artificial Photosynthesis",

                },

            "author": [{

                "given_name": "Charles",
                "family_name": "McCrory",

                "institution": "Joint Center for Artificial Photosynthesis",

                },
                {

                "given_name": "Suho",
                "family_name": "Jung",

                "institution": "Joint Center for Artificial Photosynthesis",

                },
                {

                "given_name": "Jonas",
                "family_name": "Peters",

                "institution": "Joint Center for Artificial Photosynthesis",

                },
                {

                "given_name": "Thomas",
                "family_name": "Jaramillo",

                "institution": "Joint Center for Artificial Photosynthesis",

                }],

#            "license": ,

            "collection": "JCAP Benchmarking DB",
            "tags": ["benchmarking", "catalyst"],

            "description": "The JCAP Benchmarking scientists developed and implemented uniform methods and protocols for characterizing the activities of electrocatalysts under standard operating conditions for water-splitting devices. They have determined standard measurement protocols that reproducibly quantify catalytic activity and stability. Data for several catalysts studied are made available in this database.",
            "year": 2013,

            "links": {

                "landing_page": "http://solarfuelshub.org/benchmarking-database",

                "publication": ["https://dx.doi.org/10.1021/ja407115p", "https://dx.doi.org/10.1021/ja510442p"],
#                "dataset_doi": ,

#                "related_id": ,

                # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

#            "mrr": ,

            "data_contributor": {
                "given_name": "Jonathon",
                "family_name": "Gaff",
                "email": "*****@*****.**",
                "institution": "The University of Chicago",
                "github": "jgaff"
                }
            }
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")


    dataset_validator = Validator(dataset_metadata)


    # Get the data
    for data_file in tqdm(find_files(input_path, ".txt"), desc="Processing files", disable= not verbose):
        with open(os.path.join(data_file["path"], data_file["filename"])) as in_file:
            record = {}
            key = ""
            for line in in_file:
                clean_line = line.strip()
                if clean_line.endswith(":"):
                    key = clean_line.strip(": ").lower().replace(" ", "_")
                else:
                    record[key] = clean_line
        record_metadata = {
        "mdf": {
            "title": "JCAP Benchmark - " + record["catalyst"],
            "acl": ["public"],

#            "tags": ,
#            "description": ,
            
            "composition": record["catalyst"],
            "raw": json.dumps(record),

            "links": {
                "landing_page": "https://internal.solarfuelshub.org/jcapresources/benchmarking/catalysts_for_iframe/view/jcapbench_catalyst/" + data_file["filename"][:-4],

#                "publication": ,
#                "dataset_doi": ,

#                "related_id": ,

                # data links: {
 
                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #},
                },

            "citation": record["publication"],
#            "data_contact": {

#                "given_name": ,
#                "family_name": ,

#                "email": ,
#                "institution":,

                # IDs
#                },

#            "author": ,

#            "license": ,
#            "collection": ,
#            "data_format": ,
#            "data_type": ,
            "year": int(record["release_date"][:4]),

#            "mrr":

#            "processing": ,
#            "structure":,
            }
        }
#        record_metadata["jcap_benchmarking_db"] = record

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])


    if verbose:
        print("Finished converting")