Python parse_tabの例

プログラミング言語: Python

名前空間/パッケージ名: mdf_refinery.parsers.tab_parser

メソッド/関数: parse_tab

hotexamples.comのコード掲載数: 16

Python parse_tab - 16件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのmdf_refinery.parsers.tab_parser.parse_tabの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: klh_1_converter.py プロジェクト: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    # NOTE: For fields that represent people (e.g. mdf-data_contact), other IDs can be added (ex. "github": "jgaff").
    #    It is recommended that all people listed in mdf-data_contributor have a github username listed.
    #
    # If there are other useful fields not covered here, another block (dictionary at the same level as "mdf") can be created for those fields.
    # The block must be called the same thing as the source_name for the dataset.
    if not metadata:
        ## Metadata:dataset
        dataset_metadata = {
            "mdf": {

                "title": "KLH Dataset I",
                "acl": ["public"],
                "source_name": "klh_1",

                "data_contact": {

                    "given_name": "Clinton S",
                    "family_name": "Potter",
                    "email": "*****@*****.**",
                    "institution": "The Scripps Research Institute",

                },

                "data_contributor": [{

                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78",

                }],

                #"citation": [""],

                "author": [{

                    "given_name": "Yuanxin",
                    "family_name": "Zhu",
                    "institution": "The Scripps Research Institute",

                },
                {

                    "given_name": "Bridget",
                    "family_name": "Carragher",
                    "institution": "The Scripps Research Institute",

                },
                {

                    "given_name": "Robert M",
                    "family_name": "Glaeser",
                    "institution": "University of California, Berkeley",

                },
                {

                    "given_name": "Denis",
                    "family_name": "Fellmann",
                    "institution": "The Scripps Research Institute",

                },
                {

                    "given_name": "Chandrajit",
                    "family_name": "Bajaj",
                    "institution": "University of Texas at Austin,",

                },
                {

                    "given_name": "Marshall",
                    "family_name": "Bern",
                    "institution": "Palo Alto Research Center",

                },
                {

                    "given_name": "Fabrice",
                    "family_name": "Mouche",
                    "institution": "The Scripps Research Institute",

                },
                {

                    "given_name": "Felix",
                    "family_name": "de Haas",
                    "institution": "FEI Company, Eindhoven",

                },
                {

                    "given_name": "Richard J",
                    "family_name": "Hall",
                    "institution": "Imperial College London",

                },
                {

                    "given_name": "David J",
                    "family_name": "Kriegman",
                    "institution": "University of California, San Diego",

                },
                {

                    "given_name": "Steven J",
                    "family_name": "Ludtke",
                    "institution": "Baylor College of Medicine",

                },
                {

                    "given_name": "Satya P",
                    "family_name": "Mallick",
                    "institution": "University of California, San Diego",

                },
                {

                    "given_name": "Pawel A",
                    "family_name": "Penczek",
                    "institution": "University of Texas-Houston Medical School",

                },
                {

                    "given_name": "Alan M",
                    "family_name": "Roseman",
                    "institution": "MRC Laboratory of Molecular Biology",

                },
                {

                    "given_name": "Fred J",
                    "family_name": "Sigworth",
                    "institution": "Yale University School of Medicine",

                },
                {

                    "given_name": "Niels",
                    "family_name": "Volkmann",
                    "institution": "The Burnham Institute",

                },
                {

                    "given_name": "Clinton S",
                    "family_name": "Potter",
                    "email": "*****@*****.**",
                    "institution": "The Scripps Research Institute",

                }],

                #"license": "",
                "collection": "Keyhole Limpet Hemocyanin",
                "tags": ["Electron microscopy", "Single-particle reconstruction", "Automatic particle selection", "Image processing", "Pattern recognition"],
                "description": "Manual selection of single particles in images acquired using cryo-electron microscopy (cryoEM) will become a significant bottleneck when datasets of a hundred thousand or even a million particles are required for structure determination at near atomic resolution. Algorithm development of fully automated particle selection is thus an important research objective in the cryoEM field. A number of research groups are making promising new advances in this area. Evaluation of algorithms using a standard set of cryoEM images is an essential aspect of this algorithm development. With this goal in mind, a particle selection \"bakeoff\" was included in the program of the Multidisciplinary Workshop on Automatic Particle Selection for cryoEM. Twelve groups participated by submitting the results of testing their own algorithms on a common dataset. The dataset consisted of 82 defocus pairs of high-magnification micrographs, containing keyhole limpet hemocyanin particles, acquired using cryoEM.",
                "year": 2004,

                "links": {

                    "landing_page": "http://emg.nysbc.org/redmine/projects/public-datasets/wiki/KLH_dataset_I",
                    "publication": ["http://www.sciencedirect.com/science/article/pii/S1047847703002004#!"],
                    #"data_doi": "",
                    #"related_id": "",

                    #"data_link": {

                        #"globus_endpoint": ,
                        #"http_host": "",

                        #"path": "",

                    #},

                },

            },

            #"mrr": {

            #},

            #"dc": {

            #},


        }
        ## End metadata
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")



    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)


    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in tqdm(find_files(input_path, "map$"), desc="Processing Files", disable=not verbose):
        with open(os.path.join(data_file["path"], data_file["filename"]), 'r') as raw_in:
            map_data = raw_in.read()
        headers = ["index", "image", "coordinate"]
        for line in parse_tab(map_data, headers=headers, sep=" "):
            ifile_1 = line["image"].replace(".002", ".001")
            ifile_2 = line["image"]
            cfile = line["coordinate"]
            df = pd.read_csv(os.path.join(data_file["path"], cfile), delim_whitespace=True)
            ## Metadata:record
            record_metadata = {
                "mdf": {
    
                    "title": "Keyhole Limpet Hemocyanin 1 - " + cfile,
                    "acl": ["public"],
                    #"composition": ,
    
                    #"tags": ,
                    "description": "Images under exposure1 are near-to-focus (NTF). Images under exposure2 are far-from-focus (FFF).",
                    #"raw": ,
    
                    "links": {
    
                        #"landing_page": ,
                        #"publication": ,
                        #"data_doi": ,
                        #"related_id": ,
    
                        "klh": {
    
                            "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                            "http_host": "https://data.materialsdatafacility.org",
    
                            "path": "/collections/klh_1/" + data_file["no_root_path"] + "/" + cfile,
    
                            },
    
                        "jpg": {
        
                                "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                                "http_host": "https://data.materialsdatafacility.org",
        
                                "path": "/collections/klh_1/exposure1_jpeg/" + ifile_1.replace(".mrc", ".jpg"),
        
                            },
    
    
                        "mrc": {
        
                                "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                                "http_host": "https://data.materialsdatafacility.org",
        
                                "path": "/collections/klh_1/exposure1_mrc/" + ifile_1,
        
                            },
                        
                        "jpg2": {
        
                                "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                                "http_host": "https://data.materialsdatafacility.org",
        
                                "path": "/collections/klh_1/exposure2_jpeg/" + ifile_2.replace(".mrc", ".jpg"),
        
                            },
    
    
                        "mrc2": {
        
                                "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                                "http_host": "https://data.materialsdatafacility.org",
        
                                "path": "/collections/klh_1/exposure2_mrc/" + ifile_2,
        
                            },
                    },
    
                    #"citation": ,
    
                    #"data_contact": {
    
                        #"given_name": ,
                        #"family_name": ,
                        #"email": ,
                        #"institution": ,
    
                    #},
    
                    #"author": [{
    
                        #"given_name": ,
                        #"family_name": ,
                        #"email": ,
                        #"institution": ,
    
                    #}],
    
                    #"year": ,
    
                },
    
                #"dc": {
    
                #},
    
    
            }
            ## End metadata
    
            # Pass each individual record to the Validator
            result = dataset_validator.write_record(record_metadata)
    
            # Check if the Validator accepted the record, and stop processing if it didn't
            # If the Validator returns "success" == True, the record was written successfully
            if not result["success"]:
                if not dataset_validator.cancel_validation()["success"]:
                    print("Error cancelling validation. The partial feedstock may not be removed.")
                raise ValueError(result["message"] + "\n" + result.get("details", ""))


    # You're done!
    if verbose:
        print("Finished converting")

コード例 #2

ファイルを表示

ファイル: carbonyl_sulfide_fluxes_converter.py プロジェクト: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    # NOTE: For fields that represent people (e.g. mdf-data_contact), other IDs can be added (ex. "github": "jgaff").
    #    It is recommended that all people listed in mdf-data_contributor have a github username listed.
    #
    # If there are other useful fields not covered here, another block (dictionary at the same level as "mdf") can be created for those fields.
    # The block must be called the same thing as the source_name for the dataset.
    if not metadata:
        ## Metadata:dataset
        dataset_metadata = {
            "mdf": {
                "title":
                "Dataset for \"Canopy uptake dominates nighttime carbonyl sulfide fluxes in a boreal forest\"",
                "acl": ["public"],
                "source_name":
                "carbonyl_sulfide_fluxes",
                "data_contact": {
                    "given_name":
                    "Huilin",
                    "family_name":
                    "Chen",
                    "email":
                    "*****@*****.**",
                    "institution":
                    "University of Groningen, University of Colorado"
                },
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78",
                }],
                "citation": [
                    "Linda M.J. Kooijmans, Kadmiel Maseyk, Ulli Seibt, Wu Sun, Timo Vesala, Ivan Mammarella, … Huilin Chen. (2017). Dataset for \"Canopy uptake dominates nighttime carbonyl sulfide fluxes in a boreal forest\" [Data set]. Zenodo. http://doi.org/10.5281/zenodo.580303"
                ],
                "author": [{
                    "given_name": "Linda M.J.",
                    "family_name": "Kooijmans",
                    "institution": "University of Groningen",
                }, {
                    "given_name": "Kadmiel",
                    "family_name": "Maseyk",
                    "institution": "The Open University",
                }, {
                    "given_name": "Ulli",
                    "family_name": "Seibt",
                    "institution": "University of California",
                }, {
                    "given_name": "Wu",
                    "family_name": "Sun",
                    "institution": "University of California",
                }, {
                    "given_name": "Timo",
                    "family_name": "Vesala",
                    "institution": "University of Helsinki",
                }, {
                    "given_name": "Ivan",
                    "family_name": "Mammarella",
                    "institution": "University of Helsinki",
                }, {
                    "given_name": "Pasi",
                    "family_name": "Kolari",
                    "institution": "University of Helsinki",
                }, {
                    "given_name": "Juho",
                    "family_name": "Aalto",
                    "institution": "University of Helsinki",
                }, {
                    "given_name":
                    "Alessandro",
                    "family_name":
                    "Franchin",
                    "institution":
                    "University of Helsinki, University of Colorado",
                }, {
                    "given_name": "Roberta",
                    "family_name": "Vecchi",
                    "institution": "University of Milan",
                }, {
                    "given_name": "Gianluigi",
                    "family_name": "Valli",
                    "institution": "University of Milan",
                }, {
                    "given_name":
                    "Huilin",
                    "family_name":
                    "Chen",
                    "email":
                    "*****@*****.**",
                    "institution":
                    "University of Groningen, University of Colorado",
                }],
                "license":
                "https://creativecommons.org/licenses/by/4.0/",
                "collection":
                "Carbonyl Sulfide Fluxes",
                #"tags": [""],
                "description":
                "Nighttime averaged ecosystem fluxes of COS and CO2 obtained through the radon-tracer and eddy-covariance method as presented in \"Canopy uptake dominates nighttime carbonyl sulfide fluxes in a boreal forest\" submitted to Atmospheric Chemistry and Physics.",
                "year":
                2017,
                "links": {
                    "landing_page":
                    "https://doi.org/10.5281/zenodo.580303",
                    "publication":
                    ["https://www.atmos-chem-phys-discuss.net/acp-2017-407/"],
                    #"data_doi": "",
                    #"related_id": "",
                    "txt": {

                        #"globus_endpoint": ,
                        "http_host":
                        "https://zenodo.org",
                        "path":
                        "/record/580303/files/Kooijmans_et_al_2017_ACPD_20170516.txt",
                    },
                },
            },

            #"mrr": {

            #},

            #"dc": {

            #},
        }
        ## End metadata
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    with open(
            os.path.join(input_path, "Kooijmans_et_al_2017_ACPD_20170516.txt"),
            "r") as raw_in:
        data = raw_in.read()
    description = "".join(data.split("\n\n")[1:2])
    start = "##########################################\n"
    for line in tqdm(parse_tab(data.split(start)[-1], sep=","),
                     desc="Processing Data",
                     disable=not verbose):
        ## Metadata:record
        record_metadata = {
            "mdf": {
                "title": "Carbonyl Sulfide Fluxes doy: " + line["doy"],
                "acl": ["public"],
                #"composition": ,

                #"tags": ,
                "description": description,
                "raw": json.dumps(line),
                "links": {

                    #"landing_page": ,
                    #"publication": ,
                    #"data_doi": ,
                    #"related_id": ,
                    "txt": {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host":
                        "https://data.materialsdatafacility.org",
                        "path":
                        "/collections/carbonyl_sulfide_fluxes/Kooijmans_et_al_2017_ACPD_20170516.txt",
                    },
                },

                #"citation": ,

                #"data_contact": {

                #"given_name": ,
                #"family_name": ,
                #"email": ,
                #"institution": ,

                #},

                #"author": [{

                #"given_name": ,
                #"family_name": ,
                #"email": ,
                #"institution": ,

                #}],

                #"year": ,
            },

            #"dc": {

            #},
        }
        ## End metadata

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and stop processing if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if not result["success"]:
            if not dataset_validator.cancel_validation()["success"]:
                print(
                    "Error cancelling validation. The partial feedstock may not be removed."
                )
            raise ValueError(result["message"] + "\n" +
                             result.get("details", ""))

    # You're done!
    if verbose:
        print("Finished converting")

コード例 #3

ファイルを表示

ファイル: ft_icr_ms_converter.py プロジェクト: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title": "Assigned formula of complex mixture FT-ICR MS datasets",
                "acl": ['public'],
                "source_name": "ft_icr_ms",
                "citation": ["Blackburn, John; Uhrin, Dusan. (2017). Assigned formula of complex mixture FT-ICR MS datasets, [dataset]. University of Edinburgh. School of Chemistry. http://dx.doi.org/10.7488/ds/1984"],
                "data_contact": {
    
                    "given_name": "Dusan",
                    "family_name": "Uhrin",
                    
                    "email": "*****@*****.**",
                    "instituition": "University of Edinburgh"
    
                    },
    
                "author": [{
    
                    "given_name": "John",
                    "family_name": "Blackburn",
                    
                    "instituition": "University of Edinburgh"
                    
                    },
                    {
                    
                    "given_name": "Dusan",
                    "family_name": "Uhrin",
                    
                    "email": "*****@*****.**",
                    "instituition": "University of Edinburgh"
                    
                    }],
    
                "license": "http://creativecommons.org/licenses/by/4.0/legalcode",
    
                "collection": "FT-ICR MS Complex Mixtures",
                "tags": ["ESI", "MALDI", "LDI"],
    
                "description": "The dataset included is of formula assigned from FT-ICR MS data for samples of Suwannee River fulvic acid (SRFA) and Suwannee River natural organic matter (SRNOM) (both are standards from the International Humic Substances Society) using a variety of ionisation sources. This includes electrospray ionisation (ESI), matrix assisted laser desorption/ionisation (MALDI) and matrix free laser desorption/ionisation (LDI).",
                "year": 2017,
    
                "links": {
    
                    "landing_page": "http://datashare.is.ed.ac.uk/handle/10283/2640",
    
                    "publication": ["http://dx.doi.org/10.1021/acs.analchem.6b04817"],
                   # "data_doi": "",
    
              #      "related_id": ,
    
                    "zip": {
                    
                        #"globus_endpoint": ,
                        "http_host": "http://datashare.is.ed.ac.uk",
    
                        "path": "/download/10283/2640/Assigned_formula_of_complex_mixture_FT-ICR_MS_datasets.zip",
                        }
                    },
    
    #            "mrr": ,
    
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78"
                    }]
                }
            }
        
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")



    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)


    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    with open(os.path.join(input_path, "ft_icr_ms_data.txt")) as raw_in:
        all_data = raw_in.read()
    for record in tqdm(parse_tab(all_data, sep=";"), desc="Processing files", disable=not verbose):
        record_metadata = {
            "mdf": {
                "title": "FT_ICR_MS " + record["Molecular Formula"],
                "acl": ['public'],
    
    #            "tags": ,
    #            "description": ,
                
                "composition": record["Molecular Formula"],
                "raw": json.dumps(record),
    
                "links": {
    #                "landing_page": ,
    
    #                "publication": ,
    #                "data_doi": ,
    
    #                "related_id": ,
    
                    "txt": {
                        "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host": "https://data.materialsdatafacility.org",
    
                        "path": "/collections/ft_icr_ms/ft_icr_ms_data.txt",
                        },
                    },
    
    #            "citation": ,
    #            "data_contact": {
    
    #                "given_name": ,
    #                "family_name": ,
    
    #                "email": ,
    #                "institution":,
    
    #                },
    
    #            "author": ,
    
    #            "license": ,
    #            "collection": ,
    #            "year": ,
    
    #            "mrr":
    
    #            "processing": ,
    #            "structure":,
                }
            }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    # You're done!
    if verbose:
        print("Finished converting")

コード例 #4

ファイルを表示

ファイル: irradiated_pyrochlores_ml_converter.py プロジェクト: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "Using Machine Learning To Identify Factors That Govern Amorphization of Irradiated Pyrochlores",
                "acl": ['public'],
                "source_name":
                "irradiated_pyrochlores_ml",
                "citation": [
                    "Using Machine Learning To Identify Factors That Govern Amorphization of Irradiated Pyrochlores Ghanshyam Pilania, Karl R. Whittle, Chao Jiang, Robin W. Grimes, Christopher R. Stanek, Kurt E. Sickafus, and Blas Pedro Uberuaga Chemistry of Materials 2017 29 (6), 2574-2583 DOI: 10.1021/acs.chemmater.6b04666"
                ],
                "data_contact": {
                    "given_name": "Blas Pedro",
                    "family_name": "Uberuaga",
                    "email": "*****@*****.**",
                    "instituition": "Los Alamos National Laboratory"
                },
                "author": [{
                    "given_name": "Blas Pedro",
                    "family_name": "Uberuaga",
                    "email": "*****@*****.**",
                    "instituition": "Los Alamos National Laboratory"
                }, {
                    "given_name": "Ghanshyam",
                    "family_name": "Pilania",
                    "instituition": "Los Alamos National Laboratory"
                }, {
                    "given_name": "Karl R.",
                    "family_name": "Whittle",
                    "instituition": "University of Liverpool"
                }, {
                    "given_name": "Chao",
                    "family_name": "Jiang",
                    "instituition": "Idaho National Laboratory"
                }, {
                    "given_name": "Robin W.",
                    "family_name": "Grimes",
                    "instituition": "Imperial College London"
                }, {
                    "given_name": "Christopher R.",
                    "family_name": "Stanek",
                    "instituition": "Los Alamos National Laboratory"
                }, {
                    "given_name": "Kurt E.",
                    "family_name": "Sickafus",
                    "instituition": "University of Tennessee"
                }],
                "license":
                "https://creativecommons.org/licenses/by-nc/4.0/",
                "collection":
                "ML for Amorphization of Irradiated Pyrochlores",
                "tags": ["ML"],
                "description":
                "Here, we use a machine learning model to examine the factors that govern amorphization resistance in the complex oxide pyrochlore (A2B2O7) in a regime in which amorphization occurs as a consequence of defect accumulation. We examine the fidelity of predictions based on cation radii and electronegativities, the oxygen positional parameter, and the energetics of disordering and amorphizing the material.",
                "year":
                2017,
                "links": {
                    "landing_page":
                    "http://pubs.acs.org/doi/full/10.1021/acs.chemmater.6b04666#showFigures",

                    #   "publication": ,
                    #"data_doi": "",

                    #    "related_id": ,
                    "pdf": {

                        #"globus_endpoint": ,
                        "http_host": "https://ndownloader.figshare.com",
                        "path": "/files/7712131",
                    }
                },

                #            "mrr": ,
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78"
                }]
            }
        }

    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    with open(os.path.join(input_path, "irradiated_pyrochlores_ml_data.txt"),
              'r') as raw_in:
        headers = raw_in.readline().split("; ")
        for record in parse_tab(raw_in.read(), headers=headers, sep=" "):
            record_metadata = {
                "mdf": {
                    "title":
                    "ML for Amorphization of Irradiated Pyrochlores - " +
                    record["Compound"],
                    "acl": ['public'],

                    #            "tags": ,
                    #            "description": ,
                    "composition":
                    record["Compound"],
                    #            "raw": ,
                    "links": {
                        #                "landing_page": ,

                        #                "publication": ,
                        #                "data_doi": ,

                        #                "related_id": ,
                        "txt": {
                            "globus_endpoint":
                            "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                            "http_host":
                            "https://data.materialsdatafacility.org",
                            "path":
                            "/collections/irradiated_pyrochlores_ml/irradiated_pyrochlores_ml_data.txt",
                        },
                    },

                    #            "citation": ,
                    #            "data_contact": {

                    #                "given_name": ,
                    #                "family_name": ,

                    #                "email": ,
                    #                "institution":,

                    #                },

                    #            "author": ,

                    #            "license": ,
                    #            "collection": ,
                    #            "year": ,

                    #            "mrr":

                    #            "processing": ,
                    #            "structure":,
                }
            }

            # Pass each individual record to the Validator
            result = dataset_validator.write_record(record_metadata)

            # Check if the Validator accepted the record, and print a message if it didn't
            # If the Validator returns "success" == True, the record was written successfully
            if result["success"] is not True:
                print("Error:", result["message"])

    # You're done!
    if verbose:
        print("Finished converting")

コード例 #5

ファイルを表示

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                'Research Data Supporting "The microstructure and hardness of Ni-Co-Al-Ti-Cr quinary alloys"',
                "acl": ["public"],
                "source_name":
                "quinary_alloys",
                "citation": [
                    'Christofidou, K. A., Jones, N. G., Pickering, E. J., Flacau, R., Hardy, M. C., & Stone, H. J. Research Data Supporting "The microstructure and hardness of Ni-Co-Al-Ti-Cr quinary alloys" [Dataset]. https://doi.org/10.17863/CAM.705'
                ],
                "data_contact": {
                    "given_name": "Howard",
                    "family_name": "Stone",
                    "email": "*****@*****.**",
                    "institution": "University of Cambridge"
                },
                "author": [{
                    "given_name": "Howard",
                    "family_name": "Stone",
                    "email": "*****@*****.**",
                    "institution": "University of Cambridge"
                }, {
                    "given_name": "Katerina",
                    "family_name": "Christofidou",
                    "institution": "University of Cambridge",
                    "orcid": "https://orcid.org/0000-0002-8064-5874"
                }, {
                    "given_name": "Nicholas",
                    "family_name": "Jones",
                    "institution": "University of Cambridge"
                }, {
                    "given_name": "Edward",
                    "family_name": "Pickering",
                    "institution": "University of Cambridge"
                }, {
                    "given_name": "Roxana",
                    "family_name": "Flacau",
                    "institution": "University of Cambridge"
                }, {
                    "given_name": "Mark",
                    "family_name": "Hardy",
                    "institution": "University of Cambridge"
                }],
                "license":
                "http://creativecommons.org/licenses/by/4.0/",
                "collection":
                "Ni-Co-Al-Ti-Cr Quinary Alloys",
                #            "data_format": ,
                #            "data_type": ,
                "tags": ["alloys"],
                "description":
                "DSC files, neutron diffraction data, hardness measurements, SEM and TEM images and thermodynamic simulations are provided for all alloy compositions studied and presented in this manuscript.",
                "year":
                2016,
                "links": {
                    "landing_page":
                    "https://www.repository.cam.ac.uk/handle/1810/256771",
                    "publication":
                    "https://doi.org/10.1016/j.jallcom.2016.07.159",
                    "data_doi": "https://doi.org/10.17863/CAM.705",

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

                #            "mrr": ,
                "data_contributor": {
                    "given_name": "Jonathon",
                    "family_name": "Gaff",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "jgaff"
                }
            }
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    dataset_validator = Validator(dataset_metadata)

    # Get the data
    with open(os.path.join(input_path, "alloy_data.csv"), 'r') as adata:
        raw_data = adata.read()
    for record in tqdm(parse_tab(raw_data),
                       desc="Processing records",
                       disable=not verbose):
        links = {}
        for ln in find_files(input_path, record["Alloy"]):
            key = "_".join(ln["no_root_path"].split("/")).replace(" ", "_")
            links[key] = {
                "globus_endpoint":
                "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                "http_host":
                "https://data.materialsdatafacility.org",
                "path":
                os.path.join("/collections/quinary_alloys", ln["no_root_path"],
                             ln["filename"])
            }
        links["csv"] = {
            "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
            "http_host": "https://data.materialsdatafacility.org",
            "path": "/collections/quinary_alloys/alloy_data.csv"
        }
        record_metadata = {
            "mdf": {
                "title": "Ni-Co-Al-Ti-Cr Quinary Alloys " + record["Alloy"],
                "acl": ["public"],

                #            "tags": ,
                #            "description": ,
                "composition": "NiCoAlTiCr",
                "raw": json.dumps(record),
                "links": links,  #{
                #                "landing_page": ,

                #                "publication": ,
                #                "dataset_doi": ,

                #                "related_id": ,

                #                 "csv": {

                #                    "globus_endpoint": "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                #                    "http_host": "https://data.materialsdatafacility.org",

                #                    "path": "/collections/quinary_alloys/alloy_data.csv",
                #                    },
                #                },

                #            "citation": ,
                #            "data_contact": {

                #                "given_name": ,
                #                "family_name": ,

                #                "email": ,
                #               "institution":,

                # IDs
                #                },

                #            "author": ,

                #            "license": ,
                #            "collection": ,
                #            "data_format": ,
                #            "data_type": ,
                #            "year": ,

                #            "mrr":

                #            "processing": ,
                #            "structure":,
            },
            "quinary_alloys": {
                "atomic_composition_percent": {
                    "Ni": record["Ni"],
                    "Co": record["Co"],
                    "Al": record["Al"],
                    "Ti": record["Ti"],
                    "Cr": record["Cr"]
                }
            }
        }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    if verbose:
        print("Finished converting")

コード例 #6

ファイルを表示

ファイル: gdb8_15_converter.py プロジェクト: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "Electronic spectra from TDDFT and machine learning in chemical space",
                "acl": ['public'],
                "source_name":
                "gdb8_15",
                "citation": [
                    "Electronic spectra of 22k molecules Raghunathan Ramakrishnan, Mia Hartmann, Enrico Tapavicza, O. Anatole von Lilienfeld, J. Chem. Phys. submitted (2015)",
                    "Structures of 22k molecules Raghunathan Ramakrishnan, Pavlo Dral, Matthias Rupp, O. Anatole von Lilienfeld Scientific Data 1, Article number: 140022 (2014). doi:10.1038/sdata.2014.22"
                ],
                "data_contact": {
                    "given_name": "O. Anatole",
                    "family_name": "von Lilienfeld",
                    "email": "*****@*****.**",
                    "institution": "Argonne National Laboratory",
                },
                "author": [{
                    "given_name": "O. Anatole",
                    "family_name": "von Lilienfeld",
                    "email": "*****@*****.**",
                    "instituition": "Argonne National Laboratory"
                }, {
                    "given_name": "Raghunathan",
                    "family_name": "Ramakrishnan",
                    "institution": "University of Basel"
                }, {
                    "given_name": "Mia",
                    "family_name": "Hartmann",
                    "instituition": "California State University",
                }, {
                    "given_name": "Enrico",
                    "family_name": "Tapavicza",
                    "email": "*****@*****.**",
                    "instituition": "California State University",
                }],

                #            "license": "",
                "collection":
                "gdb8_15",
                "tags": [
                    "Density functional theory", "Excitation energies",
                    "Computer modeling", "Oscillators", "Molecular spectra"
                ],
                "description":
                "Due to its favorable computational efficiency, time-dependent (TD) density functional theory (DFT) enables the prediction of electronic spectra in a high-throughput manner across chemical space. Its predictions, however, can be quite inaccurate. We resolve this issue with machine learning models trained on deviations of reference second-order approximate coupled-cluster (CC2) singles and doubles spectra from TDDFT counterparts, or even from DFT gap. We applied this approach to low-lying singlet-singlet vertical electronic spectra of over 20 000 synthetically feasible small organic molecules with up to eight CONF atoms.",
                "year":
                2015,
                "links": {
                    "landing_page":
                    "http://qmml.org/datasets.html#gdb8-15",
                    "publication": [
                        "http://dx.doi.org/10.1063/1.4928757http://dx.doi.org/10.1063/1.4928757"
                    ],
                    #"data_doi": "",

                    #                "related_id": ,
                    "zip": {

                        #"globus_endpoint": ,
                        "http_host": "http://qmml.org",
                        "path": "/Datasets/gdb8-15.zip",
                    }
                },

                #            "mrr": ,
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78"
                }]
            }
        }

    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    headers = [
        "Index", "E1-CC2", "E2-CC2", "f1-CC2", "f2-CC2", "E1-PBE0", "E2-PBE0",
        "f1-PBE0", "f2-PBE0", "E1-PBE0", "E2-PBE0", "f1-PBE0", "f2-PBE0",
        "E1-CAM", "E2-CAM", "f1-CAM", "f2-CAM"
    ]
    with open(os.path.join(input_path, "gdb8_22k_elec_spec.txt"),
              'r') as raw_in:
        data = raw_in.read()
    #Start at line 29 for data
    starter = data.find("       1      0.43295186     0.43295958")
    #Remove the spaces before the index column
    decomp = data[starter:].split("\n")
    stripped_decomp = []
    for line in decomp:
        stripped_decomp.append(line.strip())

    #Open gdb9-14 feedstock to get chemical composition
    with open(os.path.expanduser("~/mdf/feedstock/gdb9_14_all.json"),
              'r') as json_file:
        lines = json_file.readlines()
        full_json_data = [json.loads(line) for line in lines]
        #Composition needed doesn't begin until after record 6095
        json_data = full_json_data[6095:]

    for record in tqdm(parse_tab("\n".join(stripped_decomp),
                                 headers=headers,
                                 sep="     "),
                       desc="Processing files",
                       disable=not verbose):
        comp = json_data[int(record["Index"])]["mdf"]["composition"]
        record_metadata = {
            "mdf": {
                "title": "gdb8_15 - " + "record: " + record["Index"],
                "acl": ['public'],

                #            "tags": ,
                #            "description": ,
                "composition": comp,
                "raw": json.dumps(record),
                "links": {
                    # "landing_page": ,

                    #                "publication": ,
                    #                "data_doi": ,

                    #                "related_id": ,
                    "txt": {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host": "https://data.materialsdatafacility.org",
                        "path": "/collections/gdb8_15/gdb8_22k_elec_spec.txt",
                    },
                },

                #            "citation": ,
                #            "data_contact": {

                #                "given_name": ,
                #                "family_name": ,

                #                "email": ,
                #                "institution":,

                #                },

                #            "author": ,

                #            "license": ,
                #            "collection": ,
                #            "year": ,

                #            "mrr":

                #            "processing": ,
                #            "structure":,
            }
        }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    # You're done!
    if verbose:
        print("Finished converting")

コード例 #7

ファイルを表示

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "The Open Access Malaria Box: A Drug Discovery Catalyst for Neglected Diseases",
                "acl": ['public'],
                "source_name":
                "malaria_drug_discovery",
                "citation": [
                    "Spangenberg T, Burrows JN, Kowalczyk P, McDonald S, Wells TNC, Willis P (2013) The Open Access Malaria Box: A Drug Discovery Catalyst for Neglected Diseases. PLoS ONE 8(6): e62906. https://doi.org/10.1371/journal.pone.0062906"
                ],
                "data_contact": {
                    "given_name": "Thomas",
                    "family_name": "Spangenberg",
                    "email": "*****@*****.**",
                    "instituition": "Medicines for Malaria Venture"
                },
                "author": [{
                    "given_name": "Thomas",
                    "family_name": "Spangenberg",
                    "email": "*****@*****.**",
                    "instituition": "Medicines for Malaria Venture"
                }, {
                    "given_name": "Jeremy N.",
                    "family_name": "Burrows",
                    "instituition": "Medicines for Malaria Venture"
                }, {
                    "given_name": "Paul",
                    "family_name": "Kowalczyk",
                    "instituition": "SCYNEXIS Inc."
                }, {
                    "given_name": "Simon",
                    "family_name": "McDonald",
                    "instituition": "Medicines for Malaria Venture"
                }, {
                    "given_name": "Timothy N. C.",
                    "family_name": "Wells",
                    "instituition": "Medicines for Malaria Venture"
                }, {
                    "given_name": "Paul",
                    "family_name": "Willis",
                    "email": "*****@*****.**",
                    "instituition": "Medicines for Malaria Venture"
                }],
                "license":
                "https://creativecommons.org/licenses/by/4.0/",
                "collection":
                "Open Access Malaria Box",
                "tags": [
                    "Malaria", "Malarial parasites", "Antimalarials",
                    "Plasmodium", "Parasitic diseases", "Drug discovery",
                    "Plasmodium falciparum"
                ],
                "description":
                "In most cases it is a prerequisite to be able to obtain physical samples of the chemical compounds for further study, and the groups responsible for screening did not originally plan to provide these molecules. In addition, many of the biological systems in which these compounds would be tested are not suitable for testing such large numbers of compounds. There needs to therefore be some simplification of the collection. To overcome these barriers, a diverse collection of anti-malarial compounds has been designed and assembled.",
                "year":
                2013,
                "links": {
                    "landing_page":
                    "https://doi.org/10.1371/journal.pone.0062906",

                    #  "publication": ,
                    "data_doi":
                    "https://ndownloader.figshare.com/files/1090667",

                    #   "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

                #            "mrr": ,
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78"
                }]
            }
        }

    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    with open(os.path.join(input_path, "Table_S1.csv"), 'r') as raw_in:
        data_records = raw_in.read()
    for record in tqdm(parse_tab(data_records),
                       desc="Processing Data",
                       disable=not verbose):
        record_metadata = {
            "mdf": {
                "title": "Malaria Drug Discovery - " + record["Smiles"],
                "acl": ['public'],

                #            "tags": ,
                #            "description": ,
                "composition": record["Smiles"],
                #            "raw": ,
                "links": {
                    #                "landing_page": ,

                    #                "publication": ,
                    #                "data_doi": ,

                    #                "related_id": ,
                    "csv": {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host": "https://data.materialsdatafacility.org",
                        "path":
                        "/collections/malaria_drug_discovery/Table_S1.csv",
                    },
                },

                #            "citation": ,
                #            "data_contact": {

                #                "given_name": ,
                #                "family_name": ,

                #                "email": ,
                #                "institution":,

                #                },

                #            "author": ,

                #            "license": ,
                #            "collection": ,
                #            "year": ,

                #            "mrr":

                #            "processing": ,
                #            "structure":,
            }
        }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    # You're done!
    if verbose:
        print("Finished converting")

コード例 #8

ファイルを表示

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "A compilation of ab-initio calculations of embrittling potencies in binary metallic alloys",
                "acl": ['public'],
                "source_name":
                "binary_metallic_alloys_ab_initio",
                "citation": [
                    "Gibson, Michael A., and Christopher A. Schuh. “A Compilation of Ab-Initio Calculations of Embrittling Potencies in Binary Metallic Alloys.” Data in Brief 6 (2016): 143–148. PMC. Web. 29 June 2017."
                ],
                "data_contact": {
                    "given_name": "Michael A.",
                    "family_name": "Gibson",
                    "email": "*****@*****.**",
                    "instituition": "Massachusetts Institute of Technology"
                },
                "author": [{
                    "given_name":
                    "Michael A.",
                    "family_name":
                    "Gibson",
                    "email":
                    "*****@*****.**",
                    "instituition":
                    "Massachusetts Institute of Technology"
                }, {
                    "given_name":
                    "Christopher A.",
                    "family_name":
                    "Schuh",
                    "email":
                    "*****@*****.**",
                    "instituition":
                    "Massachusetts Institute of Technology"
                }],
                "license":
                "http://creativecommons.org/licenses/by/4.0/",
                "collection":
                "Binary Metallic Alloys Ab-initio Calculations",
                "tags": [
                    "Grain Boundary Segregation", "Embrittlement",
                    "Ab-Initio Calculation", "Surface", "Segregation",
                    "Fracture"
                ],
                "description":
                "Segregation-induced changes in interfacial cohesion often control the mechanical properties of metals. The change in the work of separation of an interface upon segregation of a solute to the interface, termed the embrittling potency, is an atomic-level quantity used to predict and understand embrittlement phenomena. We present a compilation of calculations of embrittling potencies, along with references for these calculations.",
                "year":
                2015,
                "links": {
                    "landing_page":
                    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4706572/",

                    #"publication": ,
                    # "data_doi": ,

                    # "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

                #            "mrr": ,
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78"
                }]
            }
        }

    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in find_files(input_path, "csv"):
        with open(os.path.join(data_file["path"], data_file["filename"]),
                  'r') as raw_in:
            total_data_lst = raw_in.readlines()
            #remove first line descriptions
            total_data = "".join(total_data_lst[1:])
        for record in tqdm(parse_tab(total_data),
                           desc="Processing file: " + data_file["filename"],
                           disable=not verbose):
            comp = record["Solvent"] + record["Solute"]
            record_metadata = {
                "mdf": {
                    "title": "Binary Metallic Alloys Ab-initio - " + comp,
                    "acl": ['public'],

                    #            "tags": ,
                    #            "description": ,
                    "composition": comp,
                    #            "raw": ,
                    "links": {
                        #                "landing_page": ,

                        #                "publication": ,
                        #                "data_doi": ,

                        #                "related_id": ,
                        "csv": {
                            "globus_endpoint":
                            "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                            "http_host":
                            "https://data.materialsdatafacility.org",
                            "path":
                            "/collections/binary_metallic_alloys_ab_initio/" +
                            data_file["filename"],
                        },
                    },

                    #            "citation": ,
                    #            "data_contact": {

                    #                "given_name": ,
                    #                "family_name": ,

                    #                "email": ,
                    #                "institution":,

                    #                },

                    #            "author": ,

                    #            "license": ,
                    #            "collection": ,
                    #            "year": ,

                    #            "mrr":

                    #            "processing": ,
                    #            "structure":,
                }
            }

            # Pass each individual record to the Validator
            result = dataset_validator.write_record(record_metadata)

            # Check if the Validator accepted the record, and print a message if it didn't
            # If the Validator returns "success" == True, the record was written successfully
            if result["success"] is not True:
                print("Error:", result["message"])

    # You're done!
    if verbose:
        print("Finished converting")

コード例 #9

ファイルを表示

ファイル: fe_cr_al_oxidation_converter.py プロジェクト: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "High-throughput Diffraction and Spectroscopic Data for Fe-Cr-Al Oxidation Studies",
                "acl": ["public"],
                "source_name":
                "fe_cr_al_oxidation",
                "citation": [
                    "Bunn, Jonathan K.; Fang, Randy L.; Albing, Mark R.; Mehta, Apurva; Kramer, Matt J.; Besser, Matt F.; Hattrick-Simpers, Jason R High-throughput Diffraction and Spectroscopic Data for Fe-Cr-Al Oxidation Studies (2015-06-28)"
                ],
                "data_contact": {
                    "given_name": "Jason",
                    "family_name": "Hattrick-Simpers",
                    "email": "*****@*****.**",
                    "institution": "University of South Carolina Columbia",
                },

                #            "author": ,

                #            "license": ,
                "collection":
                "Fe-Cr-Al Oxidation Studies",
                #            "tags": ,
                "description":
                "The data set was used to evaluate a Fe-Cr-Al thin film samples in a narrow composition region centered on known bulk compositions. The data are composed of two individual studies. The first set of data is a low temperature oxidation study on composition spread sampled performed at SLAC Beamline 1-5. Only the integrated and background subtracted 1-D spectra are included, the 2-D data and calibrations are available upon request. The second set of data was taken during high temperature oxidation of selected samples. These data are exclusively Raman data with values taken as a function of total oxidation time.",
                "year":
                2015,
                "links": {
                    "landing_page":
                    "https://materialsdata.nist.gov/dspace/xmlui/handle/11256/836",
                    "publication":
                    "http://dx.doi.org/10.1088/0957-4484/26/27/274003",
                    "data_doi": "http://hdl.handle.net/11256/836",

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

                #            "mrr": ,
                "data_contributor": {
                    "given_name": "Jonathon",
                    "family_name": "Gaff",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "jgaff"
                }
            }
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    dataset_validator = Validator(dataset_metadata)

    # Get the data
    with open(
            os.path.join(
                input_path, "Fe_Cr_Al_data",
                "Point Number to Composition.csv")) as composition_file:
        composition_list = list(parse_tab(composition_file.read()))
        compositions = {}
        for comp in composition_list:
            compositions[int(comp.pop("Sample Number"))] = comp
    # Each record also needs its own metadata
    for data_file in tqdm(find_files(input_path, ".txt"),
                          desc="Processing files",
                          disable=not verbose):
        temp_k = data_file["filename"].split(" ")[0]
        point_num = int(data_file["filename"].replace(
            "_", " ").split(" ")[-1].split(".")[0])
        record_metadata = {
            "mdf": {
                "title":
                "Fe-Cr-Al Oxidation - " + data_file["filename"].split(".")[0],
                "acl": ["public"],

                #            "tags": ,
                #            "description": ,
                "composition":
                "FeCrAl",
                #            "raw": ,
                "links": {
                    #                "landing_page": ,

                    #                "publication": ,
                    #                "dataset_doi": ,

                    #                "related_id": ,
                    "csv": {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host":
                        "https://data.materialsdatafacility.org",
                        "path":
                        "/collections/" + data_file["no_root_path"] + "/" +
                        data_file["filename"],
                    },
                },

                #            "citation": ,
                #            "data_contact": {

                #                "given_name": ,
                #                "family_name": ,

                #                "email": ,
                #                "institution":,

                # IDs
                #                },

                #            "author": ,

                #            "license": ,
                #            "collection": ,
                #            "data_format": ,
                #            "data_type": ,
                #            "year": ,

                #            "mrr":

                #            "processing": ,
                #            "structure":,
            },
            "fe_cr_al_oxidation": {
                "temperature_k":
                float(temp_k) if temp_k != "Room" else 293.15,  # Avg room temp
                "atomic_composition_percent": {
                    "Fe": float(compositions[point_num]["Fe at. %"]),
                    "Cr": float(compositions[point_num]["Cr at. %"]),
                    "Al": float(compositions[point_num]["Al at. %"])
                }
            }
        }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            dataset_validator.cancel_validation()
            raise ValueError(result["message"] + "\n" +
                             result.get("details", ""))

    if verbose:
        print("Finished converting")

コード例 #10

ファイルを表示

ファイル: qsar_biodeg_converter.py プロジェクト: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf-title":
            "QSAR biodegradation Data Set",
            "mdf-acl": ["public"],
            "mdf-source_name":
            "qsar_biodeg",
            "mdf-citation": [
                "Mansouri, K., Ringsted, T., Ballabio, D., Todeschini, R., Consonni, V. (2013). Quantitative Structure - Activity Relationship models for ready biodegradability of chemicals. Journal of Chemical Information and Modeling, 53, 867-878",
                "Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science."
            ],
            "mdf-data_contact": {
                "given_name": "Davide",
                "family_name": "Ballabio",
                "email": "*****@*****.**",
                "institution": "Università degli Studi di Milano-Bicocca",
            },
            "mdf-author": [{
                "given_name":
                "Davide",
                "family_name":
                "Ballabio",
                "email":
                "*****@*****.**",
                "institution":
                "Università degli Studi di Milano-Bicocca",
            }, {
                "given_name":
                "Kamel",
                "family_name":
                "Mansouri",
                "institution":
                "Università degli Studi di Milano-Bicocca",
            }, {
                "given_name":
                "Tine",
                "family_name":
                "Ringsted",
                "institution":
                "Università degli Studi di Milano-Bicocca",
            }, {
                "given_name":
                "Roberto",
                "family_name":
                "Todeschini",
                "institution":
                "Università degli Studi di Milano-Bicocca",
            }, {
                "given_name":
                "Viviana",
                "family_name":
                "Consonni",
                "institution":
                "Università degli Studi di Milano-Bicocca",
            }],

            #            "mdf-license": ,
            "mdf-collection":
            "QSAR Biodegradation Data Set",
            "mdf-data_format":
            "csv",
            "mdf-data_type":
            "Biodegradation",
            "mdf-tags": ["biodegredation", "Chemometrics"],
            "mdf-description":
            "Data set containing values for 41 attributes (molecular descriptors) used to classify 1055 chemicals into 2 classes (ready and not ready biodegradable).",
            "mdf-year":
            2013,
            "mdf-links": {
                "mdf-landing_page":
                "https://archive.ics.uci.edu/ml/datasets/QSAR+biodegradation",

                #                "mdf-publication": ,
                #                "mdf-dataset_doi": ,

                #                "mdf-related_id": ,

                # data links: {

                #"globus_endpoint": ,
                #"http_host": ,

                #"path": ,
                #}
            },

            #           "mdf-mrr": ,
            "mdf-data_contributor": [{
                "given_name": "Evan",
                "family_name": "Pike",
                "email": "*****@*****.**",
                "institution": "The University of Chicago",
                "github": "dep78"
            }, {
                "given_name": "Jonathon",
                "family_name": "Gaff",
                "email": "*****@*****.**",
                "institution": "The University of Chicago",
                "github": "jgaff"
            }]
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    dataset_validator = Validator(dataset_metadata)

    # Get the data
    i = 1
    headers = [
        "SpMax_L", "J_Dz(e)", "nHM", "F01[N-N]", "F04[C-N]", "NssssC", "nCb-",
        "C%", "nCp", "nO", "F03[C-N]", "SdssC", "HyWi_B(m)", "LOC", " SM6_L",
        "F03[C-O]", "Me", "Mi", "nN-N", "nArNO2", "nCRX3", "SpPosA_B(p)",
        "nCIR", "B01[C-Br]", "B03[C-Cl]", "N-073", "SpMax_A", "Psi_i_1d",
        "B04[C-Br]", "SdO", "TI2_L", "nCrt", "C-026", "F02[C-N]", "nHDon",
        "SpMax_B(m)", "Psi_i_A", "nN", "SM6_B(m)", " nArCOOR", "nX",
        "experimental class"
    ]
    with open(os.path.join(input_path, "biodeg.csv"), 'r') as raw_in:
        for row_data in tqdm(parse_tab(raw_in.read(), sep=";",
                                       headers=headers),
                             desc="Processing data",
                             disable=not verbose):
            record = []
            for key, value in row_data.items():
                record.append(key + ": " + value)
            record_metadata = {
                "mdf-title": "QSAR Biodegradation #" + str(i),
                "mdf-acl": ["public"],

                #            "mdf-tags": ,
                #            "mdf-description": ,

                #            "mdf-composition": ,
                "mdf-raw": json.dumps(record),
                "mdf-links": {
                    #                "mdf-landing_page": ,

                    #                "mdf-publication": ,
                    #                "mdf-dataset_doi": ,

                    #                "mdf-related_id": ,
                    "csv": {
                        "globus_endpoint":
                        "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                        "http_host": "https://data.materialsdatafacility.org",
                        "path": "/collections/qsar_biodeg/biodeg.csv",
                    },
                },

                #            "mdf-citation": ,
                #            "mdf-data_contact": {

                #                "given_name": ,
                #                "family_name": ,

                #                "email": ,
                #                "institution":,

                # IDs
                #                },

                #            "mdf-author": ,

                #            "mdf-license": ,
                #            "mdf-collection": ,
                #            "mdf-data_format": ,
                #            "mdf-data_type": ,
                #            "mdf-year": ,

                #            "mdf-mrr":

                #            "mdf-processing": ,
                #            "mdf-structure":,
            }
            i += 1

            # Pass each individual record to the Validator
            result = dataset_validator.write_record(record_metadata)

            # Check if the Validator accepted the record, and print a message if it didn't
            # If the Validator returns "success" == True, the record was written successfully
            if result["success"] is not True:
                print("Error:", result["message"])

    if verbose:
        print("Finished converting")

コード例 #11

ファイルを表示

ファイル: nist_xray_tran_en_db_converter.py プロジェクト: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "NIST X-Ray Transition Energies Database",
                "acl": ["public"],
                "source_name":
                "nist_xray_tran_en_db",
                "citation": [
                    "http://physics.nist.gov/PhysRefData/XrayTrans/Html/refs.html"
                ],
                "data_contact": {
                    "given_name": "Lawrence",
                    "family_name": "Hudson",
                    "email": "*****@*****.**",
                    "institution":
                    "National Institute of Standards and Technology"
                },

                #            "author": ,

                #            "license": ,
                "collection":
                "NIST X-Ray Transition Energies",
                "tags": ["Radiation", "Spectroscopy", "Reference data"],
                "description":
                "This x-ray transition table provides the energies for K transitions connecting the K shell (n = 1) to the shells with principal quantum numbers n = 2 to 4 and L transitions connecting the L1, L2, and L3 shells (n = 2) to the shells with principal quantum numbers n = 3 and 4. The elements covered include Z = 10, neon to Z = 100, fermium. There are two unique features of this database: (1) all experimental values are on a scale consistent with the International System of measurement (the SI) and the numerical values are determined using constants from the Recommended Values of the Fundamental Physical Constants: 1998 [115] and (2) accurate theoretical estimates are included for all transitions. The user will find that for many of the transitions, the experimental and theoretical values are very consistent. It is our hope that the theoretical values will provide a useful estimate for missing or poorly measured experimental values.",
                "year":
                2003,
                "links": {
                    "landing_page":
                    "https://www.nist.gov/pml/x-ray-transition-energies-database",

                    #                "publication": ,
                    #                "dataset_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

                #            "mrr": ,
                "data_contributor": {
                    "given_name": "Jonathon",
                    "family_name": "Gaff",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "jgaff"
                }
            }
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    dataset_validator = Validator(dataset_metadata)

    # Get the data
    headers = [
        'element', 'A', 'transition', 'theory_(eV)', 'theory_uncertainty_(eV)',
        'direct_(eV)', 'direct_uncertainty_(eV)', 'combined_(eV)',
        'combined_uncertainty_(eV)', 'vapor_(eV)', 'vapor_uncertainty_(eV)',
        'blend', 'reference'
    ]
    with open(os.path.join(input_path, "xray_tran_en_db.txt")) as in_file:
        raw_data = in_file.read()
    for record in tqdm(parse_tab(raw_data, sep="\t", headers=headers),
                       desc="Processing data",
                       disable=not verbose):
        record_metadata = {
            "mdf": {
                "title": "X-Ray Transition - " + record["element"],
                "acl": ["public"],

                #            "tags": ,
                #            "description": ,
                "composition": record["element"],
                "raw": json.dumps(record),
                "links": {
                    "landing_page":
                    "http://physics.nist.gov/PhysRefData/XrayTrans/Html/search.html",

                    #                "publication": ,
                    #                "dataset_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #},
                },

                #            "citation": ,
                #            "data_contact": {

                #                "given_name": ,
                #                "family_name": ,

                #                "email": ,
                #                "institution":,

                # IDs
                #                },

                #            "author": ,

                #            "license": ,
                #            "collection": ,
                #            "data_format": ,
                #            "data_type": ,
                #            "year": ,

                #            "mrr":

                #            "processing": ,
                #            "structure":,
            }
        }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    if verbose:
        print("Finished converting")

コード例 #12

ファイルを表示

ファイル: nist_th_ar_lamp_spectrum_converter.py プロジェクト: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "NIST Spectrum of Th-Ar Hollow Cathode Lamps",
                "acl": ["public"],
                "source_name":
                "nist_th_ar_lamp_spectrum",
                "citation": ["NIST SRD 161"],
                "data_contact": {
                    "given_name":
                    "Gillian",
                    "family_name":
                    "Nave",
                    "email":
                    "*****@*****.**",
                    "institution":
                    "National Institute of Standards and Technology",
                },
                "author": [{
                    "given_name":
                    "Gillian",
                    "family_name":
                    "Nave",
                    "email":
                    "*****@*****.**",
                    "institution":
                    "National Institute of Standards and Technology",
                }, {
                    "given_name":
                    "Craig",
                    "family_name":
                    "Sansonetti",
                    "institution":
                    "National Institute of Standards and Technology",
                }, {
                    "given_name": "Florian",
                    "family_name": "Kerber",
                    "institution": "European Southern Observatory",
                }],

                #            "license": ,
                "collection":
                "NIST Spectrum of Th-Ar Hollow Cathode Lamps",
                "tags": ["Spectroscopy", "Reference data"],
                "description":
                "This atlas presents observations of the infra-red (IR) spectrum of a low current Th-Ar hollow cathode lamp with the 2-m Fourier transform spectrometer (FTS) at the National Institute of Standards and Technology. These observations establish more than 2400 lines that are suitable for use as wavelength standards in the range 691 nm to 5804 nm.",
                "year":
                2009,
                "links": {
                    "landing_page":
                    "https://www.nist.gov/pml/spectrum-th-ar-hollow-cathode-lamps",

                    #                "publication": ,
                    #                "dataset_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

                #            "mrr": ,
                "data_contributor": {
                    "given_name": "Jonathon",
                    "family_name": "Gaff",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "jgaff"
                }
            }
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    dataset_validator = Validator(dataset_metadata)

    # Get the data
    headers = [
        "wavenumber", "wavenumber_uncertainty_le-3", "snr", "fwhm_le-3",
        "intensity", "species", "lower_level", "lower_j", "upper_level",
        "upper_j", "vacuum_wavelength", "vacuum_wavelength_uncertainty_le-3"
    ]
    with open(os.path.join(input_path,
                           "nist_th_ar_lamp_spectrum.txt")) as in_file:
        raw = in_file.read()
    while "  " in raw:
        raw = raw.replace("  ", " ")
    for record in tqdm(parse_tab(raw, headers=headers, sep=" "),
                       desc="Processing records",
                       disable=not verbose):
        record_metadata = {
            "mdf": {
                "title":
                "Hollow Cathode Lamp Spectrum - " + record["wavenumber"],
                "acl": ["public"],

                #            "tags": ,
                #            "description": ,
                "composition": record["species"],
                "raw": json.dumps(record),
                "links": {
                    "landing_page":
                    "http://physics.nist.gov/PhysRefData/ThArLampAtlas/node9.html",

                    #                "publication": ,
                    #                "dataset_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #},
                },

                #            "citation": ,
                #            "data_contact": {

                #                "given_name": ,
                #                "family_name": ,

                #                "email": ,
                #                "institution":,

                # IDs
                #                },

                #            "author": ,

                #            "license": ,
                #            "collection": ,
                #            "data_format": ,
                #            "data_type": ,
                #            "year": ,

                #            "mrr":

                #            "processing": ,
                #            "structure":,
            }
        }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    if verbose:
        print("Finished converting")

コード例 #13

ファイルを表示

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "Data set for diffusion coefficients of alloying elements in dilute Mg alloys from first-principles",
                "acl": ['public'],
                "source_name":
                "dilute_mg_alloys_dft",
                "citation": [
                    "Zhou, Bi-Cheng et al. “Data Set for Diffusion Coefficients of Alloying Elements in Dilute Mg Alloys from First-Principles.” Data in Brief 5 (2015): 900–912. PMC. Web. 7 July 2017."
                ],
                "data_contact": {
                    "given_name": "Bi-Cheng",
                    "family_name": "Zhou",
                    "email": "*****@*****.**",
                    "instituition": "The Pennsylvania State University"
                },
                "author": [{
                    "given_name": "Bi-Cheng",
                    "family_name": "Zhou",
                    "email": "*****@*****.**",
                    "instituition": "The Pennsylvania State University"
                }, {
                    "given_name": "Shun-Li",
                    "family_name": "Shang",
                    "instituition": "The Pennsylvania State University"
                }, {
                    "given_name": "Yi",
                    "family_name": "Wang",
                    "instituition": "The Pennsylvania State University"
                }, {
                    "given_name": "Zi-Kui",
                    "family_name": "Liu",
                    "instituition": "The Pennsylvania State University"
                }],
                "license":
                "http://creativecommons.org/licenses/by/4.0/",
                "collection":
                "Dilute Mg Alloys DFT",
                #"tags": ,
                "description":
                "Diffusion coefficients of alloying elements in Mg are critical for the development of new Mg alloys for lightweight applications. Here we present the data set of the temperature-dependent dilute tracer diffusion coefficients for 47 substitutional alloying elements in hexagonal closed packed (hcp) Mg calculated from first-principles calculations based on density functional theory (DFT) by combining transition state theory and an 8-frequency model.",
                "year":
                2015,
                "links": {
                    "landing_page":
                    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4669471/",
                    "publication":
                    ["http://dx.doi.org/10.1016/j.dib.2015.10.024"],
                    # "data_doi": ,

                    #  "related_id": ,
                    "xslx": {

                        #"globus_endpoint": ,
                        "http_host": "https://www.ncbi.nlm.nih.gov",
                        "path": "/pmc/articles/PMC4669471/bin/mmc1.xlsx",
                    }
                },

                #            "mrr": ,
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78"
                }]
            }
        }

    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in tqdm(find_files(input_path, "^[^\.]"),
                          desc="Processing files",
                          disable=not verbose):
        with open(os.path.join(data_file["path"], data_file["filename"]),
                  'r',
                  encoding="utf-8") as raw_in:
            all_data = raw_in.read().strip("\ufeff")
        if data_file["filename"] == "csv_info.txt":
            continue
        if "txt" in data_file["filename"]:
            sep = "\t"
            file_type = "txt"
            path = data_file["no_root_path"] + "/"
        else:
            sep = ","
            file_type = "csv"
            path = ""
        for record in parse_tab(all_data, sep):
            if "diffusion" in data_file["filename"]:
                comp = data_file["filename"].split("_")[0]
            else:
                comp = record["X"]
            record_metadata = {
                "mdf": {
                    "title": "Diliute Mg Alloys DFT - " + comp,
                    "acl": ['public'],

                    #            "tags": ,
                    #            "description": ,
                    "composition": comp,
                    "raw": json.dumps(record),
                    "links": {
                        #                "landing_page": ,

                        #                "publication": ,
                        #                "data_doi": ,

                        #                "related_id": ,
                        file_type: {
                            "globus_endpoint":
                            "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                            "http_host":
                            "https://data.materialsdatafacility.org",
                            "path":
                            "/collections/dilute_mg_alloys_dft/" + path +
                            data_file["filename"],
                        },
                    },

                    #            "citation": ,
                    #            "data_contact": {

                    #                "given_name": ,
                    #                "family_name": ,

                    #                "email": ,
                    #                "institution":,

                    #                },

                    #            "author": ,

                    #            "license": ,
                    #            "collection": ,
                    #            "year": ,

                    #            "mrr":

                    #            "processing": ,
                    #            "structure":,
                }
            }

            # Pass each individual record to the Validator
            result = dataset_validator.write_record(record_metadata)

            # Check if the Validator accepted the record, and print a message if it didn't
            # If the Validator returns "success" == True, the record was written successfully
            if result["success"] is not True:
                print("Error:", result["message"])

    # You're done!
    if verbose:
        print("Finished converting")

コード例 #14

ファイルを表示

ファイル: nrel_pv_converter.py プロジェクト: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "National Renewable Energy Laboratory Organic Photovoltaic Database",
                "acl": ["public"],
                "source_name":
                "nrel_pv",
                "citation": [
                    "Gaussian 09, (Revisions B.01, C.01 and D.01), M. J. Frisch, et al., Gaussian, Inc., Wallingford CT, 2009. See gaussian.com",
                    "Ross E. Larsen, J. Phys. Chem. C, 120, 9650-9660 (2016). DOI: 10.1021/acs .jpcc.6b02138"
                ],
                "data_contact": {
                    "given_name": "Ross",
                    "family_name": "Larsen",
                    "email": "*****@*****.**",
                    "institution": "National Renewable Energy Laboratory"

                    # IDs
                },
                "author": {
                    "given_name": "Ross",
                    "family_name": "Larsen",
                    "email": "*****@*****.**",
                    "institution": "National Renewable Energy Laboratory"

                    # IDs
                },

                #            "license": ,
                "collection":
                "NREL Organic Photovoltaic Database",
                "tags": ["dft", "simulation", "organic photovoltaics"],
                "description":
                "Welcome to the National Renewable Energy Laboratory materials discovery database for organic electronic materials. The focus is on materials for organic photovoltaic (OPV) absorber materials but materials suitable for other applications may be found here as well.",
                #            "year": ,
                "links": {
                    "landing_page": "https://organicelectronics.nrel.gov",
                    "publication":
                    ["https://dx.doi.org/10.1021/acs.jpcc.6b02138"],
                    #                "dataset_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

                #            "mrr": ,
                "data_contributor": [{
                    "given_name": "Jonathon",
                    "family_name": "Gaff",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "jgaff"
                }]
            }
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    dataset_validator = Validator(dataset_metadata)

    # Get the data
    with open(os.path.join(input_path, "polymer_export_0501179397151.csv"),
              'r') as raw_in:
        for record in tqdm(parse_tab(raw_in.read()),
                           desc="Processing files",
                           disable=not verbose):
            record_metadata = {
                "mdf": {
                    "title": "NREL OPV - " + record["common_tag"],
                    "acl": ["public"],

                    #                "tags": ,
                    #                "description": ,
                    "composition": record["common_tag"],
                    "raw": json.dumps(record),
                    "links": {
                        "landing_page": record["URL"],

                        #                    "publication": ,
                        #                    "dataset_doi": ,

                        #                    "related_id": ,

                        # data links: {

                        #"globus_endpoint": ,
                        #"http_host": ,

                        #"path": ,
                        #},
                    },

                    #                "citation": ,
                    #                "data_contact": {

                    #                    "given_name": ,
                    #                    "family_name": ,

                    #                    "email": ,
                    #                    "institution":,

                    # IDs
                    #                    },

                    #                "author": ,

                    #                "license": ,
                    #                "collection": ,
                    #                "data_format": ,
                    #                "data_type": ,
                    #                "year": ,

                    #                "mrr":

                    #            "processing": ,
                    #            "structure":,
                }
            }

            # Pass each individual record to the Validator
            result = dataset_validator.write_record(record_metadata)

            # Check if the Validator accepted the record, and print a message if it didn't
            # If the Validator returns "success" == True, the record was written successfully
            if result["success"] is not True:
                print("Error:", result["message"])

    if verbose:
        print("Finished converting")

コード例 #15

ファイルを表示

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    if not metadata:
        dataset_metadata = {
            "mdf": {
                "title":
                "Benchmark of G0W0 on 100 Molecules",
                "acl": ["public"],
                "source_name":
                "gw100",
                "citation": [
                    "M.J. van Setten, F. Caruso, S. Sharifzadeh, X. Ren, M. Scheffler, F. Liu, J. Lischner, L. Lin, J.R. Deslippe, S.G. Louie, C. Yang, F. Weigend, J.B. Neaton, F. Evers, and P. Rinke, GW100: Benchmarking G0W0 for Molecular Systems, J. Chem. Theory Comput. 11, 5665 (2015).",
                    "M. Govoni et al., (2016). In preparation.",
                    "P.J. Linstrom and W.G. Mallard, Eds., NIST Chemistry WebBook, NIST Standard Reference Database Number 69, National Institute of Standards and Technology, Gaithersburg MD, 20899, http://webbook.nist.gov."
                ],
                "data_contact": {
                    "given_name": "Michiel",
                    "family_name": "van Setten",
                    "email": "*****@*****.**",
                    "institution": "Université catholique de Louvain",
                },

                #            "author":

                #            "license": ,
                "collection":
                "GW100",
                #            "tags": ,
                "description":
                "This is a benchmark of G0W0 on 100 molecules.",
                "year":
                2015,
                "links": {
                    "landing_page":
                    "http://www.west-code.org/database/gw100/index.php",
                    "publication":
                    "https://dx.doi.org/10.1021/acs.jctc.5b00453",
                    #                "dataset_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #}
                },

                #            "mrr": ,
                "data_contributor": {
                    "given_name": "Jonathon",
                    "family_name": "Gaff",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "jgaff"
                }
            }
        }
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    dataset_validator = Validator(dataset_metadata)

    # Get the data
    with open(os.path.join(input_path, "gw100.csv")) as in_file:
        data = in_file.read()
    for record in tqdm(parse_tab(data),
                       desc="Processing records",
                       disable=not verbose):
        record_metadata = {
            "mdf": {
                "title": "GW100 - " + record["name"],
                "acl": ["public"],

                #            "tags": ,
                #            "description": ,
                "composition": record["formula"],
                #            "raw": ,
                "links": {
                    "landing_page":
                    "http://www.west-code.org/database/gw100/pag/" +
                    record["cas"] + ".php",

                    #                "publication": ,
                    #                "dataset_doi": ,

                    #                "related_id": ,

                    # data links: {

                    #"globus_endpoint": ,
                    #"http_host": ,

                    #"path": ,
                    #},
                },

                #            "citation": ,
                #            "data_contact": {

                #                "given_name": ,
                #                "family_name": ,

                #                "email": ,
                #                "institution":,

                # IDs
                #                },

                #            "author": ,

                #            "license": ,
                #            "collection": ,
                #            "data_format": ,
                #            "data_type": ,
                #            "year": ,

                #            "mrr":

                #            "processing": ,
                #            "structure":,
            }
        }

        # Pass each individual record to the Validator
        result = dataset_validator.write_record(record_metadata)

        # Check if the Validator accepted the record, and print a message if it didn't
        # If the Validator returns "success" == True, the record was written successfully
        if result["success"] is not True:
            print("Error:", result["message"])

    if verbose:
        print("Finished converting")

コード例 #16

ファイルを表示

ファイル: second_order_magnetic_prop_converter.py プロジェクト: maxhutch/forge

def convert(input_path, metadata=None, verbose=False):
    if verbose:
        print("Begin converting")

    # Collect the metadata
    # NOTE: For fields that represent people (e.g. mdf-data_contact), other IDs can be added (ex. "github": "jgaff").
    #    It is recommended that all people listed in mdf-data_contributor have a github username listed.
    #
    # If there are other useful fields not covered here, another block (dictionary at the same level as "mdf") can be created for those fields.
    # The block must be called the same thing as the source_name for the dataset.
    if not metadata:
        ## Metadata:dataset
        dataset_metadata = {
            "mdf": {
                "title":
                "On the calculation of second-order magnetic properties using subsystem approaches in the relativistic framework",
                "acl": ["public"],
                "source_name":
                "second_order_magnetic_prop",
                "data_contact": {
                    "given_name": "Andre Severo Pereira",
                    "family_name": "Gomes",
                    "email": "*****@*****.**",
                    "institution": "Universit´e de Lille",
                },
                "data_contributor": [{
                    "given_name": "Evan",
                    "family_name": "Pike",
                    "email": "*****@*****.**",
                    "institution": "The University of Chicago",
                    "github": "dep78",
                }],
                "citation": [
                    "Olejniczak, Małgorzata, Bast, Radovan, & Gomes, Andre Severo Pereira. (2016). On the calculation of second-order magnetic properties using subsystem approaches in the relativistic framework - supplementary information [Data set]. Zenodo. http://doi.org/10.5281/zenodo.179720"
                ],
                "author": [{
                    "given_name": "Małgorzata",
                    "family_name": "Olejniczak",
                    "email": "*****@*****.**",
                    "institution": "Universit´e de Lille",
                }, {
                    "given_name":
                    "Radovan",
                    "family_name":
                    "Bast",
                    "email":
                    "*****@*****.**",
                    "institution":
                    "UiT The Arctic University of Norway",
                }, {
                    "given_name": "Andre Severo Pereira",
                    "family_name": "Gomes",
                    "email": "*****@*****.**",
                    "institution": "Universit´e de Lille",
                }],
                "license":
                "https://creativecommons.org/licenses/by/4.0/",
                "collection":
                "Second-Order Magnetic Properties",
                #"tags": [""],
                "description":
                "We report an implementation of the nuclear magnetic resonance (NMR) shielding (σ), isotopeindependent indirect spin-spin coupling (K) and the magnetizability (ξ) tensors in the frozen density embedding scheme using the four-component (4c) relativistic Dirac–Coulomb (DC) Hamiltonian and the non-collinear spin density functional theory.",
                "year":
                2016,
                "links": {
                    "landing_page": "https://doi.org/10.5281/zenodo.179720",
                    "publication": ["https://arxiv.org/abs/arXiv:1610.04280"],
                    #"data_doi": "",
                    #"related_id": ,
                    "tar.gz": {

                        #"globus_endpoint": ,
                        "http_host":
                        "https://zenodo.org",
                        "path":
                        "/record/179720/files/supplementary_info_fde_mag.tar.gz",
                    },
                },
            },

            #"mrr": {

            #},

            #"dc": {

            #},
        }
        ## End metadata
    elif type(metadata) is str:
        try:
            dataset_metadata = json.loads(metadata)
        except Exception:
            try:
                with open(metadata, 'r') as metadata_file:
                    dataset_metadata = json.load(metadata_file)
            except Exception as e:
                sys.exit("Error: Unable to read metadata: " + repr(e))
    elif type(metadata) is dict:
        dataset_metadata = metadata
    else:
        sys.exit("Error: Invalid metadata parameter")

    # Make a Validator to help write the feedstock
    # You must pass the metadata to the constructor
    # Each Validator instance can only be used for a single dataset
    # If the metadata is incorrect, the constructor will throw an exception and the program will exit
    dataset_validator = Validator(dataset_metadata)

    # Get the data
    #    Each record should be exactly one dictionary
    #    You must write your records using the Validator one at a time
    #    It is recommended that you use a parser to help with this process if one is available for your datatype
    #    Each record also needs its own metadata
    for data_file in tqdm(find_files(input_path, "(xyz$|csv$|cube$)"),
                          desc="Processing files",
                          disable=not verbose):
        ftype = data_file["filename"].split(".")[-1]
        if ftype == "xyz" or ftype == "cube":
            records = [
                parse_ase(
                    os.path.join(data_file["path"], data_file["filename"]),
                    ftype)
            ]
        else:
            with open(os.path.join(data_file["path"], data_file["filename"]),
                      'r') as raw_in:
                headers = []
                for head in raw_in.readline().split(","):
                    head = head.strip()
                    headers.append(head)
                data = raw_in.read()[1:]
            records = list(parse_tab(data, headers=headers))

        for rec in records:
            if "xyz" in data_file["filename"] or "cube" in data_file[
                    "filename"]:
                comp = rec["chemical_formula"]
            elif "shielding" in data_file["filename"]:
                comp = rec["nucleus"]
            elif "spinspin" in data_file["filename"]:
                comp = rec["At1"] + rec["At2"]
            elif "magnetizability" in data_file["filename"]:
                comp = rec["mol"]
                if comp == "sum":
                    continue

            record_metadata = {
                "mdf": {
                    "title": "Second-Order Magnetic Properties - " + comp,
                    "acl": ["public"],
                    "composition": comp,

                    #                "tags": ,
                    #                "description": ,
                    #"raw": json.dumps(record),
                    "links": {

                        #                    "landing_page": ,
                        #                    "publication": ,
                        #                    "data_doi": ,
                        #                    "related_id": ,
                        ftype: {
                            "globus_endpoint":
                            "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
                            "http_host":
                            "https://data.materialsdatafacility.org",
                            "path":
                            "/collections/second_order_magnetic_prop/" +
                            data_file["no_root_path"] + "/" +
                            data_file["filename"],
                        },
                    },

                    #                "citation": ,

                    #                "data_contact": {

                    #                    "given_name": ,
                    #                    "family_name": ,
                    #                    "email": ,
                    #                    "institution": ,

                    #                    },

                    #                "author": [{

                    #                    "given_name": ,
                    #                    "family_name": ,
                    #                    "email": ,
                    #                    "institution": ,

                    #                    }],

                    #                "year": ,
                },

                # "dc": {

                # },
            }
            ## End metadata

            # Pass each individual record to the Validator
            result = dataset_validator.write_record(record_metadata)

            # Check if the Validator accepted the record, and stop processing if it didn't
            # If the Validator returns "success" == True, the record was written successfully
            if not result["success"]:
                if not dataset_validator.cancel_validation()["success"]:
                    print(
                        "Error cancelling validation. The partial feedstock may not be removed."
                    )
                raise ValueError(result["message"] + "\n" +
                                 result.get("details", ""))

    # You're done!
    if verbose:
        print("Finished converting")