Esempio n. 1
0
def test_inorganic_db():
    db = ChemicalMetadataDB(elements=False,
                            main_db=os.path.join(folder, 'Inorganic db.tsv'),
                            user_dbs=[])

    # Check CAS lookup
    for CAS, d in  db.CAS_index.items():
        assert CAS_from_any(d.CASs) == d.CASs

    # Try ro check formula lookups
    for formula, d in  db.formula_index.items():
        if formula in set(['H2MgO2', 'F2N2']):
            # Formulas which are not unique by design
            continue
        assert CAS_from_any(formula) == d.CASs
    
    # Check smiles are unique / can lookup by smiles
    for smi, d in db.smiles_index.items():
        if not smi:
            continue
        assert CAS_from_any('smiles=' + smi) == d.CASs

    # Check formula is formatted right
    assert all([i.formula == serialize_formula(i.formula) for i in db.CAS_index.values()])

    # Check CAS validity
    assert all([checkCAS(i.CASs) for i in db.CAS_index.values()])

    # MW checker
    for i in db.CAS_index.values():
        formula = serialize_formula(i.formula)
        atoms = nested_formula_parser(formula, check=False)
        mw_calc = molecular_weight(atoms)
        assert_allclose(mw_calc, i.MW, atol=0.05)
Esempio n. 2
0
def test_inorganic_db():
    from thermo.identifiers import ChemicalMetadataDB, folder
    from thermo.elements import nested_formula_parser, serialize_formula, molecular_weight
    db = ChemicalMetadataDB(elements=False,
                            main_db=os.path.join(folder, 'Inorganic db.tsv'),
                            user_dbs=[])

    for CAS, d in db.CAS_index.items():
        assert CAS_from_any(d.CASs) == d.CASs

    for formula, d in db.formula_index.items():
        if formula in set(['H2MgO2', 'F2N2']):
            # Formulas which are not unique by design
            continue
        assert CAS_from_any(formula) == d.CASs

    for smi, d in db.smiles_index.items():
        if not smi:
            continue
        assert CAS_from_any('smiles=' + smi) == d.CASs

    assert all([
        i.formula == serialize_formula(i.formula)
        for i in db.CAS_index.values()
    ])
    assert all([checkCAS(i.CASs) for i in db.CAS_index.values()])

    for i in db.CAS_index.values():
        formula = serialize_formula(i.formula)
        atoms = nested_formula_parser(formula, check=False)
        mw_calc = molecular_weight(atoms)
        try:
            assert_allclose(mw_calc, i.MW, atol=0.05)
        except:
            print(i)
Esempio n. 3
0
def test_organic_user_db():
    db = ChemicalMetadataDB(elements=False,
                            main_db=os.path.join(
                                folder,
                                'chemical identifiers example user db.tsv'),
                            user_dbs=[])
    for CAS, d in db.CAS_index.items():
        assert CAS_from_any(d.CASs) == d.CASs
    # Check something was loaded
    assert len(db.CAS_index) > 100

    # Check smiles are unique / can lookup by smiles
    for smi, d in db.smiles_index.items():
        if not smi:
            continue
        assert CAS_from_any('smiles=' + smi) == d.CASs

    # Check formula is formatted right
    assert all([
        i.formula == serialize_formula(i.formula)
        for i in db.CAS_index.values()
    ])

    # Check CAS validity
    assert all([checkCAS(i.CASs) for i in db.CAS_index.values()])

    # MW checker
    for i in db.CAS_index.values():
        formula = serialize_formula(i.formula)
        atoms = nested_formula_parser(formula, check=False)
        mw_calc = molecular_weight(atoms)
        assert_allclose(mw_calc, i.MW, atol=0.05)

    for CAS, d in db.CAS_index.items():
        assert CAS_from_any('InChI=1S/' + d.InChI) == int2CAS(CAS)

    for CAS, d in db.CAS_index.items():
        assert CAS_from_any('InChIKey=' + d.InChI_key) == int2CAS(CAS)

    # Test the pubchem ids which aren't -1
    for CAS, d in db.CAS_index.items():
        if d.pubchemid != -1:
            assert CAS_from_any('PubChem=' + str(d.pubchemid)) == int2CAS(CAS)

    CAS_lenth = len(db.CAS_index)
    assert CAS_lenth == len(db.smiles_index)
    assert CAS_lenth == len(db.InChI_index)
    assert CAS_lenth == len(db.InChI_key_index)
Esempio n. 4
0
def test_db_vs_ChemSep():
    '''The CAS numbers are checked, as are most of the chemical formulas.
    Some chemical structural formulas aren't supported by the current formula
    parser and are ignored; otherwise it is a very effective test.
    '''
    import xml.etree.ElementTree as ET
    folder = os.path.join(os.path.dirname(__file__), 'Data')

    tree = ET.parse(os.path.join(folder, 'chemsep1.xml'))
    root = tree.getroot()

    data = {}
    for child in root:
        CAS = [i.attrib['value'] for i in child if i.tag == 'CAS'][0]
        name = [i.attrib['value'] for i in child if i.tag == 'CompoundID'][0]
        smiles = [i.attrib['value'] for i in child if i.tag == 'Smiles']
        formula = [i.attrib['value'] for i in child if i.tag == 'StructureFormula'][0]
        
        try:
            if '-' in formula:
                formula = None
            else:
                formula = serialize_formula(formula)
        except:
            pass
        if smiles:
            smiles = smiles[0]
        else:
            smiles = None
        
        data[CAS] = {'name': name, 'smiles': smiles, 'formula': formula}        
    
    for CAS, d in data.items():
        hit = pubchem_db.search_CAS(CAS)
        assert hit.CASs == CAS

    for CAS, d in data.items():
        assert CAS_from_any(CAS) == CAS

    for CAS, d in data.items():
        f = d['formula']
        if f is None or f == '1,4-COOH(C6H4)COOH' or d['name'] == 'Air':
            continue
        assert pubchem_db.search_CAS(CAS).formula == f

    # In an ideal world, the names would match too but ~22 don't. Adding more synonyms
    # might help.
    # Some of them are straight disagreements however
#    for CAS, d in data.items():
#        try:
#            assert CAS_from_any(d['name']) == CAS
#        except:
#            print(CAS, d['name'])
##

    # In an ideal world we could also validate against their smiles
    # but that's proving difficult due to things like 1-hexene - 
    # is it 'CCCCC=C' or 'C=CCCCC'?
#test_db_vs_ChemSep() 
Esempio n. 5
0
def test_organic_user_db():
    db = ChemicalMetadataDB(elements=False,
                            main_db=os.path.join(folder, 'chemical identifiers example user db.tsv'),
                            user_dbs=[])
    for CAS, d in  db.CAS_index.items():
        assert CAS_from_any(d.CASs) == d.CASs
    # Check something was loaded
    assert len(db.CAS_index) > 100

    # Check smiles are unique / can lookup by smiles
    for smi, d in db.smiles_index.items():
        if not smi:
            continue
        assert CAS_from_any('smiles=' + smi) == d.CASs

    # Check formula is formatted right
    assert all([i.formula == serialize_formula(i.formula) for i in db.CAS_index.values()])

    # Check CAS validity
    assert all([checkCAS(i.CASs) for i in db.CAS_index.values()])

    # MW checker
    for i in db.CAS_index.values():
        formula = serialize_formula(i.formula)
        atoms = nested_formula_parser(formula, check=False)
        mw_calc = molecular_weight(atoms)
        assert_allclose(mw_calc, i.MW, atol=0.05)


    for CAS, d in db.CAS_index.items():
        assert CAS_from_any('InChI=1S/' + d.InChI) == int2CAS(CAS)
        
    for CAS, d in db.CAS_index.items():
        assert CAS_from_any('InChIKey=' + d.InChI_key) == int2CAS(CAS)

    # Test the pubchem ids which aren't -1
    for CAS, d in db.CAS_index.items():
        if d.pubchemid != -1:
            assert CAS_from_any('PubChem=' + str(d.pubchemid)) == int2CAS(CAS)

    CAS_lenth = len(db.CAS_index)
    assert CAS_lenth == len(db.smiles_index)
    assert CAS_lenth == len(db.InChI_index)
    assert CAS_lenth == len(db.InChI_key_index)
Esempio n. 6
0
def test_inorganic_db():
    db = ChemicalMetadataDB(elements=False,
                            main_db=os.path.join(folder, 'Inorganic db.tsv'),
                            user_dbs=[])

    # Check CAS lookup
    for CAS, d in db.CAS_index.items():
        assert CAS_from_any(d.CASs) == d.CASs

    # Try ro check formula lookups
    for formula, d in db.formula_index.items():
        if formula in set(['H2MgO2', 'F2N2']):
            # Formulas which are not unique by design
            continue
        assert CAS_from_any(formula) == d.CASs

    # Check smiles are unique / can lookup by smiles
    for smi, d in db.smiles_index.items():
        if not smi:
            continue
        assert CAS_from_any('smiles=' + smi) == d.CASs

    # Check formula is formatted right
    assert all([
        i.formula == serialize_formula(i.formula)
        for i in db.CAS_index.values()
    ])

    # Check CAS validity
    assert all([checkCAS(i.CASs) for i in db.CAS_index.values()])

    # MW checker
    for i in db.CAS_index.values():
        formula = serialize_formula(i.formula)
        atoms = nested_formula_parser(formula, check=False)
        mw_calc = molecular_weight(atoms)
        assert_allclose(mw_calc, i.MW, atol=0.05)
Esempio n. 7
0
def test_database_formulas():
    # This worked until sisotopes were added to formulas
    assert all([
        i.formula == serialize_formula(i.formula)
        for i in pubchem_db.CAS_index.values()
    ])
Esempio n. 8
0
def test_db_vs_ChemSep():
    '''The CAS numbers are checked, as are most of the chemical formulas.
    Some chemical structural formulas aren't supported by the current formula
    parser and are ignored; otherwise it is a very effective test.
    '''
    import xml.etree.ElementTree as ET
    folder = os.path.join(os.path.dirname(__file__), 'Data')

    tree = ET.parse(os.path.join(folder, 'chemsep1.xml'))
    root = tree.getroot()

    data = {}
    for child in root:
        CAS = [i.attrib['value'] for i in child if i.tag == 'CAS'][0]
        name = [i.attrib['value'] for i in child if i.tag == 'CompoundID'][0]
        smiles = [i.attrib['value'] for i in child if i.tag == 'Smiles']
        formula = [
            i.attrib['value'] for i in child if i.tag == 'StructureFormula'
        ][0]

        try:
            if '-' in formula:
                formula = None
            else:
                formula = serialize_formula(formula)
        except:
            pass
        if smiles:
            smiles = smiles[0]
        else:
            smiles = None

        data[CAS] = {'name': name, 'smiles': smiles, 'formula': formula}

    for CAS, d in data.items():
        hit = pubchem_db.search_CAS(CAS)
        assert hit.CASs == CAS

    for CAS, d in data.items():
        assert CAS_from_any(CAS) == CAS

    for CAS, d in data.items():
        f = d['formula']
        if f is None or f == '1,4-COOH(C6H4)COOH' or d['name'] == 'Air':
            continue
        assert pubchem_db.search_CAS(CAS).formula == f

    # In an ideal world, the names would match too but ~22 don't. Adding more synonyms
    # might help.
    # Some of them are straight disagreements however


#    for CAS, d in data.items():
#        try:
#            assert CAS_from_any(d['name']) == CAS
#        except:
#            print(CAS, d['name'])
##

# In an ideal world we could also validate against their smiles
# but that's proving difficult due to things like 1-hexene -
# is it 'CCCCC=C' or 'C=CCCCC'?
#test_db_vs_ChemSep()
Esempio n. 9
0
def test_database_formulas():
    # This worked until sisotopes were added to formulas
    assert all([i.formula == serialize_formula(i.formula) for i in pubchem_db.CAS_index.values()])
Esempio n. 10
0
def CAS_from_any(ID, autoload=False):
    '''Looks up the CAS number of a chemical by searching and testing for the
    string being any of the following types of chemical identifiers:
    
    * Name, in IUPAC form or common form or a synonym registered in PubChem
    * InChI name, prefixed by 'InChI=1S/' or 'InChI=1/'
    * InChI key, prefixed by 'InChIKey='
    * PubChem CID, prefixed by 'PubChem='
    * SMILES (prefix with 'SMILES=' to ensure smiles parsing; ex.
      'C' will return Carbon as it is an element whereas the SMILES 
      interpretation for 'C' is methane)
    * CAS number (obsolete numbers may point to the current number)    

    If the input is an ID representing an element, the following additional 
    inputs may be specified as 
    
    * Atomic symbol (ex 'Na')
    * Atomic number (as a string)

    Parameters
    ----------
    ID : str
        One of the name formats described above

    Returns
    -------
    CASRN : string
        A three-piece, dash-separated set of numbers

    Notes
    -----
    An exception is raised if the name cannot be identified. The PubChem 
    database includes a wide variety of other synonyms, but these may not be
    present for all chemcials.

    Examples
    --------
    >>> CAS_from_any('water')
    '7732-18-5'
    >>> CAS_from_any('InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3')
    '64-17-5'
    >>> CAS_from_any('CCCCCCCCCC')
    '124-18-5'
    >>> CAS_from_any('InChIKey=LFQSCWFLJHTTHZ-UHFFFAOYSA-N')
    '64-17-5'
    >>> CAS_from_any('pubchem=702')
    '64-17-5'
    >>> CAS_from_any('O') # only elements can be specified by symbol
    '17778-80-2'
    '''
    ID = ID.strip()
    ID_lower = ID.lower()
    if ID in periodic_table:
        if periodic_table[ID].number not in homonuclear_elemental_gases:
            return periodic_table[ID].CAS
        else:
            for i in [periodic_table.symbol_to_elements, 
                      periodic_table.number_to_elements,
                      periodic_table.CAS_to_elements]:
                if i == periodic_table.number_to_elements:
                    if int(ID in i):
                        return periodic_table[int(ID)].CAS
                    
                else:
                    if ID in i:
                        return periodic_table[ID].CAS

    if checkCAS(ID):
        CAS_lookup = pubchem_db.search_CAS(ID, autoload)
        if CAS_lookup:
            return CAS_lookup.CASs
        
        # handle the case of synonyms
        CAS_alternate_loopup = pubchem_db.search_name(ID, autoload)
        if CAS_alternate_loopup:
            return CAS_alternate_loopup.CASs
        if not autoload:
            return CAS_from_any(ID, autoload=True)
        raise Exception('A valid CAS number was recognized, but is not in the database')
        
        
    
    ID_len = len(ID)
    if ID_len > 9:
        inchi_search = False
        # normal upper case is 'InChI=1S/'
        if ID_lower[0:9] == 'inchi=1s/':
            inchi_search = ID[9:]
        elif ID_lower[0:8] == 'inchi=1/':
            inchi_search = ID[8:]
        if inchi_search:
            inchi_lookup = pubchem_db.search_InChI(inchi_search, autoload)
            if inchi_lookup:
                return inchi_lookup.CASs
            else:
                if not autoload:
                    return CAS_from_any(ID, autoload=True)
                raise Exception('A valid InChI name was recognized, but it is not in the database')
        if ID_lower[0:9] == 'inchikey=':
            inchi_key_lookup = pubchem_db.search_InChI_key(ID[9:], autoload)
            if inchi_key_lookup:
                return inchi_key_lookup.CASs
            else:
                if not autoload:
                    return CAS_from_any(ID, autoload=True)
                raise Exception('A valid InChI Key was recognized, but it is not in the database')
    if ID_len > 8:
        if ID_lower[0:8] == 'pubchem=':
            pubchem_lookup = pubchem_db.search_pubchem(ID[8:], autoload)
            if pubchem_lookup:
                return pubchem_lookup.CASs
            else:
                if not autoload:
                    return CAS_from_any(ID, autoload=True)
                raise Exception('A PubChem integer identifier was recognized, but it is not in the database.')
    if ID_len > 7:
        if ID_lower[0:7] == 'smiles=':
            smiles_lookup = pubchem_db.search_smiles(ID[7:], autoload)
            if smiles_lookup:
                return smiles_lookup.CASs
            else:
                if not autoload:
                    return CAS_from_any(ID, autoload=True)
                raise Exception('A SMILES identifier was recognized, but it is not in the database.')

    # Try the smiles lookup anyway
    # Parsing SMILES is an option, but this is faster
    # Pybel API also prints messages to console on failure
    smiles_lookup = pubchem_db.search_smiles(ID, autoload)
    if smiles_lookup:
        return smiles_lookup.CASs
    
    try:
        formula_query = pubchem_db.search_formula(serialize_formula(ID), autoload)
        if formula_query and type(formula_query) == ChemicalMetadata:
            return formula_query.CASs
    except:
        pass
    
    # Try a direct lookup with the name - the fastest
    name_lookup = pubchem_db.search_name(ID, autoload)
    if name_lookup:
        return name_lookup.CASs

#     Permutate through various name options
    ID_no_space = ID.replace(' ', '')
    ID_no_space_dash = ID_no_space.replace('-', '')
    
    for name in [ID, ID_no_space, ID_no_space_dash]:
        for name2 in [name, name.lower()]:
            name_lookup = pubchem_db.search_name(name2, autoload)
            if name_lookup:
                return name_lookup.CASs
            
    
    if ID[-1] == ')' and '(' in ID:#
        # Try to matck in the form 'water (H2O)'
        first_identifier, second_identifier = ID[0:-1].split('(', 1)
        try:
            CAS1 = CAS_from_any(first_identifier)
            CAS2 = CAS_from_any(second_identifier)
            assert CAS1 == CAS2
            return CAS1
        except:
            pass
        
    if not autoload:
        return CAS_from_any(ID, autoload=True)
            
    raise Exception('Chemical name not recognized')
Esempio n. 11
0
def CAS_from_any(ID, autoload=False):
    '''Looks up the CAS number of a chemical by searching and testing for the
    string being any of the following types of chemical identifiers:
    
    * Name, in IUPAC form or common form or a synonym registered in PubChem
    * InChI name, prefixed by 'InChI=1S/' or 'InChI=1/'
    * InChI key, prefixed by 'InChIKey='
    * PubChem CID, prefixed by 'PubChem='
    * SMILES (prefix with 'SMILES=' to ensure smiles parsing; ex.
      'C' will return Carbon as it is an element whereas the SMILES 
      interpretation for 'C' is methane)
    * CAS number (obsolete numbers may point to the current number)    

    If the input is an ID representing an element, the following additional 
    inputs may be specified as 
    
    * Atomic symbol (ex 'Na')
    * Atomic number (as a string)

    Parameters
    ----------
    ID : str
        One of the name formats described above

    Returns
    -------
    CASRN : string
        A three-piece, dash-separated set of numbers

    Notes
    -----
    An exception is raised if the name cannot be identified. The PubChem 
    database includes a wide variety of other synonyms, but these may not be
    present for all chemcials.

    Examples
    --------
    >>> CAS_from_any('water')
    '7732-18-5'
    >>> CAS_from_any('InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3')
    '64-17-5'
    >>> CAS_from_any('CCCCCCCCCC')
    '124-18-5'
    >>> CAS_from_any('InChIKey=LFQSCWFLJHTTHZ-UHFFFAOYSA-N')
    '64-17-5'
    >>> CAS_from_any('pubchem=702')
    '64-17-5'
    >>> CAS_from_any('O') # only elements can be specified by symbol
    '17778-80-2'
    '''
    ID = ID.strip()
    ID_lower = ID.lower()
    if ID in periodic_table:
        if periodic_table[ID].number not in homonuclear_elemental_gases:
            return periodic_table[ID].CAS
        else:
            for i in [periodic_table.symbol_to_elements, 
                      periodic_table.number_to_elements,
                      periodic_table.CAS_to_elements]:
                if i == periodic_table.number_to_elements:
                    if int(ID in i):
                        return periodic_table[int(ID)].CAS
                    
                else:
                    if ID in i:
                        return periodic_table[ID].CAS

    if checkCAS(ID):
        CAS_lookup = pubchem_db.search_CAS(ID, autoload)
        if CAS_lookup:
            return CAS_lookup.CASs
        
        # handle the case of synonyms
        CAS_alternate_loopup = pubchem_db.search_name(ID, autoload)
        if CAS_alternate_loopup:
            return CAS_alternate_loopup.CASs
        if not autoload:
            return CAS_from_any(ID, autoload=True)
        raise Exception('A valid CAS number was recognized, but is not in the database')
        
        
    
    ID_len = len(ID)
    if ID_len > 9:
        inchi_search = False
        # normal upper case is 'InChI=1S/'
        if ID_lower[0:9] == 'inchi=1s/':
            inchi_search = ID[9:]
        elif ID_lower[0:8] == 'inchi=1/':
            inchi_search = ID[8:]
        if inchi_search:
            inchi_lookup = pubchem_db.search_InChI(inchi_search, autoload)
            if inchi_lookup:
                return inchi_lookup.CASs
            else:
                if not autoload:
                    return CAS_from_any(ID, autoload=True)
                raise Exception('A valid InChI name was recognized, but it is not in the database')
        if ID_lower[0:9] == 'inchikey=':
            inchi_key_lookup = pubchem_db.search_InChI_key(ID[9:], autoload)
            if inchi_key_lookup:
                return inchi_key_lookup.CASs
            else:
                if not autoload:
                    return CAS_from_any(ID, autoload=True)
                raise Exception('A valid InChI Key was recognized, but it is not in the database')
    if ID_len > 8:
        if ID_lower[0:8] == 'pubchem=':
            pubchem_lookup = pubchem_db.search_pubchem(ID[8:], autoload)
            if pubchem_lookup:
                return pubchem_lookup.CASs
            else:
                if not autoload:
                    return CAS_from_any(ID, autoload=True)
                raise Exception('A PubChem integer identifier was recognized, but it is not in the database.')
    if ID_len > 7:
        if ID_lower[0:7] == 'smiles=':
            smiles_lookup = pubchem_db.search_smiles(ID[7:], autoload)
            if smiles_lookup:
                return smiles_lookup.CASs
            else:
                if not autoload:
                    return CAS_from_any(ID, autoload=True)
                raise Exception('A SMILES identifier was recognized, but it is not in the database.')

    # Try the smiles lookup anyway
    # Parsing SMILES is an option, but this is faster
    # Pybel API also prints messages to console on failure
    smiles_lookup = pubchem_db.search_smiles(ID, autoload)
    if smiles_lookup:
        return smiles_lookup.CASs
    
    try:
        formula_query = pubchem_db.search_formula(serialize_formula(ID), autoload)
        if formula_query and type(formula_query) == ChemicalMetadata:
            return formula_query.CASs
    except:
        pass
    
    # Try a direct lookup with the name - the fastest
    name_lookup = pubchem_db.search_name(ID, autoload)
    if name_lookup:
        return name_lookup.CASs

#     Permutate through various name options
    ID_no_space = ID.replace(' ', '')
    ID_no_space_dash = ID_no_space.replace('-', '')
    
    for name in [ID, ID_no_space, ID_no_space_dash]:
        for name2 in [name, name.lower()]:
            name_lookup = pubchem_db.search_name(name2, autoload)
            if name_lookup:
                return name_lookup.CASs
            
    
    if ID[-1] == ')' and '(' in ID:#
        # Try to matck in the form 'water (H2O)'
        first_identifier, second_identifier = ID[0:-1].split('(', 1)
        try:
            CAS1 = CAS_from_any(first_identifier)
            CAS2 = CAS_from_any(second_identifier)
            assert CAS1 == CAS2
            return CAS1
        except:
            pass
        
    if not autoload:
        return CAS_from_any(ID, autoload=True)
            
    raise Exception('Chemical name not recognized')