예제 #1
0
def test_organic_user_db():
    db = ChemicalMetadataDB(elements=False,
                            main_db=None,
                            user_dbs=[
                                os.path.join(
                                    folder,
                                    'chemical identifiers example user db.tsv')
                            ])
    for CAS, d in db.CAS_index.items():
        assert CAS_from_any(d.CASs) == d.CASs
    # Check something was loaded
    assert len(db.CAS_index) > 100

    # Check smiles are unique / can lookup by smiles
    for smi, d in db.smiles_index.items():
        if not smi:
            continue
        assert CAS_from_any('smiles=' + smi) == d.CASs

    # Check formula is formatted right
    assert all([
        i.formula == serialize_formula(i.formula)
        for i in db.CAS_index.values()
    ])

    # Check CAS validity
    assert all([check_CAS(i.CASs) for i in db.CAS_index.values()])

    # MW checker
    for i in db.CAS_index.values():
        formula = serialize_formula(i.formula)
        atoms = nested_formula_parser(formula, check=False)
        mw_calc = molecular_weight(atoms)
        assert_allclose(mw_calc, i.MW, atol=0.05)

    for CAS, d in db.CAS_index.items():
        assert CAS_from_any('InChI=1S/' + d.InChI) == int_to_CAS(CAS)

    for CAS, d in db.CAS_index.items():
        assert CAS_from_any('InChIKey=' + d.InChI_key) == int_to_CAS(CAS)

    # Test the pubchem ids which aren't -1
    for CAS, d in db.CAS_index.items():
        if d.pubchemid != -1:
            assert CAS_from_any('PubChem=' +
                                str(d.pubchemid)) == int_to_CAS(CAS)

    CAS_lenth = len(db.CAS_index)
    assert CAS_lenth == len(db.smiles_index)
    assert CAS_lenth == len(db.InChI_index)
    assert CAS_lenth == len(db.InChI_key_index)
예제 #2
0
def test_database_formulas():
    # Failures are thing slike 3He, C2D4Br2, C14H18N3NaO10[99Tc], [1H]I
    # The fix here is adding an isotope db and making the formula parser handle isotopes as well.
    # This worked until isotopes were added to formulas
    assert all([
        i.formula == serialize_formula(i.formula)
        for i in pubchem_db.CAS_index.values()
    ])
예제 #3
0
def test_inorganic_db():
    db = ChemicalMetadataDB(
        elements=False,
        main_db=None,
        user_dbs=[os.path.join(folder, 'Inorganic db.tsv')])

    # Check CAS lookup
    for CAS, d in db.CAS_index.items():
        assert CAS_from_any(d.CASs) == d.CASs

    # Try ro check formula lookups
    for formula, d in db.formula_index.items():
        if formula in set(['H2MgO2', 'F2N2']):
            # Formulas which are not unique by design
            continue
        assert CAS_from_any(formula) == d.CASs

    # Check smiles are unique / can lookup by smiles
    for smi, d in db.smiles_index.items():
        if not smi:
            continue
        assert CAS_from_any('smiles=' + smi) == d.CASs

    # Check formula is formatted right
    assert all([
        i.formula == serialize_formula(i.formula)
        for i in db.CAS_index.values()
    ])

    # Check CAS validity
    assert all([check_CAS(i.CASs) for i in db.CAS_index.values()])

    # MW checker
    for i in db.CAS_index.values():
        formula = serialize_formula(i.formula)
        atoms = nested_formula_parser(formula, check=False)
        mw_calc = molecular_weight(atoms)
        assert_allclose(mw_calc, i.MW, atol=0.05)
예제 #4
0
def test_db_vs_ChemSep():
    """The CAS numbers are checked, as are most of the chemical formulas. Some
    chemical structural formulas aren't supported by the current formula parser
    and are ignored; otherwise it is a very effective test.

    DO NOT TRY TO OPTimizE THis FUNCTION - IT HAS ALREADY BEEN TRIED AND
    FAILED AT. THE TIME IS ONLY TAKEN py the PARSE function.

    EVEN THAT HAS BEEN REDUCED By 80% by using cElementTree instead of
    ElementTree.
    """
    
    import xml.etree.cElementTree as ET
    folder = os.path.join(os.path.dirname(__file__), 'Data')

    tree = ET.parse(os.path.join(folder, 'chemsep1.xml'))
    root = tree.getroot()

    data = {}
    for child in root:
        CAS, name, smiles, formula = None, None, None, None
        for i in child:
            tag = i.tag
            if CAS is None and tag == 'CAS':
                CAS = i.attrib['value']
            elif name is None and tag == 'CompoundID':
                name = i.attrib['value']
            elif smiles is None and tag == 'Smiles':
                smiles = i.attrib['value']
            elif formula is None and tag == 'StructureFormula':
                formula = i.attrib['value']
        
#        CAS = [i.attrib['value'] if  ][0]
#        name = [i.attrib['value'] for i in child if i.tag ][0]
#        smiles = [i.attrib['value'] for i in child if i.tag == ]
#        formula = [i.attrib['value'] for i in child if i.tag == 'StructureFormula'][0]
        
        try:
            if '-' in formula:
                formula = None
            else:
                formula = serialize_formula(formula)
        except:
            pass
        if smiles:
            smiles = smiles[0]
        else:
            smiles = None
        
        data[CAS] = {'name': name, 'smiles': smiles, 'formula': formula}        
    
    for CAS, d in data.items():
        hit = pubchem_db.search_CAS(CAS)
        assert hit.CASs == CAS

    for CAS, d in data.items():
        assert CAS_from_any(CAS) == CAS

    for CAS, d in data.items():
        f = d['formula']
        if f is None or f == '1,4-COOH(C6H4)COOH' or d['name'] == 'Air':
            continue
        assert pubchem_db.search_CAS(CAS).formula == f
예제 #5
0
def _search_chemical(ID, autoload):
    ID_arg = ID
    ID = ID.strip()
    ID_lower = ID.lower()
    if ID in periodic_table:
        '''Special handling for homonuclear elements. Search '1'> H, 'H'> H, monotomic CAS > H
        but "Hydrogen"> H2.
        pubchem_db does not contain atomic numbers, so searching in the periodic table is necessary.
        '''
        if (ID in periodic_table._symbol_to_elements
                or ID in periodic_table._number_to_elements
                or ID in periodic_table._CAS_to_elements):
            obj = pubchem_db.search_CAS(periodic_table[ID].CAS)
        else:
            obj = pubchem_db.search_CAS(periodic_table[ID].CAS_standard)
        return obj
    if check_CAS(ID):
        CAS_lookup = pubchem_db.search_CAS(ID, autoload)
        if CAS_lookup:
            return CAS_lookup
        # handle the case of synonyms
        CAS_alternate_loopup = pubchem_db.search_name(ID, autoload)
        if CAS_alternate_loopup:
            return CAS_alternate_loopup

        if not autoload:
            return search_chemical(ID, autoload=True)
        raise ValueError(
            'A valid CAS number (%s) was recognized, but is not in the database'
            % (ID))

    ID_len = len(ID)
    if ID_len > 9:
        inchi_search = False
        # normal upper case is 'InChI=1S/'
        if ID_lower[0:9] == 'inchi=1s/':
            inchi_search = ID[9:]
        elif ID_lower[0:8] == 'inchi=1/':
            inchi_search = ID[8:]
        if inchi_search:
            inchi_lookup = pubchem_db.search_InChI(inchi_search, autoload)
            if inchi_lookup:
                return inchi_lookup
            else:
                if not autoload:
                    return search_chemical(ID, autoload=True)
                raise ValueError(
                    'A valid InChI name (%s) was recognized, but it is not in the database'
                    % (inchi_search))
        if ID_lower[0:9] == 'inchikey=':
            inchi_key_lookup = pubchem_db.search_InChI_key(ID[9:], autoload)
            if inchi_key_lookup:
                return inchi_key_lookup
            else:
                if not autoload:
                    obj = search_chemical(ID, autoload=True)
                    return obj
                raise ValueError(
                    'A valid InChI Key (%s) was recognized, but it is not in the database'
                    % (inchi_key_lookup))
    if ID_len > 8:
        if ID_lower[0:8] == 'pubchem=':
            pubchem_lookup = pubchem_db.search_pubchem(ID[8:], autoload)
            if pubchem_lookup:
                return pubchem_lookup

            else:
                if not autoload:
                    return search_chemical(ID, autoload=True)
                raise ValueError(
                    'A PubChem integer (%s) identifier was recognized, but it is not in the database.'
                    % (ID[8:]))
    if ID_len > 7:
        if ID_lower[0:7] == 'smiles=':
            smiles_lookup = pubchem_db.search_smiles(ID[7:], autoload)
            if smiles_lookup:
                return smiles_lookup
            else:
                if not autoload:
                    return search_chemical(ID, autoload=True)
                raise ValueError(
                    'A SMILES identifier (%s) was recognized, but it is not in the database.'
                    % (ID[7:]))

    # Try the smiles lookup anyway
    # Parsing SMILES is an option, but this is faster
    # Pybel API also prints messages to console on failure
    smiles_lookup = pubchem_db.search_smiles(ID, autoload)
    if smiles_lookup:
        return smiles_lookup

    try:
        formula_query = pubchem_db.search_formula(serialize_formula(ID),
                                                  autoload)
        if formula_query and type(formula_query) == ChemicalMetadata:
            return formula_query
    except:
        pass

    # Try a direct lookup with the name - the fastest
    name_lookup = pubchem_db.search_name(ID, autoload)
    if name_lookup:
        return name_lookup

#     Permutate through various name options
    ID_no_space = ID.replace(' ', '')
    ID_no_space_dash = ID_no_space.replace('-', '')

    for name in [ID, ID_no_space, ID_no_space_dash]:
        for name2 in [name, name.lower()]:
            name_lookup = pubchem_db.search_name(name2, autoload)
            if name_lookup:
                return name_lookup

    if ID[-1] == ')' and '(' in ID:  #
        # Try to match in the form 'water (H2O)'
        first_identifier, second_identifier = ID[0:-1].split('(', 1)
        try:
            CAS1 = search_chemical(first_identifier, autoload)
            CAS2 = search_chemical(second_identifier, autoload)
            assert CAS1 == CAS2
            CAS = CAS1
            return CAS
        except:
            pass

    if not autoload:
        return _search_chemical(ID, autoload=True)

    raise ValueError('Chemical name (%s) not recognized' % (ID))
예제 #6
0
    def _search(self, ID):
        if not ID: raise ValueError('ID cannot be empty')
        ID = ID.replace('_', ' ')
        ID_lower = ID.lower()

        ID_len = len(ID)
        if ID_len > 9:
            inchi_search = False
            # normal upper case is 'InChI=1S/'
            if ID_lower[0:9] == 'inchi=1s/':
                inchi_search = ID[9:]
            elif ID_lower[0:8] == 'inchi=1/':
                inchi_search = ID[8:]
            if inchi_search:
                inchi_lookup = self.search_InChI(inchi_search)
                if inchi_lookup: return inchi_lookup
                raise LookupError(
                    'A valid InChI name was recognized, but it is not in the database'
                )
            if ID_lower[0:9] == 'inchikey=':
                inchi_key_lookup = self.search_InChI_key(ID[9:])
                if inchi_key_lookup: return inchi_key_lookup
                raise LookupError(
                    'A valid InChI Key was recognized, but it is not in the database'
                )
        if ID_len > 8:
            if ID_lower[0:8] == 'pubchem=':
                pubchem_lookup = self.search_pubchem(ID[8:])
                if pubchem_lookup: return pubchem_lookup
                raise LookupError(
                    'A PubChem integer identifier was recognized, but it is not in the database.'
                )
        if ID_len > 7:
            if ID_lower[0:7] == 'smiles=':
                smiles_lookup = self.search_smiles(ID[7:])
                if smiles_lookup: return smiles_lookup
                raise LookupError(
                    'A SMILES identifier was recognized, but it is not in the database.'
                )

        # Permutate through various name options
        ID_search = spaceout_words(ID).lower()
        for name in (ID_lower, ID_search):
            name_lookup = self.search_name(name)
            if name_lookup: return name_lookup

        if check_CAS(ID):
            CAS_lookup = self.search_CAS(ID)
            if CAS_lookup: return CAS_lookup

            # Handle the case of synonyms
            CAS_alternate_loopup = self.search_name(ID)
            if CAS_alternate_loopup: return CAS_alternate_loopup

            raise LookupError(
                'a valid CAS number was recognized, but its not in the database'
            )

        try:
            formula = serialize_formula(ID)
        except:
            pass
        else:
            formula_query = self.search_formula(formula)
            if formula_query: return formula_query

        raise LookupError(f'chemical {repr(ID)} not recognized')