Esempio n. 1
0
def main():
    """
    Extract data from databases and store into pickle files of sklearn-ready lists

    TODO: abstract out SQL queries
    """
    
    check_species_in_dict = False
    mol_dict = bond_order_dict()


    # alloy_formation = pd.read_table('alloys_formation.txt', index_col=0)

    # load the CatApp database
    conn = sqlite3.connect('catapp/catapp.db')
    c = conn.cursor()

    dataset = 'CO2+NO_larger'

    # select chemisorption energies
    # e.g. NO --> N* + O*
    # reaction = ['NO*','N*','O*']
    # table = c.execute('SELECT * FROM catapp WHERE ab = \"'+reaction[0]+'\" AND a = \"'+reaction[1]+'\" AND b = \"'+reaction[2]+'\"')

    # CO2 intermediates
    # table = c.execute('SELECT * FROM catapp WHERE ((ab LIKE "%C%O%" AND a LIKE "%C%O%") OR (ab LIKE "CH_*" AND a LIKE "CH_") OR  (ab LIKE "O%" AND a LIKE "O%") )AND (ab NOT LIKE "%Si%") AND (ab NOT LIKE "%Cl%") AND (termination LIKE "___") AND (surface like "%3%" or surface like "__") AND (surface <> "ZnO");')

    # CO2+NO intermediates
    # reaction = ['NO*','N*','O*']
    # table = c.execute('SELECT * FROM catapp WHERE ' +
    #                   '(((ab LIKE "%C%O%" AND a LIKE "%C%O%") OR ' +
    #                       '(ab LIKE "CH_*" AND a LIKE "CH_") OR  ' +
    #                       '(ab LIKE "O%" AND a LIKE "O%")) ' +
    #                   'AND (ab NOT LIKE "%Si%") ' +
    #                   'AND (ab NOT LIKE "%Cl%") ' +
    #                   'AND (termination LIKE "___") ' +
    #                   'AND (surface like "%3%" or surface like "__") ' +
    #                   'AND (surface <> "ZnO")) OR ' +
    #                       '(ab = \"'+reaction[0]+'\" AND a = \"'+reaction[1]+'\" AND b = \"'+reaction[2]+'\");'
    #                   )

    # CO2+NO intermediates + more
    reaction = ['NO*','N*','O*']
    table = c.execute('SELECT * FROM catapp WHERE '
                      + '(((ab LIKE "%C%O%") OR '
                      + '(ab LIKE "CH_*") OR  '
                      + '(ab LIKE "O%")) '
                      + 'AND (ab NOT LIKE "%Si%") '
                      + 'AND (ab NOT LIKE "%Cl%") '
                      + 'AND (termination LIKE "___") '
                      + 'AND (surface like "%3%" or surface like "__") '
                      + 'AND (surface <> "ZnO")) OR '
                      + '(ab = \"'+reaction[0]+'\" AND a = \"'+reaction[1]+'\" AND b = \"'+reaction[2]+'\");'
                      )

    # A3B alloys with atomic adsorption energies of C, O, H, CHO, and CO
    # table = c.execute('select * from catapp where (ab like "_*" or ab = "CO*" or ab = "CHO*") and (termination like "___") and (surface like "%3%" or surface like "__") ;')

    # table = c.execute('SELECT * FROM catapp WHERE (ab like "%*") AND (a NOT LIKE "%*" OR b NOT LIKE "%*") AND (ab NOT LIKE "%Si%") AND (termination LIKE "___") AND (surface like "%3%" or surface like "__");')

    # A3B alloys and pure metals with most atomic adsorption energies
    # table = c.execute('SELECT * FROM catapp WHERE (ab NOT LIKE "%Si%") AND (ab NOT LIKE "%Cl%") AND (termination LIKE "___") AND (surface like "%3%" or surface like "__");')

    if check_species_in_dict:
            species = []
            for spec in ['AB','A','B']:
                result = c.execute('SELECT '+spec+' FROM catapp WHERE (ab NOT LIKE "%Si%") AND (ab NOT LIKE "%Cl%") AND (termination LIKE "___") AND (surface like "%3%" or surface like "__");')
                species += [unicode_convert(s[0]) for s in result]

            unique_species = list(set(species))
            for spec in unique_species:
                if spec not in mol_dict:
                    print spec 


    features = ['h', 'k', 'l', 'stoichiometry_M1', 'stoichiometry_M2', 'E_form', 'density', 'a', 'b', 'c', 'alpha', 'beta', 'gamma', 's_M1', 'p_M1', 'd_M1', 'f_M1', 's_M2', 'p_M2', 'd_M2', 'f_M2', 'max_bonds_central', 'molecule_bonds', 'surface_bonds', 'C_count', 'H_count', 'O_count', 'N_count']

    data, energetics, data_key = get_data_from_query(table)

    pickle.dump(data, open('datasets/data_'+dataset+'.pckl','w'))
    pickle.dump(energetics, open('datasets/energetics_'+dataset+'.pckl','w'))
    pickle.dump(data_key, open('datasets/key_'+dataset+'.pckl','w'))

    return 0
Esempio n. 2
0
def get_data_from_query(table):
    """Returns the matrix of data, the response, and an array form of the table

    Input:
    table -- Table from SQL query e.g. `c.execute(...)`

    Returns:
    data -- data in the design matrix
    energetics -- response variable, the energetics of the reaction
    data_key -- corresponding row from SQL table
    """

    mol_dict = bond_order_dict()
    
        # set up data and response lists
    data = []
    energetics = []
    data_key = []
    
    # populate design matrix and response vector
    for row in table:
        # h|k|l|stoichiometry_M1|stoichiometry_M2|E_form|density|a|b|c|alpha|beta|gamma|s_M1|p_M1|d_M1|f_M1|s_M2|p_M2|d_M2|f_M2|max bonds of the central atom|bonds in the molecule|bonds in the surface|C count|H count|O count|N count

        # surface termination - determined by Miller indices
        observation = [int(row[3][0]), int(row[3][1]), int(row[3][2])]

        # Split formula and determine stoichiometry
        formula = split_formula(row[2])
        if len(set(formula)) > 1:
            stoich = [s for s in formula if s.isdigit()][0]
            
            if formula.index(stoich) == 1:
                observation += [float(stoich),1]
                material = ''.join([formula[2],formula[0],formula[1]])
            else:
                observation += [1,float(stoich)]
                material = row[2]
            
            formula.remove(stoich)

        else:
            observation += [1,3]
            material = formula[0]
        
        # make sure MP has the required data
        mp_data = get_mp_crystal_data(material)
        
        if mp_data:
            for bulk_prop in mp_data:
                observation.append(bulk_prop)

            # repeat for pure elements to match columns for binary alloys
            if len(formula) == 1:
                formula *= 2

            for element in formula:
                for elemental_prop in get_outer_elec_data(element):
                    observation.append(elemental_prop)       
            
            # bond orders of adsorbate
            for state in row[4:7]:
                for bond_order in mol_dict[state]['b-order']:
                    observation.append(bond_order)

                # element counts for each adsorbate
                for element_count in mol_dict[state]['e-count']:
                    observation.append(element_count)
            
            # add all columns as an entry
            data.append(observation)
            
            # output: E_rxn
            energetics.append(float(row[0]))

            # add data row to key
            data_key += [row]