def gen_atomic_df(self, composition_df): compound_lst = composition_df.columns.tolist() all_elements = self.chemical_attributes.columns.tolist() element_guide = np.zeros((len(all_elements), len(compound_lst))) for j in range(len(compound_lst)): c = compound_lst[j] cdic = parse_formula(c) for el in cdic: i = all_elements.index(el) element_guide[i,j] += cdic[el] atomic_df = np.zeros((len(composition_df), len(all_elements))) for i in range(len(compound_lst)): c = compound_lst[i] cdic = parse_formula(c) for el in cdic: j = all_elements.index(el) atomic_df[:,j] += composition_df[c].values*element_guide[j,i] atomic_df = pd.DataFrame( atomic_df, columns=all_elements, index=composition_df.index, ) atomic_df = atomic_df.div(atomic_df.sum(axis=1), axis=0) return atomic_df
def preprocess_data(): #open the CSV file into chunks reader = pd.read_csv('training_data.csv') #shuffle data and seperate labels and candidates formulas = list(reader['Formula'].to_numpy()) labels = list(reader['Is_Candidate'].to_numpy().astype(int)) data = list(zip(formulas, labels)) random.shuffle(data) formulas, labels = zip(*data) formulas = list(formulas) labels = list(labels) parsed_formulas = [] elements = [str(element.symbol) for element in periodictable.elements] elements = elements[1:] for formula in formulas: parsed_formula = [] if isinstance(formula, str): parsed_formula = {k:0 for k in elements} for k, v in chemparse.parse_formula(formula).items(): parsed_formula[k] = v parsed_formulas.append(list(parsed_formula.values())) print(parsed_formulas[0]) training_size = 50000 training_formulas = parsed_formulas[:training_size] testing_formulas = parsed_formulas[:training_size] training_labels = labels[:training_size] testing_labels = labels[:training_size] training_formulas = np.array(training_formulas) training_labels = np.array(training_labels) testing_formulas = np.array(testing_formulas) testing_labels = np.array(testing_labels) print(len(training_formulas[0])) model = tf.keras.Sequential([ tf.keras.layers.InputLayer(input_shape=(len(training_formulas[0]))), tf.keras.layers.Dense(24, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid') ]) model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) model.summary() num_epochs = 30 history = model.fit(training_formulas, training_labels, epochs=num_epochs, validation_data=(testing_formulas, testing_labels), verbose=2)
def parse_formula(formula): elements = chemparse.parse_formula(formula) for element in list(elements.keys()): if len(element) > 2: correct_elements = parse_segment(element) elements.pop(element) for elem in correct_elements: elements[elem] = correct_elements[elem] return elements
def parseFormulaNorm(formula, sum_total=1, round_=False): dic = parse_formula(formula) sum_values = sum(dic.values()) if round_: norm_dic = { k: round(v * sum_total / sum_values, round_) for k, v in dic.items() } else: norm_dic = {k: v / sum_values for k, v in dic.items()} return norm_dic
def main(): df = pd.read_csv("Data/IMA_abiotic_labels.csv") formulas = df["IMA Chemistry (plain)"].tolist() dash_count = 0 extra_count = 0 for f in formulas: if '+' in f: f = re.sub('\d\+','',f) #print(chemparse.parse_formula(f)) elif '-' in f: print(f) dash_count += 1 elif re.findall('[^\da-zA-Z()\\box]+', f): print(f) print(chemparse.parse_formula(f)) extra_count += 1 print("Dash count:", dash_count) print("Extra count:", extra_count)
# Crystal System Prediction Integrating Concepts Of Chemistry & Computer Science ''') chem_data = data.Formula.apply(chemparse.parse_formula) chem_data = pd.json_normalize(chem_data) chem_data = chem_data.fillna(0) data = data.join(chem_data) st.subheader("Data Information") st.dataframe(data) st.write(data.describe()) user_input = st.sidebar.text_input("Enter The Formula Of The Compound") user_input = chemparse.parse_formula(user_input) element_list = [ 'K', 'S', 'O', 'Al', 'Fe', 'H', 'N', 'Ce', 'C', 'Cl', 'B', 'Cu', 'Ba', 'Ca', 'Co', 'Pb', 'Mn', 'Mg', 'Hg', 'Ni', 'Cr', 'Sr', 'Na', 'Zn', 'Ag', 'I', 'P' ] user_data = {} melting_point = { "Melting Point": int(st.sidebar.slider("Melting Point (in K)", 273.0, 2000.0, 664.1)) } solubility = { "Solubility":
from mendeleev import element import chemparse tempElem = element("H") tempMult = 0 totalValence = 0 counter = 0 inputThing = "" while (inputThing != "0"): try: inputThing = input("Enter Formula:") formula = chemparse.parse_formula(inputThing) for k, v in formula.items(): tempElem = element(k) tempMult = v totalValence = (tempElem.nvalence() * tempMult) + totalValence print(totalValence) totalValence = 0 except: print("Error! Try Again.")
def initialize_properties_file(a, ai, id, d, ma): """Initializes a file over properties with correct titles and main structure for an material. Parameters: a (obj): a is an atoms object of class defined in ase. The material is made into an atoms object. ai (obj): initial atoms object an object of class sdefined in ase. The unit cell atoms object that md runs for. id (str): a special number identifying the material system. d (int): a number for the formatting of file. Give a correct spacing for printing to file. ma (boolean): a boolean indicating if the material is monoatomic Returns: None """ # Help function for formating def lj(str, k=d): return " " + str.ljust(k + 6) file = open("property_calculations/properties_" + id + ".txt", "w+") file.write("Material ID: " + id + "\n") file.write("Unit cell composition: " + a.get_chemical_formula() + "\n") chem_formula = a.get_chemical_formula(mode='hill', empirical=True) file.write("Material: " + chem_formula + "\n") # Write the elements as title file.write("Site positions of initial unit cell:" + "\n") dict = chemparse.parse_formula(ai.get_chemical_formula()) els = list(dict.keys()) prop_num = list(dict.values()) tmp_ls = [(a + " ") * int(b) for a, b in zip(els, prop_num) ] # Get ["Al", "Mg Mg Mg"] for "AlMg3" e.g. els_str = "".join(tmp_ls) els_ls = els_str.split() # give you ["Al", "Mg", "Mg", "Mg"] e.g. for a in els_ls: file.write(lj(a)) # Write the site positions res_array = ai.get_positions() for i in range(0, 3): # 3 components file.write("\n") for ii in range(0, len(res_array)): format_str = "." + str(d) + "f" val = format(res_array[:, i][ii], format_str) # d decimals file.write(lj(val)) file.write("\n") file.write("Properties:\n") file.write( lj("Time") + lj("Epot") + lj("Ekin") + lj("Etot") + lj("Temp", 2) + lj("MSD")) file.write(lj("Self_diff") + lj("LC_a", 3) + lj("LC_b", 3) + lj("LC_c", 3)) file.write(lj("Volume") + lj("Pressure")) if ma: file.write(lj("DebyeT", 2) + lj("Lindemann")) file.write("\n") file.write( lj("fs") + lj("eV/atom") + lj("eV/atom") + lj("eV/atom") + lj("K", 2) + lj("Å^2")) file.write(lj("mm^2/s") + lj("Å", 3) + lj("Å", 3) + lj("Å", 3)) file.write(lj("Å^3/atom") + lj("GPa")) if ma: file.write(lj("K", 2) + lj("1")) file.write("\n") file.close() return