def modelfit(alg, X, y, X_test, y_test, name1, name2, name3, name4, useTrainCV=True, cv_folds=5, early_stopping_rounds=80): if useTrainCV: xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(X.values, label=y.values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval = 500) alg.set_params(n_estimators=cvresult.shape[0]) print(cvresult.shape[0]) n_estimators_optimal = cvresult.shape[0] alg.fit(X, y,eval_metric='auc') dtrain_predictions = alg.predict(X) dtrain_predprob = alg.predict_proba(X)[:,1] dtest_predictions = alg.predict(X_test) dtest_predprob = alg.predict_proba(X_test)[:,1] model_report(y, dtrain_predictions, dtrain_predprob, y_test, dtest_predictions, dtest_predprob) save_results(name3, name4, dtrain_predictions, dtrain_predprob) save_results(name1, name2, dtest_predictions, dtest_predprob) print(mean_squared_error(y, dtrain_predprob)) feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') plt.show()
def stack_results_3(Y, name1, name2, name3, nameout1, nameout2): print("Loading first result...") res1 = load_results(name1, True) print("Loading second result...") res2 = load_results(name2, True) print("Loading third result...") res3 = load_results(name3, True) max_acc = 0 maxx_auc = 0 max_frac1 = 0 max_frac2 = 0 for j in range(101): frac1 = j / 100.0 for k in range(101 - j): frac2 = k / 100.0 res_1 = np.array([i * frac1 for i in res1]) res_2 = np.array([i * frac2 for i in res2]) res_3 = np.array([i * (1 - frac1 - frac2) for i in res3]) result = np.add(res_1, res_2, res_3) result_pred = np.round(result, decimals=0) (acc, auc) = model_report_get(Y, result_pred, result) if (acc > max_acc): max_acc = acc max_auc = auc max_frac1 = frac1 max_frac2 = frac2 print("The max fractions are: %f, %f" % (max_frac1, max_frac2)) res_1 = np.array([i * max_frac1 for i in res1]) res_2 = np.array([i * max_frac2 for i in res2]) res_3 = np.array([i * (1 - max_frac1 - max_frac2) for i in res3]) result = np.add(res_1, res_2, res_3) result_pred = np.round(result, decimals=0) model_report_test(Y, result_pred, result) save_results(nameout1, nameout2, result_pred, result)
def stack_results_2(Y, Yt, name1, name2, name3, name4, nameout1, nameout2): print("Loading first result...") res1 = load_results(name1, True) print("Loading second result...") res2 = load_results(name2, True) max_acc = 0 maxx_auc = 0 max_frac = 0 for j in range(101): frac = j / 100.0 res_1 = np.array([i * frac for i in res1]) res_2 = np.array([i * (1 - frac) for i in res2]) result = np.add(res_1, res_2) result_pred = np.round(result, decimals=0) (acc, auc) = model_report_get(Y, result_pred, result) if (acc > max_acc): max_acc = acc max_auc = auc max_frac = frac print("The max fraction is: %f" % max_frac) print("Loading first result...") res1 = load_results(name3, True) print("Loading second result...") res2 = load_results(name4, True) res_1 = np.array([i * max_frac for i in res1]) res_2 = np.array([i * (1 - max_frac) for i in res2]) result = np.add(res_1, res_2) result_pred = np.round(result, decimals=0) model_report_test(Yt, result_pred, result) save_results(nameout1, nameout2, result_pred, result)
def modelfit(alg, X, y, X_test, y_test, name1, name2, name3, name4): alg.fit(X, y) dtrain_predictions = alg.predict(X) dtrain_predprob = alg.predict_proba(X)[:,1] dtest_predictions = alg.predict(X_test) dtest_predprob = alg.predict_proba(X_test)[:,1] model_report(y, dtrain_predictions, dtrain_predprob, y_test, dtest_predictions, dtest_predprob) print(mean_squared_error(y_test, dtest_predprob)) save_results(name1, name2, dtest_predictions, dtest_predprob)
model.add(Dropout(0.30)) model.add(Dense(600)) model.add(Activation('relu')) model.add(Dropout(0.20)) model.add(Dense(50)) model.add(Activation('relu')) model.add(Dropout(0.20)) model.add(Dense(1)) #model.add(Activation('softmax')) model.compile(optimizer='rmsprop', loss='mse', metrics=['accuracy']) fit = model.fit(X_train, Y_train, batch_size=170, nb_epoch=25) model.summary() train_predprob = model.predict(X_train) test_predprob = model.predict(X_test) train_pred = np.round(train_predprob, decimals=0) test_pred = np.round(test_predprob, decimals=0) train_predprob = train_predprob.transpose()[0] test_predprob = test_predprob.transpose()[0] train_pred = train_pred.transpose()[0] test_pred = test_pred.transpose()[0] Ydf = pd.DataFrame(Y_train, columns=['id']) Ytdf = pd.DataFrame(Y_test, columns=['id']) model_report(Ydf, train_pred, train_predprob, Ytdf, test_pred, test_predprob) save_results("NN_results.csv", "NN_results_probs.csv", test_pred, test_predprob)
def scrap_n_crawl(url, df, current_indx, session, depth=2): #if url is already visited or if depth is zero then return if url in ALREADY_VISITED_URLS or depth == 0: #increment the reference count of the url if it is already visited if url in ALREADY_VISITED_URLS: df.loc[ALREADY_VISITED_URLS[url], ['References']] += 1 return current_indx - 1 try: #get the response of the url response = session.get(url) except ConnectionError as e: #if url blocks you then wait for some time and again try print(e) time.sleep(2) try: response = session.get(url) except: return current_indx - 1 except MissingSchema as e: print(e) print("Invalid URL - " + url) return current_indx - 1 except Exception as e: print(e) return current_indx - 1 try: response.raise_for_status() except HTTPError as e: print(e) return current_indx - 1 #get the raw text of response raw_text = response.text #parse the raw text with html parser of beautifulSoup html_soup = BeautifulSoup(raw_text, "html.parser") #get the words_freq_and_pos and clean_text from html soup words_freq_and_pos, clean_text_body = get_clean_words_from_html(html_soup) title = get_title(html_soup) meta_description = get_meta_description(html_soup) #get clean url with removed unnecessary slashes url = clean_url(url) #store the document(webpage) details, last feild is of reference df.loc[current_indx] = [ url, title, meta_description.replace(u'\xa0', u' '), 1 ] #store the whole clean body of webpage TEXT_OF_WEBPAGES[current_indx] = clean_text_body print(df.loc[current_indx]['Link']) ALREADY_VISITED_URLS[url] = current_indx map_keyword_to_url(words_freq_and_pos, current_indx) global SAVED_UPTO #if the number of docs that are not saved are more than the threshold value then save the data of docs first if (current_indx - SAVED_UPTO) >= THRESHOLD: save_results(KEYWORD_TO_URL, ALREADY_VISITED_URLS, TEXT_OF_WEBPAGES, df) clear_text_dict() SAVED_UPTO = current_indx #from here on start traversing other outgoing links links = html_soup.find_all('a') for link in links: next_page = link.get("href") if next_page is not None: next_page = next_page.strip() if next_page != '': if not next_page.startswith("#"): next_page = get_absolute_next_page_url(url, next_page) current_indx = scrap_n_crawl(next_page, df, current_indx + 1, session, depth - 1) return current_indx
SAVED_UPTO = current_indx #from here on start traversing other outgoing links links = html_soup.find_all('a') for link in links: next_page = link.get("href") if next_page is not None: next_page = next_page.strip() if next_page != '': if not next_page.startswith("#"): next_page = get_absolute_next_page_url(url, next_page) current_indx = scrap_n_crawl(next_page, df, current_indx + 1, session, depth - 1) return current_indx if __name__ == "__main__": if not os.path.exists('data'): os.makedirs('data') if not os.path.exists('data/doc_pages'): os.makedirs('data/doc_pages') df = pd.DataFrame( columns=['Link', 'Title', 'Meta Description', 'References']) session = requests.Session() auth = get_auth() session.auth = HTTPProxyAuth(auth[0], auth[1]) session.trust_env = False index = scrap_n_crawl("https://dmoz-odp.org/Computers/", df, 0, session, 2) print("This many urls crawled: " + str(index + 1)) save_results(KEYWORD_TO_URL, ALREADY_VISITED_URLS, TEXT_OF_WEBPAGES, df)
inventory = dict(enumerate(os.listdir(stornext_folder))) question = "Choose the index of the appropriate experiment?\n%s" % ( inventory) response = int(raw_input(question)) experiment = inventory[response] get_directory = stornext_folder + experiment + "/" save_directory = hume_folder + results_folder + experiment + "/" #which arrays? loa_array_strings = [] #nb_nsm, rate_nsm loa_array_strings.append("num_nsm") #elem & iso yield+ism iso_list = ["Re-185", "Re-187", "Os-187", "Os-188", "Os-186", "W-184"] elem_list = ["Re", "Os", "W"] for iso in iso_list: loa_array_strings.append("ism_iso_%s" % (iso)) loa_array_strings.append("yield_%s" % (iso)) for elem in elem_list: loa_array_strings.append("ism_elem_%s" % (elem)) #which timepoints? (9.5Gyr, 14Gyr) loa_timepoints = [9.5e+9, 14e+9] save_results(get_directory=get_directory, save_directory=save_directory, experiment=experiment, loa_array_strings=loa_array_strings, loa_timepoints=loa_timepoints)
model.add(Dense(300)) model.add(Activation('relu')) model.add(Dropout(0.20)) model.add(Dense(50)) model.add(Activation('relu')) model.add(Dropout(0.10)) model.add(Dense(1)) #model.add(Activation('softmax')) model.compile(optimizer='rmsprop', loss='mse', metrics=['accuracy']) fit = model.fit(X_train, Y_train, batch_size=170, nb_epoch=100) model.summary() train_predprob = model.predict(X_train) test_predprob = model.predict(X_test) train_pred = np.round(train_predprob, decimals=0) test_pred = np.round(test_predprob, decimals=0) train_predprob = train_predprob.transpose()[0] test_predprob = test_predprob.transpose()[0] train_pred = train_pred.transpose()[0] test_pred = test_pred.transpose()[0] Ydf = pd.DataFrame(Y_train, columns=['id']) Ytdf = pd.DataFrame(Y_test, columns=['id']) model_report(Ydf, train_pred, train_predprob, Ytdf, test_pred, test_predprob) print(mean_squared_error(Ydf, train_predprob)) save_results("NN_train_results2.csv", "NN_train_probs2.csv", train_pred, train_predprob) save_results("NN_results2.csv", "NN_results_probs2.csv", test_pred, test_predprob)