Exemple #1
0
def modelfit(alg, X, y, X_test, y_test, name1, name2, name3, name4, useTrainCV=True, cv_folds=5, early_stopping_rounds=80):

    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X.values, label=y.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval = 500)
        alg.set_params(n_estimators=cvresult.shape[0])
        print(cvresult.shape[0])
        n_estimators_optimal = cvresult.shape[0]

    alg.fit(X, y,eval_metric='auc')

    dtrain_predictions = alg.predict(X)
    dtrain_predprob = alg.predict_proba(X)[:,1]

    dtest_predictions = alg.predict(X_test)
    dtest_predprob = alg.predict_proba(X_test)[:,1]

    model_report(y, dtrain_predictions, dtrain_predprob, y_test, dtest_predictions, dtest_predprob)
    save_results(name3, name4, dtrain_predictions, dtrain_predprob)
    save_results(name1, name2, dtest_predictions, dtest_predprob)

    print(mean_squared_error(y, dtrain_predprob))

    feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.show()
Exemple #2
0
def stack_results_3(Y, name1, name2, name3, nameout1, nameout2):
    print("Loading first result...")
    res1 = load_results(name1, True)
    print("Loading second result...")
    res2 = load_results(name2, True)
    print("Loading third result...")
    res3 = load_results(name3, True)
    max_acc = 0
    maxx_auc = 0
    max_frac1 = 0
    max_frac2 = 0
    for j in range(101):
        frac1 = j / 100.0
        for k in range(101 - j):
            frac2 = k / 100.0
            res_1 = np.array([i * frac1 for i in res1])
            res_2 = np.array([i * frac2 for i in res2])
            res_3 = np.array([i * (1 - frac1 - frac2) for i in res3])
            result = np.add(res_1, res_2, res_3)
            result_pred = np.round(result, decimals=0)
            (acc, auc) = model_report_get(Y, result_pred, result)
            if (acc > max_acc):
                max_acc = acc
                max_auc = auc
                max_frac1 = frac1
                max_frac2 = frac2
    print("The max fractions are: %f, %f" % (max_frac1, max_frac2))
    res_1 = np.array([i * max_frac1 for i in res1])
    res_2 = np.array([i * max_frac2 for i in res2])
    res_3 = np.array([i * (1 - max_frac1 - max_frac2) for i in res3])
    result = np.add(res_1, res_2, res_3)
    result_pred = np.round(result, decimals=0)
    model_report_test(Y, result_pred, result)
    save_results(nameout1, nameout2, result_pred, result)
Exemple #3
0
def stack_results_2(Y, Yt, name1, name2, name3, name4, nameout1, nameout2):
    print("Loading first result...")
    res1 = load_results(name1, True)
    print("Loading second result...")
    res2 = load_results(name2, True)
    max_acc = 0
    maxx_auc = 0
    max_frac = 0
    for j in range(101):
        frac = j / 100.0
        res_1 = np.array([i * frac for i in res1])
        res_2 = np.array([i * (1 - frac) for i in res2])
        result = np.add(res_1, res_2)
        result_pred = np.round(result, decimals=0)
        (acc, auc) = model_report_get(Y, result_pred, result)
        if (acc > max_acc):
            max_acc = acc
            max_auc = auc
            max_frac = frac
    print("The max fraction is: %f" % max_frac)
    print("Loading first result...")
    res1 = load_results(name3, True)
    print("Loading second result...")
    res2 = load_results(name4, True)
    res_1 = np.array([i * max_frac for i in res1])
    res_2 = np.array([i * (1 - max_frac) for i in res2])
    result = np.add(res_1, res_2)
    result_pred = np.round(result, decimals=0)
    model_report_test(Yt, result_pred, result)
    save_results(nameout1, nameout2, result_pred, result)
Exemple #4
0
def modelfit(alg, X, y, X_test, y_test, name1, name2, name3, name4):

    alg.fit(X, y)

    dtrain_predictions = alg.predict(X)
    dtrain_predprob = alg.predict_proba(X)[:,1]

    dtest_predictions = alg.predict(X_test)
    dtest_predprob = alg.predict_proba(X_test)[:,1]

    model_report(y, dtrain_predictions, dtrain_predprob, y_test, dtest_predictions, dtest_predprob)
    print(mean_squared_error(y_test, dtest_predprob))
    save_results(name1, name2, dtest_predictions, dtest_predprob)
Exemple #5
0
model.add(Dropout(0.30))
model.add(Dense(600))
model.add(Activation('relu'))
model.add(Dropout(0.20))
model.add(Dense(50))
model.add(Activation('relu'))
model.add(Dropout(0.20))
model.add(Dense(1))
#model.add(Activation('softmax'))

model.compile(optimizer='rmsprop', loss='mse', metrics=['accuracy'])

fit = model.fit(X_train, Y_train, batch_size=170, nb_epoch=25)
model.summary()
train_predprob = model.predict(X_train)
test_predprob = model.predict(X_test)
train_pred = np.round(train_predprob, decimals=0)
test_pred = np.round(test_predprob, decimals=0)

train_predprob = train_predprob.transpose()[0]
test_predprob = test_predprob.transpose()[0]
train_pred = train_pred.transpose()[0]
test_pred = test_pred.transpose()[0]

Ydf = pd.DataFrame(Y_train, columns=['id'])
Ytdf = pd.DataFrame(Y_test, columns=['id'])

model_report(Ydf, train_pred, train_predprob, Ytdf, test_pred, test_predprob)
save_results("NN_results.csv", "NN_results_probs.csv", test_pred,
             test_predprob)
def scrap_n_crawl(url, df, current_indx, session, depth=2):
    #if url is already visited or if depth is zero then return
    if url in ALREADY_VISITED_URLS or depth == 0:
        #increment the reference count of the url if it is already visited
        if url in ALREADY_VISITED_URLS:
            df.loc[ALREADY_VISITED_URLS[url], ['References']] += 1
        return current_indx - 1
    try:
        #get the response of the url
        response = session.get(url)
    except ConnectionError as e:
        #if url blocks you then wait for some time and again try
        print(e)
        time.sleep(2)
        try:
            response = session.get(url)
        except:
            return current_indx - 1
    except MissingSchema as e:
        print(e)
        print("Invalid URL - " + url)
        return current_indx - 1
    except Exception as e:
        print(e)
        return current_indx - 1
    try:
        response.raise_for_status()
    except HTTPError as e:
        print(e)
        return current_indx - 1
    #get the raw text of response
    raw_text = response.text
    #parse the raw text with html parser of beautifulSoup
    html_soup = BeautifulSoup(raw_text, "html.parser")
    #get the words_freq_and_pos and clean_text from html soup
    words_freq_and_pos, clean_text_body = get_clean_words_from_html(html_soup)
    title = get_title(html_soup)
    meta_description = get_meta_description(html_soup)
    #get clean url with removed unnecessary slashes
    url = clean_url(url)
    #store the document(webpage) details, last feild is of reference
    df.loc[current_indx] = [
        url, title, meta_description.replace(u'\xa0', u' '), 1
    ]
    #store the whole clean body of webpage
    TEXT_OF_WEBPAGES[current_indx] = clean_text_body
    print(df.loc[current_indx]['Link'])

    ALREADY_VISITED_URLS[url] = current_indx

    map_keyword_to_url(words_freq_and_pos, current_indx)
    global SAVED_UPTO
    #if the number of docs that are not saved are more than the threshold value then save the data of docs first
    if (current_indx - SAVED_UPTO) >= THRESHOLD:
        save_results(KEYWORD_TO_URL, ALREADY_VISITED_URLS, TEXT_OF_WEBPAGES,
                     df)
        clear_text_dict()
        SAVED_UPTO = current_indx
    #from here on start traversing other outgoing links
    links = html_soup.find_all('a')
    for link in links:
        next_page = link.get("href")
        if next_page is not None:
            next_page = next_page.strip()
            if next_page != '':
                if not next_page.startswith("#"):
                    next_page = get_absolute_next_page_url(url, next_page)
                    current_indx = scrap_n_crawl(next_page, df,
                                                 current_indx + 1, session,
                                                 depth - 1)
    return current_indx
        SAVED_UPTO = current_indx
    #from here on start traversing other outgoing links
    links = html_soup.find_all('a')
    for link in links:
        next_page = link.get("href")
        if next_page is not None:
            next_page = next_page.strip()
            if next_page != '':
                if not next_page.startswith("#"):
                    next_page = get_absolute_next_page_url(url, next_page)
                    current_indx = scrap_n_crawl(next_page, df,
                                                 current_indx + 1, session,
                                                 depth - 1)
    return current_indx


if __name__ == "__main__":
    if not os.path.exists('data'):
        os.makedirs('data')
        if not os.path.exists('data/doc_pages'):
            os.makedirs('data/doc_pages')
    df = pd.DataFrame(
        columns=['Link', 'Title', 'Meta Description', 'References'])
    session = requests.Session()
    auth = get_auth()
    session.auth = HTTPProxyAuth(auth[0], auth[1])
    session.trust_env = False
    index = scrap_n_crawl("https://dmoz-odp.org/Computers/", df, 0, session, 2)
    print("This many urls crawled: " + str(index + 1))
    save_results(KEYWORD_TO_URL, ALREADY_VISITED_URLS, TEXT_OF_WEBPAGES, df)
    inventory = dict(enumerate(os.listdir(stornext_folder)))
    question = "Choose the index of the appropriate experiment?\n%s" % (
        inventory)
    response = int(raw_input(question))

    experiment = inventory[response]
    get_directory = stornext_folder + experiment + "/"
    save_directory = hume_folder + results_folder + experiment + "/"

    #which arrays?
    loa_array_strings = []
    #nb_nsm, rate_nsm
    loa_array_strings.append("num_nsm")
    #elem & iso yield+ism
    iso_list = ["Re-185", "Re-187", "Os-187", "Os-188", "Os-186", "W-184"]
    elem_list = ["Re", "Os", "W"]
    for iso in iso_list:
        loa_array_strings.append("ism_iso_%s" % (iso))
        loa_array_strings.append("yield_%s" % (iso))
    for elem in elem_list:
        loa_array_strings.append("ism_elem_%s" % (elem))

    #which timepoints? (9.5Gyr, 14Gyr)
    loa_timepoints = [9.5e+9, 14e+9]

    save_results(get_directory=get_directory,
                 save_directory=save_directory,
                 experiment=experiment,
                 loa_array_strings=loa_array_strings,
                 loa_timepoints=loa_timepoints)
Exemple #9
0
model.add(Dense(300))
model.add(Activation('relu'))
model.add(Dropout(0.20))
model.add(Dense(50))
model.add(Activation('relu'))
model.add(Dropout(0.10))
model.add(Dense(1))
#model.add(Activation('softmax'))

model.compile(optimizer='rmsprop', loss='mse', metrics=['accuracy'])

fit = model.fit(X_train, Y_train, batch_size=170, nb_epoch=100)
model.summary()
train_predprob = model.predict(X_train)
test_predprob = model.predict(X_test)
train_pred = np.round(train_predprob, decimals=0)
test_pred = np.round(test_predprob, decimals=0)

train_predprob = train_predprob.transpose()[0]
test_predprob = test_predprob.transpose()[0]
train_pred = train_pred.transpose()[0]
test_pred = test_pred.transpose()[0]

Ydf = pd.DataFrame(Y_train, columns=['id'])
Ytdf = pd.DataFrame(Y_test, columns=['id'])

model_report(Ydf, train_pred, train_predprob, Ytdf, test_pred, test_predprob)
print(mean_squared_error(Ydf, train_predprob))
save_results("NN_train_results2.csv", "NN_train_probs2.csv", train_pred, train_predprob)
save_results("NN_results2.csv", "NN_results_probs2.csv", test_pred, test_predprob)