Beispiel #1
0
def load_url(time):
    reslut = dict()
    num = '100'
    purl = NewsHomeURL()
    channels = ['news_china_suda', 'news_world_suda', 'news_society_suda']
    out_path = "F:/scrapy/sina_data1.1.0/news_detail_url/" + time + "/"
    for c in channels:
        url = purl.news_3url(
            c, time, top_show_num=100
        )  # finance_url(time)# entertainment_url(time)# military_url(time) # technology_url(time) #sports_url(time)
        news_detail_url = get_news_detail_url(url, out_path)
        reslut.update(news_detail_url)

    in_url = purl.integrated_channel(time, num)
    sports = purl.sports_url(time, num)
    cj = purl.finance_url(time, num)
    yl = purl.entertainment_url(time, num)
    kj = purl.technology_url(time, num)
    jc = purl.military_url(time, num)

    in_url = get_news_detail_url(in_url, out_path)
    sports = get_news_detail_url(sports, out_path)
    cj = get_news_detail_url(cj, out_path)
    yl = get_news_detail_url(yl, out_path)
    kj = get_news_detail_url(kj, out_path)
    jc = get_news_detail_url(jc, out_path)

    # reslut.update(in_url)
    reslut.update(sports)
    reslut.update(cj)
    reslut.update(yl)
    reslut.update(kj)
    reslut.update(jc)
    path = "F:/scrapy/sina_data1.1.0/news_detail_url/" + time + "/"
    name = "all_parsed.csv"
    # 数据保存 pd.Series(reslut).to_csv(file, index=False) pd.Series(in_url).to_csv(int_file, index=False)
    tmp = []
    for url, channel in reslut.values():
        tmp.append([url, channel])
    to_csv(path, name, tmp)
    tmp = []
    for url, channel in in_url.values():
        tmp.append([url, channel])
    to_csv(path, "integrated_parsed.csv", tmp)  # 新闻综合保存
    return reslut
def make_submission():
    submission_dataset = pd.read_csv('test.csv')
    X_submission = submission_dataset.iloc[:, [i-1 for i in selected_features]].values
    ids = submission_dataset.iloc[:, 0].values

    print("replacing missing values")
    print("number of examples in test: "+str(len(X_submission[:, 0])))
    for i in range(len(X[0, :])):
        if i <= categorical_features_count:
            # si c'est une variable de catégories, on prend comme stratégie de remplacer par la
            # valeur la plus fréquente
            (values, counts) = np.unique(X[:, i], return_counts=True)
            counts = [counts[i] if values[i] >= 0 else 0 for i in range(len(values))]
            ind = np.argmax(counts)
            column_ranges.append(max(values))
            replacement_value = values[ind]
        else:
            # sinon on prend simplement la moyenne
            replacement_value = np.mean(X[:, i])

        for j in range(len(X_submission[:, i])):
            if X_submission[j, i] < -0.5:
                X_submission[j, i] = replacement_value

    y_submission = gbm.predict(X_submission, num_iteration=gbm.best_iteration)

    from tools import to_csv

    minimum = 1
    maximum = 0
    epsilon = 0.01

    for y_i in y_submission:
        if y_i < minimum:
            minimum = y_i
        if y_i > maximum:
            maximum = y_i

    y_submission = y_submission - minimum + epsilon
    y_submission = y_submission/(maximum - minimum)
    y_submission = y_submission/2

    to_csv(y_submission, ids)
Beispiel #3
0
def report(d, hypothesis, ppln_name, grid):

    row_range = list(range(len(list(grid.cv_results_.values())[0])))
    rows = [[] for _ in row_range]
    keys = []
    csv_name = '.'.join([hypothesis, d.__class__.__name__, ppln_name])
    for key, col_array in (list(grid.cv_results_.items())):
        if len(keys) < len(grid.cv_results_.keys()):
            keys.append(key)
        for i, cell in enumerate(col_array):
            rows[i].append(cell)

    rows = sorted(rows,
                  reverse=True,
                  key=lambda x: x[keys.index('mean_test_score')])

    to_pickle(grid, 'models/%s.p' % csv_name)

    to_csv([keys] + rows, 'reports/%s.csv' % csv_name)
Beispiel #4
0
def get_news_detail_url(index_url, out_path=None):
    '''
    :param out_home_page:
    :param out_home_page_name:
    :return:
    '''
    channel = get_channel(url=index_url)  # 获取新闻频道
    home_html = get_html(index_url).strip()  # 访问的新闻类型url
    news_data = parse_home_data(home_html)  # 解析新闻数据
    if out_path != None:
        mkdir(out_path)
        save_data_txt(out_path, channel + "_resource.txt", home_html)  # 保存数据
        to_csv(out_path, channel + "_parsed.csv", news_data)
    result = dict()
    for news in news_data:
        news_id = news[0]
        url = news[2]
        # tmp = {"news_id":news_id, "url":url, "channel":channel}
        result[str(news_id)] = (url, channel)

    return result
Beispiel #5
0
def report(k, hypothesis, classifier, rows, keys, arg_val, grid):
    cls_name = classifier.__class__.__name__
    _rows = [[] for _ in range(len(list(grid.cv_results_.values())[0]))]
    cls_row = [cls_name for _ in range(len(list(grid.cv_results_.values())[0]))]
    k_row = [k for _ in range(len(list(grid.cv_results_.values())[0]))]
    for key, col_array in (
        [('k', k_row)]
        + [('classifier', cls_row)]
        + list(grid.cv_results_.items())
    ):
        if len(keys) < len(grid.cv_results_.keys()) + 2:
            keys.append(key)
        for i, cell in enumerate(col_array):
            _rows[i].append(cell)
    rows += _rows

    rows = sorted(
        rows,
        reverse=True,
        key=lambda x: x[keys.index('mean_test_score')]
    )
    
    best_f = rows[0][keys.index('mean_test_score')]
    if best_f > arg_val:
        print('>>>> NEW BEST:', rows[0])
        to_pickle(grid, 'models/%s.p' % hypothesis)
        arg_val = best_f
    
    to_csv(
        [keys] + rows,
        'reports/%s.csv' % hypothesis
    )
    to_csv(
        summarize_csv([keys] + rows, columns),
        'reports/%s.summary.csv' % hypothesis
    )
    
    return arg_val
Beispiel #6
0
                _Xr = reducer.transform(_X)

                # Train stacked classifier
                stacked = stack()
                stacked.fit(_Xr, Y)

                # Predict with stacked classifier
                __X, __Y = list(zip(*d.test()))
                Y_ = stacked.predict(reducer.transform(vec.transform(__X)))

                p = precision(__Y, Y_)
                r = recall(__Y, Y_)
                f = f1(__Y, Y_)
                a = accuracy(__Y, Y_)
                row = (d.__class__.__name__, ppln_name, vec.__class__.__name__,
                       cls.__class__.__name__, stacked.__class__.__name__, p,
                       r, f, a)
                rows.append(row)
                to_csv([keys] +
                       sorted(rows, reverse=True, key=lambda x: x[-2]),
                       'reports/%s.csv' % hypothesis)

                if f > arg_val:
                    arg_val = f
                    model = {
                        'grid': grid,
                        'reducer': reducer,
                        'stacked': stacked
                    }
                    to_pickle(model, 'models/%s.p' % hypothesis)
Beispiel #7
0
                           reg_lambda=reg_lambda
                           #eval_metric=eval_metric
                           )
classifier.fit(X_train, y_train)
t2 = time.time()
print(t2 - t1)

# Predicting the Test set results
y_pred = classifier.predict_proba(X_test)[:, 1]
y_pred_train = classifier.predict_proba(X_train)[:, 1]

print("gini normalized score (train): ")
log_score = log_loss(y_train, y_pred_train)
print(log_score)

print("gini normalized score (test): ")
log_score = log_loss(y_test, y_pred)
print(log_score)

print("mean de y pred")
print(np.mean(y_pred))

evaluation_dataset = pd.read_csv('testing.csv')

X_eval = evaluation_dataset.iloc[:, 2:].values  # à changer !!
y_pred_eval = classifier.predict_proba(X_eval)[:, 1]

msno = evaluation_dataset.iloc[:, 0].values

to_csv(y_pred_eval, msno)
Beispiel #8
0
y_train_pred = (y_train_pred_1 + y_train_pred_2 + y_train_pred_3) / 3

print("Gini train number 1: ")
print(gini_normalized(y_train, y_train_pred_1))
print("Gini train number 2: ")
print(gini_normalized(y_train, y_train_pred_2))
print("Gini train number 3: ")
print(gini_normalized(y_train, y_train_pred_3))
print("Gini train mean on all trees: ")
print(gini_normalized(y_train, y_train_pred))


y_test_pred = (clf_1.predict(X_test, raw_score=True) +
               clf_2.predict(X_test, raw_score=True) +
               clf_3.predict(X_test, raw_score=True)
               ) / 3

clf_1.save_model('clf_1.txt')
clf_1.save_model('clf_2.txt')
clf_1.save_model('clf_3.txt')

print(y_test_pred)
np.savetxt("y_test_pred", y_test_pred)

ids = test.iloc[:, 0].values

from tools import to_csv

to_csv(y_test_pred, ids)

Beispiel #9
0
            param_grid = param_grids[ppln_name]

            # 'GridSearchCV' defaults to stratified k-fold
            grid = GridSearchCV(ppln,
                                cv=CV,
                                n_jobs=N_JOBS,
                                verbose=VERBOSITY,
                                param_grid=param_grid,
                                refit=True)
            grid.fit(X, Y)

            #print(grid.best_estimator_.named_steps.items())

            report(d, hypothesis, ppln_name, grid)

            # Predict on the held-out split
            vec = grid.best_estimator_['vec']
            cls = grid.best_estimator_['cls']
            X, Y = list(zip(*d.test()))
            Y_ = cls.predict(vec.transform(X))

            p = precision(Y, Y_)
            r = recall(Y, Y_)
            f = f1(Y, Y_)
            a = accuracy(Y, Y_)
            row = (d.__class__.__name__, ppln_name, vec.__class__.__name__,
                   cls.__class__.__name__, p, r, f, a)
            rows.append(row)
            to_csv(rows, 'reports/%s.csv' % hypothesis)