Example #1
0
                                         binary=True,
                                         lowercase=True,
                                         stop_words=None,
                                         ngram_range=(1, ngram))
            X1 = count_vect.fit_transform(dataset1["data"])
            y1 = dataset1["target"]
            # print("finish", "transform")

            # feature-level
            X2 = dataset2["data"]
            y2 = dataset2["target"]

            y = y1
            if feature < X1.shape[1]:
                X1 = SelectKBest(chi2, k=feature).fit_transform(X1, y)
            X1 = X1.todense()
            # print("finish", "Kbest")
            X = np.concatenate((X1, np.matrix(X2)), axis=1)
            # print("finish", "append")

            #for c in Cs:
            key = " ".join(
                ["feature",
                 str(feature), "c",
                 str(10), "ngram",
                 str(ngram)])
            try:
                clf = tree.DecisionTreeClassifier()
                # clf = LogisticRegression(multi_class='ovr', C=10)
                # clf = svm.SVC(C=c, kernel='linear')
                scores = cross_val_score(clf, X, y, cv=10, n_jobs=1, verbose=0)
    for target_category in data_text_x:

        count_vect = CountVectorizer(min_df=0,
                                     max_df=9999,
                                     binary=True,
                                     lowercase=True,
                                     stop_words=None,
                                     ngram_range=(1, 5))
        y = data_y[origin_category]

        x_text = count_vect.fit_transform(data_text_x[origin_category])
        x_text = SelectKBest(chi2, k=5000).fit_transform(x_text, y)

        x_stat = data_stat_x[origin_category]

        x = np.concatenate((x_text.todense(), np.matrix(x_stat)), axis=1)

        # regression model
        reg = linear_model.Ridge(alpha=1.0)
        reg.fit(x, y)

        y2 = data_y[target_category]

        x_text2 = count_vect.fit_transform(data_text_x[target_category])
        x_text2 = SelectKBest(chi2, k=5000).fit_transform(x_text2, y2)
        x_stat2 = data_stat_x[target_category]
        x2 = np.concatenate((x_text2.todense(), np.matrix(x_stat2)), axis=1)

        r_square = reg.score(x2, y2)

        print(origin_category, target_category, str(r_square))