Beispiel #1
0
def create_file(key, binary=False, filter=True,nwords=50):
	from sklearn.feature_selection import chi2
	if key in tag_count.keys():
		labels = [(key in row) for row in tag_index]
	elif key in substance_count.keys():
		labels = [(key in row) for row in substance_index]
	
	if binary==False:
		chisq, p = chi2(data, labels)
	else:
		chisq, p = chi2(data>0, labels)
	ranking = np.argsort(chisq)[::-1]
	values = []
	for rank in ranking:
		values.append((chisq[rank],vocab[rank],p[rank]))
	
	filename = key
	if binary==True:
		filename+="_bin"
	if filter==False:
		filename+="_nof"
	filename+=".txt"
	
	print "Building " +  filename + ":"
	with open(path+"output/" + filename,"w") as file:
		j = 0
		for value in values:
			if j!=None and j>nwords:
				return
				
			if filter==True:
				response = input("Use " + str(value) + "? (" + str(j) + " words so far) (y/n/x)")
			else:
				response = "y"
			
			if response == "y":	
				if binary==True:
					r = int(value[0]/10)
				else:
					r = int(value[0]/100)
				for i in range(r):
					file.write(value[1])
					file.write(" ")
				print "Wrote " + str(value[1]) + " " + str(r) + " times."
				j+=1
			elif response == "x":
				print "Finished " + filename + "."
				return
			else:
				continue
 def chi2(self):
     """ Compute chi-squared stats and P-value between each non-negative feature and class (target variable).
     chi-square test measures dependence between stochastic variables, 
     so using this function "weeds out" the features that are the most likely to be independent of class 
     and therefore irrelevant for classification.
     
     Parameters
     ----------
     X : (N,) array_like
     Sample vectors.
     y : (N,) array_like
     Target vector (class labels)
     
     Returns
     -------
     chi2_score - chi2 statistics of each feature
     pval_score - P-values of each feature
     
     """
     
     print "Compute Chi-Squared stats..." + str(time.now())
     
     score = chi2(self.X,self.y)
     # Features with the highest values for the test chi-squared statistic from X
     chi2_score = [0 if np.isnan(s) else s for s in score[0] ]
     
     # 'P-value'
     pval_score = score[1]
     return chi2_score, pval_score
def allele_cooccurence_pValue(all_1, all_2, y_drg, X_all, association_test):
    """
    Takes in two alleles, creates a cooccurence column, and computes the association of that
    column with the AMR phenotype.
    """
    two_allele_df = X_all.loc[:, [all_1, all_2]].copy()
    two_allele_df["cooccurence"] = two_allele_df[all_1] + two_allele_df[all_2]
    two_allele_df["cooccurence"][two_allele_df["cooccurence"] < 2] = 0
    two_allele_df["cooccurence"][two_allele_df["cooccurence"] == 2] = 1

    if association_test == "chi2":
        test_stats, pVals = chi2(two_allele_df, y_drg)
    elif association_test == "f_classif":
        test_stats, pVals = f_classif(two_allele_df, y_drg)

    allele_cooccurence_pVal_dict = {}
    for col_ind in range(len(two_allele_df.columns)):
        allele_cooccurence_pVal_dict.update({
            two_allele_df.columns[col_ind]: {
                "test_stat": test_stats[col_ind],
                "pVal": pVals[col_ind]
            }
        })

    return allele_cooccurence_pVal_dict
Beispiel #4
0
    def _fit_cooccurrence_vectorizer(self, x, classes, none_class, dataset):
        non_null_x = (d for d, c in zip(x, classes) if c != none_class)
        self.cooccurrence_vectorizer = CooccurrenceVectorizer(
            config=self.config.cooccurrence_vectorizer_config,
            builtin_entity_parser=self.builtin_entity_parser,
            custom_entity_parser=self.custom_entity_parser,
            resources=self.resources,
            random_state=self.random_state,
        )
        x_cooccurrence = self.cooccurrence_vectorizer.fit(non_null_x,
                                                          dataset).transform(x)
        if not self.cooccurrence_vectorizer.word_pairs:
            return self
        _, pval = chi2(x_cooccurrence, classes)

        top_k = int(self.config.added_cooccurrence_feature_ratio *
                    len(self.tfidf_vectorizer.idf_diag))

        # No selection if k is greater or equal than the number of word pairs
        if top_k >= len(self.cooccurrence_vectorizer.word_pairs):
            return self

        top_k_cooccurrence_ix = np.argpartition(pval, top_k - 1,
                                                axis=None)[:top_k]
        top_k_cooccurrence_ix = set(top_k_cooccurrence_ix)
        top_word_pairs = [
            pair
            for pair, i in iteritems(self.cooccurrence_vectorizer.word_pairs)
            if i in top_k_cooccurrence_ix
        ]

        self.cooccurrence_vectorizer.limit_word_pairs(top_word_pairs)
        return self
Beispiel #5
0
    def _fit_transform_tfidf_vectorizer(self, x, y, dataset):
        self.tfidf_vectorizer = TfidfVectorizer(
            config=self.config.tfidf_vectorizer_config,
            builtin_entity_parser=self.builtin_entity_parser,
            custom_entity_parser=self.custom_entity_parser,
            resources=self.resources,
            random_state=self.random_state,
        )
        x_tfidf = self.tfidf_vectorizer.fit_transform(x, dataset)

        if not self.tfidf_vectorizer.vocabulary:
            raise _EmptyDatasetUtterancesError(
                "Dataset is empty or with empty utterances")
        _, tfidf_pval = chi2(x_tfidf, y)
        best_tfidf_features = set(i for i, v in enumerate(tfidf_pval)
                                  if v < self.config.pvalue_threshold)
        if not best_tfidf_features:
            best_tfidf_features = set(idx for idx, val in enumerate(tfidf_pval)
                                      if val == tfidf_pval.min())

        best_ngrams = [
            ng for ng, i in iteritems(self.tfidf_vectorizer.vocabulary)
            if i in best_tfidf_features
        ]
        self.tfidf_vectorizer.limit_vocabulary(best_ngrams)
        # We can't return x_tfidf[:best_tfidf_features] because of the
        # normalization in the transform of the tfidf_vectorizer
        # this would lead to inconsistent result between: fit_transform(x, y)
        # and fit(x, y).transform(x)
        return self.tfidf_vectorizer.transform(x)
Beispiel #6
0
def pvals(request):
    from sklearn.feature_selection import chi2
    dataPath = request.GET.get('path')
    featureName = request.GET.get('feature')
    # print path
    # print feature_name
    df = pd.read_csv(dataPath)
    df.set_index('ID', inplace=True)
    df.fillna(value=0, inplace=True)
    cols = []
    for col in df.columns:
        cols.append(col)
    cols.remove(featureName)
    for col in cols:
        if min(df[col]) < 0:
            adder = -1 * min(df[col])
        else:
            adder = min(df[col])
        df.loc[:, col] += adder
    chi2val, pval = chi2(df[cols], df[featureName])

    PvalDict = [{
        'key': cols[i],
        'pvalue': (1.0 - pval[i]) * 100.0,
        'selected': False
    } for i in range(0, len(cols))]
    # print json.dumps(PvalDict)
    return HttpResponse(json.dumps(PvalDict))
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
Beispiel #8
0
def print_common_unigrams_bigrams():
    df['category_id'] = df['category'].factorize()[0]
    category_id_df = df[['category', 'category_id'
                         ]].drop_duplicates().sort_values('category_id')
    category_to_id = dict(category_id_df.values)

    tfidf = TfidfVectorizer(sublinear_tf=True,
                            min_df=5,
                            norm='l2',
                            encoding='latin-1',
                            ngram_range=(1, 2),
                            stop_words='english')

    labels = df.category
    features = tfidf.fit_transform(df.content).toarray()
    print(features.shape)
    N = 5
    Number = 1
    for category in df['category'].unique():
        features_chi2 = chi2(features, df['category'] == category)
        indices = np.argsort(features_chi2[0])
        feature_names = np.array(tfidf.get_feature_names())[indices]
        unigrams = [x for x in feature_names if len(x.split(' ')) == 1]
        bigrams = [x for x in feature_names if len(x.split(' ')) == 2]
        print(Number, "# '{}':".format(category))
        print("  . Most correlated unigrams:\n. {}".format('\n. '.join(
            unigrams[-N:])))
        print("  . Most correlated bigrams:\n. {}".format('\n. '.join(
            bigrams[-N:])))
        Number += 1
Beispiel #9
0
    def fit(self, X, y):
        '''Calculates chi2 statistics for all features in X'''

        # Warnings are suppressed here because some features have zero occurence
        # due to the train test split (the feature matrix is generated for the
        # complete dataset, but the precompute vectorizer only returns rows for
        # documents in the training/test set). These features are taken care of
        # explicitly later
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            c2 = chi2(X, y)

        # Set p-value of nan feature to 1 and chi2 statistic to 0
        new_pval = [1.0 if np.isnan(x) else x for x in c2[1]]

        sorted_features = sorted([(pval, idx)
                                  for idx, pval in enumerate(new_pval)],
                                 key=lambda x: x[0],
                                 reverse=False)

        self.p_values = [
            x[0] for x in sorted(sorted_features, key=lambda x: x[1])
        ]
        top_features = sorted_features[:self.max_n_features]
        self.top_idxs = list(map(lambda x: x[1], top_features))
        return self
Beispiel #10
0
    def compute_pvals(self, X, y):
        # TODO: export to stats_utils?
        is_y_binary = (len(np.unique(y)) == 2)
        # is_binary_feature = np.sum(((X != np.nanmin(X, axis=0)[np.newaxis, :]) &
        #                             (X != np.nanmax(X, axis=0)[np.newaxis, :])), axis=0) == 0
        is_binary_feature = areColumnsBinary(X)
        p_vals = np.zeros(X.shape[1])
        if is_y_binary:
            # Process non-binary columns:
            for i in np.where(~is_binary_feature)[0]:
                x0 = X.loc[y == 0, i]
                x1 = X.loc[y == 1, i]
                if self.is_linear:
                    _, p_vals[i] = stats.ttest_ind(x0, x1)
                else:
                    _, p_vals[i] = stats.ks_2samp(x0, x1)

            # Process binary features:
            _, p_vals[is_binary_feature] = feature_selection.chi2(X.loc[:, is_binary_feature], y)

        else:
            # Process non-binary features:
            _, p_vals[~is_binary_feature] = feature_selection.f_regression(X.loc[:, ~is_binary_feature], y)

            # Process binary features:
            y_mat = np.row_stack(y)
            for i in np.where(is_binary_feature)[0]:
                _, p_vals[i] = feature_selection.f_regression(y_mat, X.loc[:, i])
        return p_vals
def logRegression(): # log regression works, worked on the other data set, but rimapc is too small of a dataset so it needs more entrise for it to go through regression
    ''' labels = terms, category = text '''
    keepTrack = 0
    trainText = []
    trainTerms = []
    df1 = pd.read_csv('trainingSet.csv', encoding="cp1252")
    df1['terms_id'] = df1['ClusterTerms'].factorize()[0]
    print(df1['terms_id'])
    terms_id_df = df1[['ClusterTerms','terms_id']].drop_duplicates().sort_values('terms_id')
    terms_to_id = dict(terms_id_df.values)
    id_to_category = dict(terms_id_df[['terms_id', 'ClusterTerms']].values)
    tfidfReg = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
    features = tfidfReg.fit_transform(df1['FullText']).toarray()
    labels = df1.terms_id
    print(features)
    features.shape
    N = 3
    for ClusterTerms, terms_id in sorted(terms_to_id.items()):
        features_chi2 = chi2(features, labels == terms_id)
        indices = np.argsort(features_chi2[0])
        feature_names = np.array(tfidfReg.get_feature_names())[indices]
        unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
        bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    models = [
        RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
        MultinomialNB(),
        LogisticRegression(random_state=0),
    ]
    model = LogisticRegression(random_state=0)
    X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df1.index, test_size=0.33, random_state=0)
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)
    y_pred = model.predict(X_test)
    conf_mat = confusion_matrix(y_test, y_pred)
Beispiel #12
0
def logRegTrainAndPrintScore(trainer, labels, test, test_labels, c_range=[1], penalties=['l2'], chi2_features=None):
  if chi2_features:
    len_training=len(labels)
    test_train = scipy.sparse.vstack([trainer, test])
    test_train = scipy.sparse.csr_matrix(test_train)
    all_labels = np.append(labels, test_labels)
    chi2 = fs.chi2(test_train, all_labels)
    strong = []
    weak = []
    for i, p in enumerate(chi2[1]):
      if p < 0.1:
        weak.append(i)
        if p < .05:
          strong.append(i)
    strongFeatures = test_train[:,strong]
    weakFeatures = test_train[:,weak]
    if chi2_features == 'weak':
      trainer = weakFeatures[:len_training]
      test = weakFeatures[len_training:]
    else:
      trainer = strongFeatures[:len_training]
      test = strongFeatures[len_training:]

  for c in c_range:
    for penalty in penalties:
      clf = linear_model.LogisticRegression(C=c, penalty=penalty)
      clf.fit(trainer, labels)
      logging.info("-------- LogisticRegression via linear_model.LogisticRegression C =%f, Penalty = %s ------------" % (c, penalty))
      logging.info("--------PERFORMANCE ON TRAINING DATA------------")
      show_score(clf, trainer, labels)
      logging.info("----------PERFORMANCE ON TEST DATA--------------")
      show_score(clf, test, test_labels)
Beispiel #13
0
def split_snp_chi2(X, y, snp, thresh=0.05):
    """
        input:  X, y datasets, informative SNPs indx
                partition on weak and strong by means
        output: two groups features
    """
    def split(pval, snp_idx, thresh):
        """
            split index into 2 groups
        """
        weak_idx = []
        strong_idx = []

        for i in range(len(pval)):
            if pval[i] <= thresh:
                strong_idx.append(snp_idx[i])
            else:
                weak_idx.append(snp_idx[i])
        return (weak_idx, strong_idx)

    X_test = X[:, snp]
    _, pval = chi2(X_test, y)

    weak, strong = split(pval, snp, thresh)
    return (weak, strong)
Beispiel #14
0
def do_chi_squared(X, y, chisq_f):
    scaler = preprocessing.MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    chisq_res, pval = feature_selection.chi2(X_scaled, y)

    features = [str(x) for x in range(len(chisq_res))]

    atts_chisq = []
    i = 0
    for att in features:
        atts_chisq.append((att, chisq_res[i], pval[i]))
        i += 1

    atts_chisq = sorted(atts_chisq, key=lambda tup: tup[1], reverse=True)
    sorted_atts = [x for x, y, z in atts_chisq]

    for att_i in range(len(atts_chisq) - 1):

        att, chisq_val, pval_val = atts_chisq[att_i]
        chisq_f.write("%s (%.2f,%.2f), " % (att, chisq_val, pval_val))

    att_i = len(features) - 1
    att, chisq_val, pval_val = atts_chisq[att_i]
    chisq_f.write("%s (%.2f,%.2f)\n" % (att, chisq_val, pval_val))

    return chisq_res, pval, sorted_atts
Beispiel #15
0
def chi2_select(X, y):
    '''
    使用卡方检验进行特征选择
    X:raw_data
    y:label
    dim:为选择的维度

    return:返回特征索引, 转换后的训练矩阵X
    '''
    text_vecs = CountVectorizer(dtype=np.uint8)
    X = text_vecs.fit_transform(X)

    # 使用TF-IDF进行加权
    transformer = TfidfTransformer()
    X = transformer.fit_transform(X)

    vocabulary = text_vecs.vocabulary_
    index2word = dict(zip(vocabulary.values(), vocabulary.keys()))  # 特征到词汇的索引

    scores, p_val = chi2(X, y)
    scores = as_float_array(scores, copy=True)
    scores[np.isnan(scores)] = np.finfo(scores.dtype).min
    # 从小到大选出来的索引
    indexs = np.argsort(scores, kind="mergesort")

    bow = []
    for index in indexs:
        bow.append(index2word[index])
    return indexs, bow, X
Beispiel #16
0
 def chi2(self, thread):
     """
     Return the chisquare test statistic and p-value for each word in thread
     """
     df = self.term_freq_dataframe(thread)
     x = chi2(df.T, list(df.columns))
     return x
Beispiel #17
0
    def fit(self, x, y):
        chi_scores = chi2(x,y)
        p_values = pd.Series(chi_scores[1],index = [i for i in range(x[0,:])])
        p_values.sort_values(ascending = False , inplace = True)
        feature_names = p_values.index

        self.idx_sel =[v for i, v in enumerate(p_values.index) if p_values.iloc[i] >= self.th]
Beispiel #18
0
def select_feature_chi(x_datavec,
                       label2index,
                       y_index,
                       countvec,
                       k,
                       if_print=False):
    '''the chi square method to select the feature, refer to:
    https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f'''
    #x_datavec, y_index=np.array(x_datavec),np.array(y_index).reshape(-1,1)
    k = min(x_datavec.shape[1], k)
    logging.info("Chi-square feature: %s" % k)
    skb = SelectKBest(chi2, k=k)
    x_datachi = skb.fit_transform(x_datavec, y_index)

    feature_ids = skb.get_support(indices=True)
    feature_names = countvec.get_feature_names()
    vocab_chi = {}

    for new_fid, old_fid in enumerate(feature_ids):
        feature_name = feature_names[old_fid]
        vocab_chi[feature_name] = new_fid

    for label, index in sorted(label2index.items()):
        features_chi2 = chi2(x_datavec,
                             [1 if i == index else 0 for i in y_index])
        indices = np.argsort(features_chi2[0])
        feature_names = np.array(countvec.get_feature_names())[indices]
        unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
        if if_print:
            print("# '{}':".format(label))
            print("  . Most correlated unigrams:\n. {}".format('\n. '.join(
                unigrams[-3:])))
    return x_datachi, vocab_chi
Beispiel #19
0
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4., 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
Beispiel #20
0
def extract_keywords(wordcounts_per_document, selector, n=20):
    document_index = {}    # Maps int -> fileid (str).
    vocabulary = {}    # Maps int -> word (str).
    lookup = {}    # Reverse map for vocabulary (word (str) -> int).

    I = []
    J = []
    data = []
    labels = []

    for i, (key, counts) in enumerate(wordcounts_per_document.iteritems()):
        document_index[i] = key
        for token, count in counts.iteritems():
            if count < 3:
                continue
            j = lookup.get(token, len(vocabulary))
            vocabulary[j] = token
            lookup[token] = j

            I.append(i)
            J.append(j)
            data.append(count)
        labels.append('focal' if selector(key) else 'other')

    sparse_matrix = sparse.coo_matrix((data, (I, J)))

    keyness, _ = chi2(sparse_matrix, labels)
    ranking = np.argsort(keyness)[::-1]
    _, words = zip(*sorted(vocabulary.items(), key=lambda i: i[0]))
    words = np.array(words)
    keywords = words[ranking]
    return keywords[:n]
Beispiel #21
0
    def getKbest(self, K=5):
        """
        Takes train_df as input and returns df with K best ngrams for each class
        """

        features, labels = self.do_tfidf()

        chi2_dict = dict()

        for intent, intent_id in sorted(self.intent_to_id.items()):
            features_chi2 = chi2(features, labels == intent_id)
            indices = np.argsort(features_chi2[0])
            feat_names = np.array(self.tfidf.get_feature_names())[indices]

            unigrams = [f for f in feat_names if len(f.split(' ')) == 1]
            bigrams = [f for f in feat_names if len(f.split(' ')) == 2]

            chi2_dict[intent] = dict()
            chi2_dict[intent]['top unigrams'] = str(
                unigrams[-K:]).strip("[]").replace("'", "")
            chi2_dict[intent]['top bigrams'] = str(
                bigrams[-K:]).strip("[]").replace("'", "")

        return pd.DataFrame.from_dict(chi2_dict).reindex(
            ['top unigrams', 'top bigrams'])
Beispiel #22
0
def extract_keywords(wordcounts_per_document, selector, n=20):
    document_index = {}  # Maps int -> fileid (str).
    vocabulary = {}  # Maps int -> word (str).
    lookup = {}  # Reverse map for vocabulary (word (str) -> int).

    I = []
    J = []
    data = []
    labels = []

    for i, (key, counts) in enumerate(wordcounts_per_document.iteritems()):
        document_index[i] = key
        for token, count in counts.iteritems():
            if count < 3:
                continue
            j = lookup.get(token, len(vocabulary))
            vocabulary[j] = token
            lookup[token] = j

            I.append(i)
            J.append(j)
            data.append(count)
        labels.append('focal' if selector(key) else 'other')

    sparse_matrix = sparse.coo_matrix((data, (I, J)))

    keyness, _ = chi2(sparse_matrix, labels)
    ranking = np.argsort(keyness)[::-1]
    _, words = zip(*sorted(vocabulary.items(), key=lambda i: i[0]))
    words = np.array(words)
    keywords = words[ranking]
    return keywords[:n]
Beispiel #23
0
    def divide_and_generate(self):
        q_all_list = []
        stop_words = self.stop_words_list("./chineseStopWords.txt")
        qa = self.qa.copy(deep=True)
        qa["Q_Clean"] = qa["QUESTION"].apply(self.remove_punctuation)
        qa["Q_D"] = qa["Q_Clean"].apply(lambda x: " ".join(
            [w for w in jb.lcut_for_search(x) if w not in stop_words]))
        qa["Q_Tag"] = qa["CLINIC"].apply(lambda x: self.clinic_code[x]
                                         if x in self.clinic_code else 0)
        # 生成词云
        self.word_cloud(qa)

        tf_idf = TfidfVectorizer(norm='l2', ngram_range=(1, 2))
        features = tf_idf.fit_transform(qa.Q_D)
        labels = qa.Q_Tag
        alpha_logger.info(features.shape)
        alpha_logger.info(features)

        N = 2
        for cli, cli_tag in self.clinic_code.items():
            features_chi2 = chi2(features, labels == cli_tag)
            indices = np.argsort(features_chi2[0])
            feature_names = np.array(tf_idf.get_feature_names())[indices]
            uni_grams = [v for v in feature_names if len(v.split(' ')) == 1]
            bi_grams = [v for v in feature_names if len(v.split(' ')) == 2]
            print("# '{}':".format(cli))
            print("  . Most correlated uni-grams:\n       . {}".format(
                '\n       . '.join(uni_grams[-N:])))
            print("  . Most correlated bi-grams:\n       . {}".format(
                '\n       . '.join(bi_grams[-N:])))

        alpha_logger.info("相关性展示")
        return qa, features, labels
Beispiel #24
0
def preprocess():
    df = corpus_train.copy()
    df['comments'] = df['comments'].map(lambda x: preprocess_text(x))
    y_train = df["subreddits"].to_numpy()
    global vectorizer
    x_train = vectorizer.fit_transform(df['comments'])

    print(vectorizer.get_feature_names())
    global feature_names
    feature_names = vectorizer.get_feature_names()
    featname = vectorizer.get_feature_names()
    chi_squared, pval = chi2(x_train, y_train)
    featname = pd.DataFrame(featname)
    chi_2 = pd.DataFrame(chi_squared)
    pval = pd.DataFrame(pval)

    data = pd.concat([featname, chi_2, pval], axis=1)
    data.columns = ["word", "chi_squared", "pval"]
    data = data.sort_values("pval", axis=0)

    global to_delete
    to_delete = list(data.index[15000:])  # unigram 保留10000-15000之间最高
    all_cols = np.arange(x_train.shape[1])
    cols_to_keep = np.where(np.logical_not(np.in1d(all_cols, to_delete)))[0]
    x_train = x_train[:, cols_to_keep]

    scalar = MaxAbsScaler()
    x_train = scalar.fit_transform(x_train)
    return x_train, y_train
Beispiel #25
0
def _weasel_fit(X, y, sfa_kwargs, chi2_threshold, window_size, window_step):
    n_samples, n_timestamps = X.shape

    n_windows = ((n_timestamps - window_size + window_step) // window_step)

    X_windowed = windowed_view(X,
                               window_size=window_size,
                               window_step=window_step)
    X_windowed = X_windowed.reshape(n_samples * n_windows, window_size)

    sfa = SymbolicFourierApproximation(**sfa_kwargs)

    y_repeated = np.repeat(y, n_windows)
    X_sfa = sfa.fit_transform(X_windowed, y_repeated)

    X_word = np.asarray(
        [''.join(X_sfa[i]) for i in range(n_samples * n_windows)])
    X_word = X_word.reshape(n_samples, n_windows)

    X_bow = np.asarray([' '.join(X_word[i]) for i in range(n_samples)])
    vectorizer = CountVectorizer(ngram_range=(1, 2))
    X_counts = vectorizer.fit_transform(X_bow)
    chi2_statistics, _ = chi2(X_counts, y)
    relevant_features = np.where(chi2_statistics > chi2_threshold)[0]

    return relevant_features, sfa, vectorizer, X_counts[:, relevant_features]
Beispiel #26
0
def filter_feature_Chi2(feature_column, target_column, threshold=None):
    """Filter feature using Chi2 test
    :param: feature_column: numpy array containing feature
    :param: target_column: numpy array containing target variable
    :param: threshold: threshold to filter using Chi2.
                        threshold=None means all features with non Nan Chi2 pval will be returned.
    :return: feature_selected: features that are signficant
    :return: pval_significant: list of bool to indicate which features are significant
    """

    # Perform Chi2 test
    chi2_stats, pval = chi2(feature_column, target_column)

    # select only significant pvals
    pval_result = pval.copy()
    if threshold:
        pval_result[np.isnan(pval_result)] = 100  # replace Nan with any large value
        pval_significant = pval_result <= threshold
    else:
        pval_significant = np.logical_not(np.isnan(pval))

    # select features with significant pvals
    feature_selected = feature_column[:, pval_significant]

    return feature_selected, pval_significant
def chi_square(vectors,d_labels,top_num):
	ans5,pval = chi2(vectors.toarray(),d_labels)
	sl =  heapq.nlargest(int(top_num*0.01*vectors.shape[1]), zip(ans5, itertools.count()))
	ans = []
	for item in sl:
		ans.append(item[1])
	return ans
Beispiel #28
0
def score_features(X, y, V, args):
    """ FIXME: allow different options. """
    chis, pvals = chi2(X, y)
    # chis /= sum(chis)
    signs = compute_signs(X, y, V)
    signs = np.multiply(chis, signs)
    return signs
Beispiel #29
0
def get_features_ranking(data, print_for_latex=False):
    scores, p_values = chi2(data.samples_x_features, data.class_labels)

    ranking = [(f_id, scores[f_id], p_values[f_id])
               for f_id in range(len(scores))]  # f_id = feature id (number)
    ranking.sort(key=lambda x: x[1], reverse=True)

    print('Ranking:')
    if not print_for_latex:
        print('Nr\tchi2\tNazwa cechy')

    i = 0
    for feature in ranking:
        (feature_number, score, p_value) = feature
        rounded_score = str(round(score, 3)).replace('.', ',')
        name = leukaemia_features[feature_number]
        i += 1
        if print_for_latex:
            print(f'{i}. & {feature_number+1} & {name} & {rounded_score} \\\\')
        else:
            print(f'{feature_number+1}\t{rounded_score}\t{name}')

    feature_ranking = Ranking(
        feature_ids=[ranking[i][0] for i in range(len(ranking))],
        chi2_scores=[ranking[i][1] for i in range(len(ranking))],
        p_values=[ranking[i][2] for i in range(len(ranking))],
        sorted_samples_x_features=__sort_by_score(
            data.samples_x_features, scores
        ),  # 'scores' in unsorted list of chi2 results for every feature
        feature_count=len(ranking))

    return feature_ranking  # object containing sorted lists of features (index from one list correspond to another's list's index)
def features_selection(X_train, y_train, X_test, y_test, sel_type="FI", count=30):    
    if sel_type == "FI":
        fitted_clf = xgb_fit(X_train, y_train, X_test, y_test)
        feature_importance = plot_importance(fitted_clf.feature_importances_, X_train.columns, 'Features Importance')
        feats = feature_importance[0][:count]
    elif sel_type == "HI":
        chi2_test = chi2(X_train, y_train)
        feature_importance = plot_importance(chi2_test[0], X_train.columns, 'Chi2')
        feats = feature_importance[0][:count]
    elif sel_type == "MI":
        mi = mutual_info_classif(X_train, y_train)
        feature_importance = plot_importance(mi, X_train.columns, 'Mutual_Info')
        feats = feature_importance[0][:count]
    elif sel_type == "RFE":
        # By optimal, not by count
        logit = LogisticRegression(random_state=42)
        selector = RFECV(estimator=logit, step=5, cv=StratifiedKFold(2), scoring='f1')
        selector.fit(X_train, y_train)
        feats = X_train.columns[selector.support_]
    elif sel_type == "PI":
        perm = PermutationImportance(fitted_clf, random_state=42).fit(X_train, y_train)
        res = pd.DataFrame(X_train.columns, columns=['feature'])
        res['score'] = perm.feature_importances_
        res['std'] = perm.feature_importances_std_
        feature_importance = res.sort_values(by='score', ascending=False).reset_index(drop=True)
        feats = feature_importance["feature"][0:count]
    else:
        print(f"Feature selection type {sel_type} not implemented.")
        return

    X_train = pd.DataFrame(X_train, columns=X_train.columns)[feats]
    X_test = pd.DataFrame(X_test, columns=X_test.columns)[feats]

    return X_train, y_train, X_test, y_test
Beispiel #31
0
def Chi2(data, label, times, fold, cnt, tag, geneOrSite):
    select = "Chi2"
    print('Chi2 selection start...')
    X = data.T  # 纵轴为标签
    # (0,1)化
    minMaxScaler = preprocessing.MinMaxScaler()
    minMax = minMaxScaler.fit_transform(X)
    X = pd.DataFrame(minMax, columns=X.columns.values)
    print('after standardized...')
    print(X.head())
    (chi, pval) = chi2(X, label)
    res = pd.DataFrame({
        geneOrSite: data.index.tolist(),
        'chi2': chi
    }).sort_values(by='chi2', ascending=False)
    print('Chi2 result.head()')
    print(res.head())
    col = [geneOrSite, 'chi2']
    res.to_csv(tag + '_Chi2_rank.txt', sep="\t", index=None, columns=col)
    print('Chi2 selection finish...')
    print(select + " IFS validation start...")
    result = res.iloc[list(range(cnt))]
    cur_X = data.loc[result[geneOrSite]]
    cur_X.to_csv(tag + "_" + select + "_" + str(cnt) + "_result.csv")
    cur_X = cur_X.T  # 纵轴为标签
    IFS_validation(cur_X, label, times, fold, select, tag)
    print(select + ' IFS validation finish...')
    print("------------------------")
def main():
    newsgoups = fetch_20newsgroups(subset='train', categories=['sci.crypt', 'talk.politics.guns'])

    vectorizer = CountVectorizer()
    vector = vectorizer.fit_transform(newsgoups.data, newsgoups.target)
    vocab = np.array(vectorizer.get_feature_names())
    print "number of positive examples:", np.sum(newsgoups.target)

    t0 = time.time()
    ig_scores, _ = ig(vector, newsgoups.target)
    print "Information Gain top 50  scored terms:"
    print vocab[np.argsort(ig_scores)][-50:]
    print "time: %.4f secs" % (time.time()-t0)

    t0 = time.time()
    bns_scores, _ = bns(vector, newsgoups.target)
    print "Bi-Normal Separation top 50  scored terms:"
    print vocab[np.argsort(bns_scores)][-50:]
    print "time: %.4f secs" % (time.time()-t0)

    t0 = time.time()
    chi2_scores, _ = chi2(vector, newsgoups.target)
    print "Chi Squared top 50  scored terms:"
    print vocab[np.argsort(chi2_scores)][-50:]
    print "time: %.4f secs" % (time.time()-t0)
def findFeatures(X,y,fileStorePath,fileName):

  print "Inside finding important features for the "+fileName+" data set"

  inputFile = pd.read_csv(X,header=0)
  input1 = np.array(inputFile)
  
  label = pd.read_csv(y,header=0)
  Y = np.array(label)
  chiVal,pVal = chi2(input1, Y)#Calculate the chi2 value

  indices = chiVal
  chi2List = []
  chi2Index = []

  for i, val in enumerate(pVal):
    if val<0.05:
      chi2List.append(chiVal[i])
      chi2Index.append(i)

  print chi2Index

  i=0
  if(fileName == "cad" or fileName == "health" or fileName == "science"):
    i=12
  elif(fileName == "grade"):
    i=16
  elif(fileName == "graduated"):
    i=14
  else:
    i=13
  if not os.path.exists(fileStorePath):
    os.makedirs(fileStorePath)
  if not os.path.exists(fileStorePath+'/FeatureImportancewith-P-value/'):
    os.makedirs(fileStorePath+'/FeatureImportancewith-P-value/')
  # Plot the feature importances of the forest
  plt.figure()
  plt.title("Feature importance - chi2 analysis")
  plt.bar(range(i), indices,
         color="r", align="center")
  plt.xticks(range(i))
  plt.xlim([-1, i])
  plt.ylabel('chi2 value')
  plt.xlabel('Features numbered(0 through %d)' % (i-1))
  plt.savefig(fileStorePath+'/'+fileName+'.png')
  plt.close()
  plt.clf()#clear the figure for next loop

  #Plot figures with p-values
  plt.figure()
  plt.title("chi2 analysis considering features with p-value less than 0.05")
  plt.bar(chi2Index, chi2List,
         color="r", align="center")
  plt.xticks(chi2Index)
  plt.xlim([-1, i])
  plt.ylabel('chi2 value')
  plt.xlabel('Onl;y relevant features displayed')
  plt.savefig(fileStorePath+'/FeatureImportancewith-P-value/'+fileName+'.png')
  plt.close()
  plt.clf()#clear the figure for next loop
Beispiel #34
0
def train_sentiment():
    print("training start")
    df = pd.read_csv(os.path.join(os.path.dirname(__file__), 'sentiments.csv'))
    df.head()

    col = ['Emotion', 'Review']
    df = df[col]
    df = df[pd.notnull(df['Review'])]
    df.columns = ['Emotion', 'Review']
    df['category_id'] = df['Emotion'].factorize()[0]
    category_id_df = df[['Emotion', 'category_id'
                         ]].drop_duplicates().sort_values('category_id')
    category_to_id = dict(category_id_df.values)
    id_to_category = dict(category_id_df[['category_id', 'Emotion']].values)
    df.head()

    tfidf = TfidfVectorizer(sublinear_tf=True,
                            min_df=1,
                            norm='l2',
                            encoding='latin-1',
                            ngram_range=(1, 2),
                            stop_words='english')
    features = tfidf.fit_transform(df.Review).toarray()
    labels = df.category_id
    features.shape

    N = 2
    for Product, category_id in sorted(category_to_id.items()):
        features_chi2 = chi2(features, labels == category_id)
        indices = np.argsort(features_chi2[0])
        feature_names = np.array(tfidf.get_feature_names())[indices]
        unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
        bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
        print("# '{}':".format(Product))
        print("  . Most correlated unigrams:\n. {}".format('\n. '.join(
            unigrams[-N:])))
        print("  . Most correlated bigrams:\n. {}".format('\n. '.join(
            bigrams[-N:])))

    X_train, X_test, y_train, y_test = train_test_split(df['Review'],
                                                        df['Emotion'],
                                                        random_state=0)

    #vector initialise
    count_vect = CountVectorizer()

    X_train_counts = count_vect.fit_transform(X_train)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    clf = MultinomialNB().fit(X_train_tfidf, y_train)

    # Save the vectorizer
    vec_file = 'amex/services/vectorizer.pickle'
    pickle.dump(count_vect, open(vec_file, 'wb'))

    with open('amex/services/sentiment_custom_classifier', 'wb') as picklefile:
        pickle.dump(clf, picklefile)

    return True
Beispiel #35
0
def PreProcess3():

    trainBase = csv_io.read_data(
        "PreProcessData/training_PreProcess2_temp.csv", False)
    test = csv_io.read_data("PreProcessData/test_PreProcess2_temp.csv", False)

    target = [x[0] for x in trainBase]
    train = [x[1:] for x in trainBase]

    NumFeatures = 200

    #clf = RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True)
    chi = chi2(train, target)
    print "Training"
    #clf.fit(train, target)

    chi = SelectKBest(chi2, k=NumFeatures).fit(train, target)
    print chi.get_support(indices=True)
    print chi.transform(X), np.array(train)[:, [0]]

    return

    trainNew = []
    testNew = []

    print "Computing Importances"
    importances = clf.feature_importances_
    #print importances
    importancesTemp = sorted(importances, reverse=True)
    print len(importancesTemp), "importances"

    if (len(importancesTemp) > NumFeatures):
        threshold = importancesTemp[NumFeatures]
        #print "Sorted and deleted importances"
        #print importancesTemp

        rowIndex = 0
        for row in train:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (impIndex == 0):
                    newRow.append(target[rowIndex])
                if (importance > threshold):
                    newRow.append(row[impIndex])
            trainNew.append(newRow)
            rowIndex += 1

        for row in test:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (importance > threshold):
                    #print impIndex, len(importances)
                    newRow.append(row[impIndex])
            testNew.append(newRow)

    csv_io.write_delimited_file("PreProcessData/training_PreProcess2_chi.csv",
                                trainNew)
    csv_io.write_delimited_file("PreProcessData/test_PreProcess2_chi.csv",
                                testNew)
Beispiel #36
0
def kNNeighbours(x_train, y_train, x_test, k, r, w, v):
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_test = np.array(x_test)
    n_classes = len(np.unique(y_train))
    n_feat = x_train.shape[1]
    n_test = x_test.shape[0]

    # Set distance weights
    if w == 'eq':
        d_weights = np.ones([n_feat, 1])
    elif w == 'mi':
        d_weights = mutual_info_classif(x_train, y_train).reshape([n_feat, 1])
    elif w == 'chi':
        d_weights = chi2(x_train, y_train)[0].reshape([n_feat, 1])
    else:
        raise Exception(
            "Invalid distance weighting, choose eq for equal, mi for mutual information or chi for chi2"
        )
    # Calculate distances
    distances = np.zeros([n_test, k])
    knn_labels = np.zeros([n_test, k])
    indexes = np.zeros([n_test, k], dtype=np.int64)

    for i in range(n_test):
        # Get deltas for each feature
        delta = (abs(x_train - x_test[i]))**r
        # Get weighted sum and take r root
        wdelta = (np.matmul(delta, d_weights))**(1 / r)
        # Get indices of k lowest distances
        a = np.squeeze(np.argsort(np.transpose(wdelta)))[:k]
        indexes[i, :] = np.squeeze(np.argsort(np.transpose(wdelta)))[:k]
        # Store distances and labels
        distances[i, :] = np.squeeze(wdelta[indexes[i, :]])
        knn_labels[i, :] = np.squeeze(y_train[indexes[i, :]])

    # Set voting weights
    if v == 'maj':
        v_weights = np.ones([n_test, k])
    elif v == 'inv':
        v_weights = 1 / np.maximum(distances, np.finfo(float).eps)
    elif v == 'shep':
        v_weights = np.exp(-distances)
    else:
        raise Exception(
            "Invalid voting weighting, choose maj for majority, inv for inverse distances or shep for Sheppard's work"
        )

    # Get voting scores
    scores = np.zeros([n_test, n_classes])
    for i in range(n_test):
        for j in range(n_classes):
            v_index = np.where(knn_labels[i, :] == j)
            scores[i, j] = np.sum(v_weights[i, v_index])

    # To break ties choose the lowest index. We do this like sklearn's implementation.
    preds = np.argmax(scores, axis=1)

    return distances, indexes, preds
def calc_chi2(X, Y, feature_type, level_type):
    name = "chi2_%s_%s" % (feature_type, level_type)
    if os.path.exists(name+".pkl"):
        return pickle.load(open(name+".pkl", 'rb'))
    else:
        res = chi2(X, Y)
        pickle.dump(res, open(name+".pkl", 'wb'))
        return res
 def _chi_select(self):
     """
     卡方检验用以检验两个事件是否独立,ref: http://blog.csdn.net/yihucha166/article/details/50646615
     检验统计量越大表示越有相关性
     """
     from sklearn.feature_selection import chi2
     chi2_array = chi2(self.x, self.y)[0]
     self._get_top_k_ids(chi2_array)
Beispiel #39
0
 def test_chi2(self):
     iris_dataset = load_iris()
     X = iris_dataset.data
     y = iris_dataset.target
     X = X.astype(int)
     res = chi2_measure(X, y)
     true_res = chi2(X, y)[0]
     np.testing.assert_allclose(res, true_res)
Beispiel #40
0
 def feature_selection(self,mode='F'):
     
     print 'Feature Selection...'
     print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     
     X=self.train.copy()
     y=self.train_label['label'].values.copy()
     
     test=self.test.copy()
     
     if mode.upper()=='M':
         mi=mutual_info_classif(train.values,train_label['label'].values)
     elif mode.upper()=='F':
         F,pval=f_classif(train.values,train_label['label'].values)
     elif mode.upper()=='C':
         chi,pval=chi2(train.values,train_label['label'].values)
     
     features=self.train.columns.copy()
     
     fs_features=features.copy().tolist()
     
     if mode.upper()=='M':
         fs_V=mi.copy().tolist()
     elif mode.upper()=='F':
         fs_V=F.copy().tolist()
     elif mode.upper()=='C':
         fs_V=chi.copy().tolist()
     
     if mode.upper()=='M':
         selector=SelectPercentile(mutual_info_classif,percentile=80)
     elif mode.upper()=='F':
         selector=SelectPercentile(f_classif,percentile=80)
     elif mode.upper()=='C':
         selector=SelectPercentile(chi2,percentile=80)
         
     X_new=selector.fit_transform(X,y)
     
     selected=selector.get_support()
     
     for i in xrange(len(features)):
         if selected[i]==False:
             t=features[i]
             fs_features.remove(t)
             
     fs_V=np.array(fs_V)
     fs_features=np.array(fs_features)
     
     self.train=pd.DataFrame(X_new,columns=fs_features.tolist())
     self.test=test[fs_features]
     
     self.fs_features=fs_features
     
     feas=pd.DataFrame()
     feas['feature']=fs_features
     
     print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     
     return X_new,feas
Beispiel #41
0
    def build(self, documents):
        X, Y = super(ChiSquaredDecorator, self).build(documents)
        x2, pval = feature_selection.chi2(X, Y)

        self.remove_lst = [
            i for i, val in enumerate(x2) if val < self.threshold]
        X_x2 = np.delete(X, self.remove_lst, axis=1)

        return X_x2, Y
Beispiel #42
0
def chi_sq_filter(s, sense_ids, context_vectors):
	chi2_values, p_values = chi2(context_vectors, sense_ids)
	s_copy = s
	new_s = []

	for p_value, word in sorted(zip(p_values, s_copy), key=lambda d: d[0]):
		if p_value < 0.9:
			new_s.append(word)

	return new_s
def sorted_features(ufc_val, V, X, y, topN):
  iv = {v:k for k, v in V.items()}
  chi2_scores = chi2(X, y)[0]
  top_features = [(x[1], iv[x[0]], x[0]) 
    for x in sorted(enumerate(chi2_scores), 
    key=operator.itemgetter(1), reverse=True)]
  print "TOP 10 FEATURES FOR RATING:", ufc_val
  for top_feature in top_features[0:10]:
    print "%7.3f  %s (%d)" % (top_feature[0], top_feature[1], top_feature[2])
  return [x[1] for x in top_features]
def chi(vect,labels,benchmark=1):
    a, b = chi2(vect, labels)           # 'a' holds chi values, 'b' holds p values
    colnum = 0
    chivector = np.zeros((vect.shape[0], 1))
    for i in a:
        if i > benchmark:               # The term is selected only if it exceeds a benchmark  
            chivector = np.column_stack((chivector, vect[:, colnum]))
        colnum = colnum + 1             # The benchmark is not arbitrary
    chivector = chivector[:, 1:]        # A chi2 lookup table tells how much % independence is achieved
    return chivector                    # with what benchmark
def compute_chi(dataset):
    dataset.normalize()
    # chi2() requires positive values
    features = np.rollaxis(dataset.data, 1)
    for i, feature in enumerate(features):
        features[i] = feature - feature.min()
    chi2s, ps = chi2(dataset.data, dataset.target)
    ps = [1 - x for x in ps]
    chis = list(map(np.sqrt, chi2s))
    return ps, chis
    def test_chi2(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.feature_selection.chi2()
        expected = fs.chi2(iris.data, iris.target)

        self.assertEqual(len(result), 2)
        self.assert_numpy_array_almost_equal(result[0], expected[0])
        self.assert_numpy_array_almost_equal(result[1], expected[1])
Beispiel #47
0
def get_n_top_features(X, c, n, feature_names):
    chi, F = chi2(X, c)
    clf = LogisticRegression(class_weight="auto")
    clf.fit(X, c)
    coef_sign = clf.coef_[0] / np.abs(clf.coef_[0])
    signed_chi = chi * coef_sign
    counts = X.sum(0).tolist()[0]
    top_feats = [i for i in np.argsort(signed_chi)[::-1] if counts[i] > 1][:n]
    if feature_names is not None:
        print('top_feats=', feature_names[top_feats])
    return top_feats
def get_chi_scores(TDM, label_matrix, first_n):
    best_terms = []

    for i in range(0, len(label_matrix)):
        v = chi2(TDM, label_matrix[i])
        v = zip(range(0, len(v[0])), v[0], v[1])
        v = filter(lambda x: not(math.isnan(x[1])), v)

        best_terms.append(sorted(v, key=lambda x: x[1], reverse=True)[:first_n])

    return best_terms
Beispiel #49
0
def test_chi2_unused_feature():
    # Unused feature should evaluate to NaN
    # and should issue no runtime warning
    clean_warning_registry()
    with warnings.catch_warnings(record=True) as warned:
        warnings.simplefilter('always')
        chi, p = chi2([[1, 0], [0, 0]], [1, 0])
        for w in warned:
            if 'divide by zero' in repr(w):
                raise AssertionError('Found unexpected warning %s' % w)
    assert_array_equal(chi, [1, np.nan])
    assert_array_equal(p[1], np.nan)
Beispiel #50
0
def npfs_chi2(X, y, fpr=0.05, alpha=.01, n_bootstraps=100):
  """
  Parameters
  ----------
  X : array-like, shape = (n_samples, n_features_in)
      Sample vectors.
  
  y : array-like, shape = (n_samples,)
      Target vector (class labels).
  
  fpr : double
      False positive rate for the Chi2-test feature selection approach
  
  alpha : double
      Size of the hypothesis test for NPFS 
  
  n_bootstraps : double 
      Number of boostraps 

  Returns
  -------
  selections : array 
      Vector of selected features. Length is variable.  
  """
  n_samp, n_feat = X.shape
  
  X = bin_data(X, n_bins=np.sqrt(n_samp))

  if n_samp != len(y):
    ValueError('len(y) and X.shape[0] must be the equal.')

  bern_matrix = np.zeros((n_feat,n_bootstraps))

  for n in range(n_bootstraps):
    # generate a random sample
    idx = np.random.randint(0, n_samp, n_samp)
    chi, pval = chi2(1.0*X[idx], y[idx])
    sels = np.where(pval <= fpr)
    b_sels = np.zeros((n_feat,))
    b_sels[sels] = 1.
    bern_matrix[:, n] = b_sels

  delta = binom.ppf(1-alpha, n_bootstraps, fpr)
  z = np.sum(bern_matrix, axis=1)

  selections = []
  for k in range(n_feat):
    if z[k] > delta:
      selections.append(k)

  return selections, bern_matrix, delta  
Beispiel #51
0
	def word_chisq(	key,
					reference=None,
					data=ndata,
					vocab=vocab,
					n=10,
					minwords=0,
					maxwords=1000,
					stops=[]):
		from sklearn.feature_selection import chi2
		if type(key) == str:
				key = [key]

		if stops == "custom":
			collect = []
			for k in key:
				collect += customstops[k]
			stops = collect

		if reference == None:
			refdata = data
			refs = [True for row in all_index]
			refs = np.array(refs)
		else:
			if type(reference) == str:
				reference = [reference]

			refs = [(True in [(k in row) for k in reference]) for row in all_index]
			refs = np.array(refs)
			refdata = data[refs,:]

		#okay...so we're gettin' there...refdata now works correctly, but labels is wrong
		labels = [(True in [(k in row) for k in key]) for row in all_index]
		labels = np.array(labels)
		labels = labels[refs]

		chisq, p = chi2(refdata, labels)
		ranking = np.argsort(chisq)[::-1]
		values = []
		freqs = (refdata > 0)[labels,:].sum(axis=0)

		i = 0
		for rank in ranking:
			if i >= n:
				break

			if not np.isnan(chisq[rank]) and not freqs[:,rank]<minwords and not freqs[:,rank]>maxwords and vocab[rank] not in stops:
				values.append((chisq[rank],vocab[rank],p[rank],freqs[:,rank][0,0]))
				i+=1

		return values[0:n]
    def fit(self, dataset, utterances, classes):
        utterances_to_features = _get_utterances_to_features_names(
            dataset, self.language)
        normalized_utterances_to_features = defaultdict(set)
        for k, v in iteritems(utterances_to_features):
            normalized_utterances_to_features[
                _normalize_stem(k, self.language)].update(v)
        if self.unknown_words_replacement_string is not None \
                and self.unknown_words_replacement_string in \
                normalized_utterances_to_features:
            normalized_utterances_to_features.pop(
                self.unknown_words_replacement_string)
        self.entity_utterances_to_feature_names = dict(
            normalized_utterances_to_features)

        if all(not "".join(tokenize_light(q, self.language)) for q in
               utterances):
            return None
        preprocessed_utterances = self.preprocess_utterances(utterances)
        # pylint: disable=C0103
        X_train_tfidf = self.tfidf_vectorizer.fit_transform(
            preprocessed_utterances)
        # pylint: enable=C0103
        list_index_words = {
            self.tfidf_vectorizer.vocabulary_[word]: word
            for word in self.tfidf_vectorizer.vocabulary_
        }

        stop_words = get_stop_words(self.language)

        _, pval = chi2(X_train_tfidf, classes)
        self.best_features = [i for i, v in enumerate(pval) if
                              v < self.config.pvalue_threshold]
        if not self.best_features:
            self.best_features = [idx for idx, val in enumerate(pval) if
                                  val == pval.min()]

        feature_names = {}
        for utterance_index in self.best_features:
            feature_names[utterance_index] = {
                "word": list_index_words[utterance_index],
                "pval": pval[utterance_index]}

        for feat in feature_names:
            if feature_names[feat]["word"] in stop_words:
                if feature_names[feat]["pval"] > \
                        self.config.pvalue_threshold / 2.0:
                    self.best_features.remove(feat)

        return self
def chi2_filter(_df, features_to_test):
    ret_val = []
    X = _df[features_to_test].values
    Y = _df.Vote.values
    v = chi2(X, Y)[1]
    i = 0

    for c in features_to_test:
        if v[i] < alpha:
            print c + " selected by chi2 with p-value: " + str(v[i])
            ret_val.append(c)
        i += 1

    return ret_val
Beispiel #54
0
    def rank_by_chi2(self, X, y):
        
        self.feature_count = self.count_features(X, y)
        
        chi2_scores = chi2(X, y)
                
        self.feature_scores = chi2_scores[0]
        
        nan_entries = np.nonzero(np.isnan(self.feature_scores))
        
        self.feature_scores[nan_entries] = 0
        
        feature_rank = np.argsort(self.feature_scores)[::-1]

        return self.classify_features(feature_rank)
def chi2_feature_test(X,y,feature_index):
	"""
	Performs the chi square test on the desired feature

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	feature_index - The selected feature (a zero-based index)
	"""

	feature_column=X[:,feature_index].reshape(-1,1)
	min_val=feature_column.min()
	if min_val<0:
		feature_column=feature_column+min_val*-1+1
	return chi2(feature_column,y)
Beispiel #56
0
def feature_select(X, y, c, rand, feature_names):
    """ Find the highest chi2 feature for class c and remove it from the classifier."""
    #X = scale_X(X)
    chi, F = chi2(X, c)
    clf = LogisticRegression(class_weight="auto")
    clf.fit(X, c)
    coef_sign = clf.coef_[0] / np.abs(clf.coef_[0])
    signed_chi = chi * coef_sign
    counts = X.sum(0).tolist()[0]
    top_feats = [i for i in np.argsort(signed_chi)[::-1] if counts[i] > 1][:1]
    if feature_names is not None:
        print('top_feats=', feature_names[top_feats])
    X2 = copy.copy(X)
    X2[:,top_feats] = 0.  # Set top feature to 0
    clf.fit(X2, y)
    return clf
Beispiel #57
0
def slice_chisq(d, tindex, sindex, key, n=10, stops=True):
	from sklearn.feature_selection import chi2
	if key in tag_count.keys():
		labels = [(key in row) for row in tindex]
	elif key in substance_count.keys():
		labels = [(key in row) for row in sindex]
		
	chisq, p = chi2(d, labels)
	ranking = np.argsort(chisq)[::-1]
	values = []
	for rank in ranking:
		if key in substance_count.keys() and vocab[rank] in substops and stops==True:
			continue
		elif not np.isnan(chisq[rank]):
			values.append((chisq[rank],vocab[rank],p[rank]))
	return values[0:n]
def test_NB(ds_name='enron1'):
	'''
	test naive bayes
	'''

	start = time()
	ratio = 0.7
	spam, ham = get_words_list(ds_name)
	random.shuffle(spam)
	random.shuffle(ham)

        train_spam_div = int(ratio*len(spam))
	train_ham_div = int(ratio*len(ham))

	train_set = [(i, 1) for i in spam[:train_spam_div]] + [(j, 0) for j in ham[:train_ham_div]]
	test_set = [(i, 1) for i in spam[train_spam_div:]] + [(j, 0) for j in ham[train_ham_div:]]
	
	words_list = spam[:train_spam_div] + ham[:train_ham_div]

	vocab_list = get_feature_dict(words_list)
	test_vocab_list = get_feature_dict(spam[train_spam_div:]+ham[train_ham_div:])

	train_vec, train_class = get_files_vec(vocab_list, array(train_set))

	# use chi-square feature selection method to select important features
	observed, expected = feature_selection.chi2(train_vec, train_class)
        chi_deviation = [0]*len(observed)
	for i in xrange(len(observed)):
		if expected[i] == 0 and expected[i] != observed[i]:
			chi_deviation[i] = 1000
		else:
			chi_deviation[i] = float((observed[i]-expected[i])**2)/expected[i]
	

	updated_vocab_list = [i[1] for i in sorted(zip(chi_deviation, vocab_list), reverse=True)][:2000]

	updated_train_vec, train_class = get_files_vec(updated_vocab_list, array(train_set))

	updated_test_vec, test_class = get_files_vec(updated_vocab_list, array(test_set))

	spam_vec, ham_vec = train_NB(array(updated_train_vec), array(train_spam_div))
        
	p_abusive = float(train_spam_div)/(train_spam_div+train_ham_div)

	classify_NB(array(updated_test_vec), array(test_class), array(spam_vec), array(ham_vec), p_abusive)
        print 'feature_lose = ', 1 - float(len(vocab_list))/len(set(vocab_list+test_vocab_list))	
	print time() - start, 'seconds'
Beispiel #59
0
def do_c_study(c_range, filter_corr_diff, data, ntrials, rand, size, n):
    test_biases = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
    train_biases = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
    
    x2, pval = chi2(data.train_x, data.train_c)
    top_ft_idx = np.argsort(x2)[::-1][:n]
    print('%d top correlated features: %s' % (n, data.feature_names[top_ft_idx]))
    top_fts = list(zip(np.hstack([data.feature_names[top_ft_idx],
                                  ['c=0', 'c=1']]),
                       np.hstack([top_ft_idx,
                                  [-2, -1]])))
    c_values = []
    accuracies = defaultdict(list)
    coefs = defaultdict(lambda:defaultdict(list))
    
    for train_bias in train_biases:
        for test_bias in test_biases:
            for ti in range(ntrials):
                # Sample training and testing indices.
                test_idx = make_confounding_data(X=data.test_x, y=data.test_y, c=data.test_c,
                                                pos_prob=.5, bias=test_bias, size=size, rand=rand)  
                test_corr = pearsonr(data.test_y[test_idx], data.test_c[test_idx])[0]
                train_idx = make_confounding_data(X=data.train_x, y=data.train_y, c=data.train_c,
                                                  pos_prob=.5, bias=train_bias, size=size, rand=rand)   
                train_corr = pearsonr(data.train_y[train_idx], data.train_c[train_idx])[0]
                corr_diff = round(train_corr - test_corr, 1)
                if not filter_corr_diff(corr_diff):
                    continue
                if ti == 0:
                    #corr_diffs.append(corr_diff)
                    print('train_bias=', train_bias, 'train_corr=', train_corr,
                          'test_bias=', test_bias, 'test_corr=', test_corr,
                          'corr_diff=', corr_diff)
                    
                # Train and test each model.
                for c_val in c_range:
                    name = 'BA C=%f' % c_val
                    ba = backdoor_adjustment_var_C(data.train_x[train_idx], data.train_y[train_idx],
                                                   data.train_c[train_idx], rand, data.feature_names, c_val)
                    y_pred = ba.predict(data.test_x[test_idx])
                    y_true = data.test_y[test_idx]
                    accuracies[c_val].append(accuracy_score(y_true, y_pred))
                    ba_coefs = scale(ba.coef_[0])
                    for ft_name, ft_idx in top_fts:
                        coefs[ft_name][c_val].append(ba_coefs[ft_idx])
    return accuracies, coefs
Beispiel #60
0
def select_feature(trainfilename, testfilename):
    def returnCHI(X, y):
        return chivalue
    X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename))
    
    featureNum = X_train.get_shape()[1]
    chivalue = chi2(X_train, y_train)

    step = featureNum / 20;
    for i in range(1, 21):
        selectNum = step * i
        print "selecting", selectNum, "features"
        selector = SelectKBest(chi2, k=selectNum)
        X_train_new = selector.fit_transform(X_train, y_train)
        X_test_new= selector.transform(X_test)
        sklearn.datasets.dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False)
        sklearn.datasets.dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)