Esempio n. 1
0
def test_set_feature_union_steps():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ["x2"]
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ["x3"]
    mult5 = Mult(5)
    mult5.get_feature_names = lambda: ["x5"]

    ft = FeatureUnion([("m2", mult2), ("m3", mult3)])
    assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]])))
    assert_equal(["m2__x2", "m3__x3"], ft.get_feature_names())

    # Directly setting attr
    ft.transformer_list = [("m5", mult5)]
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(["m5__x5"], ft.get_feature_names())

    # Using set_params
    ft.set_params(transformer_list=[("mock", mult3)])
    assert_array_equal([[3]], ft.transform(np.asarray([[1]])))
    assert_equal(["mock__x3"], ft.get_feature_names())

    # Using set_params to replace single step
    ft.set_params(mock=mult5)
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(["mock__x5"], ft.get_feature_names())
Esempio n. 2
0
def test_set_feature_union_step_none():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ['x2']
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ['x3']
    X = np.asarray([[1]])

    ft = FeatureUnion([('m2', mult2), ('m3', mult3)])
    assert_array_equal([[2, 3]], ft.fit(X).transform(X))
    assert_array_equal([[2, 3]], ft.fit_transform(X))
    assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names())

    ft.set_params(m2=None)
    assert_array_equal([[3]], ft.fit(X).transform(X))
    assert_array_equal([[3]], ft.fit_transform(X))
    assert_equal(['m3__x3'], ft.get_feature_names())

    ft.set_params(m3=None)
    assert_array_equal([[]], ft.fit(X).transform(X))
    assert_array_equal([[]], ft.fit_transform(X))
    assert_equal([], ft.get_feature_names())

    # check we can change back
    ft.set_params(m3=mult3)
    assert_array_equal([[3]], ft.fit(X).transform(X))
Esempio n. 3
0
def test_set_feature_union_steps():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ['x2']
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ['x3']
    mult5 = Mult(5)
    mult5.get_feature_names = lambda: ['x5']

    ft = FeatureUnion([('m2', mult2), ('m3', mult3)])
    assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]])))
    assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names())

    # Directly setting attr
    ft.transformer_list = [('m5', mult5)]
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(['m5__x5'], ft.get_feature_names())

    # Using set_params
    ft.set_params(transformer_list=[('mock', mult3)])
    assert_array_equal([[3]], ft.transform(np.asarray([[1]])))
    assert_equal(['mock__x3'], ft.get_feature_names())

    # Using set_params to replace single step
    ft.set_params(mock=mult5)
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(['mock__x5'], ft.get_feature_names())
Esempio n. 4
0
def test_set_feature_union_steps():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ['x2']
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ['x3']
    mult5 = Mult(5)
    mult5.get_feature_names = lambda: ['x5']

    ft = FeatureUnion([('m2', mult2), ('m3', mult3)])
    assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]])))
    assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names())

    # Directly setting attr
    ft.transformer_list = [('m5', mult5)]
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(['m5__x5'], ft.get_feature_names())

    # Using set_params
    ft.set_params(transformer_list=[('mock', mult3)])
    assert_array_equal([[3]], ft.transform(np.asarray([[1]])))
    assert_equal(['mock__x3'], ft.get_feature_names())

    # Using set_params to replace single step
    ft.set_params(mock=mult5)
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(['mock__x5'], ft.get_feature_names())
Esempio n. 5
0
def test_set_feature_union_step_none():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ['x2']
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ['x3']
    X = np.asarray([[1]])

    ft = FeatureUnion([('m2', mult2), ('m3', mult3)])
    assert_array_equal([[2, 3]], ft.fit(X).transform(X))
    assert_array_equal([[2, 3]], ft.fit_transform(X))
    assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names())

    ft.set_params(m2=None)
    assert_array_equal([[3]], ft.fit(X).transform(X))
    assert_array_equal([[3]], ft.fit_transform(X))
    assert_equal(['m3__x3'], ft.get_feature_names())

    ft.set_params(m3=None)
    assert_array_equal([[]], ft.fit(X).transform(X))
    assert_array_equal([[]], ft.fit_transform(X))
    assert_equal([], ft.get_feature_names())

    # check we can change back
    ft.set_params(m3=mult3)
    assert_array_equal([[3]], ft.fit(X).transform(X))
Esempio n. 6
0
def test_set_feature_union_step_drop(drop):
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ['x2']
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ['x3']
    X = np.asarray([[1]])

    ft = FeatureUnion([('m2', mult2), ('m3', mult3)])
    assert_array_equal([[2, 3]], ft.fit(X).transform(X))
    assert_array_equal([[2, 3]], ft.fit_transform(X))
    assert ['m2__x2', 'm3__x3'] == ft.get_feature_names()

    ft.set_params(m2=drop)
    assert_array_equal([[3]], ft.fit(X).transform(X))
    assert_array_equal([[3]], ft.fit_transform(X))
    assert ['m3__x3'] == ft.get_feature_names()

    ft.set_params(m3=drop)
    assert_array_equal([[]], ft.fit(X).transform(X))
    assert_array_equal([[]], ft.fit_transform(X))
    assert [] == ft.get_feature_names()

    # check we can change back
    ft.set_params(m3=mult3)
    assert_array_equal([[3]], ft.fit(X).transform(X))

    # Check 'drop' step at construction time
    ft = FeatureUnion([('m2', drop), ('m3', mult3)])
    assert_array_equal([[3]], ft.fit(X).transform(X))
    assert_array_equal([[3]], ft.fit_transform(X))
    assert ['m3__x3'] == ft.get_feature_names()
Esempio n. 7
0
def test_feature_union_get_feature_names_deprecated():
    """Check that get_feature_names is deprecated"""
    msg = "get_feature_names is deprecated in 1.0"
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ["x2"]

    ft = FeatureUnion([("m2", mult2)])
    with pytest.warns(FutureWarning, match=msg):
        ft.get_feature_names()
Esempio n. 8
0
def test_feature_union_feature_names():
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert "chars__" in feat or "words__" in feat
    assert len(feature_names) == 35

    ft = FeatureUnion([("tr1", Transf())]).fit([[1]])

    msg = re.escape("Transformer tr1 (type Transf) does not provide get_feature_names")
    with pytest.raises(AttributeError, match=msg):
        ft.get_feature_names()
def train_model(trainset):
	word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2,2), binary = False, max_features= 2000,min_df=1,decode_error="ignore")
#	print word_vector	
	print "works fine"
	char_vector = TfidfVectorizer(ngram_range=(2,3), analyzer="char", binary = False, min_df = 1, max_features = 2000,decode_error= "ignore")
	vectorizer =FeatureUnion([ ("chars", char_vector),("words", word_vector) ])
	corpus = []
	classes = []

	for item in trainset:
		corpus.append(item['text'])
		classes.append(item['label'])

	print "Training instances : ", 0.8*len(classes)
	print "Testing instances : ", 0.2*len(classes) 
	
	matrix = vectorizer.fit_transform(corpus)
	print "feature count : ", len(vectorizer.get_feature_names())
	print "training model"
	X = matrix.toarray()
	y = numpy.asarray(classes)
	model =LinearSVC()
	X_train, X_test, y_train, y_test= train_test_split(X,y,train_size=0.8,test_size=.2,random_state=0)
	y_pred = OneVsRestClassifier(model).fit(X_train, y_train).predict(X_test)
	#y_prob = OneVsRestClassifier(model).fit(X_train, y_train).decision_function(X_test)
	#print y_prob
	#con_matrix = []
	#for row in range(len(y_prob)):
	#	temp = [y_pred[row]]	
	#	for prob in y_prob[row]:
	#		temp.append(prob)
	#	con_matrix.append(temp)
	#for row in con_matrix:
	#	output.write(str(row)+"\n")
	#print y_pred		
	#print y_test
	
	res1=[i for i, j in enumerate(y_pred) if j == 'anonEdited']
	res2=[i for i, j in enumerate(y_test) if j == 'anonEdited']
	reset=[]
	for r in res1:
		if y_test[r] != "anonEdited":
			reset.append(y_test[r])
	for r in res2:
		if y_pred[r] != "anonEdited":
			reset.append(y_pred[r])
	
	
	output=open(sys.argv[2],"w")
	for suspect in reset:
		output.write(str(suspect)+"\n")	
	cm = confusion_matrix(y_test, y_pred)
	print(cm)
	pl.matshow(cm)
	pl.title('Confusion matrix')
	pl.colorbar()
	pl.ylabel('True label')
	pl.xlabel('Predicted label')
	pl.show()
	print accuracy_score(y_pred,y_test)
Esempio n. 10
0
def test_feature_union_feature_names():
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert_true("chars__" in feat or "words__" in feat)
    assert_equal(len(feature_names), 35)
Esempio n. 11
0
def test_feature_union_feature_names():
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert_true("chars__" in feat or "words__" in feat)
    assert_equal(len(feature_names), 35)
Esempio n. 12
0
class FeatureUnionDataFrame(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.fu = FeatureUnion(*args, **kwargs)

    def fit(self, X, y=None, **kwargs):
        self.fu.fit(X, y, **kwargs)
        return self

    def transform(self, X, y=None, **fit_params):
        return pd.DataFrame(self.fu.transform(X), columns=self.fu.get_feature_names())

    def get_feature_names(self):
        return self.fu.get_feature_names()

    def set_params(self, **kwargs):
        self.fu.set_params(**kwargs)

    def get_params(self, deep=False):
        return self.fu.get_params(deep)
Esempio n. 13
0
def test_set_feature_union_step_drop():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ["x2"]
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ["x3"]
    X = np.asarray([[1]])

    ft = FeatureUnion([("m2", mult2), ("m3", mult3)])
    assert_array_equal([[2, 3]], ft.fit(X).transform(X))
    assert_array_equal([[2, 3]], ft.fit_transform(X))
    assert ["m2__x2", "m3__x3"] == ft.get_feature_names()

    with pytest.warns(None) as record:
        ft.set_params(m2="drop")
        assert_array_equal([[3]], ft.fit(X).transform(X))
        assert_array_equal([[3]], ft.fit_transform(X))
    assert ["m3__x3"] == ft.get_feature_names()
    assert not record

    with pytest.warns(None) as record:
        ft.set_params(m3="drop")
        assert_array_equal([[]], ft.fit(X).transform(X))
        assert_array_equal([[]], ft.fit_transform(X))
    assert [] == ft.get_feature_names()
    assert not record

    with pytest.warns(None) as record:
        # check we can change back
        ft.set_params(m3=mult3)
        assert_array_equal([[3]], ft.fit(X).transform(X))
    assert not record

    with pytest.warns(None) as record:
        # Check 'drop' step at construction time
        ft = FeatureUnion([("m2", "drop"), ("m3", mult3)])
        assert_array_equal([[3]], ft.fit(X).transform(X))
        assert_array_equal([[3]], ft.fit_transform(X))
    assert ["m3__x3"] == ft.get_feature_names()
    assert not record
Esempio n. 14
0
def test_feature_union_feature_names():
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert "chars__" in feat or "words__" in feat
    assert_equal(len(feature_names), 35)

    ft = FeatureUnion([("tr1", Transf())]).fit([[1]])
    assert_raise_message(
        AttributeError, 'Transformer tr1 (type Transf) does not provide '
        'get_feature_names', ft.get_feature_names)
Esempio n. 15
0
def test_feature_union_feature_names():
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert_true("chars__" in feat or "words__" in feat)
    assert_equal(len(feature_names), 35)

    ft = FeatureUnion([("tr1", Transf())]).fit([[1]])
    assert_raise_message(
        AttributeError, 'Transformer tr1 (type Transf) does not provide '
        'get_feature_names', ft.get_feature_names)
def run():
    df = pd.read_csv("../input/train.csv", usecols=["description", "title"])
    df_test = pd.read_csv("../input/test.csv",
                          usecols=["description", "title"])
    df = pd.concat([df, df_test], axis=0)

    cleanup = Cleanup()

    df["title"] = df["title"].fillna("").apply(lambda x: cleanup.process2(x))
    df["description"] = (
        df["description"].fillna("").apply(lambda x: cleanup.process2(x)))

    tfidf_para = {
        "stop_words": set(stopwords.words("russian")),
        "analyzer": "word",
        "token_pattern": r"\w{1,}",
        "sublinear_tf": True,
        "dtype": np.float32,
        "norm": "l2",
        # "min_df": .05,
        # "max_df": .9,
        "smooth_idf": False,
    }

    vectorizer = FeatureUnion([
        (
            "description",
            TfidfVectorizer(ngram_range=(1, 2),
                            max_features=17000,
                            **tfidf_para,
                            preprocessor=get_col("description")),
        ),
        (
            "title",
            CountVectorizer(
                ngram_range=(1, 2),
                stop_words=set(stopwords.words("russian")),
                preprocessor=get_col("title"),
            ),
        ),
    ])

    vectorizer.fit(df.to_dict("records"))
    out_df = vectorizer.transform(df.to_dict("records"))
    vocab = vectorizer.get_feature_names()

    with open("../cache/feature_tfidf_names.pkl", "wb") as f:
        pickle.dump(vocab, f)

    return out_df
Esempio n. 17
0
def data_vectorize(df):
    russian_stop = set(stopwords.words("russian"))
    tfidf_para = {
        "stop_words": russian_stop,
        "analyzer": "word",
        "token_pattern": r"\w{1,}",
        "sublinear_tf": True,
        "dtype": np.float32,
        "norm": "l2",
        #"min_df":5,
        #"max_df":.9,
        "smooth_idf": False
    }

    def get_col(col_name):
        return lambda x: x[col_name]

    vectorizer = FeatureUnion([
        ("description",
         TfidfVectorizer(ngram_range=(1, 2),
                         max_features=36000,
                         **tfidf_para,
                         preprocessor=get_col("description"))),
        ("title_description",
         TfidfVectorizer(ngram_range=(1, 2),
                         max_features=200000,
                         **tfidf_para,
                         preprocessor=get_col("title_description"))),
        ("text_feature",
         CountVectorizer(ngram_range=(1, 2),
                         preprocessor=get_col("text_feature"))),
        ("title",
         TfidfVectorizer(ngram_range=(1, 2),
                         **tfidf_para,
                         preprocessor=get_col("title"))),
    ])
    vectorizer.fit(df.to_dict("records"))
    ready_full_df = vectorizer.transform(df.to_dict("records"))

    tfvocab = vectorizer.get_feature_names()

    df.drop([
        "text_feature", "text_feature_2", "description", "title",
        "title_description"
    ],
            axis=1,
            inplace=True)
    df.fillna(-1, inplace=True)
    return df, ready_full_df, tfvocab
    def test_same_result(self):
        X, Z = self.make_text_rdd(2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ])
        dist_union = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ])
        # test same feature names
        loc_union.fit(X)
        dist_union.fit(Z)
        assert_equal(
            loc_union.get_feature_names(),
            dist_union.get_feature_names()
        )
        # test same results
        X_transformed = loc_union.transform(X)
        Z_transformed = sp.vstack(dist_union.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results with fit_transform
        X_transformed = loc_union.fit_transform(X)
        Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results in parallel
        loc_union_par = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ], n_jobs=2)
        dist_union_par = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ], n_jobs=2)

        loc_union_par.fit(X)
        dist_union_par.fit(Z)
        X_transformed = loc_union_par.transform(X)
        Z_transformed = sp.vstack(dist_union_par.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
Esempio n. 19
0
    def test_same_result(self):
        X, Z = self.make_text_rdd(2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ])
        dist_union = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ])
        # test same feature names
        loc_union.fit(X)
        dist_union.fit(Z)
        assert_equal(
            loc_union.get_feature_names(),
            dist_union.get_feature_names()
        )
        # test same results
        X_transformed = loc_union.transform(X)
        Z_transformed = sp.vstack(dist_union.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results with fit_transform
        X_transformed = loc_union.fit_transform(X)
        Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results in parallel
        loc_union_par = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ], n_jobs=2)
        dist_union_par = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ], n_jobs=2)

        loc_union_par.fit(X)
        dist_union_par.fit(Z)
        X_transformed = loc_union_par.transform(X)
        Z_transformed = sp.vstack(dist_union_par.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
Esempio n. 20
0
def train_model(trainset):

  # create 2 blocks of features, word and character ngrams, size of 2 (using TF-IDF method)
  # we can also append here multiple other features in general

  word_vector = TfidfVectorizer( analyzer="word" , ngram_range=(2,2), binary = False, max_features= 2000 )
  char_vector = TfidfVectorizer(ngram_range=(2, 3), analyzer="char", binary=False, min_df=0 , max_features=2000 )

  # our vectors are the feature union of word/char ngrams
  vectorizer = FeatureUnion([  ("chars", char_vector),("words", word_vector)  ] )

  corpus, classes = [], []
    

  for item in trainset:    
    corpus.append( item['text'] )
    classes.append( item['label'] )

  print "num of training instances: ", len(classes)    
  print "num of training classes: ", len(set(classes))

  #fit the model of tfidf vectors for the coprus
  matrix = vectorizer.fit_transform(corpus)
 
  print "num of features: " , len(vectorizer.get_feature_names())
  print "training model"
  X = matrix.toarray()
  y = np.asarray(classes)

  print X[0]

  # Here are results of several different models for Law corpus:

  # model  = SVC(kernel='sigmoid') # ->                       0.38
  # model  = KNeighborsClassifier(algorithm = 'kd_tree') # -> 0.41
  # model = AdaBoostClassifier() #->                            0.46
  # model  = RandomForestClassifier() # ->                    0.52
  # model  = LogisticRegression() # ->                        0.65 
  model  = LinearSVC( loss='l1', dual=True) # ->              0.70
  # Results of several different models for Enron corpus:
  # model  = LinearSVC( loss='l1', dual=True) # ->              0.6

  scores = cross_validation.cross_val_score(  estimator = model,
    X = matrix.toarray(), 
        y= np.asarray(classes), cv=10  )

  print "10-fold cross-validation results:", "mean score = ", scores.mean(), "std=", scores.std(), ", num folds =", len(scores)
Esempio n. 21
0
def test_feature_union_feature_names():
    JUNK_FOOD_DOCS = (
        "the pizza pizza beer copyright",
        "the pizza burger beer copyright",
        "the the pizza beer beer copyright",
        "the burger beer beer copyright",
        "the coke burger coke copyright",
        "the coke burger burger",
    )
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert_true("chars__" in feat or "words__" in feat)
    assert_equal(len(feature_names), 35)
Esempio n. 22
0
def test_feature_stacker_feature_names():
    JUNK_FOOD_DOCS = (
        "the pizza pizza beer copyright",
        "the pizza burger beer copyright",
        "the the pizza beer beer copyright",
        "the burger beer beer copyright",
        "the coke burger coke copyright",
        "the coke burger burger",
    )
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert_true("chars__" in feat or "words__" in feat)
    assert_equal(len(feature_names), 35)
Esempio n. 23
0
def make_tfidf(train, test):
    russian_stop = set(stopwords.words('russian'))
    tfidf_para = {
        "stop_words": russian_stop,
        "analyzer": 'word',
        "token_pattern": r'\w{1,}',
        "sublinear_tf": True,
        "dtype": np.float32,
        "norm": 'l2',
        # "min_df":5,
        # "max_df":.9,
        "smooth_idf": False
    }

    def get_col(col_name):
        return lambda x: x[col_name]

    vectorizer = FeatureUnion([
        ('description',
         TfidfVectorizer(ngram_range=(1, 2),
                         max_features=100,
                         **tfidf_para,
                         preprocessor=get_col('description'))),
        # ('text_feat', CountVectorizer(
        #     ngram_range=(1, 2),
        #     # max_features=7000,
        #     preprocessor=get_col('text_feat'))),
        ('title',
         TfidfVectorizer(ngram_range=(1, 2),
                         **tfidf_para,
                         max_features=70,
                         preprocessor=get_col('title')))
    ])
    vectorizer.fit(train)
    ret_df = vectorizer.transform(train)
    feature_names = vectorizer.get_feature_names()
    return ret_df, feature_names


# vectorizer.fit(df.loc[traindex, :].to_dict('records'))
# ready_df = vectorizer.transform(df.to_dict('records'))
# tfvocab = vectorizer.get_feature_names()
#
#
# # get char count
# length_of_words = len(df["len"])
Esempio n. 24
0
def extract_features(dataset_dir):
    print_message("Extracting Features")

    X, y = get_data(dataset_dir)
    glbs.LABELS = y
    glbs.DATASET_DATA = X

    ########################################
    # X, y = zip(*list(zip(X, y))[:160])
    # from help_functions import get_fetuer_by_DF

    # get_fetuer_by_DF(X)
    ########################################

    feature_lst = []
    # add all the N-Grams feature to the list
    for feature in glbs.FEATURES:
        if is_ngrams(feature):
            vectorizer = get_vectorizer(feature)
            feature_lst = add_feature(feature_lst, feature, vectorizer)
    # add all the stylistic features to the list
    for feature in glbs.STYLISTIC_FEATURES:
        vectorizers = get_stylistic_features_vectorizer(feature)
        for i in range(len(vectorizers)):
            feature_lst = add_feature(feature_lst, feature + str(i),
                                      vectorizers[i])

    # convert the list to one vectoriazer using FeatureUnion
    if glbs.MULTIPROCESSING:
        n_jobs = -1
    else:
        n_jobs = None
    all_features = FeatureUnion(feature_lst, n_jobs=n_jobs)

    glbs.FEATURE_MODEL.append(all_features)

    all_features.fit(X, y)
    glbs.NUM_OF_FEATURE = len(all_features.get_feature_names())

    if glbs.SELECTION:
        from feature_selction import get_selected_features

        get_selected_features(X, y, all_features)

    return X, y
Esempio n. 25
0
def main():
	qtrain = read_set()
# 	X_train = gen_features(qtrain)
	Y_train = get_ans(qtrain)
	qtest = read_set()
# 	X_test = gen_features(qtest)
# 	(X_train, X_test), featkeys = dictVec(X_train, X_test)
	
#  	tfidf_word = TfidfVectorizer(preprocessor=lambda x: x['question_text'].lower(), ngram_range=(1, 3), analyzer="word", binary=False, min_df=3)
 	tfidf_word = TfidfVectorizer(preprocessor=exa, ngram_range=(1, 3), analyzer="word", binary=False, min_df=0.05)
#  	feat_select = SelectPercentile(score_func=f_regression_, percentile=0.15)
 	feat_select = SelectKBest(score_func=f_regression_, k=QN_PARAMS[QUESTION]['features_select'])
 	cf = CustomFeat()
 	feat = FeatureUnion([('word_counts', tfidf_word), ('custom', cf)])
# 	feat = FeatureUnion([('custom', cf)])
# 	feat = FeatureUnion([('word_counts', tfidf_word)])
 # 	est = ESTIMATOR(**params[SETTINGS['EST']])
  	w_model = Pipeline([('funion', feat), ('feat_select', feat_select)]) #, ('est', est)]
#   	w_X_train = tfidf_word.fit_transform(qtrain)
#   	w_X_test = tfidf_word.transform(qtest)
#   	print_err(w_X_train[0])
#  	X_train = w_X_train
#  	X_test = w_X_test
#  	featkeys = tfidf_word.get_feature_names()
# 	feat_select
# 	f_regression_(X_train[:,0],Y_train)
#	print_err('fitting')
#	w_model.fit(qtrain, Y_train)
# 	print_err(feat_select.get_support(indices=True))
	X_train = w_model.fit_transform(qtrain, Y_train).toarray()
	X_test = w_model.transform(qtest).toarray()
   	featkeys = np.asarray(feat.get_feature_names())[feat_select.get_support(indices=True)]
#	featkeys = []
# 	Y_test = classify(w_model, qtest)
# 	print_err(est.coef_.nonzero())

 	clf = get_clf(X_train, Y_train, feat_indices=featkeys, clf_used=SETTINGS['EST'], grid_search=SETTINGS['GRIDSEARCH'])
 	Y_test = classify(clf, X_test)
	for qn, pans in zip(qtest, Y_test):
		print json.dumps({
			'question_key': qn['question_key'].encode('ascii'),
			'__ans__': pans
		})
Esempio n. 26
0
class POSTagsTransformer(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.init_transformer = None

    def fit(self, X, y=None):
        """All SciKit-Learn compatible transformers and classifiers have the
        same interface. `fit` always returns the same object."""
        feature_lst = []
        all_posts_pos, all_pos = get_corpus_pos_tags(X)
        for i, pos in enumerate(all_pos):
            feature_lst += [("pos" + pos + str(i),
                             InitTransformer(pos, all_posts_pos))]
        self.init_transformer = FeatureUnion(feature_lst)
        return self

    def transform(self, X):
        return self.init_transformer.transform(X)

    def get_feature_names(self):
        """Array mapping from feature integer indices to feature name"""
        return self.init_transformer.get_feature_names()
Esempio n. 27
0
def _transform(ds_url: str) -> pd.DataFrame:
    normal_list, anomalous_list = data_sets.get(ds_url)

    X_normal_list = []
    X_anomalous_list = []

    for tf_list in (NUMBERS_TF_LIST, RAW_DATA_TF_LIST):
        # make and fit feature union
        fu = FeatureUnion(
            [(class_.__name__.replace('Transformer', ''), class_())
             for class_ in tf_list],
            n_jobs=-1)
        fu.fit(normal_list)

        # create column MultiIndex
        col_tuples = []
        for s in fu.get_feature_names():
            s = '{}__{}'.format(s[:2], s[2:])
            source_0, tf_name, source_1, tf_part = s.split('__')
            col_tuples.append((tf_name, tf_part, source_0, source_1))
        idx = pd.MultiIndex.from_tuples(col_tuples, names=COLUMN_NAMES)

        # transform requests
        X_normal = pd.DataFrame(fu.transform(normal_list), columns=idx)
        X_anomalous = pd.DataFrame(fu.transform(anomalous_list), columns=idx)
        X_normal_list.append(X_normal)
        X_anomalous_list.append(X_anomalous)

    # concatenate all features
    X_normal = pd.concat(X_normal_list, axis=1)  # type: pd.DataFrame
    X_anomalous = pd.concat(X_anomalous_list, axis=1)  # type: pd.DataFrame

    # add meta
    X_normal[META_ID] = X_normal.index
    X_normal[META_TRUE_LABEL] = 'normal'
    X_anomalous[META_ID] = X_anomalous.index
    X_anomalous[META_TRUE_LABEL] = 'anomalous'

    return pd.concat([X_normal, X_anomalous], ignore_index=True)
def dump_train():
    _, _, _, train_gray_data, test_gray_data, _, labels = i_p.load_data()

    train_df = f.make_data_df(train_gray_data, labels)
    test_df = f.make_test_df(test_gray_data)

    train_df = train_df.reset_index()
    test_df = test_df.reset_index()

    train_df.columns = ["pngname", "input", "label"]
    test_df.columns = ["pngname", "input"]

    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    feature_name_list = [s.split("__")[1] for s in fu.get_feature_names()]
    feature_name_list.append("target")
    train_X = fu.fit_transform(train_df)
    train_y = np.concatenate(train_df["label"].apply(lambda x: x.flatten()))
    train_X, train_y = cl.downsampling_data(train_X, train_y, 0.2)
    train_dump = pd.DataFrame(np.c_[train_X, train_y], columns=feature_name_list)
    dump_path = os.path.abspath(os.path.dirname(__file__)) +\
        "/../tmp/train_dump"
    train_dump.to_csv(dump_path + "/train_dump.csv", index=False)
def test_feature_union():
    df = _create_test_data()
    features = FeatureUnion(transformer_list=[
        ('scale_A', series_pipeline('col_A', [ScalingTransformer(2.0)])),
        ('scale_B', series_pipeline('col_B', [ScalingTransformer(-5.0)])),
        ('null_A', series_pipeline('col_A', [NullTransformer()])),
        ('daysofweek',
         series_pipeline('dates', [DateAttributeTransformer('dayofweek')])),
        ('is_some_day',
         series_pipeline('dates', [
             MultiDateTransformer([
                 date(2014, 10, 4),
                 date(2016, 5, 30),
             ])
         ])),
        ('linear_date', series_pipeline('dates', [LinearDateTransformer()])),
        ('label_A', series_pipeline('col_A', [LabelEncoderWithUnknown()])),
    ])
    features.fit(df)
    ret = features.transform(df)
    feature_names = features.get_feature_names()
    assert ret.shape[0] == df.shape[0]
    assert ret.shape[1] == len(feature_names)
Esempio n. 30
0
     TfidfVectorizer(max_features=20000,
                     **tfidf_para,
                     preprocessor=get_col('title'))),
])

dftrl.index = df.index
dftrl.fillna('', inplace=True)
dftrl.head()
start_vect = time.time()
text_cols = ['text', 'text_feat', 'title']
vectorizer1.fit(df[text_cols].loc[traindex, :].to_dict('records'))
text_cols = ['text', 'title']
vectorizer2.fit(dftrl[text_cols].loc[traindex, :].to_dict('records'))
ready_df = vectorizer1.transform(df.to_dict('records'))
ready_dftrl = vectorizer2.transform(dftrl.to_dict('records'))
tfvocab1 = vectorizer1.get_feature_names()
tfvocab2 = vectorizer2.get_feature_names()

print('[{}] Vectorisation completed'.format(time.time() - start_time))
# Drop Text Cols
df.drop(textfeats + ['text', 'all_titles'], axis=1, inplace=True)
del dftrl
gc.collect()

print('[{}] Drop all the categorical'.format(time.time() - start_time))
df.drop(categorical, axis=1, inplace=True)
gc.collect()

ready_df.shape
ready_dftrl.shape
Esempio n. 31
0
})

X_train = dft["text"]
stop_word_lib = set(stopwords.words('english'))

tfidf_param = {
    "stop_words": stop_word_lib,
    "analyzer": 'word',
    "token_pattern": r'\w{1,}',
    "sublinear_tf": True,
    "dtype": np.float32,
    "norm": 'l2',
    # "min_df":5,
    # "max_df":.9,
    "smooth_idf": False
}

unioned = FeatureUnion([
    ("colA", TfidfVectorizer(ngram_range=(1, 2),
                             max_features=50,
                             **tfidf_param)),
    ("colB", CountVectorizer(min_df=0, stop_words=stop_word_lib))
])
print("here")
unioned.fit(dft)
res_df = unioned.transform(dft)
f_names = unioned.get_feature_names()
print(f_names)
print(res_df.toarray())
print("here")
Esempio n. 32
0
unigram_vectorizer = TfidfVectorizer(ngram_range=(1, 1), min_df=1)
temp_uni_tfidf = unigram_vectorizer.fit_transform(X_train).toarray()
# n_features = len(unigram_vectorizer.get_feature_names())
n_features = 9000
multigrams_vectorizer = TfidfVectorizer(ngram_range=(2, 2),
                                        min_df=2,
                                        max_features=n_features)
comb_vectorizer = FeatureUnion([("uni_vec", unigram_vectorizer),
                                ("multi_vec", multigrams_vectorizer)])
# comb_vectorizer.set_params(multi_vec=None)
X_tfidf = comb_vectorizer.fit_transform(X_train).toarray()
with open("vectorizer_NEALTA_Binary.pk", "wb") as vect_file:
    pickle.dump(comb_vectorizer, vect_file)

feature_names = comb_vectorizer.get_feature_names()
print("num_features: " + str(len(feature_names)))
# print(feature_names[:50])
print("features extracted & tfidf transformed")
# Transform documents to document-term matrix. (.transform) - No learning involved as it is test data
# For test data
# X_test_tfidf = vectorizer.transform(X_test).toarray()
X_test_tfidf = comb_vectorizer.transform(X_test).toarray()

print('Creating Linear SVC Model...')
# model=svm.LinearSVC(C=1000)
model = svm.SVC(kernel="linear", C=1000, cache_size=5000, probability=True)
# model = svm.SVC(kernel="rbf", C=1, cache_size=5000,probability=True)

print('Linear SVC Model created!')
# .fit(X, y[, sample_weight]): Fit the model according to the given training data
Esempio n. 33
0
def preparTotalData(y, df, predictors, len_train, len_test, frm, to,
                    tot_filename):

    y, df, predictors, len_train, categorical, textfeats = preparBaseData(
        y, df, predictors, len_train, len_test, frm, to, tot_filename)

    print("\n[TF-IDF] Term Frequency Inverse Document Frequency Stage")
    russian_stop = set(stopwords.words('russian'))

    tfidf_para = {
        "stop_words": russian_stop,
        "analyzer": 'word',
        "token_pattern": r'\w{1,}',
        "sublinear_tf": True,
        "dtype": np.float32,
        "norm": 'l2',
        #"min_df":5,
        #"max_df":.9,
        "smooth_idf": False
    }

    def get_col(col_name):
        return lambda x: x[col_name]

    vectorizer = FeatureUnion([
        ('description',
         TfidfVectorizer(ngram_range=(1, 2),
                         max_features=17000,
                         **tfidf_para,
                         preprocessor=get_col('description'))),
        (
            'title',
            TfidfVectorizer(
                ngram_range=(1, 2),
                **tfidf_para,
                #max_features=7000,
                preprocessor=get_col('title')))
    ])

    start_vect = time.time()
    #vectorizer.fit(df.loc[traindex,:].to_dict('records'))
    vectorizer.fit(df[:len_train].to_dict('records'))
    ready_df = vectorizer.transform(df.to_dict('records'))
    tfvocab = vectorizer.get_feature_names()
    print("Vectorization Runtime: %0.2f Minutes" %
          ((time.time() - start_vect) / 60))

    # Drop Text Cols
    df.drop(textfeats, axis=1, inplace=True)

    #from sklearn.metrics import mean_squared_error
    from math import sqrt

    kf = KFold(len_train, n_folds=NFOLDS, shuffle=True, random_state=SEED)
    ridge_params = {
        'alpha': 30.0,
        'fit_intercept': True,
        'normalize': False,
        'copy_X': True,
        'max_iter': None,
        'tol': 0.001,
        'solver': 'auto',
        'random_state': SEED
    }
    ridge = SklearnWrapper(clf=Ridge, seed=SEED, params=ridge_params)
    ridge_oof_train, ridge_oof_test = get_oof(ridge, ready_df[:len_train], y,
                                              ready_df[len_train:], len_train,
                                              len_test, kf)
    #rms = sqrt(mean_squared_error(y, ridge_oof_train))
    ridge_preds = np.concatenate([ridge_oof_train, ridge_oof_test])
    df['ridge_preds'] = ridge_preds
    predictors.append('ridge_preds')

    df = kaggle_util.reduce_mem_usage(df)
    return y, df, ready_df, tfvocab, predictors, len_train, categorical
Esempio n. 34
0
vectorizer_param = {'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word',
                    'min_df': parameters['min_df'], 'max_df': parameters['max_df'],
                    'binary': parameters['TF_binary'], 'norm': parameters['norm'],
                    'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features']}

if __name__ == "__main__":
    unigram = StemmedTfidfVectorizer(**vectorizer_param)
    anew = anew_vectorizer()
    pct = punctuation_estimator()
    strength = strength_vectorizer()
    avg_strength = avg_affective_vectorizer()
    log_state('combine unigram and avg strength features')
    combined_features = FeatureUnion([('unigram', unigram), ('avg_strength', avg_strength)])
    # log_state('combine unigram and strength features')
    # combined_features =FeatureUnion([('unigram',unigram),('strength',strength)])
    # log_state('combine unigram and anew features')
    # combined_features =FeatureUnion([('unigram',unigram),('anew',anew)])
    # log_state('combine unigram and punctuation features')
    # combined_features =FeatureUnion([('unigram',unigram),('pct',pct)])
    texts, _ = load_train_data('Sentiment140')

    transformed_train = combined_features.fit_transform(texts)

    testdata, _ = load_test_data()
    transformed_test = combined_features.transform(testdata)

    dump_picle(combined_features.get_feature_names(), './data/features/feature_names.p')
    dump_picle(transformed_train, "./data/transformed_data/transformed_train.p")
    dump_picle(transformed_test, "./data/transformed_data/transformed_test.p")
    from Utils import load_test_data
    X_test, Y_test = load_test_data()

    X_test, Y_test = np.array(X_test), np.array(Y_test)

    if parameters['combine_feature']==True:
        from vectorizer_estimator import StatisticVectorizer
        from sklearn.pipeline import FeatureUnion
        statistic_vec=StatisticVectorizer()
        combined_features =FeatureUnion([('ngrams',vectorizer),('statistic_vec',statistic_vec)])
    else:
        combined_features=vectorizer

    trian_vec = combined_features.fit_transform(X_train)
    pickle.dump(combined_features.get_feature_names(), open('./debug/feature_names.p', 'wb'))
    test_vec = combined_features.transform(X_test)  # use transform for test data, instead of fit_transform

    # clf = pickle.load(open("./acc_tmp/clf_all_data_noclustering.p", "rb"))

    if parameters['classifier']=='svm':
        from sklearn import svm
        clf = svm.SVC()
        clf.fit(trian_vec.toarray(), Y_train)
        pickle.dump(trian_vec, open("./debug/trian_vec.p", "wb"))
        pickle.dump(clf, open("./acc_tmp/clf.p", "wb"))
        true_labels=Y_test
        predict_labels=np.array(clf.predict(test_vec.toarray()))
        precision,recall,fbeta_score,support=precision_recall_fscore_support(true_labels, predict_labels, average='binary')
        print('精确度(Precision):%.3f\n召回率:%.3f\nF值: %.3f'%(precision,recall,fbeta_score))
    else:
Esempio n. 36
0
def create_text_feature(df, todir, ext, language, max_features):
    print('\n>> doing Text Features')
    if language == 'russian':
        df = remove_english(df)
    else:
        df = remove_russian(df)
        df = rename_en_to_ru(df)

    if language == 'russian':
        filename = todir + 'text_feature_kernel_' + str(max_features) + ext
        filename_len = todir + 'len_feature_kernel' + ext
        filename_vocab = todir + 'vocab_' + str(max_features) + ext
        suffix_feature = ''
    else:
        filename = todir + 'text_feature_kernel_' + str(
            max_features) + '_en' + ext
        filename_len = todir + 'len_feature_kernel_en' + ext
        filename_vocab = todir + 'vocab_' + str(max_features) + '_en' + ext
        suffix_feature = '_en'

    if os.path.exists(filename):
        print('done already...')
        df = load_file(filename_len, ext)
        ready_df = load_file(filename, ext)
    else:
        df = get_original_data(
            df, ['param_1', 'param_2', 'param_3', 'description', 'title'])

        df['text_feat'] = df.apply(lambda row: ' '.join(
            [str(row['param_1']),
             str(row['param_2']),
             str(row['param_3'])]),
                                   axis=1)  # Group Param Features

        print(df[['text_feat', 'param_1', 'param_2', 'param_3']].head())
        print(df[['text_feat', 'param_1', 'param_2', 'param_3']].tail())
        df.drop(["param_1", "param_2", "param_3"], axis=1, inplace=True)
        textfeats = ["description", "text_feat", "title"]
        for cols in textfeats:
            df[cols] = df[cols].astype(str)
            df[cols] = df[cols].astype(str).fillna('n/a')  # FILL NA
            df[cols] = df[cols].str.lower(
            )  # Lowercase all text, so that capitalized words dont get treated differently
            df[cols + '_num_chars' + suffix_feature] = df[cols].apply(
                len)  # Count number of Characters
            df[cols + '_num_words' + suffix_feature] = df[cols].apply(
                lambda comment: len(comment.split()))  # Count number of Words
            df[cols + '_num_unique_words' + suffix_feature] = df[cols].apply(
                lambda comment: len(set(w for w in comment.split())))
            df[cols + '_words_vs_unique' + suffix_feature] =   \
                    df[cols+'_num_unique_words'+ suffix_feature] /   \
                    df[cols+'_num_words'+ suffix_feature] * 100 # Count Unique Words
        print_memory()

        print("\n[TF-IDF] Term Frequency Inverse Document Frequency Stage")
        language_stop = set(stopwords.words(language))
        tfidf_para = {
            "stop_words": language_stop,
            "analyzer": 'word',
            "token_pattern": r'\w{1,}',
            "sublinear_tf": True,
            "dtype": np.float32,
            "norm": 'l2',
            #"min_df":5,
            #"max_df":.9,
            "smooth_idf": False
        }
        if max_features > 0:
            vectorizer = FeatureUnion([
                ('description',
                 TfidfVectorizer(ngram_range=(1, 2),
                                 max_features=max_features,
                                 **tfidf_para,
                                 preprocessor=get_col('description'))),
                ('text_feat',
                 CountVectorizer(ngram_range=(1, 2),
                                 preprocessor=get_col('text_feat'))),
                ('title',
                 TfidfVectorizer(ngram_range=(1, 2),
                                 **tfidf_para,
                                 preprocessor=get_col('title')))
            ])
        else:
            vectorizer = FeatureUnion([
                ('description',
                 TfidfVectorizer(ngram_range=(1, 2),
                                 **tfidf_para,
                                 preprocessor=get_col('description'))),
                ('text_feat',
                 CountVectorizer(ngram_range=(1, 2),
                                 preprocessor=get_col('text_feat'))),
                ('title',
                 TfidfVectorizer(ngram_range=(1, 2),
                                 **tfidf_para,
                                 preprocessor=get_col('title')))
            ])

        vectorizer.fit(df.to_dict('records'))
        ready_df = vectorizer.transform(df.to_dict('records'))
        tfvocab = vectorizer.get_feature_names()
        print_memory()

        # Drop Text Cols
        df.drop(textfeats, axis=1, inplace=True)

        print("Modeling Stage")
        # Combine Dense Features with Sparse Text Bag of Words Features
        # ready_df = ready_df.astype(np.float32)
        X = hstack([csr_matrix(df.values), ready_df])
        for shape in [X]:
            print("{} Rows and {} Cols".format(*shape.shape))
        print_memory()

        save_file(df, filename_len, ext)

        print('>> saving to', filename)
        with open(filename, "wb") as f:
            pickle.dump((ready_df, tfvocab), f)

        del ready_df, tfvocab
        gc.collect()

        with open(filename, "rb") as f:
            ready_df, tfvocab = pickle.load(f)
    return df, ready_df
Esempio n. 37
0
            X_test, X_test_labels = clustering_test_data_method(
                X_test, X_train, cluster_size)

    X_test, Y_test = np.array(X_test), np.array(Y_test)

    if parameters['combine_feature'] == True:
        from vectorizer_estimator import StatisticVectorizer
        from sklearn.pipeline import FeatureUnion
        statistic_vec = StatisticVectorizer()
        combined_features = FeatureUnion([('ngrams', vectorizer),
                                          ('statistic_vec', statistic_vec)])
    else:
        combined_features = vectorizer

    trian_vec = combined_features.fit_transform(X_train)
    pickle.dump(combined_features.get_feature_names(),
                open('./debug/feature_names.p', 'wb'))
    test_vec = combined_features.transform(
        X_test)  # use transform for test data, instead of fit_transform

    # clf = pickle.load(open("./acc_tmp/clf_all_data_noclustering.p", "rb"))

    if parameters['classifier'] == 'svm':
        from sklearn import svm
        clf = svm.SVC()
        clf.fit(trian_vec.toarray(), Y_train)
        pickle.dump(trian_vec, open("./debug/trian_vec.p", "wb"))
        pickle.dump(clf, open("./acc_tmp/clf.p", "wb"))
        true_labels = Y_test
        predict_labels = np.array(clf.predict(test_vec.toarray()))
        precision, recall, fbeta_score, support = precision_recall_fscore_support(
def train_model(trainset):


  #create two blocks of features, word anc character ngrams, size of 2
  #we can also append here multiple other features in general
  word_vector = TfidfVectorizer( analyzer="word" , ngram_range=(2,2), binary = False, max_features= 2000 )
  char_vector = TfidfVectorizer(ngram_range=(2, 3), analyzer="char", binary=False, min_df=0 , max_features=2000 )

  #ur vectors are the feature union of word/char ngrams
  vectorizer = FeatureUnion([  ("chars", char_vector),("words", word_vector)  ] )


  #corpus is a list with the n-word chunks
  corpus = []
  #classes is the labels of each chunk
  classes = []

  	#load training sets, for males & females

  	

  for item in trainset:    
  	corpus.append( item['text']  )
  	classes.append( item['label'] )


  print "num of training instances: ", len(classes)    
  print "num of training classes: ", len(set(classes))

    



  #fit the model of tfidf vectors for the coprus
  matrix = vectorizer.fit_transform(corpus)
 

  print "num of features: " , len(vectorizer.get_feature_names())

  print "training model"
  X =matrix.toarray()
  y = np.asarray(classes)
  
  model  = LinearSVC( loss='l1', dual=True)

  #scores = cross_validation.cross_val_score(  estimator = model,
	#		X = matrix.toarray(), 
  #    		y= np.asarray(classes), cv=10  )

  #http://scikit-learn.org/stable/auto_examples/plot_confusion_matrix.html
  #print scores



  #print "10-fold cross validation results:", "mean score = ", scores.mean(), "std=", scores.std(), ", num folds =", len(scores)

  #model.fit( X= matrix.toarray(), y= np.asarray(classes) )
  
  #predicted = model.predict(matrix.toarray())
  #cm = confusion_matrix(classes, predicted)
  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
  y_pred = model.fit(X_train, y_train).predict(X_test)
  cm = confusion_matrix(y_test, y_pred)

  print(cm)

  pl.matshow(cm)
  pl.title('Confusion matrix')
  pl.colorbar()
  pl.ylabel('True label')
  pl.xlabel('Predicted label')
  pl.show()
Esempio n. 39
0
            **countv_para,
            preprocessor=get_col('text_feat'))),
        ('title',CountVectorizer(
            **countv_para,
            preprocessor=get_col('title'))),
        ('translation',TfidfVectorizer(
            #ngram_range=(1, 2),
            max_features=40000,
            **tfidf_para,
            preprocessor=get_col('translation'))),
    ])
    
start_vect=time.time()
vectorizer.fit(df.loc[traindex,:].to_dict('records'))
ready_df = vectorizer.transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
tfvocab[:50]
print('[{}] Vectorisation completed'.format(time.time() - start_time))
# Drop Text Cols
df.drop(textfeats+['text', 'all_titles', 'translation'], axis=1,inplace=True)
gc.collect()

print('[{}] Drop all the categorical'.format(time.time() - start_time))
df.drop(categorical, axis=1,inplace=True)

# Training and Validation Set
lgbm_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective' : 'regression',
    'metric' : 'rmse',
Esempio n. 40
0
class FeatureExtractor(object):
    """Class for exracting features from dataframe object.
    
    Attributes
    ----------
    
    _transform_only: bool
        True, if extractor was initiated from previously serialized vectorizer and labelencoder.
    
    """

    def __init__(self, settings, dataframe, vectorizer=None, labelencoder=None):
        """Initiate a featureextractor.
        
        Parameters
        ----------
        settings: Settings
            Settings instance of the classification task.
        dataframe: Dataframe
            Pandas Dataframe object containing the data compatible with the settings.

        Keyword parameters
        ------------------
        vectorizer: str
            Pickled and base64-encoded vectorizer from previous serialization.
        labelencoder: str
            Pickled and base64-encoded labelencoder from previous serialization.
        """
        assert isinstance(settings, Settings)

        self._settings = settings
        self._dataframe = dataframe
        self._cache = {}
        self._transform_only = False

        if vectorizer is None:
            self._vectorizer = FeatureUnion(
                [
                    ("lemma", CountVectorizer(analyzer=SimpleTextAnalyzer(settings.unifier), binary=True, min_df=5)),
                    ("word", TfidfVectorizer(analyzer="word", min_df=5, use_idf=False, ngram_range=(2, 3))),
                ],
                n_jobs=NUM_CORES,
            )
        else:
            self._vectorizer = pickle.loads(base64.b64decode(vectorizer))
            self._transform_only = True

        if labelencoder is None:
            self._labelencoder = preprocessing.LabelEncoder()
        else:
            self._labelencoder = pickle.loads(base64.b64decode(labelencoder))
            self._transform_only = True

    def export(self):
        return {
            "settings": self.settings.export(),
            "vectorizer": base64.b64encode(pickle.dumps(self._vectorizer)).decode("ascii"),
            "labelencoder": base64.b64encode(pickle.dumps(self._labelencoder)).decode("ascii"),
        }

    @property
    def settings(self):
        return self._settings

    @property
    def vectorizer(self):
        return self._vectorizer

    @property
    def dataframe(self):
        return self._dataframe

    @property
    def settings(self):
        return self._settings

    @property
    def labelencoder(self):
        return self._labelencoder

    @property
    def strings(self):
        """Get feature strings.
        
        Returns
        -------
        list[unicode]
            Dataframe columns concatenated to a single string.
        """
        if "strings" not in self._cache:
            self._cache["strings"] = [
                " ".join(row) for row in self._dataframe[self._settings.features].fillna("").values
            ]
        return self._cache["strings"]

    @property
    def X(self):
        """Returns
        -------
        scipy.sparse
            Sparse matrix containing textual features for classification.
        """
        if "X" not in self._cache:
            X = self.strings
            if self._transform_only:
                self._cache["X"] = self._vectorizer.transform(X)
            else:
                self._cache["X"] = self._vectorizer.fit_transform(X)
        return self._cache["X"]

    @property
    def y(self):
        """Returns
        -------
        numpy.array
            Labels encoded as integer values. Use get_labels() for mapping them back to strings.
        """
        if "y" not in self._cache:
            y = list(self._dataframe[self._settings.label].fillna(""))
            if self._transform_only:
                self._cache["y"] = np.array(self._labelencoder.transform(y))
            else:
                self._cache["y"] = np.array(self._labelencoder.fit_transform(y))
        return self._cache["y"]

    @property
    def feature_names(self):
        """Returns
        -------
        list[unicode]
            Meaningful feature names for vectorized feature matrix columns.
        """
        return self._vectorizer.get_feature_names()

    @property
    def labels(self):
        """Returns
        -------
        list[unicode]
            Labels for for encoded labels (y).
        """
        self.y
        return [l for l in self._labelencoder.classes_]
Esempio n. 41
0
def train_model(train_set, mode='linear'):
    """
    Train the models, using 10-fold-cv and different classifications.
    :param train_set: The set that is used for training
    :param mode: The mode that gets used for training
    """
    global used_countries

    # Create two blocks of features, word anc character n-grams, size of 2
    # We can also append here multiple other features in general
    word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2, 2), binary=False, max_features=2000)
    char_vector = TfidfVectorizer(ngram_range=(2, 3), analyzer="char", binary=False, min_df=0, max_features=2000)

    # Our vectors are the feature union of word/char n-grams
    inner_vectorizer = FeatureUnion([("chars", char_vector), ("words", word_vector)])

    # Corpus is a list with the n-word chunks
    corpus = []
    # Classes is the labels of each chunk
    classes = []

    # Load training set
    for key, country_list in train_set.items():
        # print("Processing " + '"' + key + '"')

        for item in country_list:
            corpus.append(item['text'])
            classes.append(item['label'])

    print()
    print("Size of corpus: " + str(sys.getsizeof(corpus)))
    print("Number of training instances: ", len(classes))
    print("Number of training classes: ", len(set(classes)))
    print("Processed Countries:\n" + str(used_countries))

    # Fit the model of tf-idf vectors for the corpus
    x1 = inner_vectorizer.fit_transform(corpus)

    print("Number of features: ", len(inner_vectorizer.get_feature_names()))

    x = x1.toarray()
    y = np.asarray(classes)

    print()
    print("Training model...")

    if mode == 'kernel_rbf':
        inner_model = SVC(kernel="rbf")
        inner_model.fit(x, y)
    elif mode == 'kernel_poly':
        inner_model = SVC(kernel="poly")
        inner_model.fit(x, y)
    elif mode == 'kernel_linear':
        inner_model = SVC(kernel="linear")
        inner_model.fit(x, y)
    else:
        inner_model = LinearSVC(loss='hinge', dual=True)
        inner_model.fit(x, y)
        mode = 'linear'

    print("Saving model...")

    # Create the model-folder if it does not exist
    if not isdir("../data/model/"):
        makedirs("../data/model/")
    pickle.dump(inner_model, open('../data/model/trained_model_' + mode, 'wb'))

    print("Saving tfidf-vectorizer...")
    pickle.dump(inner_vectorizer, open('../data/model/vectorizer_' + mode, 'wb'))
             .format(args.H_matrix))
    nmf_model.fit(tfidf)
    W_matrix = nmf_model.components_
    sparse_H = tfidf.dot(csr_matrix(W_matrix).transpose())
    H_matrix = np.asarray(sparse_H.todense())
else:
    LOG.info("Using provided H matrix ({})".format(args.H_matrix))
    H_matrix = nmf_model.fit_transform(tfidf)
    W_matrix = nmf_model.components_

reconstruction_err = nmf_model.reconstruction_err_

LOG.info("done in %0.3fs." % (time() - t0))

LOG.info("Labeling topics...")
feature_names = preprocess.get_feature_names()

topic_names = []
max_words = max(2, args.show_top_words) \
    if args.show_top_words is not None \
    else 2

for j, topic in enumerate(W_matrix):
    weight_words = [(topic[i], feature_names[i])
                    for i in topic.argsort()[:-max_words - 1:-1]]
    word1, word2 = weight_words[:2]
    prefix = str(j) + '_'
    topic_name = prefix + word1[1] \
        if safe_div(abs(word1[0]), abs(word2[0])) >= args.word_ratio \
        else prefix + word1[1] + '-' + word2[1]
    topic_names.append(topic_name)
Esempio n. 43
0
	print ''
	word_count = SpecialWordCounter()
	word_count.fit(t)
	print word_count.get_feature_names()
	print word_count.transform(t)

	combined_features = FeatureUnion([
		('stats', TextStats())
		, ('special_word_stats', SpecialWordCounter())
		])

	# Use combined features to transform dataset:
	X_features = combined_features.fit(t).transform(t)
	print '\nfeature union'
	print 'X:', X_features
	print 'names:', combined_features.get_feature_names()
	print 

	pipeline = Pipeline([
	    # Use FeatureUnion to combine the features from subject and body
	    ('union', FeatureUnion(
	        transformer_list=[
	        ('scaled_text_stats', Pipeline([
                ('stats', TextStats())
               , ('scaling',  StandardScaler())
            ])
            )
	        , ('special_word_stats', SpecialWordCounter())
	        ]
	        )
	    )
def data_vectorize(df):
    russian_stop1 = set(stopwords.words("russian"))
    russian_stop2 = read_stopwords()
    russian_stop = list(set(russian_stop1).intersection(set(russian_stop2)))
    tfidf_para = {
        "stop_words": russian_stop,
        "analyzer": "word",
        "token_pattern": r"\w{1,}",
        "sublinear_tf": True,
        "dtype": np.float32,
        "norm": "l2",
        # "min_df":5,
        # "max_df":.9,
        "smooth_idf": False
    }

    tfidf_para2 = {
        "stop_words": russian_stop,
        "analyzer": "char",
        "token_pattern": r"\w{1,}",
        "sublinear_tf": True,
        "dtype": np.float32,
        "norm": "l2",
        # "min_df":5,
        # "max_df":.9,
        "smooth_idf": False
    }

    def get_col(col_name):
        return lambda x: x[col_name]

    vectorizer = FeatureUnion([
        # ("description", TfidfVectorizer(
        #     ngram_range=(1, 2),
        #     max_features=40000,  # 40000,18000
        #     **tfidf_para,
        #     preprocessor=get_col("description"))
        #  ),
        #         ("title_description", TfidfVectorizer(
        #              ngram_range=(1, 2),#(1,2)
        #              max_features=1800,#40000,18000
        #              **tfidf_para,
        #              preprocessor=get_col("title_description"))
        #           ),
        # ("text_feature", CountVectorizer(
        #     ngram_range=(1, 2),
        #     preprocessor=get_col("text_feature"))
        #  ),

        # ("title", TfidfVectorizer(
        #     ngram_range=(1, 2),
        #     **tfidf_para,
        #     preprocessor=get_col("title"))
        #  ),
        # 新加入两个文本处理title2,title_char
        ("title2",
         TfidfVectorizer(ngram_range=(1, 1),
                         **tfidf_para,
                         preprocessor=get_col("title"))),

        # ("title", TfidfVectorizer(
        #     ngram_range=(1, 2),
        #     **tfidf_para,
        #     preprocessor=get_col("title_ru"))
        #  ),
        # # 新加入两个文本处理title2,title_char
        # ("title2", TfidfVectorizer(
        #     ngram_range=(1, 1),
        #     **tfidf_para,
        #     preprocessor=get_col("title_ru"))
        #  ),

        #增加了1倍的运行时间
        # ("title_char", TfidfVectorizer(
        #
        #     ngram_range=(1, 4),  # (1, 4),(1,6)
        #     max_features=16000,  # 16000
        #     **tfidf_para2,
        #     preprocessor=get_col("title"))
        #  ),

        # # 新加2018-6-3,速度很慢
        # ("description_feature", CountVectorizer(
        #     ngram_range=(1, 2),
        #     stop_words= russian_stop,
        #  max_features=8000,
        #     preprocessor=get_col("description"))
        #  ),
    ])
    vectorizer.fit(df.to_dict("records"))
    ready_full_df = vectorizer.transform(df.to_dict("records"))
    tfvocab = vectorizer.get_feature_names()
    df.drop(
        [
            "text_feature",
            "text_feature_2",
            "description",
            "title",
            "title_ru",
            # "title_description"
        ],
        axis=1,
        inplace=True)
    df.fillna(-1, inplace=True)
    return df, ready_full_df, tfvocab
Esempio n. 45
0
    #('text_feat',CountVectorizer(
    #    **countv_para,
    #    preprocessor=get_col('text_feat'))),
    ('name_bi', CountVectorizer(**countv_para,
                                preprocessor=get_col('name_bi'))),
    #('translation',TfidfVectorizer(
    #    #ngram_range=(1, 2),
    #    max_features=50000,
    #    **tfidf_para,
    #    preprocessor=get_col('translation'))),
])

start_vect = time.time()
vectorizer.fit(df.loc[traindex, :].to_dict('records'))
ready_df = vectorizer.transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
tfvocab[:50]
print('[{}] Vectorisation completed'.format(time.time() - start_time))
# Drop Text Cols
df.drop(textfeats + ['text', 'all_titles', 'translation', 'name_bi'],
        axis=1,
        inplace=True)
#drop_cols= [c for c in textfeats+['text', 'all_titles', 'translation'] if c in df.columns]
#df.drop(drop_cols, axis=1,inplace=True)

gc.collect()

print('[{}] Drop all the categorical'.format(time.time() - start_time))
df.drop(categorical, axis=1, inplace=True)

# Training and Validation Set
Esempio n. 46
0
print("Extracting features from the test dataset using the same vectorizer")
t0 = time()
X_test = preprocess.transform(data_test.data)
duration = time() - t0
print("X_test: n_samples: %d, n_features: %d" % X_test.shape)
print()


y_train, y_test = data_train.target, data_test.target


# mapping from integer feature name to original token string
if opts.vectorizer == "hashing":
    feature_names = None
elif opts.vectorizer == "tfidf":
    feature_names = np.asarray(preprocess.get_feature_names())
    assert feature_names.shape[0] == X_train.shape[1] == X_test.shape[1], \
        ("feature_names-len: %d, X-train-len:%d, X-test-len: %d" %
         (feature_names.shape[0], X_train.shape[1], X_test.shape[1]))


if opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" %
          opts.select_chi2)
    t0 = time()
    ch2 = SelectKBest(chi2, k=opts.select_chi2)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    print("done in %fs" % (time() - t0))
    print()
    if feature_names is not None:
Esempio n. 47
0
class Orchestrator(object):
    """ Main singleton, implementing Model part on Model-View-Presenter
    """

    def __init__(self, mainPfcsamrApp):
        """ Constructor
        :param mainPfcsamrApp: singleton of MainPafcsmrApp
        :return: None
        """
        self.file_path = None
        """ Current file path.

        :type: str"""

        self.headings = []
        """ List of column headings

        :type: list[str]"""

        self.rows = []
        """ Full data table

        :type: np.ndarray"""

        self.preprocessed_rows = None
        """ Preproc data table.

        :type: np.ndarray"""

        self.main_pfcsamr_app = mainPfcsamrApp
        """:type: MainPfcsamr2App"""

        self.featured_rows = []
        """ Featured data table.

        :type: np.ndarray"""

        self.featured_rows_train = []
        """ Featured data table (train subset)

        :type: np.ndarray"""

        self.featured_rows_test = []
        """ Featured data table (test subset)

        :type: np.ndarray"""

        self.train_y = []
        """ Vector of classes

        :type: np.ndarray"""

        self.train_y_train = []
        """ Vector of classes  (train subset)

        :type: np.ndarray"""

        self.train_y_test = []
        """ Vector of classes (test subset)

        :type: np.ndarray"""

        self.feature_union = None
        """:type: FeatureUnion"""

        self.featured_headings = []
        """ heading column names for features data table.

        :type: list[str]"""

        self.estimators = {}
        """ Map of trained estimators indexed by name.

         :type: dict"""

        self.already_splitted = False
        """ Boolean flag indicating if train set has been sub-split on train and test.

        :type: bool"""

        self.featured_support = []
        """ Support vector of selected features.

        :type: list[bool]"""

        self.featured_selected_headings = []
        """ Selected subset of feature headings.

        :type: list[str]"""

        self.classify_headings = []
        """ Heading of classify stage

        :type: list[str]"""

        self.classify_rows = []
        """ Data table of input classify stage

        :type: np.ndarray"""

        self.classify_preprocessed_rows = []
        """ Data table of preprocessed data on classify stage

        :type: np.ndarray"""

        self.classify_featured_rows = []
        """ Data table of vectorized data to features on classify stage

        :type: np.ndarray"""

        self.predictions_headings = []
        """ List of heading names for predictions

        :type: list[str]"""

        self.predictions_rows = []
        """ Data date of predictions to be saved and uploaded to kaggle.

        :type: np.ndarray"""

    def do_load_train_tsv(self, file_path: str = None, max_rows=None):
        """ Read and load ``train.tsv``

        :param file_path: str - path to file
        :param max_rows: int or None - max number of rows to read or everyone
        :return:
        """
        if file_path.startswith("file:///"):
            file_path = file_path[7:]
        self.file_path = file_path
        with open(file_path, "rt") as file:
            rdr = csv.reader(file, dialect="excel-tab")
            self.headings = next(rdr)
            no = 0
            self.main_pfcsamr_app.status_count = no
            for no, row in enumerate(rdr, 1):
                if no % 32 == 0:
                    self.main_pfcsamr_app.status_count = no
                self.rows.append(row)
                if max_rows is not None and no >= max_rows:
                    break
            self.main_pfcsamr_app.status_count = no

        self.main_pfcsamr_app.status_text = "Read {0} train samples".format(len(self.rows))
        self.main_pfcsamr_app.preproc_tab_enabled = True
        self.main_pfcsamr_app.features_tab_enabled = True
        self.main_pfcsamr_app.current_model = MyTableModel(self.headings, self.rows)
        return self

    def do_classify_test_tsv(self, file_path: str = None, max_rows=None):
        """ Read and load ``test.tsv``.

        :param file_path: str - path to file
        :param max_rows: int or None - max number of rows to read or everyone
        :return:
        """
        if file_path.startswith("file:///"):
            file_path = file_path[7:]
        self.file_path = file_path
        self.classify_rows = []
        with open(file_path, "rt") as file:
            rdr = csv.reader(file, dialect="excel-tab")
            self.classify_headings = next(rdr)
            no = 0
            self.main_pfcsamr_app.status_count = no
            for no, row in enumerate(rdr, 1):
                if no % 32 == 0:
                    self.main_pfcsamr_app.status_count = no
                self.classify_rows.append(row)
                if max_rows is not None and no >= max_rows:
                    break
            self.main_pfcsamr_app.status_count = no

        self.main_pfcsamr_app.status_text = "Read {0} test samples".format(len(self.classify_rows))
        self.main_pfcsamr_app.current_model = MyTableModel(self.classify_headings, self.classify_rows)
        return self

    def do_preprocess(self):
        """ Do preprocess (train stage).

        :return:
        """
        from .replacers import RegexpReplacer as ContractionsExpander

        expander = ContractionsExpander()
        ws_tokenizer = nltk.WhitespaceTokenizer()
        stemmer = nltk.PorterStemmer()
        lemmatizer = nltk.WordNetLemmatizer()
        self.preprocessed_rows = []
        no = 0
        self.main_pfcsamr_app.status_count = no
        for no, row in enumerate(self.rows, 1):
            new_row = []
            for column in row:
                if is_text(column):
                    if self.main_pfcsamr_app.config["preproc_unsplit_contractions"]:
                        column = unsplit_contractions(column)
                    if self.main_pfcsamr_app.config["preproc_expand_contractions"]:
                        column = expander.replace(column)
                    if self.main_pfcsamr_app.config["preproc_remove_stopwords"]:
                        column = remove_stopwords(column)
                    if (
                        self.main_pfcsamr_app.config["preproc_word_replacement"]
                        and self.main_pfcsamr_app.config["preproc_stemmize"]
                    ):
                        column = " ".join([stemmer.stem(w) for w in ws_tokenizer.tokenize(column)])
                    if (
                        self.main_pfcsamr_app.config["preproc_word_replacement"]
                        and self.main_pfcsamr_app.config["preproc_lemmatize"]
                    ):
                        column = " ".join([lemmatizer.lemmatize(w) for w in ws_tokenizer.tokenize(column)])
                    if self.main_pfcsamr_app.config["preproc_pos_tag_words"]:
                        column = postag(column)

                new_row.append(column)

            self.preprocessed_rows.append(new_row)
            if no % 32 == 0:
                self.main_pfcsamr_app.status_count = no

        self.main_pfcsamr_app.status_count = no
        self.main_pfcsamr_app.status_text = "Preprocessed done"
        self.main_pfcsamr_app.features_tab_enabled = True
        self.main_pfcsamr_app.current_model = MyTableModel(self.headings, self.preprocessed_rows)
        return self

    def do_classify_preprocess(self):
        """ Do preprocess (classify stage).

        :return:
        """
        from .replacers import RegexpReplacer as ContractionsExpander

        expander = ContractionsExpander()
        ws_tokenizer = nltk.WhitespaceTokenizer()
        stemmer = nltk.PorterStemmer()
        lemmatizer = nltk.WordNetLemmatizer()
        self.classify_preprocessed_rows = []
        no = 0
        self.main_pfcsamr_app.status_count = no
        for no, row in enumerate(self.classify_rows, 1):
            new_row = []
            for column in row:
                if is_text(column):
                    if self.main_pfcsamr_app.config["preproc_unsplit_contractions"]:
                        column = unsplit_contractions(column)
                    if self.main_pfcsamr_app.config["preproc_expand_contractions"]:
                        column = expander.replace(column)
                    if self.main_pfcsamr_app.config["preproc_remove_stopwords"]:
                        column = remove_stopwords(column)
                    if (
                        self.main_pfcsamr_app.config["preproc_word_replacement"]
                        and self.main_pfcsamr_app.config["preproc_stemmize"]
                    ):
                        column = " ".join([stemmer.stem(w) for w in ws_tokenizer.tokenize(column)])
                    if (
                        self.main_pfcsamr_app.config["preproc_word_replacement"]
                        and self.main_pfcsamr_app.config["preproc_lemmatize"]
                    ):
                        column = " ".join([lemmatizer.lemmatize(w) for w in ws_tokenizer.tokenize(column)])
                    if self.main_pfcsamr_app.config["preproc_pos_tag_words"]:
                        column = postag(column)

                new_row.append(column)

            self.classify_preprocessed_rows.append(new_row)
            if no % 32 == 0:
                self.main_pfcsamr_app.status_count = no

        self.main_pfcsamr_app.status_count = no
        self.main_pfcsamr_app.status_text = "Preprocessed done"
        self.main_pfcsamr_app.current_model = MyTableModel(self.classify_headings, self.classify_preprocessed_rows)
        return self

    def do_features_countvectorizer(self, variance_threshold=None, **kwargs):
        """ Do feature extract and selection (train stage).

        :param variance_threshold: float or None - threshold for feature selection
        :param kwargs: dict - options passed to CountVectorizer
        :return:
        """
        self.main_pfcsamr_app.variance_warn_message = ""
        if not self.preprocessed_rows:
            self.preprocessed_rows = copy(self.rows)

        # TODO No hardcodear
        columns_names = self.headings
        columns_is_text = [False, False, True, False]
        columns_is_class = [False, False, False, True]

        train_y = []

        steps = []
        # steps.append(('numeric_feats', MyPipeline([
        #     ('selector', SelectNumerics(columns_is_text, columns_names, columns_is_class)),
        #     ('dict', DictVectorizer()),
        # ])))
        for column_i, column_is_text in enumerate(columns_is_text):
            if columns_is_class[column_i]:
                train_y = map(lambda x: float(x[column_i]), self.preprocessed_rows)
                train_y = np.array(list(train_y))
            else:
                if column_is_text:
                    steps.append(
                        (
                            columns_names[column_i],
                            MyPipeline(
                                [
                                    ("selector", SelectText(column_i=column_i)),
                                    ("count_vector", CountVectorizer(**kwargs)),
                                ]
                            ),
                        )
                    )

        self.feature_union = FeatureUnion(steps)
        self.featured_rows = self.feature_union.fit_transform(self.preprocessed_rows, train_y)
        self.featured_headings = deepcopy(self.feature_union.get_feature_names())
        self.train_y = train_y

        variance_too_high = False
        if variance_threshold is not None:
            thresholder = VarianceThreshold(threshold=variance_threshold)
            try:
                self.featured_rows = thresholder.fit_transform(self.featured_rows)
                self.featured_support = thresholder.get_support()
                self.featured_selected_headings = [
                    self.featured_headings[i] for i, v in enumerate(self.featured_support) if v
                ]
                self.main_pfcsamr_app.variance_warn_message = ""
            except ValueError:
                traceback.print_exc()
                self.featured_rows = np.empty_like(self.featured_rows)
                self.featured_support = []
                self.featured_selected_headings = []
                self.main_pfcsamr_app.variance_warn_message = "threshold too high!!!"
                variance_too_high = True
        else:
            self.main_pfcsamr_app.variance_warn_message = ""
            self.featured_support = [True] * self.featured_rows.shape[1]
            self.featured_selected_headings = deepcopy(self.featured_headings)

        if not variance_too_high:
            self.main_pfcsamr_app.learn_tab_enabled = True
            self.main_pfcsamr_app.current_model = MyTableModel(self.featured_selected_headings, self.featured_rows)
            self.main_pfcsamr_app.status_text = "Feature extraction done. Shape of useful features: %s. Removed %d." % (
                str(self.featured_rows.shape),
                len(self.featured_headings) - len(self.featured_selected_headings),
            )

        return self

    def do_classify_features_countvectorizer(self):
        """ Do feature extract and selection (classify stage).
        :return:
        """
        if not self.classify_preprocessed_rows:
            self.classify_preprocessed_rows = copy(self.classify_rows)

        self.classify_featured_rows = self.feature_union.transform(self.classify_preprocessed_rows)

        # apply feature support
        self.classify_featured_rows = self.classify_featured_rows[:, self.featured_support]

        self.main_pfcsamr_app.current_model = MyTableModel(self.featured_selected_headings, self.classify_featured_rows)
        self.main_pfcsamr_app.status_text = "Feature extraction done. Shape of useful features: %s." % (
            str(self.classify_featured_rows.shape),
        )

        return self

    def do_learn(self, estimator_klazz: object, train_split: float = 0.75, **estimator_klazz_params):
        """ Do learn (train stage).

        :param estimator_klazz: class - class object of sklearn estimator
        :param train_split: float - percentage for train and test subsets split
        :param estimator_klazz_params: kwargs passed to estimator_klazz constructor
        :return:
        """
        if self.main_pfcsamr_app.learn_train_split_resplit and train_split is not None:
            self.already_splitted = False
            logger.debug("Re-Splitting")

            def gui_callback():
                print("callback has been called")
                self.main_pfcsamr_app.learn_train_split_resplit = False

            self.main_pfcsamr_app.queue.put_nowait(gui_callback)

        if not self.already_splitted:
            if train_split is not None:
                self.featured_rows_train, self.featured_rows_test, self.train_y_train, self.train_y_test = train_test_split(
                    self.featured_rows, self.train_y, train_size=train_split
                )
                logger.debug("Splitted")
            else:
                self.featured_rows_train, self.featured_rows_test, self.train_y_train, self.train_y_test = (
                    self.featured_rows,
                    [],
                    self.train_y,
                    [],
                )
                logger.debug("Not Splitted")
            self.already_splitted = True

        if estimator_klazz in [GaussianNB, LDA, QDA]:
            x_train = self.featured_rows_train.toarray()
            x_test = self.featured_rows_test.toarray()
        else:
            x_train = self.featured_rows_train
            x_test = self.featured_rows_test

        self.estimators[estimator_klazz.__name__] = estimator_klazz(**estimator_klazz_params)
        self.estimators[estimator_klazz.__name__].fit(x_train, self.train_y_train)

        if train_split:
            score_name = "selftest_score_" + estimator_klazz.__name__.lower()

            def gui_callback():
                self.main_pfcsamr_app.config[score_name] = self.estimators[estimator_klazz.__name__].score(
                    x_test, self.train_y_test
                )
                self.main_pfcsamr_app.config = {score_name: str(self.main_pfcsamr_app.config[score_name])}
                self.main_pfcsamr_app.classify_tab_enabled = True
                self.main_pfcsamr_app.status_text = "Learned using {0}".format(estimator_klazz.__name__)
                print("{0}: {1}".format(score_name, self.main_pfcsamr_app.config[score_name]))

            self.main_pfcsamr_app.queue.put_nowait(gui_callback)

    def do_classify_classify(self, evaluate_using: str):
        """ Compute predictions (classify stage).

        :param evaluate_using: str - fqcn of trained estimator
        :return:
        """
        my_estimator = self.estimators[evaluate_using]
        """:type: LinearClassifierMixin"""
        predictions = my_estimator.predict(self.classify_featured_rows.toarray())
        # PhraseId, SentenceId, Phrase, *Sentiment*, **Features**
        self.predictions_headings = self.headings + self.featured_headings
        self.predictions_rows = np.c_[np.array(self.classify_rows), predictions.T.astype(int)]
        self.main_pfcsamr_app.current_model = MyTableModel(self.predictions_headings, self.predictions_rows)
        self.main_pfcsamr_app.status_text = "Predictions done using {0}".format(my_estimator.__class__.__name__)
        return self

    def classify_save_csv(self, filename: str):
        """ Save ``submission.csv`` for uploading it to kaggle

        :param filename: str - path to file save to
        :return:
        """
        with open(filename, "wt") as file:
            writer = csv.writer(file)
            writer.writerow(["PhraseId", "Sentiment"])
            writer.writerows(self.predictions_rows[:, (0, 3)].astype(int))
        self.main_pfcsamr_app.status_text = "Saved predictions to {0}".format(filename)
Esempio n. 48
0
    def test_same_result_withdictrdd(self):
        X, X_rdd = self.make_text_rdd(2)
        Y_rdd = ArrayRDD(self.sc.parallelize([None] * len(X), 4), bsize=2)
        Z = DictRDD([X_rdd, Y_rdd], columns=("X", "y"), bsize=2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        loc_word_2 = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")
        dist_word_2 = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word),
            ("words2", loc_word_2)
        ])
        dist_union = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word),
            ("words2", dist_word_2)
        ])
        # test same feature names
        loc_union.fit(X)
        dist_union.fit(Z)
        converted_union = dist_union.to_scikit()

        assert_equal(
            loc_union.get_feature_names(),
            dist_union.get_feature_names(),
            converted_union.get_feature_names(),
        )

        # test same results
        Z_transformed = sp.vstack(dist_union.transform(Z)[:, 'X'].collect())
        assert_array_equal(loc_union.transform(X).toarray(), Z_transformed.toarray())
        assert_array_equal(loc_union.transform(X).toarray(),
                           converted_union.transform(X).toarray())
        # test same results with fit_transform
        X_transformed = loc_union.fit_transform(X)
        X_converted_transformed = converted_union.fit_transform(X)
        Z_transformed = sp.vstack(dist_union.fit_transform(Z)[:, 'X'].collect())

        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        assert_array_equal(X_transformed.toarray(),
                           X_converted_transformed.toarray())
        # test same results in parallel
        loc_union_par = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ], n_jobs=2)
        dist_union_par = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ], n_jobs=2)

        loc_union_par.fit(X)
        dist_union_par.fit(Z)
        converted_union = dist_union_par.to_scikit()
        X_transformed = loc_union_par.transform(X)
        Z_transformed = sp.vstack(dist_union_par.transform(Z)[:, 'X'].collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        assert_array_equal(X_transformed.toarray(),
                           converted_union.transform(X).toarray())
Esempio n. 49
0
        preprocessor=get_col('description'))),
    ('title', CountVectorizer(
        ngram_range=(1, 2),
        stop_words=russian_stop,
        # max_features=7000,
        preprocessor=get_col('title')))
])

start_vect = time.time()

# Fit my vectorizer on the entire dataset instead of the training rows
# Score improved by .0001
vectorizer.fit(df.to_dict('records'))

ready_df = vectorizer.transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
print("Vectorization Runtime: %0.2f Minutes" % ((time.time() - start_vect) / 60))

# Drop Text Cols
textfeats = ["description", "title"]
df.drop(textfeats, axis=1, inplace=True)

from sklearn.metrics import mean_squared_error
from math import sqrt

ridge_params = {'alpha': 30.0, 'fit_intercept': True, 'normalize': False, 'copy_X': True,
                'max_iter': None, 'tol': 0.001, 'solver': 'auto', 'random_state': SEED}

# Ridge oof method from Faron's kernel
# I was using this to analyze my vectorization, but figured it would be interesting to add the results back into the dataset
# It doesn't really add much to the score, but it does help lightgbm converge faster
Esempio n. 50
0
    # Ridge Feature Processing
    ridge_train, ridge_test = ens_ridge(ready_df[:train_row],
                                        train_y, train_row,
                                        ready_df[train_row:], test_row)
    ridge_preds = np.concatenate([ridge_train, ridge_test])
    df['ridge_preds'] = ridge_preds

    # NN Feature Processing
    # nn_train, nn_test = ens_nn(ready_df[:train_row],
    #                           train_y, train_row,
    #                           ready_df[train_row:], test_row)
    # nn_preds = np.concatenate([nn_train, nn_test])
    # df['nn_preds'] = nn_preds

    # Feature Stack
    tfvocab = vectorizer.get_feature_names()
    tfvocab = df.columns.tolist() + tfvocab
    logger.info('Feature Names Length:{}'.format(len(tfvocab)))

    csr_train_X = csr_matrix(df.loc[train_index, :].values)
    csr_test_X = csr_matrix(df.loc[test_index, :].values)
    train_X = hstack([csr_train_X, ready_df[:train_row]])
    test_X = hstack([csr_test_X, ready_df[train_row:]])
    # train_X = hstack([csr_matrix(df.loc[train_index, :].values), ready_df[:train_row]])
    # test_X = hstack([csr_matrix(df.loc[test_index, :].values), ready_df[train_row:]])
    
    del df
    gc.collect()

    # Train Data Split
    train_X, valid_X, train_y, valid_y = train_test_split(train_X,
Esempio n. 51
0
def data_vectorize(df):
    russian_stop = set(stopwords.words("russian"))
    tfidf_para = {
    "stop_words": russian_stop,
    "analyzer": "word",
    "token_pattern": r"\w{1,}",
    "sublinear_tf": True,
    "dtype": np.float32,
    "norm": "l2",
    #"min_df":5,
    #"max_df":.9,
    "smooth_idf":False
    }

    tfidf_para2 = {
        "stop_words": russian_stop,
        "analyzer": "char",
        "token_pattern": r"\w{1,}",
        "sublinear_tf": True,
        "dtype": np.float32,
        "norm": "l2",
        # "min_df":5,
        # "max_df":.9,
        "smooth_idf": False
    }

# mean rmse is: 0.23865288181138436
    
    def get_col(col_name): return lambda x: x[col_name]
    vectorizer = FeatureUnion([
        ("description", TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=40000,#40000,18000
            **tfidf_para,
            preprocessor=get_col("description"))
         ),
#         ("title_description", TfidfVectorizer(
#              ngram_range=(1, 2),#(1,2)
#              max_features=1800,#40000,18000
#              **tfidf_para,
#              preprocessor=get_col("title_description"))
#           ), 
        ("text_feature", CountVectorizer(
            ngram_range=(1, 2),
            preprocessor=get_col("text_feature"))
         ),
      
        ("title", TfidfVectorizer(
            ngram_range=(1, 2),
            **tfidf_para,
            preprocessor=get_col("title"))
         ),
        #新加入两个文本处理title2,title_char
        ("title2", TfidfVectorizer(
            ngram_range=(1, 1),
            **tfidf_para,
            preprocessor=get_col("title"))
         ),

        ("title_char", TfidfVectorizer(

            ngram_range=(1, 4),#(1, 4),(1,6)
            max_features=16000,#16000
            **tfidf_para2,
            preprocessor=get_col("title"))
         ),
    ])
    vectorizer.fit(df.to_dict("records"))
    ready_full_df = vectorizer.transform(df.to_dict("records"))    
    tfvocab = vectorizer.get_feature_names()
    df.drop(["text_feature", "text_feature_2", "description","title", "title_description"], axis=1, inplace=True)
    df.fillna(-1, inplace=True)     
    return df, ready_full_df, tfvocab