def __init__(self,
                 n_tfidf_node=None,
                 t_ngrams_node=None,
                 b_tfidf_node_lc=None,
                 n_tfidf_edge=None,
                 t_ngrams_edge=None,
                 b_tfidf_edge_lc=None,
                 bMirrorPage=True,
                 bMultiPage=True):
        FeatureDefinition.__init__(self)

        self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc
        self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc
        self.bMirrorPage = bMirrorPage
        self.bMultiPage = bMultiPage
        tdifNodeTextVectorizer = TfidfVectorizer(
            lowercase=self.b_tfidf_node_lc,
            max_features=self.n_tfidf_node,
            analyzer='char',
            ngram_range=self.t_ngrams_node  #(2,6)
            ,
            dtype=np.float64)

        node_transformer = FeatureUnion(
            [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                (
                    "text",
                    Pipeline([
                        ('selector', NodeTransformerTextEnclosed()),
                        #                                                         ('tfidf', TfidfVectorizer(lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node
                        #                                                                                   , analyzer = 'char', ngram_range=self.tNODE_NGRAMS #(2,6)
                        #                                                                                   , dtype=np.float64)),
                        (
                            'tfidf', tdifNodeTextVectorizer
                        ),  #we can use it separately from the pipleline once fitted
                        ('todense', SparseToDense()
                         )  #pystruct needs an array, not a sparse matrix
                    ])),
                (
                    "textlen",
                    Pipeline([
                        ('selector', NodeTransformerTextLen()),
                        ('textlen',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ])),
                (
                    "xywh",
                    Pipeline([
                        ('selector', NodeTransformerXYWH()),
                        #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('xywh',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ])),
                (
                    "neighbors",
                    Pipeline([
                        ('selector', NodeTransformerNeighbors()),
                        #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('neighbors',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ])),
                (
                    "1hot",
                    Pipeline([('1hot', Node1HotFeatures()
                               )  #does the 1-hot encoding directly
                              ]))
                #, ("sem", Pipeline([
                #                     ('sem', NodeSemanticLabels())  #add semantic labels
                #                     ])
                #  )  # Added  by Animesh
                #                                     , ('ocr' , Pipeline([
                #                                                          ('ocr', NodeOCRFeatures())
                #                                                          ])
                #                                        )
                #                                     , ('pnumre' , Pipeline([
                #                                                          ('pnumre', NodePNumFeatures())
                #                                                          ])
                #                                        )
                #                                     , ("doc_tfidf", Pipeline([
                #                                                          ('zero', Zero2Features())
                #                                                          #THIS ONE MUST BE LAST, because it include a placeholder column for the doculent-level tfidf
                #                                                          ])
                #                                        )
            ])

        lEdgeFeature = [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
            ("1hot",
             Pipeline([('1hot',
                        Edge1HotFeatures(PageNumberSimpleSequenciality()))])),
            ("boolean", Pipeline([('boolean', EdgeBooleanFeatures())])),
            (
                "numerical",
                Pipeline([
                    ('selector', EdgeNumericalSelector()),
                    #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                    ('numerical',
                     QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                         copy=False))  #use in-place scaling
                ])),
            (
                "sourcetext0",
                Pipeline([
                    ('selector',
                     EdgeTransformerSourceText(0,
                                               bMirrorPage=bMirrorPage,
                                               bMultiPage=bMultiPage)),
                    (
                        'tfidf',
                        TfidfVectorizer(
                            lowercase=self.b_tfidf_edge_lc,
                            max_features=self.n_tfidf_edge,
                            analyzer='char',
                            ngram_range=self.t_ngrams_edge  #(2,6)
                            ,
                            dtype=np.float64)),
                    ('todense', SparseToDense()
                     )  #pystruct needs an array, not a sparse matrix
                ])),
            (
                "targettext0",
                Pipeline([
                    ('selector',
                     EdgeTransformerTargetText(0,
                                               bMirrorPage=bMirrorPage,
                                               bMultiPage=bMultiPage)),
                    (
                        'tfidf',
                        TfidfVectorizer(
                            lowercase=self.b_tfidf_edge_lc,
                            max_features=self.n_tfidf_edge,
                            analyzer='char',
                            ngram_range=self.t_ngrams_edge
                            #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS
                            ,
                            dtype=np.float64)),
                    ('todense', SparseToDense()
                     )  #pystruct needs an array, not a sparse matrix
                ])),
            (
                "sourcetext1",
                Pipeline([
                    ('selector',
                     EdgeTransformerSourceText(1,
                                               bMirrorPage=bMirrorPage,
                                               bMultiPage=bMultiPage)),
                    (
                        'tfidf',
                        TfidfVectorizer(
                            lowercase=self.b_tfidf_edge_lc,
                            max_features=self.n_tfidf_edge,
                            analyzer='char',
                            ngram_range=self.t_ngrams_edge  #(2,6)
                            ,
                            dtype=np.float64)),
                    ('todense', SparseToDense()
                     )  #pystruct needs an array, not a sparse matrix
                ])),
            (
                "targettext1",
                Pipeline([
                    ('selector',
                     EdgeTransformerTargetText(1,
                                               bMirrorPage=bMirrorPage,
                                               bMultiPage=bMultiPage)),
                    (
                        'tfidf',
                        TfidfVectorizer(
                            lowercase=self.b_tfidf_edge_lc,
                            max_features=self.n_tfidf_edge,
                            analyzer='char',
                            ngram_range=self.t_ngrams_edge
                            #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS
                            ,
                            dtype=np.float64)),
                    ('todense', SparseToDense()
                     )  #pystruct needs an array, not a sparse matrix
                ]))
        ]
        if bMultiPage:
            lEdgeFeature.extend([
                (
                    "sourcetext2",
                    Pipeline([
                        ('selector',
                         EdgeTransformerSourceText(2,
                                                   bMirrorPage=bMirrorPage,
                                                   bMultiPage=bMultiPage)),
                        (
                            'tfidf',
                            TfidfVectorizer(
                                lowercase=self.b_tfidf_edge_lc,
                                max_features=self.n_tfidf_edge,
                                analyzer='char',
                                ngram_range=self.t_ngrams_edge  #(2,6)
                                ,
                                dtype=np.float64)),
                        ('todense', SparseToDense()
                         )  #pystruct needs an array, not a sparse matrix
                    ])),
                (
                    "targettext2",
                    Pipeline([
                        ('selector',
                         EdgeTransformerTargetText(2,
                                                   bMirrorPage=bMirrorPage,
                                                   bMultiPage=bMultiPage)),
                        (
                            'tfidf',
                            TfidfVectorizer(
                                lowercase=self.b_tfidf_edge_lc,
                                max_features=self.n_tfidf_edge,
                                analyzer='char',
                                ngram_range=self.t_ngrams_edge
                                #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS
                                ,
                                dtype=np.float64)),
                        ('todense', SparseToDense()
                         )  #pystruct needs an array, not a sparse matrix
                    ]))
            ])

        edge_transformer = FeatureUnion(lEdgeFeature)

        #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer
        self.tfidfNodeTextVectorizer = tdifNodeTextVectorizer
Exemple #2
0
         "age", "balance", "day", "campaign", "pdays", "previous", "duration"
     ])),
    ("std_scaler", StandardScaler()),
])

categorical_pipeline = Pipeline([
    ("select_cat",
     DataFrameSelector([
         "job", "education", "marital", "default", "housing", "loan",
         "contact", "month", "poutcome"
     ])), ("cat_encoder", CategoricalEncoder(encoding='onehot-dense'))
])

from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
    ("numerical_pipeline", numerical_pipeline),
    ("categorical_pipeline", categorical_pipeline),
])

# In[103]:

X_train = preprocess_pipeline.fit_transform(train_data)
X_train

# In[42]:

y_train = train_data['deposit']
y_test = test_data['deposit']
y_train.shape

# In[43]:
Exemple #3
0
        return X


# In[ ]:

num_pipeline = Pipeline([('selector', DataFrameSelector(num_attri)),
                         ('imputer',
                          sklearn.preprocessing.Imputer(strategy="median")),
                         ('standarddcale', StandardScaler())])

cat_pipeline = Pipeline([('selector', DataFrameSelector(cat_attri)),
                         ('fillna', Fillna_for_cat(["Embarked"])),
                         ('1hot_ecoder', MultiLabelBinarizer())])

full_pipeline = FeatureUnion(
    transformer_list=[("num_pipeline",
                       num_pipeline), ("cat_pipeline", cat_pipeline)])

train_prepared = full_pipeline.fit_transform(train_attri_dropped)

# In[ ]:

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

X = train_prepared
y = train_labels.values

clf = SVC()

clf.fit(X, y)
# using One Hot Encoder (need to integer encode first)
# using get_dummy (so allowing for k-1 levels)

### (6) Initiate Pipelines ----------------------------------------------------

# pipeline to extract ad-hoc features (saved as dataframe columns) from data
pipe_adhoc_features = Pipeline([(
    'adhoc',
    FeatureUnion([

        # pipeline for categorical features
        ('cat',
         Pipeline([('selector', ColumnSelector(columns=['is_reply'])),
                   ('todictionary', CatToDictTransformer()),
                   ('dv', DictVectorizer())])),

        # pipeline for numerical features
        ('num',
         Pipeline([('selector',
                    ColumnSelector(columns=[
                        'count_punkt', 'count_ADJ', 'count_ADV',
                        'subjectivity_text', 'VDR_polarity_text'
                    ])), ('scaler', StandardScaler())]))
    ]))])

# Pipeline for text-features (bag-of-words)

vec = CountVectorizer(analyzer="word",
                      ngram_range=(1, 3),
                      stop_words='english',
                      tokenizer=word_tokenize,
                      max_features=10000)
Exemple #5
0
def main(logger=None):
    ''' Main routine to call the entire process flow ''';

    # Load_Dataset --- Process starts

    logger.info(f'');
    logger.info(f'{"-"*20} Load dataset starts here {"-"*20}');
    logger.info(f'');

    # TODO: DONE; Load Cancer dataset;

    cancer_data_dict = datasets.load_breast_cancer();
    cancer_data_pd = convert2pandas_df(x_array=cancer_data_dict['data'],
                      y=[ cancer_data_dict['target_names'][i] for i in cancer_data_dict['target'] ],
                      # feature_names=iris_dict['feature_names'],
                      feature_names=list(cancer_data_dict['feature_names']),
                      target_name='Target');

    # logger.info(f'{cancer_data_pd.head()}');

    sns.lmplot( x="area error", y="compactness error", data=cancer_data_pd, fit_reg=False, hue='Target', legend=False,
               palette=dict(malignant="#BF0C2B", benign="#02173E")); # , versicolor="#F5900E"));
    plt.legend(loc='lower right');
    chart_save_image(plt=plt, f_size=(8, 8), left=0.125, right=0.9, bottom=0.125, top=0.9, wspace=0.0, hspace=0.0, fileName='./Cancer_Data_Plot.png');

    selected_columns = ['Target', 'mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity',
                        'mean concave points', 'mean symmetry'];

    g = sns.pairplot(cancer_data_pd[selected_columns], hue="Target", diag_kind="kde",  palette=dict(malignant="#BF0C2B", benign="#02173E"), diag_kws=dict(shade=True));
    for i, j in zip(*np.triu_indices_from(g.axes, 1)):
        g.axes[i, j].set_visible(False);
    chart_save_image(plt=plt, f_size=(16, 16), left=0.05, right=0.97, bottom=0.05, top=0.97, wspace=0.02, hspace=0.02, fileName='./Cancer_Data_PairPlot.png');

    logger.info(f'');
    logger.info(f'{"-"*20}  Load dataset ends here {"-"*20}');
    logger.info(f'');

    # Load_Dataset --- Process ends

    # __Placeholder__ --- Process Starts

    # TODO: DONE; 001; Train test split; stratified;
    X_train, X_test, y_train, y_test = train_test_split(cancer_data_pd[cancer_data_dict.feature_names],
                                                        # cancer_data_pd['Target'],
                                                        cancer_data_dict['target'], # Has to be binary for scorer F1 and Percision;
                                                        test_size=0.20,
                                                        # stratify=cancer_data_pd['Target'],
                                                        stratify=cancer_data_dict['target'],
                                                        random_state=111,
                                                        shuffle=True);

    logger.info(f'X_train.shape : {X_train.shape}');
    logger.info(f'X_test.shape  : {X_test.shape}');
    logger.info(f'Y_train.shape : {y_train.shape}');
    logger.info(f'Y_test.shape  : {y_test.shape}');

    # TODO: DONE; 002; Dummy Classifier ;

    # dummy_classifier = DummyClassifier(strategy="stratified");
    dummy_classifier = DummyClassifier(strategy="most_frequent");

    # TODO: DONE; 003; Cross_over_score and predict and Metrics (make_scorer)

    accuracy_scorer = make_scorer(cost_accuracy, greater_is_better=True);

    kfold = model_selection.KFold(n_splits=10, random_state=111);
    # results = model_selection.cross_val_score(dummy_classifier, X_train, y_train, cv=kfold, scoring='accuracy');
    # logger.info(f'{results} {np.mean(results)} {np.var(results)} {np.std(results)}');

    results = model_selection.cross_val_score(dummy_classifier, X_train, y_train, cv=kfold, scoring=accuracy_scorer);
    logger.info(f'{results} {np.mean(results)} {np.var(results)} {np.std(results)}');

    DummyClassifier_mean = np.mean(results);

    # TODO: DONE; 004; Standardization ;

    # std_scaler = preprocessing.StandardScaler();  # Contains the negative values
    std_scaler = preprocessing.MinMaxScaler(); # Range between 0 to 1; No negative terms;
    std_scaler = std_scaler.fit(X_train);
    scaled_X_train = pd.DataFrame(std_scaler.transform(X_train), columns=X_train.columns);

    logger.info(f'{X_train["mean radius"].describe()}');
    logger.info(f'{scaled_X_train["mean radius"].describe()}');

    # TODO: DONE; 005; SelectKBest; Feature selection ;

    # selectKbest_est = SelectKBest(chi2, k=4); f_classif
    selectKbest_est = SelectKBest(f_classif, k=8);
    selectKbest_X_train = selectKbest_est.fit_transform(X_train, y_train);

    logger.info(f'{selectKbest_est.get_params(deep=True)}');
    logger.info(f'{selectKbest_est.get_support(indices=False)}');
    logger.info(f'{selectKbest_est.get_support(indices=True)}');
    logger.info(f'{X_train.columns[selectKbest_est.get_support(indices=True)]}');

    # TODO: DONE; 006; Polynomial Features ;

    poly = preprocessing.PolynomialFeatures(degree=2, include_bias=False, interaction_only=False);
    X_train_poly = poly.fit_transform(X_train);
    X_train_p2 = pd.DataFrame(X_train_poly, columns=poly.get_feature_names(X_train.columns));

    lr = linear_model.LogisticRegression(fit_intercept=False, random_state=111);
    results = model_selection.cross_val_score(lr, X_train_p2, y_train, cv=kfold, scoring=accuracy_scorer); # , verbose=True);

    imp_percentage = round((np.mean(results) - DummyClassifier_mean) / DummyClassifier_mean, 4);

    logger.info(f'DummyClassifier accuracy : {DummyClassifier_mean}');
    logger.info(f'LogisticRegression accuracy : {np.mean(results)}');

    logger.info(f'The improvement over the DummyClassifier is : {imp_percentage}');

    # TODO: DONE; 007; Kernel PCA ;

    # kernel_param = ('rbf', 0.25);
    kernel_param = ('rbf', 1);

    kpca = KernelPCA(n_components=4, kernel=kernel_param[0], gamma=kernel_param[1], fit_inverse_transform=True, random_state=111) # n_jobs=-1,
    kpca.fit(scaled_X_train);   # The data has to be scaled;
    kpca_X_train = kpca.transform(scaled_X_train);

    lr = linear_model.LogisticRegression(fit_intercept=False, random_state=111);
    results = model_selection.cross_val_score(lr, kpca_X_train, y_train, cv=kfold, scoring=accuracy_scorer); # , verbose=True);

    imp_percentage = round((np.mean(results) - DummyClassifier_mean) / DummyClassifier_mean, 4);

    logger.info(f'DummyClassifier accuracy : {DummyClassifier_mean}');
    logger.info(f'LogisticRegression accuracy : {np.mean(results)}');

    logger.info(f'The improvement over the DummyClassifier is : {imp_percentage}');

    # TODO: DONE; 008; Grid-Search ;

    # tuned_parameters = [{
    #                      'n_estimators' : [1, 10, 100, 500, 1000, 2000],
    #                      'max_depth' : [10, 20],
    #                      'max_features' : [0.80, 0.40],
    #                      'random_state' : [111]
    #                      }];

    tuned_parameters = [{
                         'n_estimators' : [1, 10],
                         'max_depth' : [10, 20],
                         'max_features' : [0.80, 0.40],
                         'random_state' : [111]
                         }];

    clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring=accuracy_scorer);
    clf.fit(X_train, y_train);

    logger.info(f'Best parameters set found on development set: {clf.best_score_} {clf.best_params_}');
    logger.info('');
    logger.info('Grid scores on development set:');
    logger.info('');
    means = clf.cv_results_['mean_test_score'];
    stds = clf.cv_results_['std_test_score'];
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        logger.info(f'{round(mean,3)} (+/-{round(std*2,2)}) for {params}');
    logger.info('');

    logger.info('Detailed classification report:');
    logger.info('');
    logger.info('The model is trained on the full development set.');
    logger.info('The scores are computed on the full evaluation set.');
    logger.info('');
    y_true, y_pred = y_test, clf.predict(X_test);
    logger.info(f'{metrics.classification_report(y_true, y_pred)}');
    logger.info('');

    imp_percentage = round((clf.best_score_ - DummyClassifier_mean) / DummyClassifier_mean, 4);
    logger.info(f'DummyClassifier accuracy : {DummyClassifier_mean}');
    logger.info(f'GridSearchCV RandomForestClassifier accuracy : {clf.best_score_}');
    logger.info(f'The improvement over the DummyClassifier is : {imp_percentage}');

    # logger.info(f'{clf.best_estimator_}');

    # TODO: DONE; 009; Customer Transformer for the pipeline ;
    # reference : https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/
    # http://philipmgoddard.com/modeling/sklearn_pipelines

    ctf = ColumnTypeFilter(np.number);
    ctf.fit_transform(X_train).head();

    # TODO: YTS; 010; Pipeline ;

    custom_pipeline = make_pipeline(
            FeatureUnion(transformer_list=[
                ('StdScl', make_pipeline(
                    ColumnTypeFilter(np.number),
                    preprocessing.StandardScaler()
                )),
                ('MMScl', make_pipeline(
                    ColumnTypeFilter(np.number),
                    preprocessing.MinMaxScaler()
                ))
            ])
    );

    custom_pipeline.fit(X_train);
    X_test_transformed = custom_pipeline.transform(X_test);

    logger.info(f'{X_test.shape} {type(X_test_transformed)} {X_test_transformed.shape}');

    # TODO: DONE; 011; Ensemble (VotingClassifier) and BaseClone;

    ensemble_clf = VotingClassifier(estimators=[
                            ('dummy', dummy_classifier),
                            ('logistic', lr),
                            # ('supportvector', SVC(probability=True)),
                            ('randomforest', RandomForestClassifier())],
                            voting='soft');

    ensemble_clf.fit(X_train, y_train);
    ensemble_clf_accuracy_ = cost_accuracy(y_test, ensemble_clf.predict(X_test));

    imp_percentage = round((ensemble_clf_accuracy_ - DummyClassifier_mean) / DummyClassifier_mean, 4);
    logger.info(f'DummyClassifier accuracy : {DummyClassifier_mean}');
    logger.info(f'GridSearchCV RandomForestClassifier accuracy : {ensemble_clf_accuracy_}');
    logger.info(f'The improvement over the DummyClassifier is : {imp_percentage}');

    # TODO: DONE; 012; One-hot encoder; Label Encoder; Binary Encoder;

    baby_names = ['Ava', 'Lily', 'Noah', 'Jacob', 'Mia', 'Sophia'];
    X_train_list = [ np.random.choice(baby_names) for i in range(40) ];
    X_test_list = [ np.random.choice(baby_names) for i in range(6) ];

    bb_labelencoder = preprocessing.LabelEncoder();
    bb_labelencoder.fit(X_train_list);
    bb_encoded = bb_labelencoder.transform(X_test_list);

    bb_onehotencoder = preprocessing.OneHotEncoder(sparse=False);
    bb_encoded = bb_encoded.reshape(len(bb_encoded), 1);
    bb_onehot = bb_onehotencoder.fit_transform(bb_encoded);

    for i, v in enumerate(X_test_list):
        logger.info(f'Actual : {v} \t | LabelEncoded : {bb_encoded[i][0]} \t | OneHot : {bb_onehot[i]}');

    # TODO: DONE; 013; Feature Extraction from image and text;

    corpus = [  'This is the first document.',
                'This document is the second document.',
                'And this is the third one.',
                'Is this the first document?', ]

    vectorizer = CountVectorizer();
    X = vectorizer.fit_transform(corpus);

    cntvector_out = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names());

    for i, v in enumerate(corpus):
        logger.info(f'Input text : {v}');
        logger.info(f'Output counter vector : {v}');
        logger.info(f'{cntvector_out.iloc[i]}');
Exemple #6
0
def extract_features(docs_train, docs_test, perform_dimensionality_reduction):
    """ 
    We will extract features from the dataset, preprocess it and return the X_train and X_test
    
    @return:
        1. X_train: Feature matrix for training data
        2. X_test: Feature matrix for test data


    @Regions of improvement:
        1. Get more features and use them to get more accurate predictions 
   
    """
    word_ngram_range = (1, 4)
    char_ngram_range = (2, 5)
    '''
    Build a char_vectorizer and combine word_vectorizer and char_vectorizer to make an n_grams vectorizer
    '''

    word_vectorizer = TfidfVectorizer(preprocessor=preprocess_tweet,
                                      analyzer='word',
                                      ngram_range=word_ngram_range,
                                      min_df=2,
                                      use_idf=True,
                                      sublinear_tf=True)
    print(f'Created a word vectorizer')
    char_vectorizer = TfidfVectorizer(preprocessor=preprocess_tweet,
                                      analyzer='char',
                                      ngram_range=char_ngram_range,
                                      min_df=2,
                                      use_idf=True,
                                      sublinear_tf=True)
    print(f'Created a char vectorizer')

    ###############################################################################################
    ################## Count vectorizer -> which just computes the count of tokens ################
    '''
    Merge the two vectorizers using a pipeline
    '''
    ngrams_vectorizer = Pipeline([
        ('feats',
         FeatureUnion([('word_ngram', word_vectorizer),
                       ('char_ngram', char_vectorizer)])),
        # ('clff', LinearSVC(random_state=42))
    ])

    # fitTransform this thing
    X_train = ngrams_vectorizer.fit_transform(
        docs_train)  #it will take a lot of time... i think
    X_test = ngrams_vectorizer.transform(docs_test)
    print("Performed fitting of data")
    ############ perform dimensionality reduction ################

    if (perform_dimensionality_reduction == True):

        print("Performing dimensionality reduction")
        # use TruncatedSVD to reduce dimensionality of our dataset
        svd = TruncatedSVD(n_components=300, random_state=42)

        X_train = svd.fit_transform(X_train)
        X_test = svd.transform(X_test)
        print("Performed dimensionality reduction")

    # print(docs_train[0])
    return X_train, X_test
Exemple #7
0
    def _get_pipeline(nominal_indices, num_features, sklearn_model):
        if len(nominal_indices) > 0:
            with_mean = False
        else:
            with_mean = True
        numerical_indices = [
            i for i in range(num_features) if i not in nominal_indices
        ]

        numerical_preprocessing = ('NumericalPreprocessing',
                                   Pipeline([
                                       ('SelectNumerical',
                                        NumericalFeatureSelector(
                                            numerical_indices, )),
                                       ('NumericImputationFU',
                                        FeatureUnion([(
                                            'MissingIndicator',
                                            MissingIndicator(
                                                error_on_new=False),
                                        ),
                                                      (
                                                          'NumericalImputer',
                                                          NumericalImputer(),
                                                      )])),
                                       ('Scaler',
                                        StandardScaler(with_mean=with_mean)),
                                   ]))

        categorical_preprocessing = ('CategoricalPreprocessing',
                                     Pipeline([
                                         ('SelectCategorical',
                                          CategoricalFeatureSelector(
                                              nominal_indices, )),
                                         ('CategoricalImputer',
                                          CategoricalImputer()),
                                         ('OneHotEncoder',
                                          OneHotEncoder(
                                              categories='auto',
                                              sparse=True,
                                              handle_unknown='ignore',
                                          )),
                                     ]))

        joint_preprocessing = ('Preprocessing',
                               FeatureUnion([
                                   numerical_preprocessing,
                                   categorical_preprocessing,
                               ]))

        if len(nominal_indices) > 0 and len(numerical_indices) > 0:
            steps = [joint_preprocessing]
        elif len(nominal_indices) > 0:
            steps = [categorical_preprocessing]
        else:
            steps = [numerical_preprocessing]
        steps.extend([
            ('VarianceThreshold', VarianceThreshold()),
            ('Estimator', sklearn_model),
        ])

        return Pipeline(steps=steps)
Exemple #8
0
def main():

    xgb_params = {
        #'tree_method' : 'gpu_hist',
        'n_estimators': 1500,
        'objective': 'reg:linear',
        'booster': 'gbtree',
        'learning_rate': 0.02,
        'max_depth': 10,  #22,
        'min_child_weight': 57,
        'gamma': 1.45,
        'alpha': 0.0,
        'lambda': 0.0,
        'subsample': 0.67,
        'colsample_bytree': 0.054,
        'colsample_bylevel': 0.50,
        'n_jobs': -1,
        'random_state': 456
    }

    lgb_params = {
        'n_estimators': 1000,
        'boosting_type': 'gbdt',
        'learning_rate': 0.007,
        'num_leaves': 4000,
        'max_depth': 22,
        'n_jobs': -1,
        'random_state': 456
    }

    fit_params = {
        'early_stopping_rounds': 50,
        'eval_metric': 'rmse',
        'verbose': False,
    }

    xgb_cv = XGBRegressorCV(
        regressor=xgb.XGBRegressor,
        xgb_params=xgb_params,
        fit_params=fit_params,
        cv=10,
    )

    pipe = Pipeline(
        [
            #('vt', VarianceThreshold(threshold=0.0)),
            #('ut', UniqueTransformer()),
            ('fu',
             FeatureUnion([
                 ('pca1', PCA(n_components=100)),
                 ('spca1', SparsePCA(n_components=100)),
                 ('lda', LatentDirichletAllocation(n_components=100)),
                 ('fa1', FactorAnalysis(n_components=100)),
                 ('tsvd', TruncatedSVD(n_components=100)),
                 ('ct-2', ClassifierTransformer(get_rfc(50), n_classes=2,
                                                cv=5)),
                 ('ct-3', ClassifierTransformer(get_rfc(75), n_classes=3,
                                                cv=5)),
                 ('ct-4', ClassifierTransformer(
                     get_rfc(100), n_classes=4, cv=5)),
                 ('ct-5', ClassifierTransformer(
                     get_rfc(125), n_classes=5, cv=5)),
                 ('ct-10',
                  ClassifierTransformer(get_rfc(150), n_classes=10, cv=5)),
                 ('ct-20',
                  ClassifierTransformer(get_rfc(175), n_classes=20, cv=5)),
                 ('ct-100',
                  ClassifierTransformer(get_rfc(200), n_classes=100, cv=5)),
                 ('st', StatsTransformer(stat_funs=get_stat_funs(), verbose=2))
             ])),
            #('skb', SelectKBest(score_func=lambda X, y: score_features(X, y, estimator=xgb_cv), k=120)),
            ('xgb', xgb_cv),
        ],
        #memory = '.pipeline'
    )

    X_train, y_train_log, X_test, id_test = get_data()

    param_grid = [dict(xgb__max_depth=[11, 22, 44])]

    #pipe = GridSearchCV(_pipe, param_grid=param_grid)

    pipe.fit(X_train, y_train_log)
    #print(pipe.named_steps['xgb-cv'])
    print(pipe.named_steps['xgb'].cv_scores_)
    cv_score = pipe.named_steps['xgb'].cv_score_
    print(cv_score)

    #assert False

    y_pred_log = pipe.predict(X_test)
    y_pred = np.expm1(y_pred_log)

    filename = f'pipeline_kernel_xgb_fe_cv{cv_score}.csv'

    submission = pd.DataFrame()
    submission['ID'] = id_test
    submission['target'] = y_pred
    submission.to_csv(filename, index=None)

    print(
        f"kaggle competitions submit -c santander-value-prediction-challenge -f {filename} -m 'na'"
    )
Exemple #9
0
    def enhance_transactions(self):  # load training data
        self.training_data = ml.load_training_data(
            self.training_data,
            known_account=self.account,
            existing_entries=self.existing_entries)

        # convert training data to a list of TxnPostingAccounts
        self.converted_training_data = [
            ml.TxnPostingAccount(t, p, pRef.account)
            for t in self.training_data for pRef in t.postings
            for p in t.postings if p.account != pRef.account
        ]

        # train the machine learning model
        self._trained = False
        if not self.converted_training_data:
            logger.warning("Cannot train the machine learning model "
                           "because the training data is empty.")
        elif len(self.converted_training_data) < 2:
            logger.warning(
                "Cannot train the machine learning model "
                "because the training data consists of less than two elements."
            )
        else:
            transformers = []
            transformer_weights = {}
            transformers.append(
                ('narration',
                 Pipeline([
                     ('getNarration', ml.GetNarration()),
                     ('vect', CountVectorizer(ngram_range=(1, 3))),
                 ])))
            transformer_weights['narration'] = 0.8
            transformers.append(
                ('account',
                 Pipeline([
                     ('getReferencePostingAccount',
                      ml.GetReferencePostingAccount()),
                     ('vect', CountVectorizer(ngram_range=(1, 3))),
                 ])))
            transformer_weights['account'] = 0.8

            distinctPayees = set(
                map(lambda trx: trx.txn.payee, self.converted_training_data))
            if len(distinctPayees) > 1:
                transformers.append(
                    ('payee',
                     Pipeline([
                         ('getPayee', ml.GetPayee()),
                         ('vect', CountVectorizer(ngram_range=(1, 3))),
                     ])))
                transformer_weights['payee'] = 0.5

            transformers.append((
                'dayOfMonth',
                Pipeline([
                    ('getDayOfMonth', ml.GetDayOfMonth()),
                    ('caster',
                     ml.ArrayCaster()),  # need for issue with data shape
                ])))
            transformer_weights['dayOfMonth'] = 0.1

            self.pipeline = Pipeline([
                ('union',
                 FeatureUnion(transformer_list=transformers,
                              transformer_weights=transformer_weights)),
                ('svc', SVC(kernel='linear')),
            ])
            logger.debug("About to train the machine learning model...")
            self.pipeline.fit(
                self.converted_training_data,
                ml.GetPostingAccount().transform(self.converted_training_data))
            logger.info("Finished training the machine learning model.")
            self._trained = True

        if not self._trained:
            logger.warning(
                "Cannot generate predictions or suggestions "
                "because there is no trained machine learning model.")
            return self.imported_transactions

        # predict missing second postings
        self.transactions = self.imported_transactions
        if self.predict_second_posting:
            logger.debug(
                "About to generate predictions for missing second postings...")
            predicted_accounts: List[str]
            predicted_accounts = self.pipeline.predict(
                self.imported_transactions)
            self.transactions = [
                ml.add_posting_to_transaction(*t_a)
                for t_a in zip(self.transactions, predicted_accounts)
            ]
            logger.debug(
                "Finished adding predicted accounts to the transactions to be imported."
            )

        # suggest accounts that are likely involved in the transaction
        if self.suggest_accounts:
            # get values from the SVC decision function
            logger.debug(
                "About to generate suggestions about related accounts...")
            decision_values = self.pipeline.decision_function(
                self.imported_transactions)

            # add a human-readable class label (i.e., account name) to each value, and sort by value:
            suggestions = [[
                account for _, account in sorted(list(
                    zip(distance_values, self.pipeline.classes_)),
                                                 key=lambda x: x[0],
                                                 reverse=True)
            ] for distance_values in decision_values]

            # add the suggested accounts to each transaction:
            self.transactions = [
                ml.add_suggested_accounts_to_transaction(*t_s)
                for t_s in zip(self.transactions, suggestions)
            ]
            logger.debug(
                "Finished adding suggested accounts to the transactions to be imported."
            )

        return self.transactions
    "sublinear_tf": True,
    "dtype": np.float32,
    "norm": 'l2',
    #"min_df":5,
    #"max_df":.9,
    "smooth_idf":False
}
def get_col(col_name): return lambda x: x[col_name]
vectorizer = FeatureUnion([
        ('description',TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=16000,
            **tfidf_para,
            preprocessor=get_col('description'))),
        ('text_feat',CountVectorizer(
            ngram_range=(1, 2),
            #max_features=7000,
            preprocessor=get_col('text_feat'))),
        ('title',TfidfVectorizer(
            ngram_range=(1, 2),
            **tfidf_para,
            #max_features=7000,
            preprocessor=get_col('title')))
    ])
    
start_vect=time.time()
vectorizer.fit(df.loc[traindex,:].to_dict('records'))
ready_df = vectorizer.transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
print("Vectorization Runtime: %0.2f Minutes"%((time.time() - start_vect)/60))

# Drop Text Cols
Exemple #11
0
        return X

if flag == 1:
    nn_arch = nn_arch_madelon
grid = {'union__km__n_clusters': clusters, 'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch}
mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True,
                    random_state=5)
km = kmeans(random_state=5)
# pipe = Pipeline([('km', km), ('NN', mlp)])

pipe = Pipeline([

    ('union', FeatureUnion(
        transformer_list=[
            ("OrgFeatures", OrgFeatures()),
            ('km', km)
        ])
     ),
    ('NN', mlp)
])
gs = GridSearchCV(pipe, grid, n_jobs=num_jobs, verbose=10)

gs.fit(madelonX, madelonY)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out + 'Madelon cluster Kmeans.csv')


if flag == 1:
    nn_arch = nn_arch_madelon
grid = {'union__gmm__n_components': clusters, 'NN__alpha': nn_reg,
Exemple #12
0
def data_vectorize(df):
    russian_stop = set(stopwords.words("russian"))
    tfidf_para = {
        "stop_words": russian_stop,
        "analyzer": "word",
        "token_pattern": r"\w{1,}",
        "sublinear_tf": True,
        "dtype": np.float32,
        "norm": "l2",
        #"min_df":5,
        #"max_df":.9,
        "smooth_idf": False
    }

    tfidf_para2 = {
        "stop_words": russian_stop,
        "analyzer": "char",
        "token_pattern": r"\w{1,}",
        "sublinear_tf": True,
        "dtype": np.float32,
        "norm": "l2",
        # "min_df":5,
        # "max_df":.9,
        "smooth_idf": False
    }

    def get_col(col_name):
        return lambda x: x[col_name]

    vectorizer = FeatureUnion([
        (
            "description",
            TfidfVectorizer(
                ngram_range=(1, 2),
                max_features=40000,  #40000,18000
                **tfidf_para,
                preprocessor=get_col("description"))),
        #         ("title_description", TfidfVectorizer(
        #              ngram_range=(1, 2),#(1,2)
        #              max_features=1800,#40000,18000
        #              **tfidf_para,
        #              preprocessor=get_col("title_description"))
        #           ),
        ("text_feature",
         CountVectorizer(ngram_range=(1, 2),
                         preprocessor=get_col("text_feature"))),
        ("title",
         TfidfVectorizer(ngram_range=(1, 2),
                         **tfidf_para,
                         preprocessor=get_col("title"))),
        #新加入两个文本处理title2,title_char
        ("title2",
         TfidfVectorizer(ngram_range=(1, 1),
                         **tfidf_para,
                         preprocessor=get_col("title"))),

        #        ("title_char", TfidfVectorizer(
        #
        #            ngram_range=(1, 4),#(1, 4),(1,6)
        #            max_features=16000,#16000
        #            **tfidf_para2,
        #            preprocessor=get_col("title"))
        #         ),
    ])
    vectorizer.fit(df.to_dict("records"))
    ready_full_df = vectorizer.transform(df.to_dict("records"))
    tfvocab = vectorizer.get_feature_names()
    df.drop([
        "text_feature", "text_feature_2", "description", "title",
        "title_description"
    ],
            axis=1,
            inplace=True)
    df.fillna(-1, inplace=True)
    return df, ready_full_df, tfvocab
# Apply categorical imputer
categorical_imputation_mapper = DataFrameMapper(
                                                [(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
                                                input_df=True,
                                                df_out=True
                                               )

--------------------------------------------------
# Exercise_7 
# Import FeatureUnion
from sklearn.pipeline import FeatureUnion

# Combine the numeric and categorical transformations
numeric_categorical_union = FeatureUnion([
                                          ("num_mapper", numeric_imputation_mapper),
                                          ("cat_mapper", categorical_imputation_mapper)
                                         ])

--------------------------------------------------
# Exercise_8 
# Create full pipeline
pipeline = Pipeline([
                     ("featureunion", numeric_categorical_union),
                     ("dictifier", Dictifier()),
                     ("vectorizer", DictVectorizer(sort=False)),
                     ("clf", xgb.XGBClassifier(max_depth=3))
                    ])

# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, kidney_data, y, scoring="roc_auc", cv=3)
# -*- encoding: utf-8 -*-
'''
Created on 2016年5月22日

@author: LuoPei
'''
from numpy import log1p
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion

#新建将部分特征矩阵进行定性特征编码的对象
step2_1 = ('OneHotEncoder', OneHotEncoder(sparse=False))

#新建将部分特征矩阵进行对数函数转换的对象
step2_2 = ('ToLog', FunctionTransformer(log1p))

#新建将部分特征矩阵进行二值化类的对象
step2_3 = ('ToBinary', Binarizer())

#新建部分滨兴处理对象
#参数transform_list 为需要并行处理的对象李彪,该列表为二元组列表,第一元为对象名称,第二元为对象
#参数idx_list 为相应的需要读取的特征矩阵的列

step2 = ('FeatureUnionExt',
         FeatureUnion(transformer_list=[step2_1, step2_2, step2_3],
                      idx_list=[[0], [1, 2, 3], [4]]))

if __name__ == "__main__":
    pass
Exemple #15
0
import pandas as pd
df = pd.read_csv('./data/train.csv')
df.head()

X = df['title']
Y = df['category']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

classifier = Pipeline([
    ('features', FeatureUnion([
        ('title', Pipeline([
            # ('colext', TextSelector('title')),
            ('tfidf', TfidfVectorizer(tokenizer=textblob_tokenizer, stop_words='english',
                     min_df=.0025, max_df=0.25, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
        ])),
        # ('words', Pipeline([
        #     ('wordext', NumberSelector('TotalWords')),
        #     ('wscaler', StandardScaler()),
        # ])),
    ])),
    ('clf', XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1)),
#    ('clf', RandomForestClassifier()),
    ])

classifier.fit(X_train, y_train)
preds = classifier.predict(X_test)

from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix
print ("Accuracy:", accuracy_score(y_test, preds))
#print ("Precision:", precision_score(y_test, preds))
def perform_classificiation(data, labels):
    result_frame = pd.DataFrame(
        columns=['feature', 'class', 'f1', 'precision', 'recall'])
    conf_dict = defaultdict(lambda: np.zeros(
        (len(labels), len(labels)), dtype=np.integer))

    feature_unions = [
        FeatureUnion([
            ('full_paths_original_4_aip',
             make_pipeline(
                 PathEmbeddingFeature('./data/full_paths_original_4.csv')))
        ]),
        FeatureUnion([('middle_paths_unrestricted_16',
                       make_pipeline(
                           PathEmbeddingFeature(
                               './data/middle_paths_unrestricted_16.csv')))]),
        FeatureUnion([('Bag-Of-Words',
                       make_pipeline(ExtractMiddlePart(),
                                     CountVectorizer()))]),
        FeatureUnion([
            ('InferSent',
             make_pipeline(SelectDataFrameColumn('embedding_middle_part')))
        ]),
        FeatureUnion([('Word Embedding',
                       make_pipeline(ExtractMiddlePart(),
                                     MeanWordEmbedding()))]),
        FeatureUnion([
            ('POS n-grams',
             make_pipeline(
                 ExtractMiddlePart(), POSTransformer(),
                 TfidfVectorizer(max_features=500, ngram_range=(2, 4)))),
        ]),
        FeatureUnion([('Contains JJR',
                       make_pipeline(ExtractMiddlePart(),
                                     ContainsPos('JJR')))]),
    ]
    miss = pd.DataFrame(columns=[
        'id', 'caption', 'sentence', 'object_a', 'object_b', 'predicted',
        'gold'
    ])
    binary = labels == ['ARG', 'NONE']
    idx_mc = 1
    idx = 1
    logger.info("====== {} =====".format(labels))
    for i, f in enumerate(feature_unions):
        caption = f.transformer_list[0][0]
        logger.info('{}/{} {}'.format(i, len(feature_unions), caption))
        logger.info(f)
        folds_results = []
        try:
            for train, test in k_folds(5, data, random_state=42):
                pipeline = make_pipeline(
                    f, XGBClassifier(n_jobs=8, n_estimators=1000))
                fitted = pipeline.fit(train, train[LABEL].values)
                predicted = fitted.predict(test)
                folds_results.append((test[LABEL].values, predicted))
                logger.info(
                    classification_report(test[LABEL].values,
                                          predicted,
                                          labels=labels,
                                          digits=2))
                matrix = confusion_matrix(test[LABEL].values,
                                          predicted,
                                          labels=labels)
                logger.info(matrix)
                conf_dict[caption] += matrix

                result_frame.loc[idx] = [
                    caption, 'Overall',
                    f1_score(test[LABEL].values, predicted,
                             average='weighted'),
                    precision_score(test[LABEL].values,
                                    predicted,
                                    average='weighted'),
                    recall_score(test[LABEL].values,
                                 predicted,
                                 average='weighted')
                ]
                idx += 1
                for label in labels:
                    result_frame.loc[idx] = [
                        caption, label,
                        f1_score(test[LABEL].values,
                                 predicted,
                                 average='weighted',
                                 labels=[label]),
                        precision_score(test[LABEL].values,
                                        predicted,
                                        average='weighted',
                                        labels=[label]),
                        recall_score(test[LABEL].values,
                                     predicted,
                                     average='weighted',
                                     labels=[label])
                    ]
                    idx += 1

                for _id, sentence, a, b, predicted, gold in get_misclassified(
                        predicted, test):
                    miss.loc[idx_mc] = [
                        _id, caption, sentence, a, b, predicted, gold
                    ]
                    idx_mc += 1

            der = get_std_derivations(folds_results, labels=labels)
            best = get_best_fold(folds_results)
            best_per_feat.append((f1_score(best[0],
                                           best[1],
                                           average='weighted'), caption))
            print(
                pformat(sorted(best_per_feat, key=lambda k: k[0],
                               reverse=True)))
            logger.info(
                latex_classification_report(best[0],
                                            best[1],
                                            derivations=der,
                                            labels=labels,
                                            caption=caption))

        except Exception as ex:
            logger.error(ex)
            raise ex
        logger.info(conf_dict[caption])
        print_confusion_matrix('{}_{}'.format(caption, binary),
                               conf_dict[caption], labels)
        logger.info("\n\n=================\n\n")
    logger.info(
        pformat(sorted(best_per_feat, key=lambda k: k[0], reverse=True)))
    miss.to_csv('missclassified/binary_{}.csv'.format(binary), index=False)
    result_frame.to_csv('graphics/data/results_{}.csv'.format(binary),
                        index=False)
    plot(result_frame)
Exemple #17
0
def model_words():
    '''
    The model + pipeline for features extracted from the text
    '''
    clfs = [
        LinearSVC(),
        svm.SVC(kernel='linear', C=1.0),
        PassiveAggressiveClassifier(C=1,
                                    max_iter=1000,
                                    tol=1e-3,
                                    n_jobs=-1,
                                    class_weight="balanced"),
        PassiveAggressiveClassifier(C=0.1,
                                    max_iter=1500,
                                    tol=0.01,
                                    n_jobs=-1,
                                    class_weight="balanced",
                                    fit_intercept=False,
                                    loss="squared_hinge"),
        AdaBoostClassifier(n_estimators=200),
        MultinomialNB(),
    ]

    classifier = Pipeline([
        # Extract the features
        ('features', FeaturesExtractor()),
        # Use FeatureUnion to combine the features from subject and body
        (
            'union',
            FeatureUnion(
                transformer_list=[
                    ('text_high',
                     Pipeline([
                         ('selector', ItemSelector(key='text_high')),
                         ('tfidf',
                          TfidfVectorizer(preprocessor=identity,
                                          tokenizer=identity,
                                          max_df=.2)),
                     ])),
                    ('word_n_grams',
                     Pipeline([('selector', ItemSelector(key='sentence')),
                               ('tfidf',
                                TfidfVectorizer(analyzer='word',
                                                ngram_range=(1, 5)))])),
                    ('char_n_grams',
                     Pipeline([('selector', ItemSelector(key='sentence')),
                               ('tfidf',
                                TfidfVectorizer(analyzer='char',
                                                ngram_range=(2, 5)))])),
                    ('sentiment',
                     Pipeline([('selector', ItemSelector(key='sentiment')),
                               ('tfidf', TfidfVectorizer(analyzer='char'))])),
                    ('opinion_towards',
                     Pipeline([
                         ('selector', ItemSelector(key='opinion')),
                     ])),
                    ('target',
                     Pipeline([
                         ('selector', ItemSelector(key='target')),
                     ])),

                    #### FEATURES THAT DO NOT WORK ####

                    # ('sentiment_cont', Pipeline([
                    #     ('selector', ItemSelector(key='sentence')),
                    #     ('feature', SentimentContinuous())
                    # ])),

                    # ('glove', Pipeline([
                    #     ('selector', ItemSelector(key='sentence')),
                    #     ('tfidf', TfidfEmbeddingVectorizer(glove))
                    # ])),

                    # ('sentence_length', Pipeline([
                    #     ('selector', ItemSelector(key='sentence_length')),
                    #     ('scaler', MinMaxScaler())
                    # ])),
                ],

                # weight components in FeatureUnion
                transformer_weights={
                    'text_high': 1,
                    'word_n_grams': .8,
                    'char_n_grams': .8,
                    'sentiment': .8,
                    'opinion_towards': 1,
                    'target': 1,
                },
            )),
        # Use a classifier on the combined features
        ('clf', clfs[2]),
    ])
    return classifier
#     return Pipeline([
#         ('transformation', get_filters()),
#         ('estimator', get_estimation_pipeline())
#     ])

nonlinearity = lambda x: np.sqrt(x)

if __name__ == '__main__':
    orig_dataset = pd.read_csv(settings.TRAIN_FILE)
    # sample_mask = np.zeros((orig_dataset.shape[0],), dtype=np.bool_)
    # sample_idx = sample_without_replacement(orig_dataset.shape[0], orig_dataset.shape[0] * 1.0, random_state=42)
    # sample_mask[sample_idx] = True

    before = time.time()
    fcols = [col for col in orig_dataset.columns if col in settings.FEATURES]
    catconversion = FeatureUnion([feature_sets.CATEGORICAL_CONVERSION],
                                 n_jobs=1)

    dataset = pd.DataFrame(data=catconversion.fit_transform(orig_dataset),
                           columns=fcols,
                           index=orig_dataset.index)
    target = FeatureColumnsExtractor(
        settings.TARGET).fit_transform(orig_dataset).apply(nonlinearity)

    print('original dataset shape:', dataset.shape)

    # union = get_feature_union()
    # dataset = union.fit_transform(dataset, target)

    print('preprocessed dataset shape:', dataset.shape)
    print('preprocessing time: ', time.time() - before)
Exemple #19
0
X_test = np.array(["ga suka, sabun ini keras",
                   "suka baunya, mewah."
                   ])   
target_names = ['Class 1', 'Class 2']


def get_text_length(x):
    return np.array([len(t) for t in x]).reshape(-1, 1)

# hanya menggunakan fitur berupa vektor tf-idf
classifier1 = Pipeline([
    ('vectorizer', CountVectorizer(min_df=1,max_df=2)),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC()))])

# menggunakan fitur gabungan vektor tf-idf dan panjang teks
classifier2 = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('vectorizer', CountVectorizer(min_df=1,max_df=2)),
            ('tfidf', TfidfTransformer()),
        ])),
        ('length', Pipeline([
            ('count', FunctionTransformer(get_text_length, validate=False)),
        ]))
    ])),
    ('clf', OneVsRestClassifier(LinearSVC()))])

classifier2.fit(X_train, y_train)
predicted = classifier2.predict(X_test)
print(predicted)
        data_dict[name]['from_poi_ratio'] = data_dict[name][
            'from_poi_to_this_person'] / data_dict[name]['to_messages']
    except:
        data_dict[name]['from_poi_ratio'] = 'NaN'
    try:
        data_dict[name]['to_poi_ratio'] = data_dict[name][
            'from_this_person_to_poi'] / data_dict[name]['from_messages']
    except:
        data_dict[name]['to_poi_ratio'] = 'NaN'

features_list += ['from_poi_ratio', 'to_poi_ratio']

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)
my_dataset = data_dict
minmax = MinMaxScaler()
combined_features = FeatureUnion([("pca", PCA(n_components=18)),
                                  ("univ_select", SelectKBest(k=3))])
lg = LogisticRegression(
    C=100000000000000000000,
    tol=1e-10,
    class_weight='balanced',
)
clf = Pipeline(steps=[("minmax", minmax), ("features",
                                           combined_features), ("lg", lg)])
test_classifier(clf, my_dataset, features_list, folds=1000)
dump_classifier_and_data(clf, my_dataset, features_list)
}


def get_col(col_name):
    return lambda x: x[col_name]


vectorizer = FeatureUnion([
    ('text',
     TfidfVectorizer(ngram_range=(1, 2),
                     max_features=100000,
                     **tfidf_para,
                     preprocessor=get_col('text'))),
    ('text_feat',
     CountVectorizer(**countv_para, preprocessor=get_col('text_feat'))),
    ('title', CountVectorizer(**countv_para, preprocessor=get_col('title'))),
    (
        'translation',
        TfidfVectorizer(
            #ngram_range=(1, 2),
            max_features=50000,
            **tfidf_para,
            preprocessor=get_col('translation'))),
])

start_vect = time.time()
vectorizer.fit(df.loc[traindex, :].to_dict('records'))
ready_df = vectorizer.transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
tfvocab[:50]
print('[{}] Vectorisation completed'.format(time.time() - start_time))
Exemple #22
0
            train_chunk, test_chunk = dataset_manager.split_val(train, val_ratio=0.33, split="random", seed=22)
        else:
            train_chunk, test_chunk = dataset_manager.split_val(train, 0.2, split="random", seed=22)
        
        class_ratios.append(dataset_manager.get_class_ratio(train_chunk))

        # generate prefixes
        if nr_events is not None:
            dt_train_prefixes = dataset_manager.generate_prefix_data(train_chunk, nr_events, nr_events)
            dt_test_prefixes = dataset_manager.generate_prefix_data(test_chunk, nr_events, nr_events)
        else:
            dt_train_prefixes = dataset_manager.generate_prefix_data(train_chunk, min_prefix_length, max_prefix_length)
            dt_test_prefixes = dataset_manager.generate_prefix_data(test_chunk, min_prefix_length, max_prefix_length)

        # encode data for classifier
        feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(bucket_method, method, cls_method, **cls_encoder_args)) for method in methods])
        # if cls_method == "svm" or cls_method == "logit":
        #     feature_combiner = Pipeline([('encoder', feature_combiner), ('scaler', MinMaxScaler())])

        dt_train_encoded = feature_combiner.fit_transform(dt_train_prefixes)

        pd.DataFrame(dt_train_encoded).to_csv(os.path.join(folds_dir, "fold%s_train.csv" % cv_iter), sep=";", index=False)
        del dt_train_encoded

        dt_test_encoded = feature_combiner.transform(dt_test_prefixes)
        pd.DataFrame(dt_test_encoded).to_csv(os.path.join(folds_dir, "fold%s_test.csv" % cv_iter), sep=";", index=False)
        del dt_test_encoded

        # labels
        train_y = dataset_manager.get_label_numeric(dt_train_prefixes)
        with open(os.path.join(folds_dir, "fold%s_train_y.csv" % cv_iter), "wb") as fout:
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

iris = load_iris()

X, y = iris.data, iris.target

# This dataset is way too high-dimensional. Better do PCA:
pca = PCA(n_components=2)

# Maybe some original features where good, too?
selection = SelectKBest(k=1)

# Build estimator from PCA and Univariate selection:

combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

# Use combined features to transform dataset:
X_features = combined_features.fit(X, y).transform(X)
print("Combined space has", X_features.shape[1], "features")

svm = SVC(kernel="linear")

# Do grid search over k, n_components and C:

pipeline = Pipeline([("features", combined_features), ("svm", svm)])

param_grid = dict(features__pca__n_components=[1, 2, 3],
                  features__univ_select__k=[1, 2],
                  svm__C=[0.1, 1, 10])
Exemple #24
0
 Create a Pipeline
'''
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import Imputer, StandardScaler

num_pipeline = Pipeline([('selector', DataFrameSelector(isNumerical=True)),
                         ('imputer',
                          Imputer(missing_values="NaN", strategy='median')),
                         ('std_scaler', StandardScaler())])

cat_pipeline = Pipeline([('selector', DataFrameSelector(isNumerical=False)),
                         ('imputer', MyCatImputer()),
                         ('label_Binarizer', MyLabelBinarizer())])

full_pipeline = FeatureUnion(
    transformer_list=[('num_pipeline',
                       num_pipeline), ('cat_pipeline', cat_pipeline)])

houses_prepared = full_pipeline.fit_transform(X)
'''
 Test the model on the test set 
'''


def print_scores(scores):
    print('Scores: ', scores)
    print('Mean: ', scores.mean())
    print('SD: ', scores.std())


def remove_from_list(a, elements):
Exemple #25
0
                                    max_depth=4,
                                    verbosity=3),
                      max_features=60)

cb = CustomFeatureSelection(CustomCatBoostClassifier(
    n_estimators=100,
    class_weights=[1, 885 / 115],
    max_depth=4,
    cat_features=categorical_features + ordinal_features + boolean_features),
                            max_features=60)

feat_select = FeatureUnion([("pca",
                             make_pipeline(preprocess_mapper1,
                                           PCA(n_components=60, whiten=True))),
                            ("xgb", make_pipeline(preprocess_mapper1, xgb)),
                            ("cb", make_pipeline(preprocess_mapper2, cb)),
                            ("kbest",
                             make_pipeline(preprocess_mapper1,
                                           VarianceThreshold(),
                                           SelectKBest(chi2, k=60)))])

svm = LinearSVC(class_weight='balanced', C=0.1, max_iter=10000, dual=False)

estimator = BaggingClassifier(base_estimator=svm, n_estimators=150)

model = make_pipeline(feat_select, StandardScaler(),
                      PCA(n_components=30, whiten=True), estimator)

print('Preparing cross-validation')

# Evaluate the model and report the mean performance.
Exemple #26
0
def test():

    df = pd.read_csv('Train.csv',encoding='latin-1')
################################################################
    df['symbols'] = symbols(df['SentimentText'].values)
    df['length'] = df['SentimentText'].str.len()
    word_train,word_test=word2vec.exe()
    df['processedtext'] = np.array(pre.getData(df['SentimentText'].array))
    target = df['Sentiment']

################################################################

    X_train, X_test, y_train, y_test = train_test_split(df['processedtext'], target, test_size=0.20, random_state=100)
    X_train2, X_test2, y_train2, y_test2 = train_test_split(df['symbols'], target, test_size=0.20, random_state=100)
    X_train3, X_test3, y_train3, y_test3 = train_test_split(word_train, target, test_size=0.20, random_state=100)

    # 3 types of vectorizer
    vectorizer_tfidf = TfidfVectorizer(stop_words='english', max_df=0.75, ngram_range=(1, 2),analyzer='word',strip_accents="ascii")
    vectorizer_count = CountVectorizer(analyzer='word')
    vectorizer_hash = HashingVectorizer(ngram_range=(1, 2),strip_accents="ascii")

    # print(type(vectorizer_hash))
    from sklearn.pipeline import FeatureUnion
    # custom_vect = YourCustomVectorizer()
    combined_features = FeatureUnion([("hash", vectorizer_hash),
                                      ("count", vectorizer_count),
                                      ("tfidf",vectorizer_hash)])

    train_tfIdf = vectorizer_hash.fit_transform(X_train.values.astype('U'))
    test_tfIdf = vectorizer_hash.transform(X_test.values.astype('U'))

    # print("features names: ",vectorizer_tfidf.get_feature_names()[:10])

    print("train tf idf shape: ",train_tfIdf.shape)
    print("test tf idf shape: ",test_tfIdf.shape,"\n")

    from scipy.sparse import hstack
    train_tfIdf = hstack((train_tfIdf, np.array(X_train2)[:, None]))
    test_tfIdf = hstack((test_tfIdf, np.array(X_test2)[:, None]))

    print("train tf idf shape: ",train_tfIdf.shape)
    print("test tf idf shape: ",test_tfIdf.shape,"\n")

    num_feats = X_train3.values
    num_feats2 = X_test3.values

    train_tfIdf = hstack((train_tfIdf, num_feats))
    test_tfIdf = hstack((test_tfIdf, num_feats2))

    # train_tfIdf = hstack((train_tfIdf, np.array(X_train3)[:, None]))
    # test_tfIdf = hstack((test_tfIdf, np.array(X_test3)[:, None]))

    print("train tf idf shape: ", train_tfIdf.shape)
    print("test tf idf shape: ", test_tfIdf.shape, "\n")
    ################################################################
    scoring = 'accuracy'
    seed = 7
    # 10 cross validation to check wich model to choose

    models = []
    models.append(('LR', LogisticRegression(solver='liblinear')))
    # models.append(('KNN', KNeighborsClassifier()))
    models.append(('RFC', RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=100)))

    results = []
    names = []
    for name, model in models:
        kfold = model_selection.KFold(n_splits=10, random_state=None)
        cv_results = model_selection.cross_val_score(model, train_tfIdf, y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)

    ######################### KNN ###########################

    # print("KNN:")
    # knn_classifier = KNeighborsClassifier()
    # knn_classifier.fit(train_tfIdf, y_train)
    # pred2 = knn_classifier.predict(test_tfIdf)

    # # Calculate the accuracy score: score
    # accuracy_tfidf = metrics.accuracy_score(y_test, pred2)
    # print(accuracy_tfidf)
    # print(confusion_matrix(y_test, pred2))
    # print(classification_report(y_test, pred2))

    ######################### LR ###########################
    print("LR:")
    lr_classifier = LogisticRegression(solver='liblinear')
    lr_classifier.fit(train_tfIdf, y_train)
    pred3 = lr_classifier.predict(test_tfIdf)

    # Calculate the accuracy score: score
    accuracy_tfidf = metrics.accuracy_score(y_test, pred3)
    print(accuracy_tfidf)
    print(confusion_matrix(y_test, pred3))
    print(classification_report(y_test, pred3))

    ######################### RFC ###########################
    # print("Random Forest Classifier:")
    # classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=100)
    # classifier.fit(train_tfIdf, y_train)
    # predRF = classifier.predict(test_tfIdf)
    #
    # # Calculate the accuracy score
    # accuracy_RF = metrics.accuracy_score(y_test, predRF)
    # print(accuracy_RF)
    #
    # Conf_metrics_RF = metrics.confusion_matrix(y_test, predRF, labels=[1, 0])
    # print(Conf_metrics_RF)
    # print(confusion_matrix(y_test, predRF))
    # print(classification_report(y_test, predRF))


    #################### EXECUTE ###########################

    print("***test***")
    df2 = pd.read_csv('Test.csv',encoding='latin-1')
    df2['symbols'] = symbols(df2['SentimentText'].values)
    temp = np.array(pre.getData(df2['SentimentText'].array))
    test_tfIdf2 = vectorizer_hash.transform(temp.astype('U'))

    # df2['length'] = df2['SentimentText'].str.len()
    test_tfIdf2 = hstack((test_tfIdf2, np.array(df2['symbols'])[:, None]))
    # test_tfIdf2 = hstack((test_tfIdf2, np.array(df2['length'])[:, None]))

    num_feats3 = word_test.values
    test_tfIdf2 = hstack((test_tfIdf2, num_feats3))

    # test_x = temp
    # test_tfIdf2 = vectorizer_tfidf.transform(test_x.values.astype('U'))
    pred_test = lr_classifier.predict(test_tfIdf2)
    df2['SentimentText'] = pred_test
    del df2['symbols']
    df2.columns = ['ID','Sentiment']
    df2.to_csv("results15.csv",index=False)

    Conf_metrics_tfidf = metrics.confusion_matrix(y_test, pred3, labels=[1, 0])
    print(Conf_metrics_tfidf,"\n")
    #('counter', countvectorizer)
])

pos = Pipeline([('selector', TextSelector(key='pos')),
                ('counter', countvectorizer)])

avgsent = Pipeline([
    ('selector', NumberSelector(key='avg_sent')),
    #('standard', StandardScaler())
    ('normalize', Normalizer())
])

#create a feature union of the individual pipelines
features = FeatureUnion([
    ('tokens', tokens),
    #('pos', pos),
    #('avgsent',avgsent)
])

#creating a pipeline for the feature creation to see if it works
#feature_processing = Pipeline([('feats', features)])
#feature_processing.fit_transform(X_train)

#creating pipeline for modelling.

clf0 = CalibratedClassifierCV(LinearSVC(), cv=10)

model = Pipeline([('features', features), ('classifier', clf0)
                  #('NB',MultinomialNB())
                  ])
def train():    
	docBiasLabel,docTruthLabel = readLabel(trainLabelPath)
	title,article,numCitation = readData(trainDataPath)
	traindata = list(zip(title,numCitation,article))
	dataExtractor = Pipeline([('TitleArticleExtractor', TitleArticleExtractor()),])
	TfidfTitle = Pipeline([
						('selector', ItemSelector(key='title')),
						('vect', TfidfVectorizer(min_df = 0.01)),
						('to_dense', DenseTransformer()),
			     ])
	TfidfArticle = Pipeline([
						('selector', ItemSelector(key='article')),
						('vect', TfidfVectorizer(min_df = 0.01)),
						('to_dense', DenseTransformer()),
					])
	textStatsTitle = Pipeline([
					('selector', ItemSelector(key='title')),
					('stats', Text_Stats()),  
					('to_dense', DenseTransformer()),
					
				])
	textStatsArticle = Pipeline([
					('selector', ItemSelector(key='article')),
					('stats', Text_Stats()),  
					('to_dense', DenseTransformer()),					
				])

	matchNgrams =  Pipeline([
					('selector', ItemSelector(key='ngram')),
					('func', extractFeature()), 
					('to_dense', DenseTransformer()),
										
				])

	
	bias_clf = Pipeline([
			('TitleArticleExtractor', dataExtractor),
			('union', FeatureUnion(
				transformer_list=[
									('tfidf_title', TfidfTitle),
									('tfidf_article', TfidfArticle),
									('text_stats_title', textStatsTitle),
									('text_stats_body', textStatsArticle),
									('matchngrams', matchNgrams),
								],
							)),
					('clf', MultinomialNB()),
			])

	bias_clf.fit(traindata, docBiasLabel)

	with open(biasPKLfile,"wb") as f_pk:
		pickle.dump(bias_clf,f_pk,pickle.HIGHEST_PROTOCOL)
	

	truth_clf = Pipeline([			
			('TitleArticleExtractor', dataExtractor),
			('union', FeatureUnion(
				transformer_list=[
									('tfidf_title', TfidfTitle),
									('tfidf_article',TfidfArticle),
									('text_stats_headline',textStatsTitle),
									('text_stats_body', textStatsArticle),
									('matchngrams', matchNgrams),
								],
							)),
					('clf', GaussianNB()),
				])
	truth_clf.fit(traindata, docTruthLabel)

	with open(truthPKLfile,"wb") as f_pk:
		pickle.dump(truth_clf,f_pk,pickle.HIGHEST_PROTOCOL)
Exemple #29
0
    "token_pattern": r'\w{1,}',
    "lowercase": True,
    "min_df": 5  #False
}


def get_col(col_name):
    return lambda x: x[col_name]


vectorizer = FeatureUnion([
    ('text',
     TfidfVectorizer(ngram_range=(1, 2),
                     max_features=50000,
                     **tfidf_para,
                     preprocessor=get_col('description'))),
    ('text_feat',
     CountVectorizer(**countv_para, preprocessor=get_col('text_feat'))),
    ('title', CountVectorizer(**countv_para, preprocessor=get_col('title'))),
    ('all_titles', CountVectorizer(**countv_para,
                                   preprocessor=get_col('title')))
])

start_vect = time.time()
vectorizer.fit(df.loc[traindex, :].to_dict('records'))
ready_df = vectorizer.transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
tfvocab[:50]
print('[{}] Vectorisation completed'.format(time.time() - start_time))
# Drop Text Cols
df.drop(textfeats + ['text', 'all_titles'], axis=1, inplace=True)
gc.collect()
Exemple #30
0
def get_titanic_data(file_name):
    titanic_path = os.path.join("datasets", "titanic", file_name)
    data = pd.read_csv(titanic_path)

    # shuffled_indices = np.random.permutation(len(data))
    # data = data.iloc[shuffled_indices]

    #    data_num = data.drop(labels=["PassengerId", "Name", "Survived", "Embarked", "Sex", "Ticket", "Cabin"], axis=1)
    #   y_train = data["Survived"].copy()

    #data["Embarked"].fillna(value='U', inplace=True)
    #data["Sex"].fillna(value='unknow', inplace=True)
    #data["Cabin"].fillna(value='unknow', inplace=True)

    data["CabinCat"] = data["Cabin"].str.get(0).fillna('N')
    data["AgeBucket"] = data["Age"] // 15 * 15
    data["RelativesOnboard"] = data["SibSp"] + data["Parch"]
    data["Title"] = data.Name.str.extract('([A-Za-z]+)\.', expand=False)
    data["FirstName"] = data.Name.str.extract('([A-Za-z]+),', expand=False)

    #data["Fare_cat"] = np.ceil(data["Fare"] / 50)

    # split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    # for train_index, test_index in split.split(data, data["Pclass"]):
    #     strat_train_set = data.loc[train_index]
    #     strat_test_set = data.loc[test_index]

    if data.columns.contains('Survived'):
        X_data = data.drop("Survived", axis=1)
        y_data = data["Survived"].copy()
    else:
        X_data = data
        y_data = None

    num_attribs = ["Fare", "Parch", "RelativesOnboard", "Age", "SibSp"]
    print(num_attribs)
    cat_attribs = ["Pclass", "Sex", "Embarked"]
    cat_encoder1 = CategoricalEncoder(encoding="onehot-dense")

    #
    cat_encoder2 = CategoricalEncoder(categories=[[
        'Capt', 'Col', 'Countess', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major',
        'Master', 'Miss', 'Mlle', 'Mme', 'Mr', 'Mrs', 'Ms', 'Rev', 'Sir',
        'Dona'
    ]],
                                      encoding="onehot-dense")

    cat_encoder1.fit_transform(X_data[cat_attribs].dropna())
    cat_encoder2.fit_transform(X_data[['Title']].dropna())

    attributes = np.concatenate(
        (np.array(num_attribs),
         np.concatenate(np.array(cat_encoder1.categories_))))
    attributes = np.concatenate(
        (attributes, np.concatenate(np.array(cat_encoder2.categories_))))
    print(attributes)

    num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        # ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

    cat_pipeline = Pipeline([
        ('cat_selector', DataFrameSelector(cat_attribs)),
        ("imputer", MostFrequentImputer()),
        ('cat_encoder', cat_encoder1),
    ])

    cat_pipeline2 = Pipeline([
        ('cat_selector', DataFrameSelector(['Title'])),
        ("imputer", MostFrequentImputer()),
        ('cat_encoder', cat_encoder2),
    ])

    full_pipeline = FeatureUnion(transformer_list=[(
        "num_pipeline",
        num_pipeline), ("cat_pipeline",
                        cat_pipeline), ("cat_pipeline2", cat_pipeline2)])

    # cat_pipeline3 = Pipeline([
    #     ('cat_selector', DataFrameSelector(['Fare','female','male','Age',3,'RelativesOnboard','Master','Miss',1,'Mrs','Parch']))
    # ])

    prepare_select_and_pipeline = Pipeline([
        ('preparation', full_pipeline),
        ('feature_selection', TopFeatureSelector2([0, 2, 3, 4, 7, 8, 9, 26]))
    ])

    #X_data_prepared = full_pipeline.fit_transform(X_data)

    X_data_prepared = prepare_select_and_pipeline.fit_transform(X_data)

    return X_data_prepared, y_data, data