Exemple #1
0
    def __init__(self, env):

        # Feature preprocessing: Normalize to zero mean and unit variance
        # We use a few samples from the observation space to do this
        observation_examples = np.array(
            [env.observation_space.sample() for _ in range(10000)],
            dtype='float64')
        self.scaler = preprocessing.StandardScaler()
        self.scaler.fit(observation_examples)

        # Used to convert a state to a featurized represenation.
        # We use RBF kernels with different variances to cover different parts of the space
        self.featurizer = pipeline.FeatureUnion([
            ('rbf1', RBFSampler(gamma=5.0, n_components=100)),
            ('rbf2', RBFSampler(gamma=2.0, n_components=100)),
            ('rbf3', RBFSampler(gamma=1.0, n_components=100)),
            ('rbf4', RBFSampler(gamma=0.5, n_components=100)),
        ])
        self.featurizer.fit(self.scaler.transform(observation_examples))

        self.models = []
        for _ in range(env.action_space.n):
            model = SGDRegressor(learning_rate='constant')
            model.partial_fit([self.featurize_state(env.reset())], [0])
            self.models.append(model)
    def __init__(self, env):
        """
        SGD function approximator, with preprocessing steps from:
        https://github.com/dennybritz/reinforcement-learning/blob/master/FA/Q-Learning%20with%20Value%20Function%20Approximation%20Solution.ipynb
        """
        # Feature preprocessing: Normalize to zero mean and unit variance
        # We use a few samples from the observation space to do this
        observation_examples = np.array(
            [env.observation_space.sample() for _ in range(10000)], dtype="float64"
        )
        self.scaler = preprocessing.StandardScaler()
        self.scaler.fit(observation_examples)

        # Used to convert a state to a featurized represenation.
        # We use RBF kernels with different variances to cover different parts of the space
        self.featurizer = pipeline.FeatureUnion(
            [
                ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
                ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
                ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
                ("rbf4", RBFSampler(gamma=0.5, n_components=100)),
            ]
        )
        self.featurizer.fit(self.scaler.transform(observation_examples))

        self.models = []
        for _ in range(env.action_space.n):
            model = SGDRegressor(learning_rate="constant")
            model.partial_fit([self.featurize_state(env.reset())], [0])
            self.models.append(model)
Exemple #3
0
def create_estimator(ml_obj, numeric_features, cat_features, date_features):
    estimator = pipeline.Pipeline(steps=[
        ('Feature_processing',
         pipeline.FeatureUnion(transformer_list=[
             ('Numeric_features',
              pipeline.Pipeline(steps=[(
                  'selecting',
                  preprocessing.FunctionTransformer(
                      lambda data: data[:, numeric_features], validate=True)),
                                       ('scaling',
                                        preprocessing.StandardScaler(
                                            with_mean=0., with_std=1))])),
             ('Categical_features',
              pipeline.Pipeline(steps=[(
                  'selecting',
                  preprocessing.FunctionTransformer(
                      lambda data: data[:, cat_features], validate=True)),
                                       ('hot_encoding',
                                        preprocessing.OneHotEncoder(
                                            handle_unknown='ignore'))])),
             ('Date_features',
              pipeline.Pipeline(steps=[(
                  'selecting',
                  preprocessing.FunctionTransformer(
                      lambda data: data[:, date_features], validate=True)),
                                       ('hot_encoding',
                                        preprocessing.OneHotEncoder(
                                            handle_unknown='ignore'))]))
         ])), ('Model_fitting', ml_obj)
    ])
    return estimator


#TODO:
#make custom score
Exemple #4
0
    def get_default_features(self):

        tfidfvect = TfidfVectorizer()

        #features = skpipeline.FeatureUnion([('word_tfidf', tfidfvect)])
        features = skpipeline.FeatureUnion([
            ('word_tfidf', tfidfvect),
        ])
        return features
def main():
    env = LunarLander()

    # Add transformers for tile coding or extra features.
    transformer = pipeline.FeatureUnion([
        # ('scaler', preprocessing.StandardScaler()),
        # ('square', preprocessing.FunctionTransformer(lambda x: x**2, validate=False)),
        # ('dummy', DummyTransformer()),
        # ('poly', preprocessing.PolynomialFeatures(2)),
        # ('cos', preprocessing.FunctionTransformer(np.cos, validate=False)),
        # ('inverter', preprocessing.FunctionTransformer(lambda x: 1. /(x + 1.), validate=False)),
        # ('quantile', preprocessing.KBinsDiscretizer(strategy='uniform', n_bins=20, encode='onehot')),
        # ('quantile-poly', pipeline.Pipeline([
        #     ('poly', preprocessing.PolynomialFeatures(2, interaction_only=True)),
        #     ('quantile', preprocessing.KBinsDiscretizer(strategy='quantile', n_bins=20, encode='onehot-dense', )),
        #     ])),
        # ('quantile', pipeline.Pipeline([
        #     # ('poly', preprocessing.PolynomialFeatures(2)),
        #     ('quantile', preprocessing.KBinsDiscretizer(strategy='uniform', n_bins=200, encode='ordinal', )),
        #     # ('ohe', preprocessing.OneHotEncoder(sparse=False, categories='auto'))
        #
        #     ])),
        # ('power', preprocessing.PowerTransformer()),
    ])

    s = env.reset()

    # a = 1/(1000*s.T@s)
    print('Learning rate:', LR)

    agent = GMCAgent(lr=LR,
                     init_epsilon=EPS,
                     max_steps=MAX_STEPS,
                     gamma=GAMMA,
                     threshold=0.0,
                     transformer=None,
                     success_count=SUCCESS_COUNT,
                     success_criteria=SUCCESS_CRITERIA)
    # agent = SemiGradientAgent(lr=LR, init_epsilon=EPS, max_steps=800, gamma=GAMMA, threshold=0.0,
    #                transformer=None, success_count=3, success_criteria=220)
    # agent = EpisodicSemiGradient(lr=a, init_epsilon=0.3, max_steps=800, gamma=0.9999, threshold=0.0,
    #                transformer=None, success_count=3, success_criteria=220)

    agent.fit(env, render_train=False, verbose=True, episodes=10000)

    agent.land(env, verbose=False)
    now = datetime.now().strftime("%Y-%m-%d")
    filename = f'weights/trained_agent_weights_{agent.alpha}_{agent.max_steps}_{now}'
    np.save(filename, agent.get_weights())
    print("Weights saved to", filename)
def main():
    books = datasets.load_files("data/Book/",
                                shuffle=True,
                                encoding="ISO-8859-1",
                                random_state=1337)

    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        books.data, books.target, test_size=0.10)

    model = pipeline.Pipeline(
        [('union',
          pipeline.FeatureUnion(transformer_list=[
              ('other_features',
               util_q1.AddOtherFeatures(feature_to_add="pos_neg_count")),
              ('text_data',
               pipeline.Pipeline([
                   ('remove_words',
                    util_q1.RemoveWords(words_to_remove="none")),
                   ("normalisation",
                    util_q1.NormaliseWords(normalise_type="lemmatize")),
                   ("preprocess",
                    util_q1.PreprocessData(attribute="frequency_filtering",
                                           attribute_values="tf-idf")),
               ]))
          ])),
         ("classifier",
          GradientBoostingClassifier(n_estimators=60,
                                     max_features="sqrt",
                                     subsample=0.8))])

    scoring = {
        "accuracy": "accuracy",
        "recall": "recall",
        "precision": "precision"
    }

    grid_search_model = model_selection.GridSearchCV(
        model, {
            "classifier__max_depth": range(5, 11, 5),
            "classifier__min_samples_split": range(5, 11, 5)
        },
        n_jobs=-1,
        verbose=10,
        scoring=scoring,
        refit=False)

    grid_search_model.fit(X_train, y_train)
    joblib.dump(grid_search_model, "outputs/gridsearch_xgboost_aws.pkl")
Exemple #7
0
def main():
    model = pipeline.Pipeline([
        ("features",
         pipeline.FeatureUnion(
             transformer_list=[("other_features",
                                AddOtherFeatures(feature_to_add="None")),
                               ("text_data",
                                pipeline.Pipeline((
                                    "remove_words",
                                    RemoveWords(words_to_remove="None"),
                                    "normalize_words",
                                    NormalizeWords(normalize_type="None"),
                                    "vectorize_text",
                                    VectorizeText(vectorize_type="None"),
                                    "reduce_dimension",
                                    ReduceDimension(reduction_type="None"),
                                    "normalize_features",
                                    NormalizeFeatures(
                                        normalize_type="None"))))])),
        ("classifier", CurrentModel(model_name="knn"))
    ])

    scoring = calculate_score()

    grid_search_model = model_selection.GridSearchCV(model, {
        "features__other_features__feature_to_add": ["None"],
        "features__text_data__remove_words":
        ["None", "tool_words", "closed_class", "tool_words_and_closed_class"],
        "features__text_data__normalize_words":
        ["None", "Stemming", "Lemmatization"],
        "features__text_data__vectorize_text":
        ["None", "Presence", "Frequency", "td_idf"],
        "features__text_data__reduce_dimension": ["None", "PCA"],
        "features__text_data__normalize_features": ["None", "min_max_scale"]
    },
                                                     n_jobs=-1,
                                                     verbose=10,
                                                     scoring=scoring,
                                                     refit=False)

    grid_search_model.fit(X_train, y_train)
    pickle.dump(grid_search_model, open("fitted_pipeline", "wb"))
def build_model():
    """
    Build an NLP pipeline for multi-label text classification.
    """
    # text processing and model pipeline
    pipeline = skpipe.Pipeline([
        ('nlp',
         skpipe.FeatureUnion([
             ('tfif',
              skpipe.Pipeline([('feat',
                                skfet.TfidfVectorizer(strip_accents='unicode',
                                                      tokenizer=tokenize)),
                               ('lsa',
                                skdec.TruncatedSVD(n_components=200,
                                                   algorithm='arpack'))])),
             ('uppr', RatioUpperExtractor()), ('verb', CountVerbExtractor()),
             ('noun', RatioNounExtractor())
         ])), ('norm', skprep.StandardScaler()),
        ('clf',
         MLPClassifier(activation='logistic',
                       learning_rate='adaptive',
                       early_stopping=True,
                       random_state=RANDOM_SEED,
                       verbose=1))
    ])

    # define grid search parameters
    params = {
        'clf__learning_rate_init': [5e-3, 7.5e-3, 1e-2],
        'clf__hidden_layer_sizes': [(100), (200, ), (300, )]
    }
    # instantiate GridSearchCV object
    cv = skms.GridSearchCV(estimator=pipeline,
                           param_grid=params,
                           n_jobs=-1,
                           refit=True,
                           return_train_score=True)

    return cv
Exemple #9
0
 def get_estimator(self):
     binary = ('binary_variables_processing',
               preprocessing.FunctionTransformer(
                   lambda data: data[:, Model.binary_data_indices],
                   validate=True))
     categorial = (
         'categorical_variables_processing',
         pipeline.Pipeline(
             steps=[(
                 'selecting',
                 preprocessing.FunctionTransformer(
                     lambda data: data[:, Model.categorical_data_indices],
                     validate=True)),
                    ('hot_encoding',
                     preprocessing.OneHotEncoder(handle_unknown='ignore',
                                                 sparse=False))]))
     estimator = pipeline.Pipeline(
         steps=[('feature_processing',
                 pipeline.FeatureUnion(
                     transformer_list=[binary, categorial])
                 ), ('model_fitting', self.regressor)])
     return estimator
    def __init__(self, env, use_kernel=False, **agent_params):
        self.env = env
        self.use_kernel = use_kernel

        if use_kernel:
            # Sample feature space and define scaler to detrend data
            observation_samples = np.array(
                [env.observation_space.sample() for x in range(10000)])
            self.detrend = preprocessing.StandardScaler()
            self.detrend.fit(observation_samples)

            # Use detrended data to generate feature space with RBF kernels
            self.featurizer = pipeline.FeatureUnion([
                ("rbf1", RBFSampler(gamma=3.0, n_components=100)),
                ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
                ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
                ("rbf4", RBFSampler(gamma=0.5, n_components=100))
            ])
            self.featurizer.fit(self.detrend.transform(observation_samples))

        # Generate linear value function model for each action in our action space
        self.models = []
        initReward = np.array(0)
        for k in range(env.action_space.n):
            self.models.append(
                linear_model.SGDRegressor(learning_rate="constant"))
            random_features = self.map_to_features(self.env.reset())
            self.models[k].partial_fit(random_features.reshape(1, -1),
                                       initReward.ravel())

        self.agent_params = {
            "epsilon_min": 0.01,
            "decay_rate": 0.02,
            "discount": 0.99,
            "iter": 1000
        }
        self.agent_params.update(agent_params)
Exemple #11
0
])
#transformed_data=cat_feature_pipeline.fit_transform(X_train[['ENRL_CERT_NBR']])
num_feature_pipeline = pipeline.Pipeline([
    ('imputation', impute.SimpleImputer()),
    ('standardscalar', preprocessing.StandardScaler())
])

#transformed_data=num_feature_pipeline.fit_transform(X_train[['TOT_BLNG_AMT']])
feature_preprocessing = compose.ColumnTransformer(
    [('cat_feature_pipeline', cat_feature_pipeline, cat_features_list),
     ('num_feature_pipeline', num_feature_pipeline, num_features_list)],
    n_jobs=10)

features_pipeline = pipeline.FeatureUnion(
    [('pca_selector', decomposition.PCA(n_components=0.90)),
     ('et_selector',
      feature_selection.SelectFromModel(ensemble.ExtraTreesClassifier()))],
    n_jobs=20)

classifier = tree.DecisionTreeClassifier()
#build complete pipeline with feature selection and ml algorithms
complete_pipeline = PMMLPipeline([('preprocess', feature_preprocessing),
                                  ('zv_filter',
                                   feature_selection.VarianceThreshold()),
                                  ('features', features_pipeline),
                                  ('tree', classifier)])

pipeline_grid = {}
grid_estimator = model_selection.GridSearchCV(complete_pipeline,
                                              pipeline_grid,
                                              scoring="accuracy",
    def _generate_feature_extraction_pipeline(self):

        lang = self.feature_config.lang
        feature_weights = self.feature_config.weights
        prep_params = self.feature_config.prepchoice

        # features found in the processed tokens
        preprocessor = prep.Preprocessor(
            lang=lang,
            stopword=prep_params.stopword,
            more_stopwords=prep_params.more_stopwords,
            spellcheck=prep_params.spellcheck,
            stemming=prep_params.stemming,
            remove_numbers=prep_params.remove_numbers,
            deasciify=prep_params.deasciify,
            remove_punkt=prep_params.remove_punkt,
            lowercase=prep_params.lowercase)

        tfidfvect = sktext.TfidfVectorizer(
            tokenizer=prep.identity,
            preprocessor=None,
            lowercase=False,
            use_idf=prep_params.use_idf,
            ngram_range=prep_params.wordngramrange,
            max_features=prep_params.nmaxfeature)

        polpipe3 = toktrans.get_lexicon_count_pipeline(tokenizer=prep.identity,
                                                       lexicontype=lang)

        token_weights = dict(tfidfvect=feature_weights["word_tfidf"],
                             polpipe3=feature_weights["lexicon_count"])
        token_transformers_dict = dict(
            tfidfvect=
            tfidfvect,  # not to lose above integrity if we change variable names
            polpipe3=polpipe3)
        token_transformers = [(k, v)
                              for k, v in token_transformers_dict.items()]

        tokenbasedpipe = skpipeline.Pipeline([
            ('preprocessor', preprocessor),
            # ('nadropper', tbt.DropNATransformer()),
            ('union1',
             skpipeline.FeatureUnion(transformer_list=token_transformers,
                                     transformer_weights=token_weights)),
        ])

        charngramvect = sktext.TfidfVectorizer(
            analyzer='char_wb',
            ngram_range=prep_params.charngramrange,
            lowercase=False)

        polpipe1 = txtrans.get_polylglot_polarity_count_pipe(lang)
        polpipe2 = txtrans.get_polylglot_polarity_value_pipe(lang)

        text_weights = dict(
            charngramvect=feature_weights["char_tfidf"],  # @TODO hardcoded
            polpipe1=feature_weights["polyglot_count"],
            polpipe2=feature_weights["polyglot_value"])
        text_transformers_dict = dict(charngramvect=charngramvect,
                                      polpipe1=polpipe1,
                                      polpipe2=polpipe2)
        text_transformers = [(k, v) for k, v in text_transformers_dict.items()]
        '''
        textpipes = [('charngramvect', charngramvect),]
        textpweights = {'charngramvect' : 1.5}
        textpweights = dict(charngramvect = 1 if charngramvect else 0)
        '''
        textbasedpipe = skpipeline.Pipeline([(
            'union2',
            skpipeline.FeatureUnion(transformer_list=text_transformers,
                                    transformer_weights=text_weights),
        )])

        print(text_weights)

        final_transformers_dict = dict(tokenbasedpipe=tokenbasedpipe,
                                       textbasedpipe=textbasedpipe)
        final_transformers = [(k, v)
                              for k, v in final_transformers_dict.items()]
        '''        
        #tweights = {k : 1 if v else 0 for k,v in final_transformers.items()}
        check_zero = lambda x : 1 if sum(x) > 0 else 0
        x = list(tokenbasedpipe.get_params(False).values())
        print(len(x), x[0])
        print(x[0][1])   # convert x[0] tuple to dict, then get transformer weights
        print("**")
        print(x,"\n--")
        print(list(textbasedpipe.get_params(False).values()))
        tweights = {k : check_zero(list(k.get_params(False).values())[0][0][1].get_params(False)["transformer_weights"].values())
                          for _, k in final_transformers_dict.items()}
        '''

        feature_union = skpipeline.FeatureUnion(
            transformer_list=final_transformers,
            # transformer_weights=tweights   # weight assignment is not necessary as the number of features is small
        )

        return feature_union
def run_prep():

    classifier = sklinear.SGDClassifier(loss='hinge',
                                        penalty='l2',
                                        alpha=1e-3,
                                        n_iter=5,
                                        random_state=42)

    lang = "tr"
    stopword_choice = True
    more_stopwords_list = None
    spellcheck_choice = False
    stemming_choice = False
    number_choice = False
    deasc_choice = True
    punct_choice = True
    case_choice = True

    ngramrange = (1, 2)  # tuple
    nmaxfeature = 10000  # int or None
    norm = "l2"
    use_idf = True

    preprocessor = Preprocessor(lang=lang,
                                stopword=stopword_choice,
                                more_stopwords=more_stopwords_list,
                                spellcheck=spellcheck_choice,
                                stemming=stemming_choice,
                                remove_numbers=number_choice,
                                deasciify=deasc_choice,
                                remove_punkt=punct_choice,
                                lowercase=case_choice)
    tfidfvect = TfidfVectorizer(tokenizer=identity,
                                preprocessor=None,
                                lowercase=False,
                                use_idf=use_idf,
                                ngram_range=ngramrange,
                                max_features=nmaxfeature)

    keyword = "arıza"
    apipe = tbt.get_keyword_pipeline(keyword)
    keyword2 = "pstn"
    pstnpipe = tbt.get_keyword_pipeline(keyword2)
    polpipe1 = tbt.get_polylglot_polarity_count_pipe(lang)
    polpipe2 = tbt.get_polylglot_polarity_value_pipe(lang)
    polpipe3 = obt.get_lexicon_count_pipeline(tokenizer=identity)

    tokenizedpipe = skpipeline.Pipeline([
        ('preprocessor', preprocessor),
        ('union1',
         skpipeline.FeatureUnion(transformer_list=[
             ('vect', tfidfvect),
             ('polarity3', polpipe3),
         ])),
    ])

    textbasedpipe = skpipeline.Pipeline([(
        'union2',
        skpipeline.FeatureUnion([
            ('has_ariza', apipe),
            ('has_pstn', pstnpipe),
            ('polarity1', polpipe1),
            ('polarity2', polpipe2),
        ]),
    )])

    model = skpipeline.Pipeline([

        # ('preprocessor', preprocessor),
        ("union",
         skpipeline.FeatureUnion(transformer_list=[
             ('tfidf', tokenizedpipe),
             ('txtpipe', textbasedpipe),
         ])),
        ('classifier', classifier),
    ])

    t0 = time()
    print("Read data")
    instances, labels = get_data.get_data()

    N = 100
    instances, labels = corpus_io.select_N_instances(N, instances, labels)
    # instances_train, instances_test, ytrain, ytest = cv.train_test_split(instances, labels, test_size=0.30, random_state=20)

    print("Start classification\n..")
    nfolds = 5
    ypred = cv.cross_val_predict(model, instances, labels, cv=nfolds)
    tc_utils.get_performance(labels, ypred, verbose=True)
    t1 = time()
    print("Classification took ", round(t1 - t0, 2), "sec.")
def _email_features_pipeline(lang,
                                stopword_choice=True,
                                more_stopwords_list=None,
                                spellcheck_choice=False,
                                stemming_choice=False,
                                number_choice=False,
                                deasc_choice=True,
                                punct_choice=True,
                                case_choice=True,
                                
                                ngramrange=(1, 2),  # tuple
                                nmaxfeature=10000,  # int or None  
                                norm="l2",
                                use_idf=True,
                                keywords=[],  # ["arıza", "pstn"]
                                final_weights=dict(text_based=1, token_based=1)
                                ):
        

    # use a list of (pipeline, pipeline_name, weight)
               
        # features found in the processed tokens
    token_features = []
    token_weights = {}
    preprocessor = prep.Preprocessor(lang=lang,
                                 stopword=stopword_choice, more_stopwords=more_stopwords_list,
                                 spellcheck=spellcheck_choice,
                                 stemming=stemming_choice,
                                 remove_numbers=number_choice,
                                 deasciify=deasc_choice,
                                 remove_punkt=punct_choice,
                                 lowercase=case_choice
                                )
    
    tfidfvect = TfidfVectorizer(tokenizer=prep.identity, preprocessor=None, lowercase=False,
                                use_idf=use_idf, ngram_range=ngramrange, max_features=nmaxfeature)

    tfidfvect_name = 'word_tfidfvect'
    token_features.append((tfidfvect_name, tfidfvect))
    token_weights[tfidfvect_name] = 1
       
    
    
        # features found in the whole raw text
    text_features = []
    text_weights = {}
    # charngramvect = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 2), lowercase=False)
    # keyword presence features
    if keywords:
        for keyword in keywords:
            keywordpipe = txbt.get_keyword_pipeline(keyword)
            feature_name = "has_" + keyword
            text_features.append((feature_name, keywordpipe))
            text_weights[feature_name] = 1
            
    
    
    
    tokenbasedpipe = skpipeline.Pipeline([('preprocessor', preprocessor),
                                          # ('nadropper', tbt.DropNATransformer()),                                       
                                          ('union1', skpipeline.FeatureUnion(
                                                transformer_list=token_features ,
                                                transformer_weights=token_weights                                                
                                                )),
                                        ])
    
    textbasedpipe = skpipeline.Pipeline([('union2', skpipeline.FeatureUnion(
                                            transformer_list=text_features,
                                            transformer_weights=text_weights
                                            ),
                                          )
                                        ])
    
    
    #######
    # add the feature pipes to final_features if all the component weights are non-zero.
    ########
    check_zero_list = lambda x : 1 if sum(x) > 0 else 0
    #  l = [0,0,0] => check_zero(l) gives 0 and l=[0,0,1] => check_zero(l) gives 1.
    final_features_dict = {}     
            
    tkweights = list(token_weights.values())
    if(check_zero_list(tkweights) != 0):
        final_features_dict["token_based"] = tokenbasedpipe
    else:
        final_weights["token_based"] = 0
      
    txweights = list(text_weights.values())
    if(check_zero_list(txweights) != 0):
        final_features_dict["text_based"] = textbasedpipe
    else:
        final_weights["text_based"] = 0  
                                        
    final_features = list(final_features_dict.items())    
    
    fweights = list(final_weights.values())
    if((check_zero_list(fweights) == 0) or (len(final_features) == 0)):
        return None
    
    '''
    features = skpipeline.FeatureUnion(transformer_list=[
                                        ('tokenbasedfeatures', tokenbasedpipe),
                                        ('textbasedfeatures', textbasedpipe),                                          
                                       ],
                                       transformer_weights=final_weights)
    '''
    features = skpipeline.FeatureUnion(transformer_list=final_features,
                                       transformer_weights=final_weights)
    return features
def _tr_sentiment_features_pipeline(
        lang="tr",
        feature_weights={
            "word_tfidf": 1,
            "polyglot_value": 0,
            "polyglot_count": 0,
            "lexicon_count": 0,
            "char_tfidf": 1
        },
        stopword_choice=True,
        more_stopwords_list=None,
        spellcheck_choice=False,
        stemming_choice=False,
        number_choice=False,
        deasc_choice=True,
        punct_choice=True,
        case_choice=True,
        word_ngramrange=(1, 2),  # tuple
        char_ngramrange=(2, 2),
        nmaxfeature=10000,  # int or None  
        norm="l2",
        use_idf=True):

    preprocessor = prep.Preprocessor(lang=lang,
                                     stopword=stopword_choice,
                                     more_stopwords=more_stopwords_list,
                                     spellcheck=spellcheck_choice,
                                     stemming=stemming_choice,
                                     remove_numbers=number_choice,
                                     deasciify=deasc_choice,
                                     remove_punkt=punct_choice,
                                     lowercase=case_choice)
    tfidfvect = TfidfVectorizer(tokenizer=prep.identity,
                                preprocessor=None,
                                lowercase=False,
                                use_idf=use_idf,
                                ngram_range=word_ngramrange,
                                max_features=nmaxfeature)
    polpipe3 = obt.get_lexicon_count_pipeline(tokenizer=prep.identity)

    token_weights = dict(tfidfvect=feature_weights["word_tfidf"],
                         polpipe3=feature_weights["lexicon_count"])
    token_transformers_dict = dict(
        tfidfvect=
        tfidfvect,  # not to lose above integrity if we change variable names
        polpipe3=polpipe3)
    token_transformers = [(k, v) for k, v in token_transformers_dict.items()]

    tokenbasedpipe = skpipeline.Pipeline([
        ('preprocessor', preprocessor),
        # ('nadropper', tbt.DropNATransformer()),
        ('union1',
         skpipeline.FeatureUnion(transformer_list=token_transformers,
                                 transformer_weights=token_weights)),
    ])

    charngramvect = TfidfVectorizer(analyzer='char_wb',
                                    ngram_range=char_ngramrange,
                                    lowercase=False)

    polpipe1 = tbt.get_polylglot_polarity_count_pipe(lang)
    polpipe2 = tbt.get_polylglot_polarity_value_pipe(lang)

    text_weights = dict(charngramvect=feature_weights["char_tfidf"],
                        polpipe1=feature_weights["polyglot_count"],
                        polpipe2=feature_weights["polyglot_value"])
    text_transformers_dict = dict(charngramvect=charngramvect,
                                  polpipe1=polpipe1,
                                  polpipe2=polpipe2)
    text_transformers = [(k, v) for k, v in text_transformers_dict.items()]
    '''
    textpipes = [('charngramvect', charngramvect),]
    textpweights = {'charngramvect' : 1.5}
    textpweights = dict(charngramvect = 1 if charngramvect else 0)
    '''
    textbasedpipe = skpipeline.Pipeline([(
        'union2',
        skpipeline.FeatureUnion(transformer_list=text_transformers,
                                transformer_weights=text_weights),
    )])

    final_transformers_dict = dict(tokenbasedpipe=tokenbasedpipe,
                                   textbasedpipe=textbasedpipe)
    final_transformers = [(k, v) for k, v in final_transformers_dict.items()]
    '''        
    #tweights = {k : 1 if v else 0 for k,v in final_transformers.items()}
    check_zero = lambda x : 1 if sum(x) > 0 else 0
    x = list(tokenbasedpipe.get_params(False).values())
    print(len(x), x[0])
    print(x[0][1])   # convert x[0] tuple to dict, then get transformer weights
    print("**")
    print(x,"\n--")
    print(list(textbasedpipe.get_params(False).values()))
    tweights = {k : check_zero(list(k.get_params(False).values())[0][0][1].get_params(False)["transformer_weights"].values())
                      for _, k in final_transformers_dict.items()}
    '''

    features = skpipeline.FeatureUnion(
        transformer_list=final_transformers,
        # transformer_weights=tweights   # weight assignment is not necessary as the number of features is small
    )
    '''
    tokenbasedpipe = skpipeline.Pipeline([('preprocessor', preprocessor),
                                          #('nadropper', tbt.DropNATransformer()),                                       
                                          ('union1', skpipeline.FeatureUnion(
                                                transformer_list=[                                                   
                                                 ('tfidfvect', tfidfvect),
                                                 #('polarity3', polpipe3),
                                        ])),]
                                        )
    
    
    textbasedpipe = skpipeline.Pipeline([('union2', skpipeline.FeatureUnion([                                
                                         #('polarity1', polpipe1),
                                         #('polarity2', polpipe2),
                                         ('charngramvect', charngramvect),
                                         ]),)])
    
    features = skpipeline.FeatureUnion(transformer_list=[
                                        ('tokenbasedfeatures', tokenbasedpipe),
                                        ('textbasedfeatures', textbasedpipe),
                                       ])
    '''
    return features
Exemple #16
0
def _ar_txt_clf_features_pipeline2(
    feature_params_config_dict  #  {feature_params: {lang: .., weights : .., prep : {}, keywords : []}} see EMAIL_CONF for an example.
):

    lang = feature_params_config_dict[conf.lang_key]
    feature_weights = feature_params_config_dict[conf.weights_key]
    prep_params = feature_params_config_dict[conf.prep_key]

    #print(feature_weights)

    # features found in the processed tokens

    preprocessor = prep.Preprocessor(
        lang=lang,
        stopword=prep_params[conf.stopword_key],
        more_stopwords=prep_params[conf.more_stopwords_key],
        spellcheck=prep_params[conf.spellcheck_key],
        stemming=prep_params[conf.stemming_key],
        remove_numbers=prep_params[conf.remove_numbers_key],
        deasciify=prep_params[conf.deasciify_key],
        remove_punkt=prep_params[conf.remove_punkt_key],
        lowercase=prep_params[conf.lowercase_key])

    tfidfvect = TfidfVectorizer(
        tokenizer=prep.identity,
        preprocessor=None,
        lowercase=False,
        use_idf=prep_params[conf.use_idf_key],
        ngram_range=prep_params[conf.wordngramrange_key],
        max_features=prep_params[conf.nmaxfeature_key])

    token_weights = dict(tfidfvect=feature_weights["word_tfidf"], )
    token_transformers_dict = dict(
        tfidfvect=
        tfidfvect,  # not to lose above integrity if we change variable names
    )
    token_transformers = [(k, v) for k, v in token_transformers_dict.items()]

    tokenbasedpipe = skpipeline.Pipeline([
        ('preprocessor', preprocessor),
        # ('nadropper', tbt.DropNATransformer()),
        ('union1',
         skpipeline.FeatureUnion(transformer_list=token_transformers,
                                 transformer_weights=token_weights)),
    ])

    charngramvect = TfidfVectorizer(
        analyzer='char_wb',
        ngram_range=prep_params[conf.charngramrange_key],
        lowercase=False)

    # stylistic
    '''
    # BUG
    named_entity_pipe = tbt.get_named_entity_weight_pipeline(lang)
    
    text_weights = dict(charngramvect=feature_weights["char_tfidf"],   # @TODO hardcoded
                             polpipe1=feature_weights["polyglot_count"],
                             polpipe2=feature_weights["polyglot_value"],
                             named_entity_pipe=feature_weights["named_entity_rate"])
                             
    text_transformers_dict = dict(charngramvect=charngramvect,
                             polpipe1=polpipe1,
                             polpipe2=polpipe2,
                             named_entity_pipe=named_entity_pipe)
    '''

    text_weights = dict(
        charngramvect=feature_weights["char_tfidf"],  # @TODO hardcoded
    )

    text_transformers_dict = dict(charngramvect=charngramvect, )

    text_transformers = [(k, v) for k, v in text_transformers_dict.items()]
    '''
    textpipes = [('charngramvect', charngramvect),]
    textpweights = {'charngramvect' : 1.5}
    textpweights = dict(charngramvect = 1 if charngramvect else 0)
    '''
    textbasedpipe = skpipeline.Pipeline([(
        'union2',
        skpipeline.FeatureUnion(transformer_list=text_transformers,
                                transformer_weights=text_weights),
    )])

    final_transformers_dict = dict(tokenbasedpipe=tokenbasedpipe,
                                   textbasedpipe=textbasedpipe)
    final_transformers = [(k, v) for k, v in final_transformers_dict.items()]

    #print(textbasedpipe.named_steps)
    '''        
    #tweights = {k : 1 if v else 0 for k,v in final_transformers.items()}
    check_zero = lambda x : 1 if sum(x) > 0 else 0
    x = list(tokenbasedpipe.get_params(False).values())
    print(len(x), x[0])
    print(x[0][1])   # convert x[0] tuple to dict, then get transformer weights
    print("**")
    print(x,"\n--")
    print(list(textbasedpipe.get_params(False).values()))
    tweights = {k : check_zero(list(k.get_params(False).values())[0][0][1].get_params(False)["transformer_weights"].values())
                      for _, k in final_transformers_dict.items()}
    '''

    features = skpipeline.FeatureUnion(
        transformer_list=final_transformers,
        # transformer_weights=tweights   # weight assignment is not necessary as the number of features is small
    )

    #print("0000000000", feature_params_config_dict)

    return features
Exemple #17
0
        return self

    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)


cat_pipeline = pipeline.Pipeline([
    ("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked"])),
    ("imputer", MostFrequentImputer()),
    ("cat_encoder", OneHotEncoder(sparse=False)),
])

cat_pipeline.fit_transform(train_data)

preprocess_pipeline = pipeline.FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

X_train = preprocess_pipeline.fit_transform(train_data)
print('X_train')
print(X_train[:5])

y_train = train_data["Survived"]

# SVC
svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train)
print('svm_clf')
print(svm_clf)

# CHECK PREDICION
	INPUT: Dataframe with features (X), target variable dataframe (y), polynomial degree (parameter)
	OUTPUT: Score of XGB Regressor
    '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    model_xgb = xgb.XGBRegressor(n_estimators=n_est,learning_rate=0.15,max_depth=2)
	model_xgb.fit(X_train, y_train, eval_metric='rmse')
	
	return model_xgb.score(X_test,y_test)

def regression_pipeline(x,y,degree,pca_comps):
	'''
	INPUT: Dataframe with features (X), target variable dataframe (y), polynomial degree, PCA components
	OUTPUT: Score of regression pipeline using polynomial features and PCA
	'''
    # lets try using some feature tranforms, to our original X
    combined_features = pipeline.FeatureUnion([('poly',PolynomialFeatures(degree=degree)),
                                              ('pca', decomposition.PCA(n_components=pca_comps))])
    X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = .3)
    
    # now lets lay out the steps of our model
    # First we do the feature transforms with our feature union defined above
    # Second we do feature selection with the built-in SelectFromModel
    # Third we train the actual model
    steps = [
    ('features', combined_features),
    ('feature_selection', feature_selection.SelectFromModel(Lasso(alpha=.5))),
    ('model', LinearRegression())]

    #but at this point we have only defined what reg, no training has happened yet
    regression_pipeline = pipeline.Pipeline(steps)
    # calling fit here calls fit on the entire pipeline which in turn executes all its members
    regression_pipeline.fit(X_train,y_train)
clf = XGBClassifier(max_depth=5, n_estimators=100)

estimator = pipeline.Pipeline(steps=[
    (
        'feature_preprocessing',
        pipeline.FeatureUnion(transformer_list=[
            #real
            ('numeric_variables_processing',
             pipeline.Pipeline(
                 steps=[('selecting',
                         preprocessing.FunctionTransformer(
                             lambda data: data[:, real_data_indices])),
                        ('scaling',
                         preprocessing.StandardScaler(with_mean=0))])),

            #categorical
            ('categorical_variables_processing',
             pipeline.Pipeline(
                 steps=[('selecting',
                         preprocessing.FunctionTransformer(
                             lambda data: data[:, cat_data_indices])),
                        ('hot_encoding',
                         preprocessing.OneHotEncoder(
                             handle_unknown='ignore'))])),
        ])),
    ('model_fitting', clf)
])

estimator.fit(train_data, y)
pred = estimator.predict(test_data)
write_to_csv('check.csv', ['PassengerId', 'Survived'], pred)
Exemple #20
0
counters_pipe = pipeline.FeatureUnion(
    n_jobs=-1,
    transformer_list=[
        ('chars_features',
         pipeline.Pipeline([('chars_counter',
                             TfidfVectorizer(analyzer=u'char',
                                             ngram_range=(2, 5),
                                             tokenizer=None,
                                             max_features=config.max_features,
                                             strip_accents=None,
                                             max_df=0.9,
                                             min_df=2,
                                             lowercase=False)),
                            ('chars_tsvd',
                             TruncatedSVD(n_components=config.svd_n_components,
                                          n_iter=25,
                                          random_state=42))])),
        ('words_features',
         pipeline.Pipeline([('words_counter',
                             TfidfVectorizer(analyzer=u'word',
                                             ngram_range=(1, 3),
                                             tokenizer=None,
                                             use_idf=True,
                                             max_features=config.max_features,
                                             strip_accents=None,
                                             max_df=0.9,
                                             min_df=2,
                                             lowercase=False)),
                            ('words_tsvd',
                             TruncatedSVD(n_components=config.svd_n_components,
                                          n_iter=25,
                                          random_state=42))])),
    ])
    def _generate_feature_extraction_pipeline(self):

        lang = self.feature_config.lang
        final_weights = self.feature_config.weights
        prep_params = self.feature_config.prepchoice

        # features found in the processed tokens
        token_features = []
        token_weights = {}

        preprocessor = prep.Preprocessor(
            lang=lang,
            stopword=prep_params.stopword,
            more_stopwords=prep_params.more_stopwords,
            spellcheck=prep_params.spellcheck,
            stemming=prep_params.stemming,
            remove_numbers=prep_params.remove_numbers,
            deasciify=prep_params.deasciify,
            remove_punkt=prep_params.remove_punkt,
            lowercase=prep_params.lowercase)

        tfidfvect = TfidfVectorizer(tokenizer=prep.identity,
                                    preprocessor=None,
                                    lowercase=False,
                                    use_idf=prep_params.use_idf,
                                    ngram_range=prep_params.wordngramrange,
                                    max_features=prep_params.nmaxfeature)

        tfidfvect_name = 'word_tfidfvect'
        token_features.append((tfidfvect_name, tfidfvect))
        token_weights[tfidfvect_name] = 1

        # features found in the whole raw text
        text_features = []
        text_weights = {}
        # charngramvect = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 2), lowercase=False)

        tokenbasedpipe = skpipeline.Pipeline([
            ('preprocessor', preprocessor),
            # ('nadropper', tbt.DropNATransformer()),
            ('union1',
             skpipeline.FeatureUnion(transformer_list=token_features,
                                     transformer_weights=token_weights)),
        ])
        if text_weights:
            textbasedpipe = skpipeline.Pipeline([(
                'union2',
                skpipeline.FeatureUnion(transformer_list=text_features,
                                        transformer_weights=text_weights),
            )])
        '''
        features = skpipeline.FeatureUnion(transformer_list=[
                                            ('tokenbasedfeatures', tokenbasedpipe),
                                            ('textbasedfeatures', textbasedpipe),                                          
                                           ],
                                           transformer_weights=final_weights)
        '''
        #######
        # add the feature pipes to final_features if all the component weights are non-zero.
        ########
        check_zero_list = lambda x: 1 if sum(x) > 0 else 0
        #  l = [0,0,0] => check_zero(l) gives 0 and l=[0,0,1] => check_zero(l) gives 1.
        final_features_dict = {}

        tkweights = list(token_weights.values())
        if (check_zero_list(tkweights) != 0):
            final_features_dict["token_based"] = tokenbasedpipe
        else:
            final_weights["token_based"] = 0

        txweights = list(text_weights.values())
        if (check_zero_list(txweights) != 0):
            final_features_dict["text_based"] = textbasedpipe
        else:
            final_weights["text_based"] = 0

        final_features = list(final_features_dict.items())

        fweights = list(final_weights.values())

        #print(final_weights)

        if ((check_zero_list(fweights) == 0) or (len(final_features) == 0)):
            return None

        features = skpipeline.FeatureUnion(transformer_list=final_features,
                                           transformer_weights=final_weights)
        return features
Exemple #22
0
    def _build_feature_pipeline(self,
                                sample_mode='rollouts',
                                num_components=50,
                                gammas=None,
                                num_obs=10000,
                                use_standard_scaler=True,
                                featurizer_max_env_steps=10000):
        """Build the feature pipeline.

    Args:
      sample_mode: A string rerpresenting how to collect data from the
        environment to build features. Must be {'rollouts', 'reset', 'random'}.
        - `rollouts` will collect observations by executing a random policy in
          the env.
        - `reset` will collect rollouts by repeatedly resetting the env.
        - `random` will just sample the env observation space randomly.
      num_components: The number of components in each RBF.
      gammas: A list containing the frequency of each RBF. If None will default
        to `[0.5, 1.0, 2.5, 5.0]`.
      num_obs: The integer number of observations to use to fit the Kernels.
      use_standard_scaler: Boolean indicating if the observations should be
        normalized.
      featurizer_max_env_steps: Maximum number of steps to be taken in each
        rollout to estimate the kernels in the featurizer.

    Raises:
      ValueError: If the `sample_mode` is unknown.
    """
        env = self._env._envs[0]  # pylint: disable=protected-access
        if gammas is None:
            gammas = [0.5, 1.0, 2.5, 5.0]

        features = []
        for i, gamma in enumerate(gammas):
            features.append(
                ('rbf{}'.format(i),
                 kernel_approximation.RBFSampler(gamma=gamma,
                                                 n_components=num_components)))
        self.featurizer = pipeline.FeatureUnion(features)
        if use_standard_scaler:
            self.scaler = skl_preprocessing.StandardScaler()

        if sample_mode == 'random':
            # Randomly sample from the observation space to fit the featurizers.
            observation_examples = np.array([env.observation_space.sample() for _ in range(num_obs)])  # pylint: disable=line-too-long
        elif sample_mode == 'reset':
            # Just reset the environment to obtain the observations.
            observation_examples = np.array(
                [env.reset() for _ in range(num_obs)])
        elif sample_mode == 'rollouts':
            # Rollout mode.
            observations = []
            while True:
                observations.append(env.reset())
                done = False
                t = 0
                while not done and t < featurizer_max_env_steps:
                    action = env.action_space.sample()
                    obs, _, done, _ = env.step(action)
                    observations.append(obs)
                if len(observations) > num_obs:
                    break  # Collected enough observations.
            observation_examples = np.array(observations)
        else:
            raise ValueError('Unknown `sample_mode`!')

        if use_standard_scaler: self.scaler.fit(observation_examples)
        if use_standard_scaler: self.scaler.transform(observation_examples)
        self.featurizer.fit(observation_examples)
        self.use_standard_scaler = use_standard_scaler
Exemple #23
0
def main():
	'''
	The main function reads in data, sets up the union features
	'''
	
	train_file = 'train_data.csv'
	blind_file = 'test_features_2013-03-07.csv'

	#read csv files into a dataframe
	train_df = pd.read_csv(train_file)
	blind_df  = pd.read_csv(blind_file)

	#do print-out checks of the file
	print(train_df.shape)
	print(blind_df.shape)
	print(train_df.head(5))

	#fix column names in the blind_df column to be all lowercase to match with train_df
	for col in blind_df.columns:
		blind_df.rename(columns={col: col.lower()}, inplace=True)

	print(blind_df.head(5))

	#take out the zero-target values from the train data
	X_total = train_df[train_df['target'] > 0]

	#set the predicted y values to be the 'target' column from the train dataframe
	y_total = X_total['target']

	print(X_total.shape)
	print(y_total.shape)

	#divide train data into 'train' set and validation test set, where validation set is 20% of data from the training dataframe 
	X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_total, y_total, test_size=0.2)

	#make a dictionary of weights for each step in the model pipeline
	transform_dict = {'job type':1, 'years exp': 1, 'location': 1,
                  'degree': 1, 'major':1, 'industry':1}

	#combine features into one large set 
	all_features = pipeline.FeatureUnion([
     ('job type', FullModelTransformer(CategTransformer('jobtype'))),
     ('years exp', FullModelTransformer(LinFitTransformer('yearsexperience'))),
     ('location', FullModelTransformer(LinFitTransformer('milesfrommetropolis'))),
     ('degree', FullModelTransformer(CategTransformer('degree'))),
     ('major', FullModelTransformer(CategTransformer('major'))),
     ('industry', FullModelTransformer(CategTransformer('industry')))
        ],
     transformer_weights=transform_dict)

	#make a pipeline which performs fitting to all features and then fits those predictions to a an overall model (n nearest neighbors)
	k_union = pipeline.Pipeline([
       ("features", all_features),
     ('modelfit', KNeighborsRegressor(n_neighbors=3))
 #("linreg", LinearRegression(fit_intercept=True))
    ])
	
	#fit the train data
	k_union.fit(X_train, y_train.values.reshape(-1,1))
	#print the R^2 score of the fit
	print k_union.score(X_train, y_train.values.reshape(-1,1))

	#fit the validation test data
	k_union.fit(X_test, y_test.values.reshape(-1,1))
	#print the R^2 score of the fit
	print k_union.score(X_test, y_test.values.reshape(-1,1))

	#predict on the blind data
	result = k_union.predict(blind_df)

	#add the prediction result as a column in the blind dataframe
	blind_df['target'] = result
	#write out resulting dataframe to a new csv file
	header = ["jobid", "target"]
	#blind_df.to_csv('test_target.csv', columns = header, index=False)

	#predict on the entire input dataset
	result = k_union.predict(X_total)
	X_total['target_pred'] = result

	#write resuling dataframe to new csv file
	header = ["jobid", "target_pred"]
	X_total.to_csv('train_target_pred.csv', columns = header, index=False)

	#send results to plot
	make_plot(X_total, blind_df)

	return
Exemple #24
0
    # specify cross-validation
    k = 10  # number of folds
    cvsplitter = ms.KFold(n_splits=k, shuffle=True,
                          random_state=0)  # cross-validation splitter
    score = ms.cross_val_score(model, X, y, cv=cvsplitter)
    print('Standardized linear discriminant analysis mean accuracy: {0:.4f}'.
          format(score.mean()))

    # define steps in a feature selection pipeline
    features = list()
    features.append(('pca', decomp.PCA(
        n_components=3)))  # use PCA to select 3 of the best features
    features.append(('select_best', fs.SelectKBest(
        k=6)))  # use chi-squared test to select 6 of the best features
    feature_union = pipeline.FeatureUnion(
        features)  # create the feature selection pipeline
    estimators = list()
    estimators.append((
        'feature_union',
        feature_union))  # add the feature selection pipleine to a new pipeline
    estimators.append(('logistic', sl.LogisticRegression(
        max_iter=1000)))  # use logistic regression as the model
    model = pipeline.Pipeline(
        estimators
    )  # logistic regression with automatic feature selection by pca and chi-squared test

    # specify cross-validation
    score = ms.cross_val_score(model, X, y, cv=cvsplitter)
    print(
        'Logistic regression with automatic feature selection by PCA and chi2 test mean accuracy: {0:.4f}'
        .format(score.mean()))
 pipeline.FeatureUnion(
     n_jobs=-1,
     transformer_list=[
         ('standard', cust_regression_vals()),
         ('pi1',
          pipeline.Pipeline([
              ('Gene', cust_txt_col('Gene')),
              ('count_Gene',
               feature_extraction.text.CountVectorizer(analyzer=u'char',
                                                       ngram_range=(1,
                                                                    8))),
              ('tsvd1',
               decomposition.TruncatedSVD(n_components=20,
                                          n_iter=25,
                                          random_state=12))
          ])),
         ('pi2',
          pipeline.Pipeline([
              ('Variation', cust_txt_col('Variation')),
              ('count_Variation',
               feature_extraction.text.CountVectorizer(analyzer=u'char',
                                                       ngram_range=(1,
                                                                    8))),
              ('tsvd2',
               decomposition.TruncatedSVD(n_components=20,
                                          n_iter=25,
                                          random_state=12))
          ])),
         ('pi3',
          pipeline.Pipeline([
              ('Text', cust_txt_col('Text')),
              ('hv',
               feature_extraction.text.HashingVectorizer(
                   decode_error='ignore',
                   n_features=2**16,
                   non_negative=True,
                   ngram_range=(1, 3))),
              ('tfidf_Text', feature_extraction.text.TfidfTransformer()),
              ('tsvd3',
               decomposition.TruncatedSVD(n_components=300,
                                          n_iter=25,
                                          random_state=12))
          ]))
     ]))
Exemple #26
0
    'NumUniqueBadges', 'NumPosts', 'MeanPostScore', 'MeanPostViews',
    'MeanPostFavorites', 'MeanPostComments', 'NumComments', 'MeanCommentScore'
]
feature_cols = numeric_cols + ['AboutMe']

model = skl_pipeline.Pipeline([
    ('feat',
     skl_pipeline.FeatureUnion([
         ('num',
          skl_pipeline.Pipeline([
              ('get',
               skl_preproc.FunctionTransformer(itemgetter(numeric_cols),
                                               validate=False)),
              ('poly', skl_preproc.PolynomialFeatures()),
              ('std', skl_preproc.StandardScaler()),
          ])),
         ('text',
          skl_pipeline.Pipeline([
              ('get',
               skl_preproc.FunctionTransformer(itemgetter('AboutMe'),
                                               validate=False)),
              ('tfidf', skl_featext.text.TfidfVectorizer()),
          ]))
     ])), ('reg', skl_linear.ElasticNet(alpha=1.0, l1_ratio=1.0))
])

param_grid = [
    dict(
        feat__num__poly__degree=[1, 2, 3],
        feat__num__poly__interaction_only=[True, False],
        feat__text__tfidf__max_df=[0.25, 0.5, 1.0],
Exemple #27
0
_estimators.append(
    (cluster.KMeans(random_state=42),
     pickle.loads(pickle.dumps(pd_cluster.KMeans(random_state=42))), True))
_estimators.append((neighbors.KNeighborsClassifier(),
                    pd_neighbors.KNeighborsClassifier(), True))
_estimators.append(
    (ensemble.GradientBoostingClassifier(random_state=42),
     pd_ensemble.GradientBoostingClassifier(random_state=42), True))
_estimators.append((pipeline.make_union(decomposition.PCA(n_components=2),
                                        feature_selection.SelectKBest(k=1)),
                    pd_decomposition.PCA(n_components=2) +
                    pd_feature_selection.SelectKBest(k=1), True))
_estimators.append(
    (pipeline.FeatureUnion([('pca', decomposition.PCA(n_components=2)),
                            ('kbest', feature_selection.SelectKBest(k=1))],
                           transformer_weights={
                               'pca': 3,
                               'kbest': 4
                           }),
     pd_pipeline.FeatureUnion(
         [('pca', pd_decomposition.PCA(n_components=2)),
          ('kbest', pd_feature_selection.SelectKBest(k=1))],
         transformer_weights={
             'pca': 3,
             'kbest': 4
         }), True))
_estimators.append((pipeline.make_union(decomposition.PCA(n_components=1),
                                        decomposition.PCA(n_components=2),
                                        feature_selection.SelectKBest(k=1)),
                    pd_decomposition.PCA(n_components=1) +
                    pd_decomposition.PCA(n_components=2) +
                    pd_feature_selection.SelectKBest(k=1), True))
def _email_features_pipeline2(feature_params_config_dict  #  {feature_params: {lang: .., weights : .., prep : {}, keywords : []}} see EMAIL_CONF for an example.
                                ):
        


    lang = feature_params_config_dict[conf.lang_key]
    final_weights = feature_params_config_dict[conf.weights_key]
    prep_params = feature_params_config_dict[conf.prep_key]
    keywords = feature_params_config_dict[conf.keyword_key]

               
        # features found in the processed tokens
    token_features = []
    token_weights = {}

    preprocessor = prep.Preprocessor(lang=lang,
                                     stopword=prep_params[conf.stopword_key], more_stopwords=prep_params[conf.more_stopwords_key],
                                     spellcheck=prep_params[conf.spellcheck_key],
                                     stemming=prep_params[conf.stemming_key],
                                     remove_numbers=prep_params[conf.remove_numbers_key],
                                     deasciify=prep_params[conf.deasciify_key],
                                     remove_punkt=prep_params[conf.remove_punkt_key],
                                     lowercase=prep_params[conf.lowercase_key]
                                )
    
    tfidfvect = TfidfVectorizer(tokenizer=prep.identity, preprocessor=None, lowercase=False,
                                use_idf=prep_params[conf.use_idf_key],
                                ngram_range=prep_params[conf.ngramrange_key],
                                max_features=prep_params[conf.nmaxfeature_key])

    tfidfvect_name = 'word_tfidfvect'
    token_features.append((tfidfvect_name, tfidfvect))
    token_weights[tfidfvect_name] = 1
       
    
    
        # features found in the whole raw text
    text_features = []
    text_weights = {}
    # charngramvect = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 2), lowercase=False)
    # keyword presence features
    if keywords:
        for keyword in keywords:
            keywordpipe = txbt.get_keyword_pipeline(keyword)
            feature_name = "has_" + keyword
            text_features.append((feature_name, keywordpipe))
            text_weights[feature_name] = 1
            
    
    
    
    tokenbasedpipe = skpipeline.Pipeline([('preprocessor', preprocessor),
                                          # ('nadropper', tbt.DropNATransformer()),                                       
                                          ('union1', skpipeline.FeatureUnion(
                                                transformer_list=token_features ,
                                                transformer_weights=token_weights                                                
                                                )),
                                        ])
    
    textbasedpipe = skpipeline.Pipeline([('union2', skpipeline.FeatureUnion(
                                            transformer_list=text_features,
                                            transformer_weights=text_weights
                                            ),
                                          )
                                        ])
    
    
    
    
    '''
    features = skpipeline.FeatureUnion(transformer_list=[
                                        ('tokenbasedfeatures', tokenbasedpipe),
                                        ('textbasedfeatures', textbasedpipe),                                          
                                       ],
                                       transformer_weights=final_weights)
    '''
    #######
    # add the feature pipes to final_features if all the component weights are non-zero.
    ########
    check_zero_list = lambda x : 1 if sum(x) > 0 else 0
    #  l = [0,0,0] => check_zero(l) gives 0 and l=[0,0,1] => check_zero(l) gives 1.
    final_features_dict = {}     
            
    tkweights = list(token_weights.values())
    if(check_zero_list(tkweights) != 0):
        final_features_dict["token_based"] = tokenbasedpipe
    else:
        final_weights["token_based"] = 0
      
    txweights = list(text_weights.values())
    if(check_zero_list(txweights) != 0):
        final_features_dict["text_based"] = textbasedpipe
    else:
        final_weights["text_based"] = 0  
                                        
    final_features = list(final_features_dict.items())    
    
    fweights = list(final_weights.values())
    
    #print(final_weights)
    
    if((check_zero_list(fweights) == 0) or (len(final_features) == 0)):
        return None
    
    
    features = skpipeline.FeatureUnion(transformer_list=final_features,
                                       transformer_weights=final_weights)
    return features
Exemple #29
0
    def __init__(self, env, use_kernel=False, **agent_params):
        self.env = env
        self.use_kernel = use_kernel
        self.agent_params = {
            "epsilon_min": 0.01,
            "decay_rate": 0.01,
            "discount": 0.99,
            "iter": 200,
        }
        self.agent_params.update(agent_params)

        # Generating feature space of RBF kernels
        if self.use_kernel:
            observation_samples = np.array(
                [env.observation_space.sample() for x in range(10000)])
            self.detrend = preprocessing.StandardScaler()
            self.detrend.fit(observation_samples)
            self.featurizer = pipeline.FeatureUnion([
                ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
                ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
                ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
                ("rbf4", RBFSampler(gamma=0.5, n_components=100))
            ])
            self.featurizer.fit(self.detrend.transform(observation_samples))
            self.n_features = len(
                self.featurizer.transform(env.observation_space.sample())[0])
        else:
            self.n_features = len(env.observation_space.sample())

        print(self.n_features)
        # Generating linear model approximation for value function
        with tf.variable_scope("value_function"):
            self.value_features = tf.placeholder(tf.float32, [self.n_features],
                                                 name="value_features")
            self.value_reward_target = tf.placeholder(
                tf.float32, name="value_reward_target")
            value_output_layer = tf.contrib.layers.fully_connected(
                inputs=tf.expand_dims(self.value_features, 0),
                num_outputs=1,
                activation_fn=None,
                weights_initializer=tf.zeros_initializer)

            self.value_estimate = tf.squeeze(value_output_layer)
            self.value_loss = tf.squared_difference(self.value_estimate,
                                                    self.value_reward_target)
            self.value_optimizer = tf.train.AdamOptimizer()
            self.value_train_op = self.value_optimizer.minimize(
                self.value_loss)

        # Generating linear model approximation for policy function
        with tf.variable_scope("policy_function"):
            self.action = tf.placeholder(tf.int32, name="action")
            self.policy_features = tf.placeholder(tf.float32,
                                                  [self.n_features],
                                                  name="policy_features")
            self.policy_reward_target = tf.placeholder(
                tf.float32, name="policy_reward_target")
            policy_output_layer = tf.contrib.layers.fully_connected(
                inputs=tf.expand_dims(self.policy_features, 0),
                num_outputs=env.action_space.n,
                activation_fn=None,
                weights_initializer=tf.zeros_initializer)

            self.action_probabilities = tf.squeeze(
                tf.nn.softmax(policy_output_layer))
            self.max_action_probability = tf.gather(self.action_probabilities,
                                                    self.action)

            self.policy_loss = -tf.log(
                self.max_action_probability) * self.policy_reward_target
            self.policy_optimizer = tf.train.AdamOptimizer()
            self.policy_train_op = self.policy_optimizer.minimize(
                self.policy_loss)
train.fillna('', inplace=True)
test.fillna('', inplace=True)

counters_pipe = pipeline.FeatureUnion(
    n_jobs=-1,
    transformer_list=[
        ('chars_features',
         TfidfVectorizer(analyzer=u'char',
                         ngram_range=(2, 5),
                         tokenizer=None,
                         max_features=config.max_features,
                         strip_accents=None,
                         max_df=0.9,
                         min_df=2,
                         lowercase=False)),
        ('words_features',
         TfidfVectorizer(analyzer=u'word',
                         ngram_range=(1, 3),
                         tokenizer=None,
                         use_idf=True,
                         max_features=config.max_features,
                         strip_accents=None,
                         max_df=0.9,
                         min_df=2,
                         lowercase=False)),
    ])

cv = KFold(n_splits=config.nfolds, shuffle=True, random_state=42)
splits = list(cv.split(train))