def weka_arff_ttest(model='nc'): # importing config print("\nimporting config file...") config = utility.load_config() print("\npreparing the components...\n") db = DBConnector(**config['db']) if model is 'nc': n_class = 9 dataset = shuffle(pd.DataFrame(db.find_trainingset()), random_state=42) # dataset = dataset.head(100) print("\ntokenizing...\n") ds = preprocessing.tokenize_list(dataset) labels = dataset['tag'].to_numpy() txt_labels = classification.news_categories else: n_class = 2 dataset = shuffle(pd.DataFrame(db.find_likabilityset()), random_state=42) print("\ntokenizing...\n") ds = pd.DataFrame() ds['content'] = preprocessing.tokenize_list(dataset) ds['tag'] = dataset['tag'] labels = np.asarray([ classification.labelize_likability(a)[0] for _, a in dataset.iterrows() ]) txt_labels = ['LIKE', 'DISLIKE'] # preparing classifiers print("\n\nt-testing %s ...\n\n" % (model)) classifiers = classification.init_simple_classifiers(model) classifiers = classifiers + classification.init_ensmeta_classifiers( classifiers, model) # putting together simple and ens/meta classification.weka_ttest(classifiers, ds, labels, n_class=n_class, txt_labels=txt_labels, model=model)
def build_feed(self, parsed_feed): self.feed = pd.DataFrame(parsed_feed) features = pd.DataFrame() features['content'] = pp.tokenize_list(parsed_feed) features['tag'] = self.nws_clf.predict(features['content']) self.feed['likability'] = self.lik_prd.predict(features) self.feed['predicted_tag'] = features['tag'] self.feed = self.feed[self.feed['likability'] == 'LIKE'].drop( 'likability', axis=1)
def deploy_news_classifier(dataset, dir_path='home/miner/model'): # this function should provide a trained pipeline (so apparently bagging is the best now) clf = BaggingClassifier(base_estimator=classifier['svc'](C=0.51, random_state=42), n_estimators=100, random_state=42) model = build_nc_model(('clf', clf)) dataset = shuffle(dataset, random_state=42) ds = pp.tokenize_list(dataset) labels = dataset['tag'].to_numpy() cross_validate_fullscores(model, ds, labels, n_class=9, txt_labels=news_categories) joblib.dump(model, dir_path + '/nws_clf.pkl') return model
def meta_classify_lc(dataset, show_mat=False, tuning=False, plot=False, load_pretokenized=False): # preparing the inputs ds = pd.DataFrame() dataset = shuffle(dataset, random_state=42) # avoid wasting time during test... # if load_pretokenized: # with open('home/pretokenized_dataset/lcds.pkl', 'rb') as f: # ds['content'] = pickle.load(f) # else: ds['content'] = pp.tokenize_list(dataset) ds['tag'] = dataset['tag'] # preparing the targets labels = np.asarray([labelize_likability(a)[0] for _,a in dataset.iterrows()]) # classifiers initialization classifiers = init_simple_classifiers('lc') print("\n\nSimple Classifiers:\n") for c in classifiers: print('\n---------------------------\n') # building the model pl = build_lc_model(c) print('\nCrossValidation with 10 folds for ' + c[0] + '\n') cross_validate(pl, ds, labels, 2, show_mat=show_mat, txt_labels=['LIKE', 'DISLIKE'], random_state=42) if plot: plot_learning_curve(pl, c[0], ds, labels, random_state=42) print('\n---------------------------\n') print("\n\nEnsembles and Meta-Classifiers:\n") np.random.seed(42) ens_meta_classifiers = init_ensmeta_classifiers(classifiers, 'lc') for c in ens_meta_classifiers: print("CV 10 folds - " + c[0]) # building the pipeline vectorizer-classifier pl = build_lc_model(c) # Cross_validating the model (dunno y its not working with the ) cross_validate(pl, ds, labels, 2, show_mat=show_mat, txt_labels=['LIKE', 'DISLIKE'], random_state=42) if plot: plot_learning_curve(pl, c[0], ds, labels, random_state=42)
def deploy_likability_predictor(dataset, dir_path='home/miner/model'): # preparing the inputs ds = pd.DataFrame() dataset = shuffle(dataset, random_state=42) ds['content'] = pp.tokenize_list(dataset) ds['tag'] = dataset['tag'] # preparing the targets labels = np.asarray([labelize_likability(a)[0] for _,a in dataset.iterrows()]) # building the model... clf = classifier['log_reg'](solver='lbfgs', multi_class='auto', random_state=42) model = build_lc_model(('clf', clf)) cross_validate_fullscores(model, ds, labels, n_class=2, txt_labels=['LIKE', 'DISIKE']) joblib.dump(model, dir_path + '/lik_prd.pkl') return model
def meta_classify_nc(dataset, show_mat=False, tuning=False, plot=False, load_pretokenized=False): # preparing the trainingset dataset = shuffle(dataset, random_state=42) # avoid wasting time during test... # if load_pretokenized: # with open('home/pretokenized_dataset/ncds.pkl', 'rb') as f: # ds = pickle.load(f) # else: ds = pp.tokenize_list(dataset) # pp.vectorize_list(dataset) (doc_to_vector stuff, not really working) # preparing the targets labels = dataset['tag'].to_numpy() n_class = len(news_categories) # classifiers initialization classifiers = init_simple_classifiers('nc') # params tuning params = { 'rf' : { 'clf__n_estimators': [2000], 'clf__bootstrap': [True], 'clf__max_depth': [None], 'clf__max_features': ['sqrt'], 'clf__min_samples_leaf': [2, 4], 'clf__min_samples_split': [5, 10] }, 'mnb' : {}, 'svc' : { # 'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2)], # 'vect__min_df': (1, 0.01, 0.025), # 'vect__norm': [None, 'l1', 'l2'] # 'vect__max_df': (0.5, 0.55, 0.65, 0.70, 0.75, 1.0), # 'vect__max_features' : [12000, 14500, 15000], # best MF: 14500 'clf__C': np.arange(0.001, 1, 0.001) # best C: 0.51 }, 'ada' : { 'clf__n_estimators': [2000], 'clf__algorithm': ['SAMME'] }, 'lr' : {}, 'dt' : {} } print("\n\nSimple Classifiers:\n") for c in classifiers: print('\n---------------------------\n') # building the model model = build_nc_model(c) if tuning: print("\nTuning {} Hyper-Parameters with GridSearchCV: \n".format(c[0])) tuned_model = GridSearchCV(model, params[c[0]], iid=True, scoring='accuracy', cv=StratifiedKFold(random_state=42, shuffle=True, n_splits=10), verbose=1, n_jobs=-1 ) tuned_model.fit(ds, labels) print(tuned_model.best_score_, tuned_model.best_params_) else: print('\n CrossValidation with 10 folds for ' + c[0] + '\n') cross_validate(model, ds, labels, n_class, show_mat=show_mat, txt_labels=news_categories, random_state=42) if plot: plot_learning_curve(model, c[0], ds, labels, random_state=42) print('\n---------------------------\n') print("\n\nSimple Classifiers WITH Independent Features Selection (chi2-40Percentile):\n") for c in classifiers: print('\n---------------------------\n') # building the model model = build_nc_model(c, feature_selection=SelectPercentile(chi2, percentile=40)) if tuning: print("\nTuning {} Hyper-Parameters with GridSearchCV: \n".format(c[0])) tuned_model = GridSearchCV(model, params[c[0]], iid=True, scoring='accuracy', cv=StratifiedKFold(random_state=42, shuffle=True, n_splits=10), verbose=1, n_jobs=-1 ) tuned_model.fit(ds, labels) print(tuned_model.best_score_, tuned_model.best_params_) else: print('\n CrossValidation with 10 folds for ' + c[0] + '\n') cross_validate(model, ds, labels, n_class, show_mat=show_mat, txt_labels=news_categories, random_state=42) if plot: plot_learning_curve(model, c[0], ds, labels, random_state=42) print('\n---------------------------\n') print("\n\nEnsembles and Meta-Classifiers:\n") # set the seed for some classifiers... np.random.seed(42) # init the list... ens_meta_classifiers = init_ensmeta_classifiers(classifiers, 'nc') for c in ens_meta_classifiers: print("\nCV 10 folds - " + c[0]) # building the pipeline vectorizer-classifier model = build_nc_model(c) # # Cross_validating the model cross_validate(model, ds, labels, n_class, show_mat=show_mat, txt_labels=news_categories, random_state=42) if plot: plot_learning_curve(model, c[0], ds, labels, random_state=42)