def optimise_model(self, argv): self.logger.println("optimise model called") start_time = timeit.default_timer() cs = CrfSuite() dataset = Dataset() data = dataset.read(nr_of_files=argv) we_model = WeModel() w2v_model = we_model.train( data) # optionally load a pretrained model here we_model.save(w2v_model) word2count, word2idx = dataset.encode_dataset(data) f_generator = FeatureGenerator(w2v_model, word2count, word2idx) train_features = f_generator.generate_features_docs(data) y_train = f_generator.generate_true_outcome(data) cs.optimise_model(train_features, y_train) elapsed_seconds = timeit.default_timer() - start_time self.logger.print_time_taken("optimise model operation took", elapsed_seconds)
def predict_urls(urls: list, model=None): """Predicts the probabilities of given urls.""" feature_generator = FeatureGenerator() if model is None: tr_urls, tr_labels = load_data(os.path.join( "datasets", 'url_data_mega_deep_learning.csv'), url_column_name='url', label_column_name='isMalicious', to_binarize=False) # Features one_hot_tr_urls = feature_generator.one_hot_encoding(tr_urls) X_train, X_val, y_train, y_val = train_test_split(one_hot_tr_urls, tr_labels, test_size=0.2) # Model model = UrlDetector("big_conv_nn") model.fit(X_train, y_train, epochs=5, batch_size=512, validation_data=(X_val, y_val)) # Predict one_hot_urls = feature_generator.one_hot_encoding(urls) print(model.predict_proba(one_hot_urls))
def predict_on_non_dated_dataset( non_dated_dataset='url_data_mega_deep_learning.csv'): """Prediction on the non-dated dataset.""" if non_dated_dataset == 'url_data_mega_deep_learning.csv': to_binarize = False label_column_name = 'isMalicious' url_column_name = 'url' elif non_dated_dataset == 'simple.csv': to_binarize = True label_column_name = 'label' url_column_name = 'url' # Data urls, labels = load_data(os.path.join("datasets", non_dated_dataset), url_column_name=url_column_name, label_column_name=label_column_name, to_binarize=to_binarize) # Features feature_generator = FeatureGenerator() one_hot_urls = feature_generator.one_hot_encoding(urls) X_train, X_test, y_train, y_test = train_test_split(one_hot_urls, labels, test_size=0.2) # Model url_detector = UrlDetector("big_conv_nn") url_detector.fit(X_train, y_train, epochs=5, batch_size=128) # Evaluate url_detector.evaluate(X_test, y_test) url_detector.plot_roc_curve(X_test, y_test) plt.show()
def train_model_learning_curve(self, arg): self.logger.println("train model called") start_time = timeit.default_timer() cs = CrfSuite() dataset = Dataset() data = dataset.read(nr_of_files=arg) nr_of_filled_lines, data1 = dataset.filter_for_filled_tags(data) data2 = dataset.obtain_default_tags(nr_of_filled_lines * 3, data) data = data1 + data2 data = dataset.shuffle_data(data) train_set, test_set = dataset.split_dataset(data) we_model = WeModel() w2v_model = we_model.train( data) # optionally load a pretrained model here #w2v_model = we_model.load_pretrained_model() # optionally load a pretrained model here word2count, word2idx = dataset.encode_dataset(train_set) f_generator = FeatureGenerator(w2v_model, word2count, word2idx) train_features = f_generator.generate_features_docs(train_set) y_train = f_generator.generate_true_outcome(train_set) cs.plot_learning_curve(train_features, y_train) plt.show() elapsed_seconds = timeit.default_timer() - start_time self.logger.print_time_taken("train model operation took", elapsed_seconds)
def load_tagger(self): self.__trained_tagger = pycrfsuite.Tagger() self.__trained_tagger.open('test_NER.crfsuite') we_model = WeModel() self.w2v_model = we_model.read() dataset = Dataset() data = dataset.read(nr_of_files=-1) word2count, word2idx = dataset.encode_dataset(data) self.f_generator = FeatureGenerator(self.w2v_model, word2count, word2idx)
def __init__(self): print('Initializing...') self.client_credentials_manager = SpotifyClientCredentials( client_id=CLIENT_ID, client_secret=CLIENT_SECRET) self.spotify_api = spotipy.Spotify( client_credentials_manager=self.client_credentials_manager) print('Training...') self.gen = FeatureGenerator(spotify=self.spotify_api) self.X = [json.loads(d.strip()) for d in open('./training.data', 'r')] self.classifier = BallTree([x[4] for x in self.X]) print('Training complete.')
def get_feature_generator_results(self): input = [[("this", "", "", ""), ("is", "", "", ""), ("a", "", "", ""), ("test", "", "", "")], [("this", "", "", ""), ("is", "", "", ""), ("a", "", "", ""), ("test", "", "", "")]] dataset = Dataset() word2count, word2idx = dataset.encode_dataset([input]) f_generator = FeatureGenerator(self.w2v_model, word2count, word2idx) X = f_generator.generate_features_docs([input]) y = f_generator.generate_true_outcome([input]) # run tests on X and y return input, X, y
def menu_cluster(fn='merged_shop.json'): ''' 进行菜品的聚类 input: fn string 合并好的文件名 一行一个合并的店铺 {"baidu":'', "eleme":'', "meituan":'', "id":''} output: 打印结果 json 每行为一个店铺的菜品合并信息 ''' sql = "select * from `{}` where id='{}'" mysql_obj = get_mysql_obj(os.sep.join([conf_dir, 'db.conf']), 'mysql_online') conn = mysql_obj['conn'] cursor = mysql_obj['cursor'] feat_gen = FeatureGenerator() cluster_obj = SimpleCluster() tb_dic = { 'eleme': 'eleme_shop', 'baidu': 'baidu_waimai_shop', 'meituan': 'meituan_waimai_shop' } num = 0 for line in open(fn, 'r'): dic = json.loads(line.strip()) feat_ls = [] for tag, _id in dic.items(): if tag not in tb_dic: continue tb_name = tb_dic[tag] cursor.execute(sql.format(tb_name, _id)) res = cursor.fetchone() __feat_ls = feat_gen.generate_feature_with_food_dic(res, tag) # print ('__feat_ls', len(__feat_ls)) feat_ls.extend(__feat_ls) # print (len(feat_ls), feat_ls) label_ls = cluster_obj.cluster(feat_ls) res_dic = OrderedDict() for __feat_ls, label in zip(feat_ls, label_ls): if label not in res_dic: res_dic[label] = [] src, _id, food_dic, food_name = __feat_ls[:4] food_dic['__source'] = src food_dic['__id'] = _id res_dic[label].append(food_dic) res_dic = {'id': dic['id'], 'foods': res_dic} print(json.dumps(res_dic, ensure_ascii=False).encode('utf8'))
def reset(self): self.stepcount = 0 self.fg = FeatureGenerator() oo = self.e.reset() self.lastx = oo[1] o = self.obg(oo) return o
def menu_cluster(fn = '../menu_fusion/res_dic.json'): ''' 进行菜品的聚类 input: fn string 合并好的文件名 一行一个合并的店铺 {"baidu":'', "eleme":'', "meituan":'', "id":''} output: 打印结果 json 每行为一个店铺的菜品合并信息 ''' sql = "select * from `{}` where id='{}'" mysql_obj = get_mysql_obj(os.sep.join([conf_dir, 'db.conf']), 'mysql_online') conn = mysql_obj['conn'] cursor = mysql_obj['cursor'] feat_gen = FeatureGenerator() cluster_obj = SimpleCluster() tb_dic = {'eleme':'eleme_shop', 'baidu':'baidu_waimai_shop', 'meituan':'meituan_waimai_shop'} num = 0 for line in open(, 'r'): dic = json.loads(line.strip()) feat_ls = [] for tag, _id in dic.items(): tb_name = tb_dic[tag] cursor.execute(sql.format(tb_name, _id)) res = cursor.fetchone() __feat_ls = feat_gen.generate_feature_with_food_dic(res, tb_name) # print ('__feat_ls', len(__feat_ls)) feat_ls.extend(__feat_ls) # print (len(feat_ls), feat_ls) label_ls = cluster_obj.cluster(feat_ls) out_ls = [] for __feat_ls, label in zip(feat_ls, label_ls): __feat_ls.append(str(label)) _feat_ls = [] for ss in __feat_ls: if not isinstance(ss, str): ss = ss.encode('utf8') _feat_ls.append(ss) out_ls.append(_feat_ls) out_ls = sorted(out_ls, key=lambda x:x[-1]) for _feat_ls in out_ls: print ('\t'.join(_feat_ls)) print '' num += 1 if num == 10: break
def test(args): print('start testing') ddpg = DDPG() ddpg.load_model(args.model, load_memory=False) env = RunEnv(visualize=args.visualize, max_obstacles=args.max_obs) np.random.seed(args.seed) for i in range(1): step = 0 state = env.reset(difficulty=2) fg = FeatureGenerator() state = fg.gen(state) #obs = fg.traj[0] #print(obs.left_knee_r, obs.right_knee_r) ep_reward = 0 ep_memories = [] while True: action = ddpg.select_action(list(state)) next_state, reward, done, info = env.step(action.tolist()) next_state = fg.gen(next_state) #obs = fg.traj[0] #print(obs.left_knee_r, obs.right_knee_r) print('step: {0:03d}'.format(step), end=', action: ') for act in action: print('{0:.3f}'.format(act), end=', ') print() state = next_state ep_reward += reward step += 1 print('reward:', ep_reward) if done: break print('\nEpisode: {} Reward: {}, n_steps: {}'.format( i, ep_reward, step))
def draw_roc_curve_saved_model(self): self.logger.println("drawing roc curve from saved model") start_time = timeit.default_timer() cs = CrfSuite() crf = cs.load_model("current_crf_model.pkl") dataset = Dataset() data = dataset.read(nr_of_files=1000) nr_of_filled_lines, data1 = dataset.filter_for_filled_tags(data) data2 = dataset.obtain_default_tags(nr_of_filled_lines * 3, data) data = data1 + data2 data = dataset.shuffle_data(data) train_set, test_set = dataset.split_dataset(data) we_model = WeModel() w2v_model = we_model.read() we_model = None word2count, word2idx = dataset.encode_dataset(train_set) f_generator = FeatureGenerator(w2v_model, word2count, word2idx) w2v_model = None train_features = f_generator.generate_features_docs(train_set) y_train = f_generator.generate_true_outcome(train_set) test_features = f_generator.generate_features_docs(test_set) y_test = f_generator.generate_true_outcome(test_set) f_generator = None evaluator = Evaluator() evaluator.draw_roc_proba(crf, test_features, y_test)
def main(): print("Start data preprocessing and feature extraction..") training_fg = FeatureGenerator(dtype='train', n_jobs=4, chunk_size=150000) training_data = training_fg.generate() test_fg = FeatureGenerator(dtype='test', n_jobs=4, chunk_size=None) test_data = test_fg.generate() training_data.to_csv("../input/train_features.csv", index=False) test_data.to_csv("../input/test_features.csv", index=False) X = training_data.iloc[:, :-2].values y = training_data.loc[:, "target"].values print("Train model") model = NeuralNetv1(training_data, y, save=False) model.evaluate_model() model.train() print("Model trained") print("Build result and predict for Kaggle") predictor = Predictor(x_test, y_test, model=model.model) predictor.predict() print("END")
def main(_): tf.logging.set_verbosity(tf.logging.INFO) set_dist_env() #------bulid Tasks------ model_params = { "learning_rate": FLAGS.learning_rate, "l2_reg": FLAGS.l2_reg, "deep_layers": list(map(int,FLAGS.deep_layers.split(','))), "atten_layers":list(map(int,FLAGS.atten_layers.split(','))), "dropout": list(map(float,FLAGS.dropout.split(','))), "optimizer":FLAGS.optimizer } if FLAGS.clear_existing_model: try: shutil.rmtree('./model') except Exception as e: print(e, "at clear_existing_model") else: print("existing model cleaned at %s" % FLAGS.model_dir) tr_files = "./data/train.tfrecords" va_files ="./data/test.tfrecords" fea_json = feature_json('./feature_generator.json') fg = FeatureGenerator(fea_json) md = DIN(fg) model = Model(fg,md) config = tf.estimator.RunConfig().replace(session_config = tf.ConfigProto(device_count={'GPU':0, 'CPU':FLAGS.num_threads}), log_step_count_steps=FLAGS.log_steps, save_summary_steps=FLAGS.log_steps) Estimator = tf.estimator.Estimator(model_fn=model.model_fn, model_dir='./model/', params=model_params, config=config) if FLAGS.task_type == 'train': train_spec = tf.estimator.TrainSpec(input_fn=lambda: model.input_fn(tr_files, num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size)) eval_spec = tf.estimator.EvalSpec(input_fn=lambda: model.input_fn(va_files, num_epochs=1, batch_size=FLAGS.batch_size), steps=None, start_delay_secs=1000, throttle_secs=1200) tf.estimator.train_and_evaluate(Estimator, train_spec, eval_spec) elif FLAGS.task_type == 'eval': Estimator.evaluate(input_fn=lambda: model.input_fn(tr_files, num_epochs=1, batch_size=FLAGS.batch_size)) Estimator.evaluate(input_fn=lambda: model.input_fn(va_files, num_epochs=1, batch_size=FLAGS.batch_size)) elif FLAGS.task_type == 'infer': preds = Estimator.predict(input_fn=lambda: model.input_fn(va_files, num_epochs=1, batch_size=FLAGS.batch_size), predict_keys="prob") elif FLAGS.task_type == 'export': ##单机使用保存 # print(fg.feature_spec) # serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(fg.feature_spec) serving_input_receiver_fn = ( tf.estimator.export.build_raw_serving_input_receiver_fn(fg.feature_placeholders) ) Estimator.export_saved_model(FLAGS.servable_model_dir, serving_input_receiver_fn)
def train_model(self, nr_of_files=-1): self.logger.println("train model called") start_time = timeit.default_timer() cs = CrfSuite() dataset = Dataset() data = dataset.read(nr_of_files=nr_of_files) nr_of_filled_lines, data1 = dataset.filter_for_filled_tags(data) data2 = dataset.obtain_default_tags(nr_of_filled_lines * 3, data) data = data1 + data2 data = dataset.shuffle_data(data) train_set, test_set = dataset.split_dataset(data) we_model = WeModel() w2v_model = we_model.train( data) # optionally load a pretrained model here we_model.save(w2v_model) we_model = None word2count, word2idx = dataset.encode_dataset(train_set) f_generator = FeatureGenerator(w2v_model, word2count, word2idx) w2v_model = None train_features = f_generator.generate_features_docs(train_set) y_train = f_generator.generate_true_outcome(train_set) test_features = f_generator.generate_features_docs(test_set) y_test = f_generator.generate_true_outcome(test_set) f_generator = None model = cs.train_model(train_features, y_train) cs.save_model(model) y_train_pred = cs.test_model(model, train_features) y_test_pred = cs.test_model(model, test_features) print("printing training results") cs.print_classification_report(dataset.docs2lines(y_train), y_train_pred) score_train = cs.score_model(dataset.docs2lines(y_train), y_train_pred) print("training f1 score: %s" % score_train) print("printing test results") cs.print_classification_report(dataset.docs2lines(y_test), y_test_pred) score_test = cs.score_model(dataset.docs2lines(y_test), y_test_pred) print("test f1 score: %s" % score_test) elapsed_seconds = timeit.default_timer() - start_time self.logger.print_time_taken("train model operation took", elapsed_seconds) evaluator = Evaluator() evaluator.perform_roc_analysis(dataset.docs2lines(y_train), y_train_pred) evaluator.perform_roc_analysis(dataset.docs2lines(y_test), y_test_pred)
def submit(args): print('start submitting') remote_base = 'http://grader.crowdai.org:1733' client = Client(remote_base) ddpg = DDPG() ddpg.load_model(args.model, load_memory=False) state = client.env_create(TOKEN) fg = FeatureGenerator() state = fg.gen(state) step = 0 ep_reward = 0 while True: print('selecting action ...', end=' ') action = ddpg.select_action(list(state)) print('client.env_step ...') next_state, reward, done, info = client.env_step(action.tolist()) next_state = fg.gen(next_state) print('step: {0:03d}, ep_reward: {1:02.08f}'.format(step, ep_reward)) state = next_state ep_reward += reward step += 1 if done: print('done') state = client.env_reset() if not state: break step = 0 ep_reward = 0 fg = FeatureGenerator() state = fg.gen(state) client.submit()
#from pandas.tseries.offsets import CustomBusinessDay #us_cal = CustomBusinessDay(calendar=USFederalHolidayCalendar()) from sklearn.model_selection import StratifiedShuffleSplit from sklearn.ensemble import RandomForestClassifier if __name__ == "__main__": plotIt = PlotUtility() timeUtil = TimeUtility() ct = ComputeTarget() candle_ind = CandleIndicators() dSet = DataRetrieve() taLibMomSt = TALibMomentumStudies() transf = Transformers() modelUtil = ModelUtility() featureGen = FeatureGenerator() issue = "TLT" # Set IS-OOS parameters pivotDate = datetime.date(2018, 4, 2) is_oos_ratio = 2 oos_months = 3 segments = 4 dataSet = dSet.read_issue_data(issue) # get first data from loaded data instead of hard coding start date dataSet = dSet.set_date_range(dataSet, "2014-09-26", pivotDate) #set beLong level beLongThreshold = 0.000
control_flow_p, time_p, resource_p, data_p, transition) testset_dir = "../testsets" log_config = { "control_flow_p": control_flow_p, "time_p": time_p, "resource_p": resource_p, "data_p": data_p, "transition": transition } # load data print("flag: loading data") fg = FeatureGenerator() df = fg.create_initial_log(filename, log_config) print("done") num_events = len(df) num_cases = len(set(df["id"])) # feature generation print("flag: generating features") if task == 'next_activity': loss = 'categorical_crossentropy' regression = False feature_type_list = ["activity_history"] df = fg.add_activity_history(df) df = fg.add_next_activity(df)
filename = args.data_dir + args.data_set model_name = args.data_set + args.task contextual_info = args.contextual_info if args.task == 'next_activity': loss = 'categorical_crossentropy' regression = False elif args.task == 'next_timestamp': loss = 'mae' regression = True batch_size = args.batch_size_train num_folds = args.num_folds # load data FG = FeatureGenerator() df = FG.create_initial_log(filename) #split train and test #train_df, test_df = FG.train_test_split(df, 0.7, 0.3) train_df = df test_df = train_df #create train train_df = FG.order_csv_time(train_df) train_df = FG.queue_level(train_df) train_df.to_csv('./training_data.csv') state_list = FG.get_states(train_df) train_X, train_Y_Event, train_Y_Time = FG.one_hot_encode_history( train_df, args.checkpoint_dir + args.data_set) if contextual_info: train_context_X = FG.generate_context_feature(train_df, state_list)
def cross_datasets(direction='dated-->non_dated', day=15, month=7, year=2018, randomise=False, non_dated_dataset='url_data_mega_deep_learning.csv'): """ Trains the model on a dataset and predicts on the other. Check above functions for the details of the parameters. """ # Parameters definition for non-dated dataset if non_dated_dataset == 'url_data_mega_deep_learning.csv': to_binarize = False label_column_name = 'isMalicious' url_column_name = 'url' elif non_dated_dataset == 'simple.csv': to_binarize = True label_column_name = 'label' url_column_name = 'url' if direction == 'non_dated-->dated': # TRAINING # Data urls, labels = load_data(os.path.join("datasets", non_dated_dataset), url_column_name=url_column_name, label_column_name=label_column_name, to_binarize=to_binarize) # Features feature_generator = FeatureGenerator() one_hot_urls = feature_generator.one_hot_encoding(urls) X_train, X_val, y_train, y_val = train_test_split(one_hot_urls, labels, test_size=0.2) # Model url_detector = UrlDetector("big_conv_nn") url_detector.fit(X_train, y_train, epochs=5, batch_size=128, validation_data=(X_val, y_val)) # TESTING # Data if randomise: urls_1, labels_1, urls_2, labels_2 = load_randomized_dated_data( os.path.join("datasets", "bad_urls1.csv"), os.path.join("datasets", "good_urls.csv"), ratio_good_bad=1, ratio_testing_set=0.2) else: # For 80%/20% : take 15/07/2018 # For 50%/50% : take 01/03/2018 urls_1, labels_1, urls_2, labels_2 = load_dated_data( os.path.join("datasets", "bad_urls1.csv"), os.path.join("datasets", "good_urls.csv"), ratio_good_bad=1, separation_date=date(year, month, day)) # Features feature_generator = FeatureGenerator() one_hot_urls_2 = feature_generator.one_hot_encoding(urls_2) # Evaluate url_detector.evaluate(one_hot_urls_2, labels_2) url_detector.plot_roc_curve(one_hot_urls_2, labels_2) plt.show() elif direction == 'dated-->non_dated': # TRAINING # Data if randomise: training_urls, training_labels, val_urls, val_labels = load_randomized_dated_data( os.path.join("datasets", "bad_urls1.csv"), os.path.join("datasets", "good_urls.csv"), ratio_good_bad=1, ratio_testing_set=0.2) else: training_urls, training_labels, val_urls, val_labels = load_dated_data( os.path.join("datasets", "bad_urls1.csv"), os.path.join("datasets", "good_urls.csv"), ratio_good_bad=1, separation_date=date(year, month, day)) # 20% is at 15/07/2018 # Features feature_generator = FeatureGenerator() one_hot_training_urls = feature_generator.one_hot_encoding( training_urls) one_hot_val_urls = feature_generator.one_hot_encoding(val_urls) # Model url_detector = UrlDetector("big_conv_nn") url_detector.fit(one_hot_training_urls, training_labels, epochs=5, batch_size=128, validation_data=(one_hot_val_urls, val_labels)) # TESTING # Data urls, labels = load_data(os.path.join("datasets", non_dated_dataset), url_column_name=url_column_name, label_column_name=label_column_name, to_binarize=to_binarize) # Features size_testing = int(0.21 * len(urls)) feature_generator = FeatureGenerator() one_hot_urls = feature_generator.one_hot_encoding(urls) # Evaluate url_detector.evaluate(one_hot_urls[:size_testing], labels[:size_testing]) url_detector.plot_roc_curve(one_hot_urls[:size_testing], labels[:size_testing]) plt.show()
import os import constants as CONSTANTS from pdb_converter import PDBConverter from data_spliter import DataSpliter from feature_generator import FeatureGenerator pdb_converter = PDBConverter(CONSTANTS.ALL_PDB_IDS) pdb_converter.apply() feature_generator = FeatureGenerator(CONSTANTS.N_PDB_IDS) feature_generator.one_hot() data_spliter = DataSpliter(CONSTANTS.N_PDB_IDS) data_spliter.split()
Computes the precision, a metric for multi-label classification of how many selected items are relevant. """ true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) precision = true_positives / (predicted_positives + K.epsilon()) return precision precision = precision(y_true, y_pred) recall = recall(y_true, y_pred) return 2 * ((precision * recall) / (precision + recall + K.epsilon())) if __name__ == '__main__': # Data feature_generator = FeatureGenerator() urls, labels = feature_generator.load_data(os.path.join( "datasets", "url_data_mega_deep_learning.csv"), url_column_name="url", label_column_name="isMalicious", to_binarize=False) urls, labels = shuffle(urls, labels) one_hot_urls = feature_generator.one_hot_encoding(urls) # Model url_detector = UrlDetector("big_conv_nn") url_detector.fit(one_hot_urls, labels) url_detector.evaluate(one_hot_urls[-100:], labels[-100:]) plt.show() # TODO: test ROC curve, new datasets
def __init__(self): """ Initialize all the models and classification pipeline """ # Preprocessor which removes left, right and middle outliers of the wave self.preprocessor = Preprocessor() # Feature Generator for generating features of a wave self.feature_generator = FeatureGenerator() # Output arrays self.features = [] self.labels = [] self.file_names = [] # Custom scoring module self.scorer = Scorer() # Feature selection models self.feature_selector_1 = SelectFromModel( LinearSVC(penalty="l1", dual=False)) self.feature_selector_2 = SelectFromModel( LinearSVC(penalty="l1", dual=False)) # Classification models # clf_1 = DecisionTreeClassifier() # # params = {"criterion": ["gini", "entropy"], # "min_samples_split": [2, 10, 20], # "max_depth": [None, 2, 5, 10], # "min_samples_leaf": [1, 5, 10], # "max_leaf_nodes": [None, 5, 10, 20], # } clf_1 = AdaBoostClassifier() base_classifier_1 = RandomForestClassifier() # Best Classifier 1 # {'n_estimators': 40, # 'base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', # max_depth=None, max_features='auto', max_leaf_nodes=None, # min_impurity_split=1e-07, min_samples_leaf=1, # min_samples_split=2, min_weight_fraction_leaf=0.0, # n_estimators=10, n_jobs=1, oob_score=False, random_state=None, # verbose=0, warm_start=False), 'learning_rate': 0.85000000000000009} # Best # Classifier # 2 # {'n_estimators': 30, # 'base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', # max_depth=None, max_features='auto', max_leaf_nodes=None, # min_impurity_split=1e-07, min_samples_leaf=1, # min_samples_split=2, min_weight_fraction_leaf=0.0, # n_estimators=10, n_jobs=1, oob_score=False, random_state=None, # verbose=0, warm_start=False), 'learning_rate': 0.80000000000000004} # params = { # "base_estimator": [base_classifier_1], # "n_estimators": range(30, 61, 10), # "learning_rate": np.arange(0.8, 1.01, 0.05), # } optimal_params = { "base_estimator": [base_classifier_1], 'n_estimators': [40], 'learning_rate': [0.85000000000000009] } self.classifier_1 = GridSearchCV(clf_1, param_grid=optimal_params, cv=10, scoring=make_scorer( self.scorer.score), verbose=10) # clf_2 = DecisionTreeClassifier() clf_2 = AdaBoostClassifier() base_classifier_2 = RandomForestClassifier() # params = { # "base_estimator": [base_classifier_2], # "n_estimators": range(30, 61, 10), # "learning_rate": np.arange(0.8, 1.01, 0.05), # } optimal_params = { "base_estimator": [base_classifier_2], 'n_estimators': [30], 'learning_rate': [0.80000000000000004] } # params = {"criterion": ["gini", "entropy"], # "min_samples_split": [2, 10, 20], # "max_depth": [None, 2, 5, 10], # "min_samples_leaf": [1, 5, 10], # "max_leaf_nodes": [None, 5, 10, 20], # } self.classifier_2 = GridSearchCV(clf_2, param_grid=optimal_params, cv=2, scoring=make_scorer( self.scorer.score), verbose=10) # Pipeline initializations self.pipeline_1 = Pipeline([('feature_selector', self.feature_selector_1), ('clf', self.classifier_1)]) self.pipeline_2 = Pipeline([('feature_selector', self.feature_selector_2), ('clf', self.classifier_2)])
class TrainingSampleGenerator(object): def __init__(self, svm_feature=False): super(TrainingSampleGenerator, self).__init__() self.training_samples = [] self.processed_training_data = [] # format: tuple([feature vector] , label) self.feature_generator = FeatureGenerator() self.svm_feature = svm_feature def parse_from_text(self, path): fp = open(path, 'r') counter = 0 lname = None root_url = None target_url = None for line in fp: counter += 1 if counter == 4: data = Data(lname, root_url, target_url) self.training_samples.append(data) counter = 0 continue elif counter == 1: lname = line.lower().strip() elif counter == 2: root_url = line.strip() elif counter == 3: target_url = line.strip() def generate(self, path): # process and generate training data self.parse_from_text(path) for raw_training_data in self.training_samples: self.processed_training_data += self.convert_raw_training_data_to_features(raw_training_data) print self.processed_training_data def prepare_libsvm(self, data_lst, path): fw = open(path, 'w') for tp in data_lst: feature_lst = tp[0] label = tp[1] line = str(label) + ' ' for i in xrange(len(feature_lst)): line += str(i+1) + ':' + str(feature_lst[i]) + ' ' fw.write(line + '\n') fw.close() def serialize(self): fp = open('processed_training_data.pkl', 'wb') pickle.dump(self.processed_training_data, fp) fp.close() def deserialize(self): # self.processed_training_data = pickle.load(open('processed_training_data.pkl', 'rb')) self.processed_training_data = pickle.load(open('processed_training_data.pkl.bak', 'rb')) # self.processed_training_data = pickle.load(open('processed_training_data.pkl.svm.full', 'rb')) # print self.processed_training_data print len(self.processed_training_data) def convert_raw_training_data_to_features(self, train_data): q = Queue() url_deduplicator = {} DELIMITER = ('###DELIMITER###', '', -1) lname = train_data.lname root_url = train_data.root_url target_url = train_data.target_url self.feature_generator.setup(lname, root_url) training_data_features = [] q.push( (root_url, '', 0) ) q.push(DELIMITER) level_count = 0 stop_pushing_flag = False while not q.is_empty(): cur_url_info = q.pop() print cur_url_info print target_url, len(target_url) if cur_url_info == DELIMITER: #if delimiter, increment level counter level_count += 1 if level_count == 1: #dig no mare than 1 levels stop_pushing_flag = True else: continue if url_deduplicator.has_key(url_cleanup(cur_url_info[0])): # if appeared, ignore continue else: url_deduplicator[url_cleanup(cur_url_info[0])] = True self.feature_generator.generate_features(cur_url_info, self.svm_feature) if self.feature_generator.features: #if feautures is [], then means previous is an invalid link if url_cleanup(cur_url_info[0]) == target_url: training_data_features.append( (self.feature_generator.features, 1) ) print '\t\t', (self.feature_generator.features, 1) else: training_data_features.append( (self.feature_generator.features, 0) ) print '\t\t', (self.feature_generator.features, 0) # Push sublinks if not stop_pushing_flag: try: raw_html = urllib2.urlopen(cur_url_info[0], timeout=3).read() soup = BeautifulSoup(raw_html) a_lst = soup.find_all('a', href=True) for anchor in a_lst: # new_url = urllib2.urlparse.urljoin(cur_url_info[0], anchor.attrs['href']) new_url = get_absolute_url(cur_url_info[0], anchor.attrs['href']) if under_same_maindomain(cur_url_info[0], new_url): q.push( (new_url, anchor.text, level_count+1) ) if cur_url_info[2] == 0: q.push(DELIMITER) except: pass self.feature_generator.close_up() return training_data_features def five_folds(self): """ Format: [ ( [([feature vector], label), (), ...], #training set 80% fold[0] [([feature vector], label), (), ...] #testing set 20% fold[1] ) #1st fold, ().... ] """ total_len = len(self.processed_training_data) five_folds = [] test_set_len = total_len - 4 * (total_len/5) print 'total len: ', total_len for i in xrange(5): test_set = [] for j in xrange(test_set_len): idx = i * (total_len/5) + j test_set.append(self.processed_training_data[idx]) training_set = self.processed_training_data[: i*(total_len/5)] + self.processed_training_data[i*(total_len/5) + test_set_len :] self.prepare_libsvm(training_set, config.LIBSVM_TRAINING_FOLDS[i]) self.prepare_libsvm(test_set, config.LIBSVM_TESTING_FOLDS[i]) five_folds.append( (training_set, test_set) ) return five_folds
def predict_on_dated_dataset(day=15, month=7, year=2018, randomise=False, ratio_good_bad=1, ratio_testing_set=0.2, validation_split=0.2, reverse=False): """ Prediction on the dated dataset. Parameters ---------- day, month, year Limit date: the network is trained on data dated before this day and tested on data from this day and newer. The proportion training/testing is at 80%/20% for the date 15/07/2018. Only used if 'randomise' = False. randomise Whether to randomise the set so as to use or not the date of the data. ratio_good_bad Ratio of (Good Data)/(Bad Data). ratio_testing_set Represents the proportion of the dataset to include in the test split. Only used if 'randomise' = True validation_split % of data to put in the validation set. reverse True to train on newer data and test on older. Only used if 'randomise' = False. """ # Data if randomise: training_urls, training_labels, testing_urls, testing_labels = load_randomized_dated_data( os.path.join("datasets", "bad_urls1.csv"), os.path.join("datasets", "good_urls.csv"), ratio_good_bad=ratio_good_bad, ratio_testing_set=ratio_testing_set) else: training_urls, training_labels, testing_urls, testing_labels = load_dated_data( os.path.join("datasets", "bad_urls1.csv"), os.path.join("datasets", "good_urls.csv"), ratio_good_bad=ratio_good_bad, separation_date=date(year, month, day)) # 20% is at 15/07/2018 if reverse: training_urls, testing_urls = testing_urls, training_urls training_labels, testing_labels = testing_labels, training_labels # Features feature_generator = FeatureGenerator() one_hot_training_urls = feature_generator.one_hot_encoding(training_urls) one_hot_testing_urls = feature_generator.one_hot_encoding(testing_urls) # Model url_detector = UrlDetector("big_conv_nn") url_detector.fit(one_hot_training_urls, training_labels, epochs=5, batch_size=128, validation_split=validation_split) # Evaluate url_detector.evaluate(one_hot_testing_urls, testing_labels) url_detector.plot_roc_curve(one_hot_testing_urls, testing_labels) plt.show() url_detector.evaluate(one_hot_training_urls + one_hot_testing_urls, training_labels + testing_labels)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) set_dist_env() #------bulid Tasks------ model_params = { "learning_rate": FLAGS.learning_rate, "l2_reg": FLAGS.l2_reg, "fcn_layers": list(map(int,FLAGS.fcn_layers.split(','))), "atten_layers":list(map(int,FLAGS.atten_layers.split(','))), "auxili_layers":list(map(int,FLAGS.auxili_layers.split(','))), "dropout": list(map(float,FLAGS.dropout.split(','))), "optimizer":FLAGS.optimizer, "neg_count":FLAGS.neg_count } #### negative sampling source model_params["mid_cat"] = cPickle.load(open("data/mid_cat.pkl", "rb")) tr_files = "./data/train.tfrecords" va_files ="./data/test.tfrecords" fea_json = feature_json('./feature_generator.json') fg = FeatureGenerator(fea_json) md = DIEN(fg) model = Model(fg,md) config = tf.estimator.RunConfig().replace( session_config = tf.ConfigProto(device_count={'GPU':0, 'CPU':FLAGS.num_threads}), log_step_count_steps=FLAGS.log_steps, save_summary_steps=FLAGS.log_steps, save_checkpoints_secs=FLAGS.save_checkpoints_secs) Estimator = tf.estimator.Estimator( model_fn=model.model_fn, model_dir='./model/', params=model_params, config=config) if FLAGS.task_type == 'train': train_spec = tf.estimator.TrainSpec( input_fn=lambda: model.input_fn( tr_files, num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size)) eval_spec = tf.estimator.EvalSpec( input_fn=lambda: model.input_fn( va_files, num_epochs=1, batch_size=FLAGS.batch_size), steps=None, start_delay_secs=10, throttle_secs=FLAGS.save_checkpoints_secs) tf.estimator.train_and_evaluate( Estimator, train_spec, eval_spec) elif FLAGS.task_type == 'eval': Estimator.evaluate( input_fn=lambda: model.input_fn( tr_files, num_epochs=1, batch_size=FLAGS.batch_size)) Estimator.evaluate( input_fn=lambda: model.input_fn( va_files, num_epochs=1, batch_size=FLAGS.batch_size)) elif FLAGS.task_type == 'infer': preds = Estimator.predict( input_fn=lambda: model.input_fn( va_files, num_epochs=1, batch_size=FLAGS.batch_size), predict_keys="prob") elif FLAGS.task_type == 'export': ##单机使用保存 # print(fg.feature_spec) # serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(fg.feature_spec) serving_input_receiver_fn = ( tf.estimator.export.build_raw_serving_input_receiver_fn(fg.feature_placeholders) ) Estimator.export_saved_model( FLAGS.servable_model_dir, serving_input_receiver_fn)
class CrfSuite(Tags): __seperator = "/" __crf_model_name = "current_crf_model.pkl" def __init__(self): self.logger = Logger() self.logger.println("CrfSuite created") def train_model(self, X, y): self.logger.println("transforming data to train model") X_combined = list(chain.from_iterable(X)) y_combined = list(chain.from_iterable(y)) self.logger.println("crf trainer init") crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.35, c2=0.35, max_iterations=125, all_possible_transitions=True, verbose=False) crf.fit(X_combined, y_combined) return crf def save_model(self, model, name=__crf_model_name): joblib.dump(model, name) def load_model(self, name=__crf_model_name): return joblib.load(name) def score_model(self, y_true, y_pred): lb = LabelBinarizer() y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true))) y_pred_combined = lb.transform(list(chain.from_iterable(y_pred))) tagset = set(lb.classes_) - {'O'} tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1]) class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} return f1_score(y_true_combined, y_pred_combined, average="weighted", labels=[class_indices[cls] for cls in tagset]) def print_classification_report(self, y_true, y_pred): lb = LabelBinarizer() y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true))) y_pred_combined = lb.transform(list(chain.from_iterable(y_pred))) tagset = set(lb.classes_) - {'O'} tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1]) class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} # TODO return f1 score or other wise here: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics print( classification_report( y_true_combined, y_pred_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset)) def load_tagger(self): self.__trained_tagger = pycrfsuite.Tagger() self.__trained_tagger.open('test_NER.crfsuite') we_model = WeModel() self.w2v_model = we_model.read() dataset = Dataset() data = dataset.read(nr_of_files=-1) word2count, word2idx = dataset.encode_dataset(data) self.f_generator = FeatureGenerator(self.w2v_model, word2count, word2idx) # doc: in format of tagged tuples def tag_doc(self, doc): feature_input = self.f_generator.generate_features_docs([doc]) model = self.load_model() """ xseq = [] for line_idx, line in enumerate(feature_input[0]): for token_idx, token in enumerate(line): xseq.append(token) """ predicted_tags = model.predict(feature_input[0]) # TODO change this to take in doc and not xseq (convert predicted tags # to the structure of doc) return self.interpret_predicted_tags(doc, predicted_tags) def interpret_predicted_tags(self, doc, tags): dataset = Dataset() identified_entities = [] doc = dataset.docs2lines(doc) tags = dataset.docs2lines(tags) for tag_idx, tag in enumerate(tags): if tag in Tags.start_tagset: entity_found = "" tag_idx_forward = tag_idx while True: if tag_idx_forward >= len(tags) or tags[ tag_idx_forward] == self._Tags__outside_tag: break entity_found = entity_found + " " + doc[tag_idx_forward][0] #entity_found = entity_found + " " + doc[line_idx]['word'] tag_idx_forward += 1 identified_entities.append((entity_found, tags[tag_idx])) return identified_entities # use an existing model to tag data def test_model(self, model, features): X_features = list(chain.from_iterable(features)) y_pred = model.predict(X_features) return y_pred # hyperparamter optimisation def optimise_model(self, X, y): # prepare data structure xseq = [] yseq = [] # transform data structure to group tokens by lines for doc_x, doc_y in zip(X, y): for line_idx, line in enumerate(doc_x): xseq.append(line) yseq.append(doc_y[line_idx]) # define fixed parameters and parameters to search crf = sklearn_crfsuite.CRF(algorithm='lbfgs', max_iterations=200, all_possible_transitions=True, verbose=True) params_space = { 'c1': scipy.stats.expon(scale=0.03), 'c2': scipy.stats.expon(scale=0.03), } labels = Tags.tag_list labels.remove('O') # use the same metric for evaluation f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels) # search rs = RandomizedSearchCV(crf, params_space, cv=5, verbose=1, n_jobs=2, n_iter=50, scoring=f1_scorer) rs.fit(xseq, yseq) print('best params:', rs.best_params_) print('best CV score:', rs.best_score_) print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000)) _x = [s.parameters['c1'] for s in rs.grid_scores_] _y = [s.parameters['c2'] for s in rs.grid_scores_] _c = [s.mean_validation_score for s in rs.grid_scores_] fig = plt.figure() fig.set_size_inches(12, 12) ax = plt.gca() ax.set_yscale('log') ax.set_xscale('log') ax.set_xlabel('C1') ax.set_ylabel('C2') ax.set_title( "Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})" .format(min(_c), max(_c))) ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0, 0, 0]) print("Dark blue => {:0.4}, dark red => {:0.4}".format( min(_c), max(_c))) plt.show() def plot_learning_curve(self, X, y): train_sizes = np.linspace(.1, 1.0, 5) n_jobs = 8 title = "Learning Curves" cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0) plt.figure() plt.title(title) ylim = (0.01, 1.01) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") X_lines = [] y_lines = [] for doc_x, doc_y in zip(X, y): for line_idx, line in enumerate(doc_x): X_lines.append(line) y_lines.append(doc_y[line_idx]) estimator = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.001, c2=0.001, max_iterations=110, all_possible_transitions=True, verbose=True) custom_scorer = make_scorer(self.score_model, greater_is_better=True) #train_sizes, train_scores, test_scores = learning_curve(estimator, X_lines, y_lines, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_sizes, train_scores, test_scores = learning_curve( estimator, X_lines, y_lines, cv=cv, scoring=custom_scorer, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") return plt
class magicalObject: def __init__(self): print('Initializing...') self.client_credentials_manager = SpotifyClientCredentials( client_id=CLIENT_ID, client_secret=CLIENT_SECRET) self.spotify_api = spotipy.Spotify( client_credentials_manager=self.client_credentials_manager) print('Training...') self.gen = FeatureGenerator(spotify=self.spotify_api) self.X = [json.loads(d.strip()) for d in open('./training.data', 'r')] self.classifier = BallTree([x[4] for x in self.X]) print('Training complete.') def add_track_info_to_results(self, results): for i, arr in enumerate(results): username = arr[3] playlist_id = arr[0] playlist_search = self.spotify_api.user_playlist( username, playlist_id) playlist_url = playlist_search['external_urls']['spotify'] ps = playlist_search['tracks']['items'] arr.append(playlist_url) sub_list = [] for item in ps: track_name = item['track']['name'] artist_name = item['track']['artists'][0]['name'] track_url = item['track']['external_urls']['spotify'] sub_list.append([track_name, artist_name, track_url]) arr.append(sub_list) results[i] = json.dumps(arr) # return results def classify_playlist(self, playlist_uri): parts = playlist_uri.split(':') user = parts[2] playlist = parts[4] p = self.spotify_api.user_playlist(user, playlist) feature_vec = [p['id'], p['name'], p['uri'], user, self.gen.process(p)] dist, ind = self.classifier.query([feature_vec[4]], k=5) results = [] for i in range(len(ind[0])): results.append(self.X[ind[0][i]]) self.add_track_info_to_results(results) return results def classify_song(self, song_uri): parts = song_uri.split(':') track = self.spotify_api.track(parts[-1]) p = {'tracks': {'items': [{'track': track}]}} feature_vec = [ track['id'], track['name'], track['uri'], None, self.gen.process(p) ] dist, ind = self.classifier.query([feature_vec[4]], k=5) results = [] for i in range(len(ind[0])): results.append(self.X[ind[0][i]]) self.add_track_info_to_results(results) return results def search_for_tracks(self, string_query): results = self.spotify_api.search(string_query)['tracks']['items'] return_list = [] for result in results: return_list.append({ 'song_id': result['id'], 'song_name': result['name'], 'artist_name': result['album']['artists'][0]['name'], 'artist_id': result['album']['artists'][0]['id'], 'song_uri': result['uri'] }) return return_list def search_for_playlists(self, string_query): results = self.spotify_api.search(string_query, type='playlist') return results
def __init__(self, svm_feature=False): super(TrainingSampleGenerator, self).__init__() self.training_samples = [] self.processed_training_data = [] # format: tuple([feature vector] , label) self.feature_generator = FeatureGenerator() self.svm_feature = svm_feature
def perform_bootstrapping(self, dataset, sample_size, iterations): """ bootstraps a sample n times. Averages the precision, recall, f1, tpr and fpr for each of the entities. Prints results of precision, recall and f1. Plots roc curves for tpr and fpr of each entity. """ training_scores = [] test_scores = [] emp_pos_scores = np.empty(shape=(0, 3), dtype='float64') emp_comp_scores = np.empty(shape=(0, 3), dtype='float64') edu_major_scores = np.empty(shape=(0, 3), dtype='float64') edu_inst_scores = np.empty(shape=(0, 3), dtype='float64') mean_fpr = np.linspace(0, 1, 100) lb = LabelBinarizer() emp_pos_tpr = np.empty(shape=(0, 3), dtype='float64') emp_pos_fpr = np.empty(shape=(0, 3), dtype='float64') emp_comp_tpr = np.empty(shape=(0, 3), dtype='float64') emp_comp_fpr = np.empty(shape=(0, 3), dtype='float64') edu_major_tpr = np.empty(shape=(0, 3), dtype='float64') edu_major_fpr = np.empty(shape=(0, 3), dtype='float64') edu_inst_tpr = np.empty(shape=(0, 3), dtype='float64') edu_inst_fpr = np.empty(shape=(0, 3), dtype='float64') for x in range(0, iterations): print("iteration nr %s" % x) sampled_train_set, oob_test_set = self.resample_data( dataset, sample_size, return_leftovers=True) cs = CrfSuite() ds = Dataset() we_model = WeModel() w2v_model = we_model.train( dataset) # optionally load a pretrained model here word2count, word2idx = ds.encode_dataset(sampled_train_set) f_generator = FeatureGenerator(w2v_model, word2count, word2idx) train_features = f_generator.generate_features_docs( sampled_train_set) y_train = f_generator.generate_true_outcome(sampled_train_set) test_features = f_generator.generate_features_docs(oob_test_set) y_test = f_generator.generate_true_outcome(oob_test_set) trainer = cs.train_model(train_features, y_train) y_train_pred = cs.test_model(trainer, train_features) y_test_pred = cs.test_model(trainer, test_features) score_train = cs.score_model(ds.docs2lines(y_train), y_train_pred) score_test = cs.score_model(ds.docs2lines(y_test), y_test_pred) y_true_combined = lb.fit_transform( list(chain.from_iterable(ds.docs2lines(y_test)))) y_pred_combined = lb.transform( list(chain.from_iterable(y_test_pred))) class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)} """ # fpr and tpr for one class temp_fpr, temp_tpr, _ = roc_curve(y_true_combined[:, class_indices["B-EMP-POS"]], y_pred_combined[:, class_indices["B-EMP-POS"]], pos_label=1) temp_fpr1, temp_tpr1, _ = roc_curve(y_true_combined[:, class_indices["I-EMP-POS"]], y_pred_combined[:, class_indices["I-EMP-POS"]], pos_label=1) temp_fpr = np.vstack([temp_fpr, temp_fpr1]) temp_tpr = np.vstack([temp_tpr, temp_tpr1]) emp_pos_tpr = np.vstack([emp_pos_tpr, temp_tpr.mean(axis=0)]) emp_pos_fpr = np.vstack([emp_pos_fpr, temp_fpr.mean(axis=0)]) temp_fpr = temp_tpr = temp_fpr1 = temp_tpr1 = np.empty(shape=(0,3),dtype='float64') temp_fpr, temp_tpr, _ = roc_curve(y_true_combined[:, class_indices["B-EMP-COMP"]], y_pred_combined[:, class_indices["B-EMP-COMP"]], pos_label=1) temp_fpr1, temp_tpr1, _ = roc_curve(y_true_combined[:, class_indices["I-EMP-COMP"]], y_pred_combined[:, class_indices["I-EMP-COMP"]], pos_label=1) temp_fpr = np.vstack([temp_fpr, temp_fpr1]) temp_tpr = np.vstack([temp_tpr, temp_tpr1]) emp_comp_tpr = np.vstack([emp_comp_tpr, temp_tpr.mean(axis=0)]) emp_comp_fpr = np.vstack([emp_comp_fpr, temp_fpr.mean(axis=0)]) temp_fpr = temp_tpr = temp_fpr1 = temp_tpr1 = np.empty(shape=(0,3),dtype='float64') temp_fpr, temp_tpr, _ = roc_curve(y_true_combined[:, class_indices["B-EDU-MAJOR"]], y_pred_combined[:, class_indices["B-EDU-MAJOR"]], pos_label=1) temp_fpr1, temp_tpr1, _ = roc_curve(y_true_combined[:, class_indices["I-EDU-MAJOR"]], y_pred_combined[:, class_indices["I-EDU-MAJOR"]], pos_label=1) temp_fpr = np.vstack([temp_fpr, temp_fpr1]) temp_tpr = np.vstack([temp_tpr, temp_tpr1]) edu_major_tpr = np.vstack([edu_major_tpr, temp_tpr.mean(axis=0)]) edu_major_fpr = np.vstack([edu_major_fpr, temp_fpr.mean(axis=0)]) temp_fpr = temp_tpr = temp_fpr1 = temp_tpr1 = np.empty(shape=(0,3),dtype='float64') temp_fpr, temp_tpr, _ = roc_curve(y_true_combined[:, class_indices["B-EDU-INST"]], y_pred_combined[:, class_indices["B-EDU-INST"]], pos_label=1) temp_fpr1, temp_tpr1, _ = roc_curve(y_true_combined[:, class_indices["I-EDU-INST"]], y_pred_combined[:, class_indices["I-EDU-INST"]], pos_label=1) temp_fpr = np.vstack([temp_fpr, temp_fpr1]) temp_tpr = np.vstack([temp_tpr, temp_tpr1]) edu_inst_tpr = np.vstack([edu_inst_tpr, temp_tpr.mean(axis=0)]) edu_inst_fpr = np.vstack([edu_inst_fpr, temp_fpr.mean(axis=0)]) """ emp_pos_scores = np.vstack([ emp_pos_scores, self.entity_scorer(ds.docs2lines(y_test), y_test_pred, "EMP-POS") ]) emp_comp_scores = np.vstack([ emp_comp_scores, self.entity_scorer(ds.docs2lines(y_test), y_test_pred, "EMP-COMP") ]) edu_major_scores = np.vstack([ edu_major_scores, self.entity_scorer(ds.docs2lines(y_test), y_test_pred, "EDU-MAJOR") ]) edu_inst_scores = np.vstack([ edu_inst_scores, self.entity_scorer(ds.docs2lines(y_test), y_test_pred, "EDU-INST") ]) w2v_model = None train_features = test_features = None """ print("EMP-POS") print("precision %s" % np.mean(emp_pos_scores[:,0])) print("recall %s" % np.mean(emp_pos_scores[:,1])) print("f1 %s" % np.mean(emp_pos_scores[:,2])) print("EMP-COMP") print("precision %s" % np.mean(emp_comp_scores[:,0])) print("recall %s" % np.mean(emp_comp_scores[:,1])) print("f1 %s" % np.mean(emp_comp_scores[:,2])) print("EDU-MAJOR") print("precision %s" % np.mean(edu_major_scores[:,0])) print("recall %s" % np.mean(edu_major_scores[:,1])) print("f1 %s" % np.mean(edu_major_scores[:,2])) print("EDU-INST") print("precision %s" % np.mean(edu_inst_scores[:,0])) print("recall %s" % np.mean(edu_inst_scores[:,1])) print("f1 %s" % np.mean(edu_inst_scores[:,2])) emp_pos_tpr = emp_pos_tpr.mean(axis=0) emp_pos_fpr = emp_pos_fpr.mean(axis=0) emp_comp_tpr = emp_comp_tpr.mean(axis=0) emp_comp_fpr = emp_comp_fpr.mean(axis=0) edu_major_tpr = edu_major_tpr.mean(axis=0) edu_major_fpr = edu_major_fpr.mean(axis=0) edu_inst_tpr = edu_inst_tpr.mean(axis=0) edu_inst_fpr = edu_inst_fpr.mean(axis=0) lw=2 plt.subplot(221) plt.plot(emp_pos_fpr, emp_pos_tpr, color='g', linestyle='--', label='EMP-POS', lw=lw) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC') plt.legend(loc="lower right") plt.subplot(222) plt.plot(emp_comp_fpr, emp_comp_tpr, color='g', linestyle='--', label='EMP-COMP', lw=lw) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC') plt.legend(loc="lower right") plt.subplot(223) plt.plot(edu_major_fpr, edu_major_tpr, color='g', linestyle='--', label='EDU-MAJOR', lw=lw) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC') plt.legend(loc="lower right") plt.subplot(224) plt.plot(edu_inst_fpr, edu_inst_tpr, color='g', linestyle='--', label='EDU-INST', lw=lw) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC') plt.legend(loc="lower right") plt.show() """ return emp_pos_scores, emp_comp_scores, edu_inst_scores, edu_major_scores
# SPLIT THE DATA (we could use sklearn.model_selection.train_test_split) training_set_size = int(0.8 * len(data)) training_data = data[:training_set_size] test_data = data[training_set_size:] # K-Fold split kf = KFold(n_splits=5) kf.get_n_splits(training_data) scores = [] for train_index, dev_index in kf.split(training_data): ttp_train_data = get_sub(training_data, train_index) ttp_dev_data = get_sub(training_data, dev_index) feature_generator = FeatureGenerator(ttp_train_data) train_data_vectors, train_classes = feature_generator.transform( ttp_train_data) dev_data_vectors, dev_classes = feature_generator.transform( ttp_dev_data) test_data_vectors, test_classes = feature_generator.transform( test_data) # TRAIN THE MODEL cls_models = [] #cls_models.append(GaussianNB()) cls_models.append(LinearSVC(random_state=0)) #cls_models.append(RandomForestClassifier(bootstrap=True, max_depth=10, max_features='auto', # min_samples_leaf=1, min_samples_split=2, n_estimators=10, # random_state=0, n_jobs=-1)) #cls_models.append(MLPClassifier(activation='tanh', hidden_layer_sizes=(16,)))
class ECGClassifier: def __init__(self): """ Initialize all the models and classification pipeline """ # Preprocessor which removes left, right and middle outliers of the wave self.preprocessor = Preprocessor() # Feature Generator for generating features of a wave self.feature_generator = FeatureGenerator() # Output arrays self.features = [] self.labels = [] self.file_names = [] # Custom scoring module self.scorer = Scorer() # Feature selection models self.feature_selector_1 = SelectFromModel( LinearSVC(penalty="l1", dual=False)) self.feature_selector_2 = SelectFromModel( LinearSVC(penalty="l1", dual=False)) # Classification models # clf_1 = DecisionTreeClassifier() # # params = {"criterion": ["gini", "entropy"], # "min_samples_split": [2, 10, 20], # "max_depth": [None, 2, 5, 10], # "min_samples_leaf": [1, 5, 10], # "max_leaf_nodes": [None, 5, 10, 20], # } clf_1 = AdaBoostClassifier() base_classifier_1 = RandomForestClassifier() # Best Classifier 1 # {'n_estimators': 40, # 'base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', # max_depth=None, max_features='auto', max_leaf_nodes=None, # min_impurity_split=1e-07, min_samples_leaf=1, # min_samples_split=2, min_weight_fraction_leaf=0.0, # n_estimators=10, n_jobs=1, oob_score=False, random_state=None, # verbose=0, warm_start=False), 'learning_rate': 0.85000000000000009} # Best # Classifier # 2 # {'n_estimators': 30, # 'base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', # max_depth=None, max_features='auto', max_leaf_nodes=None, # min_impurity_split=1e-07, min_samples_leaf=1, # min_samples_split=2, min_weight_fraction_leaf=0.0, # n_estimators=10, n_jobs=1, oob_score=False, random_state=None, # verbose=0, warm_start=False), 'learning_rate': 0.80000000000000004} # params = { # "base_estimator": [base_classifier_1], # "n_estimators": range(30, 61, 10), # "learning_rate": np.arange(0.8, 1.01, 0.05), # } optimal_params = { "base_estimator": [base_classifier_1], 'n_estimators': [40], 'learning_rate': [0.85000000000000009] } self.classifier_1 = GridSearchCV(clf_1, param_grid=optimal_params, cv=10, scoring=make_scorer( self.scorer.score), verbose=10) # clf_2 = DecisionTreeClassifier() clf_2 = AdaBoostClassifier() base_classifier_2 = RandomForestClassifier() # params = { # "base_estimator": [base_classifier_2], # "n_estimators": range(30, 61, 10), # "learning_rate": np.arange(0.8, 1.01, 0.05), # } optimal_params = { "base_estimator": [base_classifier_2], 'n_estimators': [30], 'learning_rate': [0.80000000000000004] } # params = {"criterion": ["gini", "entropy"], # "min_samples_split": [2, 10, 20], # "max_depth": [None, 2, 5, 10], # "min_samples_leaf": [1, 5, 10], # "max_leaf_nodes": [None, 5, 10, 20], # } self.classifier_2 = GridSearchCV(clf_2, param_grid=optimal_params, cv=2, scoring=make_scorer( self.scorer.score), verbose=10) # Pipeline initializations self.pipeline_1 = Pipeline([('feature_selector', self.feature_selector_1), ('clf', self.classifier_1)]) self.pipeline_2 = Pipeline([('feature_selector', self.feature_selector_2), ('clf', self.classifier_2)]) def fit(self, X, Y, filenames): """ Fits the training data and labels in the classifier :param X: Training data :param Y: Training labels """ X1, I1, X2, I2 = self.__transform__(X, filenames, 'training') Y1, Y2 = self.__get_feature_labels__(Y, I1, I2) self.pipeline_1.fit(X1, Y1) self.pipeline_2.fit(X2, Y2) def predict(self, X, file_names): """ Predict test labels :param X: Test Data :return: Return predicted output labels """ X1, I1, X2, I2 = self.__transform__(X, file_names, 'test') Y1 = self.pipeline_1.predict(X1) Y2 = self.pipeline_2.predict(X2) return self.__merge__(Y1, Y2, I1, I2) def score(self, X, Y, file_names): """ Predict and compute the accuracy score :param X: Test data :param Y: Actual labels of test data """ predicted_Y = self.predict(X, file_names) return self.scorer.score(predicted_Y, Y) def __transform__(self, X, filenames, prefix='training'): """ Transforms the provided waves data into array containing features of each wave :param X: 2D array containing data points of all the waves :return: Tranformed X """ # Return data from pickle files if it was transformed once if os.path.isfile("pickle_files/" + prefix + "_peak_data.pickle"): # Fetch data points with open("pickle_files/" + prefix + "_peak_data.pickle", "rb") as handle: peak_features = cPickle.load(handle) with open("pickle_files/" + prefix + "_point_data.pickle", "rb") as handle: point_features = cPickle.load(handle) with open("pickle_files/" + prefix + "_peak_indices.pickle", "rb") as handle: peak_indices = cPickle.load(handle) with open("pickle_files/" + prefix + "_point_indices.pickle", "rb") as handle: point_indices = cPickle.load(handle) return [peak_features, peak_indices, point_features, point_indices] # Initializing output labels peak_features = [] peak_indices = [] point_features = [] point_indices = [] # for data in X: for i in range(0, len(X)): data = X[i] print filenames[i] # pyplot.close("all") # peakfinder = ([], []) # pre.plot("original", data) # Remove outlier sections from the wave data, outliers = self.preprocessor.process(data) # Append the features of the transformed wave in the final output array features, type = self.feature_generator.get_features( data, outliers) if type == "peak": peak_features.append(features) peak_indices.append(i) else: point_features.append(features) point_indices.append(i) # Store the data in pickle files gc.disable() with open("pickle_files/" + prefix + '_peak_data.pickle', 'wb') as handle: cPickle.dump(peak_features, handle, protocol=cPickle.HIGHEST_PROTOCOL) gc.enable() gc.disable() with open("pickle_files/" + prefix + '_point_data.pickle', 'wb') as handle: cPickle.dump(point_features, handle, protocol=cPickle.HIGHEST_PROTOCOL) gc.enable() gc.disable() with open("pickle_files/" + prefix + '_peak_indices.pickle', 'wb') as handle: cPickle.dump(peak_indices, handle, protocol=cPickle.HIGHEST_PROTOCOL) gc.enable() gc.disable() with open("pickle_files/" + prefix + '_point_indices.pickle', 'wb') as handle: cPickle.dump(point_indices, handle, protocol=cPickle.HIGHEST_PROTOCOL) gc.enable() return [peak_features, peak_indices, point_features, point_indices] def __get_feature_labels__(self, Y, I1, I2): """ Get feature labels for corresponding index arrays :param Y: Output labels array :param I1: Index Array :param I2: Index Array :return: Corresponding label arrays """ Y1 = [] Y2 = [] for i in I1: Y1.append(Y[i]) for i in I2: Y2.append(Y[i]) return [Y1, Y2] def __merge__(self, Y1, Y2, I1, I2): """ Merge two output labels arrays using index arrays :param Y1: Labels array :param Y2: Labels array :param I1: Index array :param I2: Index array :return: Merged output labels array """ output = np.zeros(len(Y1) + len(Y2)) for i in range(0, len(I1)): output[I1[i]] = Y1[i] for i in range(0, len(I2)): output[I2[i]] = Y2[i] return output