Example #1
0
    def optimise_model(self, argv):
        self.logger.println("optimise model called")
        start_time = timeit.default_timer()

        cs = CrfSuite()

        dataset = Dataset()
        data = dataset.read(nr_of_files=argv)

        we_model = WeModel()
        w2v_model = we_model.train(
            data)  # optionally load a pretrained model here
        we_model.save(w2v_model)

        word2count, word2idx = dataset.encode_dataset(data)

        f_generator = FeatureGenerator(w2v_model, word2count, word2idx)
        train_features = f_generator.generate_features_docs(data)
        y_train = f_generator.generate_true_outcome(data)

        cs.optimise_model(train_features, y_train)

        elapsed_seconds = timeit.default_timer() - start_time
        self.logger.print_time_taken("optimise model operation took",
                                     elapsed_seconds)
Example #2
0
def predict_urls(urls: list, model=None):
    """Predicts the probabilities of given urls."""
    feature_generator = FeatureGenerator()
    if model is None:
        tr_urls, tr_labels = load_data(os.path.join(
            "datasets", 'url_data_mega_deep_learning.csv'),
                                       url_column_name='url',
                                       label_column_name='isMalicious',
                                       to_binarize=False)

        # Features
        one_hot_tr_urls = feature_generator.one_hot_encoding(tr_urls)
        X_train, X_val, y_train, y_val = train_test_split(one_hot_tr_urls,
                                                          tr_labels,
                                                          test_size=0.2)

        # Model
        model = UrlDetector("big_conv_nn")
        model.fit(X_train,
                  y_train,
                  epochs=5,
                  batch_size=512,
                  validation_data=(X_val, y_val))

    # Predict
    one_hot_urls = feature_generator.one_hot_encoding(urls)
    print(model.predict_proba(one_hot_urls))
Example #3
0
def predict_on_non_dated_dataset(
        non_dated_dataset='url_data_mega_deep_learning.csv'):
    """Prediction on the non-dated dataset."""
    if non_dated_dataset == 'url_data_mega_deep_learning.csv':
        to_binarize = False
        label_column_name = 'isMalicious'
        url_column_name = 'url'
    elif non_dated_dataset == 'simple.csv':
        to_binarize = True
        label_column_name = 'label'
        url_column_name = 'url'

    # Data
    urls, labels = load_data(os.path.join("datasets", non_dated_dataset),
                             url_column_name=url_column_name,
                             label_column_name=label_column_name,
                             to_binarize=to_binarize)

    # Features
    feature_generator = FeatureGenerator()
    one_hot_urls = feature_generator.one_hot_encoding(urls)
    X_train, X_test, y_train, y_test = train_test_split(one_hot_urls,
                                                        labels,
                                                        test_size=0.2)

    # Model
    url_detector = UrlDetector("big_conv_nn")
    url_detector.fit(X_train, y_train, epochs=5, batch_size=128)

    # Evaluate
    url_detector.evaluate(X_test, y_test)
    url_detector.plot_roc_curve(X_test, y_test)

    plt.show()
Example #4
0
    def train_model_learning_curve(self, arg):
        self.logger.println("train model called")
        start_time = timeit.default_timer()

        cs = CrfSuite()

        dataset = Dataset()
        data = dataset.read(nr_of_files=arg)
        nr_of_filled_lines, data1 = dataset.filter_for_filled_tags(data)
        data2 = dataset.obtain_default_tags(nr_of_filled_lines * 3, data)
        data = data1 + data2
        data = dataset.shuffle_data(data)
        train_set, test_set = dataset.split_dataset(data)

        we_model = WeModel()
        w2v_model = we_model.train(
            data)  # optionally load a pretrained model here
        #w2v_model = we_model.load_pretrained_model() # optionally load a pretrained model here
        word2count, word2idx = dataset.encode_dataset(train_set)

        f_generator = FeatureGenerator(w2v_model, word2count, word2idx)
        train_features = f_generator.generate_features_docs(train_set)
        y_train = f_generator.generate_true_outcome(train_set)

        cs.plot_learning_curve(train_features, y_train)
        plt.show()

        elapsed_seconds = timeit.default_timer() - start_time
        self.logger.print_time_taken("train model operation took",
                                     elapsed_seconds)
Example #5
0
    def load_tagger(self):
        self.__trained_tagger = pycrfsuite.Tagger()
        self.__trained_tagger.open('test_NER.crfsuite')

        we_model = WeModel()
        self.w2v_model = we_model.read()
        dataset = Dataset()
        data = dataset.read(nr_of_files=-1)
        word2count, word2idx = dataset.encode_dataset(data)
        self.f_generator = FeatureGenerator(self.w2v_model, word2count,
                                            word2idx)
Example #6
0
 def __init__(self):
     print('Initializing...')
     self.client_credentials_manager = SpotifyClientCredentials(
         client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
     self.spotify_api = spotipy.Spotify(
         client_credentials_manager=self.client_credentials_manager)
     print('Training...')
     self.gen = FeatureGenerator(spotify=self.spotify_api)
     self.X = [json.loads(d.strip()) for d in open('./training.data', 'r')]
     self.classifier = BallTree([x[4] for x in self.X])
     print('Training complete.')
    def get_feature_generator_results(self):
        input = [[("this", "", "", ""), ("is", "", "", ""), ("a", "", "", ""),
                  ("test", "", "", "")],
                 [("this", "", "", ""), ("is", "", "", ""), ("a", "", "", ""),
                  ("test", "", "", "")]]

        dataset = Dataset()
        word2count, word2idx = dataset.encode_dataset([input])

        f_generator = FeatureGenerator(self.w2v_model, word2count, word2idx)

        X = f_generator.generate_features_docs([input])
        y = f_generator.generate_true_outcome([input])

        # run tests on X and y
        return input, X, y
Example #8
0
def menu_cluster(fn='merged_shop.json'):
    '''
    进行菜品的聚类
    input: fn string 合并好的文件名 一行一个合并的店铺 {"baidu":'', "eleme":'', "meituan":'', "id":''}
    output: 打印结果 json 每行为一个店铺的菜品合并信息
    '''
    sql = "select * from `{}` where id='{}'"
    mysql_obj = get_mysql_obj(os.sep.join([conf_dir, 'db.conf']),
                              'mysql_online')
    conn = mysql_obj['conn']
    cursor = mysql_obj['cursor']

    feat_gen = FeatureGenerator()
    cluster_obj = SimpleCluster()

    tb_dic = {
        'eleme': 'eleme_shop',
        'baidu': 'baidu_waimai_shop',
        'meituan': 'meituan_waimai_shop'
    }
    num = 0
    for line in open(fn, 'r'):
        dic = json.loads(line.strip())
        feat_ls = []
        for tag, _id in dic.items():
            if tag not in tb_dic:
                continue
            tb_name = tb_dic[tag]
            cursor.execute(sql.format(tb_name, _id))
            res = cursor.fetchone()
            __feat_ls = feat_gen.generate_feature_with_food_dic(res, tag)
            #	    print ('__feat_ls', len(__feat_ls))
            feat_ls.extend(__feat_ls)


#	print (len(feat_ls), feat_ls)
        label_ls = cluster_obj.cluster(feat_ls)
        res_dic = OrderedDict()
        for __feat_ls, label in zip(feat_ls, label_ls):
            if label not in res_dic:
                res_dic[label] = []
            src, _id, food_dic, food_name = __feat_ls[:4]
            food_dic['__source'] = src
            food_dic['__id'] = _id
            res_dic[label].append(food_dic)
        res_dic = {'id': dic['id'], 'foods': res_dic}
        print(json.dumps(res_dic, ensure_ascii=False).encode('utf8'))
    def reset(self):
        self.stepcount = 0
        self.fg = FeatureGenerator()

        oo = self.e.reset()
        self.lastx = oo[1]
        o = self.obg(oo)
        return o
Example #10
0
def menu_cluster(fn = '../menu_fusion/res_dic.json'):
    '''
    进行菜品的聚类
    input: fn string 合并好的文件名 一行一个合并的店铺 {"baidu":'', "eleme":'', "meituan":'', "id":''}
    output: 打印结果 json 每行为一个店铺的菜品合并信息
    '''
    sql = "select * from `{}` where id='{}'"
    mysql_obj = get_mysql_obj(os.sep.join([conf_dir, 'db.conf']), 'mysql_online')
    conn = mysql_obj['conn']
    cursor = mysql_obj['cursor']

    feat_gen = FeatureGenerator()
    cluster_obj = SimpleCluster()
 
    tb_dic = {'eleme':'eleme_shop', 'baidu':'baidu_waimai_shop', 'meituan':'meituan_waimai_shop'}
    num = 0
    for line in open(, 'r'):
	dic = json.loads(line.strip())
	feat_ls = []
	for tag, _id in dic.items():
	    tb_name = tb_dic[tag]
	    cursor.execute(sql.format(tb_name, _id))
	    res = cursor.fetchone()
	    __feat_ls = feat_gen.generate_feature_with_food_dic(res, tb_name)
#	    print ('__feat_ls', len(__feat_ls))
	    feat_ls.extend(__feat_ls)
#	print (len(feat_ls), feat_ls)
	label_ls = cluster_obj.cluster(feat_ls)
	out_ls = []
	for __feat_ls, label in zip(feat_ls, label_ls):
	    __feat_ls.append(str(label))
	    _feat_ls = []
	    for ss in __feat_ls:
		if not isinstance(ss, str):
		    ss = ss.encode('utf8')
		_feat_ls.append(ss)
	    out_ls.append(_feat_ls)
	out_ls = sorted(out_ls, key=lambda x:x[-1])
	for _feat_ls in out_ls:
	    print ('\t'.join(_feat_ls))
	print ''
	num += 1
	if num == 10:
	    break
def test(args):
    print('start testing')

    ddpg = DDPG()
    ddpg.load_model(args.model, load_memory=False)
    env = RunEnv(visualize=args.visualize, max_obstacles=args.max_obs)

    np.random.seed(args.seed)
    for i in range(1):
        step = 0
        state = env.reset(difficulty=2)
        fg = FeatureGenerator()

        state = fg.gen(state)
        #obs = fg.traj[0]
        #print(obs.left_knee_r, obs.right_knee_r)

        ep_reward = 0
        ep_memories = []
        while True:
            action = ddpg.select_action(list(state))
            next_state, reward, done, info = env.step(action.tolist())
            next_state = fg.gen(next_state)

            #obs = fg.traj[0]
            #print(obs.left_knee_r, obs.right_knee_r)

            print('step: {0:03d}'.format(step), end=', action: ')
            for act in action:
                print('{0:.3f}'.format(act), end=', ')
            print()

            state = next_state
            ep_reward += reward
            step += 1

            print('reward:', ep_reward)

            if done:
                break

        print('\nEpisode: {} Reward: {}, n_steps: {}'.format(
            i, ep_reward, step))
Example #12
0
    def draw_roc_curve_saved_model(self):
        self.logger.println("drawing roc curve from saved model")
        start_time = timeit.default_timer()
        cs = CrfSuite()
        crf = cs.load_model("current_crf_model.pkl")

        dataset = Dataset()
        data = dataset.read(nr_of_files=1000)
        nr_of_filled_lines, data1 = dataset.filter_for_filled_tags(data)
        data2 = dataset.obtain_default_tags(nr_of_filled_lines * 3, data)
        data = data1 + data2
        data = dataset.shuffle_data(data)
        train_set, test_set = dataset.split_dataset(data)

        we_model = WeModel()
        w2v_model = we_model.read()
        we_model = None

        word2count, word2idx = dataset.encode_dataset(train_set)

        f_generator = FeatureGenerator(w2v_model, word2count, word2idx)
        w2v_model = None
        train_features = f_generator.generate_features_docs(train_set)
        y_train = f_generator.generate_true_outcome(train_set)

        test_features = f_generator.generate_features_docs(test_set)
        y_test = f_generator.generate_true_outcome(test_set)
        f_generator = None

        evaluator = Evaluator()
        evaluator.draw_roc_proba(crf, test_features, y_test)
Example #13
0
def main():
    print("Start data preprocessing and feature extraction..")

    training_fg = FeatureGenerator(dtype='train', n_jobs=4, chunk_size=150000)
    training_data = training_fg.generate()

    test_fg = FeatureGenerator(dtype='test', n_jobs=4, chunk_size=None)
    test_data = test_fg.generate()

    training_data.to_csv("../input/train_features.csv", index=False)
    test_data.to_csv("../input/test_features.csv", index=False)

    X = training_data.iloc[:, :-2].values
    y = training_data.loc[:, "target"].values
    print("Train model")
    model = NeuralNetv1(training_data, y, save=False)
    model.evaluate_model()
    model.train()
    print("Model trained")

    print("Build result and predict for Kaggle")
    predictor = Predictor(x_test, y_test, model=model.model)
    predictor.predict()

    print("END")
Example #14
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    set_dist_env()
    #------bulid Tasks------
    model_params = {
        "learning_rate": FLAGS.learning_rate,
        "l2_reg": FLAGS.l2_reg,
        "deep_layers": list(map(int,FLAGS.deep_layers.split(','))),
        "atten_layers":list(map(int,FLAGS.atten_layers.split(','))),
        "dropout": list(map(float,FLAGS.dropout.split(','))),
        "optimizer":FLAGS.optimizer
    }

    if FLAGS.clear_existing_model:
        try:
            shutil.rmtree('./model')
        except Exception as e:
            print(e, "at clear_existing_model")
        else:
            print("existing model cleaned at %s" % FLAGS.model_dir)    

    tr_files = "./data/train.tfrecords"
    va_files ="./data/test.tfrecords"


    fea_json = feature_json('./feature_generator.json')
    fg = FeatureGenerator(fea_json)
    md = DIN(fg)
    model = Model(fg,md)

    config = tf.estimator.RunConfig().replace(session_config = tf.ConfigProto(device_count={'GPU':0, 'CPU':FLAGS.num_threads}),
            log_step_count_steps=FLAGS.log_steps, save_summary_steps=FLAGS.log_steps)
    Estimator = tf.estimator.Estimator(model_fn=model.model_fn, model_dir='./model/', params=model_params, config=config)

    if FLAGS.task_type == 'train':
        train_spec = tf.estimator.TrainSpec(input_fn=lambda: model.input_fn(tr_files, num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size))
        eval_spec = tf.estimator.EvalSpec(input_fn=lambda: model.input_fn(va_files, num_epochs=1, batch_size=FLAGS.batch_size), steps=None, start_delay_secs=1000, throttle_secs=1200)
        tf.estimator.train_and_evaluate(Estimator, train_spec, eval_spec)
    elif FLAGS.task_type == 'eval':
        Estimator.evaluate(input_fn=lambda: model.input_fn(tr_files, num_epochs=1, batch_size=FLAGS.batch_size))
        Estimator.evaluate(input_fn=lambda: model.input_fn(va_files, num_epochs=1, batch_size=FLAGS.batch_size))
    elif FLAGS.task_type == 'infer':
        preds = Estimator.predict(input_fn=lambda: model.input_fn(va_files, num_epochs=1, batch_size=FLAGS.batch_size), predict_keys="prob")
    elif FLAGS.task_type == 'export':
        ##单机使用保存
        # print(fg.feature_spec)
        # serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(fg.feature_spec)
        serving_input_receiver_fn = (
            tf.estimator.export.build_raw_serving_input_receiver_fn(fg.feature_placeholders)
        )

        Estimator.export_saved_model(FLAGS.servable_model_dir, serving_input_receiver_fn)
Example #15
0
    def train_model(self, nr_of_files=-1):
        self.logger.println("train model called")
        start_time = timeit.default_timer()
        cs = CrfSuite()

        dataset = Dataset()
        data = dataset.read(nr_of_files=nr_of_files)
        nr_of_filled_lines, data1 = dataset.filter_for_filled_tags(data)
        data2 = dataset.obtain_default_tags(nr_of_filled_lines * 3, data)
        data = data1 + data2
        data = dataset.shuffle_data(data)
        train_set, test_set = dataset.split_dataset(data)

        we_model = WeModel()
        w2v_model = we_model.train(
            data)  # optionally load a pretrained model here
        we_model.save(w2v_model)
        we_model = None

        word2count, word2idx = dataset.encode_dataset(train_set)

        f_generator = FeatureGenerator(w2v_model, word2count, word2idx)
        w2v_model = None
        train_features = f_generator.generate_features_docs(train_set)
        y_train = f_generator.generate_true_outcome(train_set)

        test_features = f_generator.generate_features_docs(test_set)
        y_test = f_generator.generate_true_outcome(test_set)
        f_generator = None

        model = cs.train_model(train_features, y_train)
        cs.save_model(model)
        y_train_pred = cs.test_model(model, train_features)
        y_test_pred = cs.test_model(model, test_features)

        print("printing training results")
        cs.print_classification_report(dataset.docs2lines(y_train),
                                       y_train_pred)
        score_train = cs.score_model(dataset.docs2lines(y_train), y_train_pred)
        print("training f1 score: %s" % score_train)

        print("printing test results")
        cs.print_classification_report(dataset.docs2lines(y_test), y_test_pred)
        score_test = cs.score_model(dataset.docs2lines(y_test), y_test_pred)
        print("test f1 score: %s" % score_test)

        elapsed_seconds = timeit.default_timer() - start_time
        self.logger.print_time_taken("train model operation took",
                                     elapsed_seconds)

        evaluator = Evaluator()
        evaluator.perform_roc_analysis(dataset.docs2lines(y_train),
                                       y_train_pred)
        evaluator.perform_roc_analysis(dataset.docs2lines(y_test), y_test_pred)
def submit(args):
    print('start submitting')

    remote_base = 'http://grader.crowdai.org:1733'
    client = Client(remote_base)

    ddpg = DDPG()
    ddpg.load_model(args.model, load_memory=False)

    state = client.env_create(TOKEN)
    fg = FeatureGenerator()
    state = fg.gen(state)

    step = 0
    ep_reward = 0

    while True:
        print('selecting action ...', end=' ')
        action = ddpg.select_action(list(state))

        print('client.env_step ...')
        next_state, reward, done, info = client.env_step(action.tolist())
        next_state = fg.gen(next_state)

        print('step: {0:03d}, ep_reward: {1:02.08f}'.format(step, ep_reward))
        state = next_state
        ep_reward += reward
        step += 1

        if done:
            print('done')
            state = client.env_reset()
            if not state:
                break

            step = 0
            ep_reward = 0

            fg = FeatureGenerator()
            state = fg.gen(state)

    client.submit()
#from pandas.tseries.offsets import CustomBusinessDay
#us_cal = CustomBusinessDay(calendar=USFederalHolidayCalendar())

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier

if __name__ == "__main__":
    plotIt = PlotUtility()
    timeUtil = TimeUtility()
    ct = ComputeTarget()
    candle_ind = CandleIndicators()
    dSet = DataRetrieve()
    taLibMomSt = TALibMomentumStudies()
    transf = Transformers()
    modelUtil = ModelUtility()
    featureGen = FeatureGenerator()

    issue = "TLT"
    # Set IS-OOS parameters
    pivotDate = datetime.date(2018, 4, 2)
    is_oos_ratio = 2
    oos_months = 3
    segments = 4

    dataSet = dSet.read_issue_data(issue)

    # get first data from loaded data instead of hard coding start date
    dataSet = dSet.set_date_range(dataSet, "2014-09-26", pivotDate)

    #set beLong level
    beLongThreshold = 0.000
Example #18
0
                                             control_flow_p, time_p,
                                             resource_p, data_p, transition)

    testset_dir = "../testsets"

    log_config = {
        "control_flow_p": control_flow_p,
        "time_p": time_p,
        "resource_p": resource_p,
        "data_p": data_p,
        "transition": transition
    }

    # load data
    print("flag: loading data")
    fg = FeatureGenerator()
    df = fg.create_initial_log(filename, log_config)
    print("done")

    num_events = len(df)
    num_cases = len(set(df["id"]))

    # feature generation
    print("flag: generating features")
    if task == 'next_activity':
        loss = 'categorical_crossentropy'
        regression = False
        feature_type_list = ["activity_history"]
        df = fg.add_activity_history(df)
        df = fg.add_next_activity(df)
    filename = args.data_dir + args.data_set
    model_name = args.data_set + args.task

    contextual_info = args.contextual_info
    if args.task == 'next_activity':
        loss = 'categorical_crossentropy'
        regression = False
    elif args.task == 'next_timestamp':
        loss = 'mae'
        regression = True

    batch_size = args.batch_size_train
    num_folds = args.num_folds

    # load data
    FG = FeatureGenerator()
    df = FG.create_initial_log(filename)

    #split train and test
    #train_df, test_df = FG.train_test_split(df, 0.7, 0.3)
    train_df = df
    test_df = train_df
    #create train
    train_df = FG.order_csv_time(train_df)
    train_df = FG.queue_level(train_df)
    train_df.to_csv('./training_data.csv')
    state_list = FG.get_states(train_df)
    train_X, train_Y_Event, train_Y_Time = FG.one_hot_encode_history(
        train_df, args.checkpoint_dir + args.data_set)
    if contextual_info:
        train_context_X = FG.generate_context_feature(train_df, state_list)
Example #20
0
def cross_datasets(direction='dated-->non_dated',
                   day=15,
                   month=7,
                   year=2018,
                   randomise=False,
                   non_dated_dataset='url_data_mega_deep_learning.csv'):
    """
    Trains the model on a dataset and predicts on the other. Check above functions for the details of the parameters.
    """
    # Parameters definition for non-dated dataset
    if non_dated_dataset == 'url_data_mega_deep_learning.csv':
        to_binarize = False
        label_column_name = 'isMalicious'
        url_column_name = 'url'
    elif non_dated_dataset == 'simple.csv':
        to_binarize = True
        label_column_name = 'label'
        url_column_name = 'url'

    if direction == 'non_dated-->dated':
        # TRAINING
        # Data
        urls, labels = load_data(os.path.join("datasets", non_dated_dataset),
                                 url_column_name=url_column_name,
                                 label_column_name=label_column_name,
                                 to_binarize=to_binarize)

        # Features
        feature_generator = FeatureGenerator()
        one_hot_urls = feature_generator.one_hot_encoding(urls)
        X_train, X_val, y_train, y_val = train_test_split(one_hot_urls,
                                                          labels,
                                                          test_size=0.2)

        # Model
        url_detector = UrlDetector("big_conv_nn")
        url_detector.fit(X_train,
                         y_train,
                         epochs=5,
                         batch_size=128,
                         validation_data=(X_val, y_val))

        # TESTING
        # Data
        if randomise:
            urls_1, labels_1, urls_2, labels_2 = load_randomized_dated_data(
                os.path.join("datasets", "bad_urls1.csv"),
                os.path.join("datasets", "good_urls.csv"),
                ratio_good_bad=1,
                ratio_testing_set=0.2)
        else:
            # For 80%/20% : take 15/07/2018
            # For 50%/50% : take 01/03/2018
            urls_1, labels_1, urls_2, labels_2 = load_dated_data(
                os.path.join("datasets", "bad_urls1.csv"),
                os.path.join("datasets", "good_urls.csv"),
                ratio_good_bad=1,
                separation_date=date(year, month, day))
        # Features
        feature_generator = FeatureGenerator()
        one_hot_urls_2 = feature_generator.one_hot_encoding(urls_2)

        # Evaluate
        url_detector.evaluate(one_hot_urls_2, labels_2)
        url_detector.plot_roc_curve(one_hot_urls_2, labels_2)

        plt.show()

    elif direction == 'dated-->non_dated':
        # TRAINING
        # Data
        if randomise:
            training_urls, training_labels, val_urls, val_labels = load_randomized_dated_data(
                os.path.join("datasets", "bad_urls1.csv"),
                os.path.join("datasets", "good_urls.csv"),
                ratio_good_bad=1,
                ratio_testing_set=0.2)
        else:
            training_urls, training_labels, val_urls, val_labels = load_dated_data(
                os.path.join("datasets", "bad_urls1.csv"),
                os.path.join("datasets", "good_urls.csv"),
                ratio_good_bad=1,
                separation_date=date(year, month, day))  # 20% is at 15/07/2018

        # Features
        feature_generator = FeatureGenerator()
        one_hot_training_urls = feature_generator.one_hot_encoding(
            training_urls)
        one_hot_val_urls = feature_generator.one_hot_encoding(val_urls)

        # Model
        url_detector = UrlDetector("big_conv_nn")
        url_detector.fit(one_hot_training_urls,
                         training_labels,
                         epochs=5,
                         batch_size=128,
                         validation_data=(one_hot_val_urls, val_labels))

        # TESTING
        # Data
        urls, labels = load_data(os.path.join("datasets", non_dated_dataset),
                                 url_column_name=url_column_name,
                                 label_column_name=label_column_name,
                                 to_binarize=to_binarize)
        # Features
        size_testing = int(0.21 * len(urls))
        feature_generator = FeatureGenerator()
        one_hot_urls = feature_generator.one_hot_encoding(urls)

        # Evaluate
        url_detector.evaluate(one_hot_urls[:size_testing],
                              labels[:size_testing])
        url_detector.plot_roc_curve(one_hot_urls[:size_testing],
                                    labels[:size_testing])

        plt.show()
Example #21
0
import os
import constants as CONSTANTS

from pdb_converter import PDBConverter
from data_spliter import DataSpliter
from feature_generator import FeatureGenerator

pdb_converter = PDBConverter(CONSTANTS.ALL_PDB_IDS)
pdb_converter.apply()

feature_generator = FeatureGenerator(CONSTANTS.N_PDB_IDS)
feature_generator.one_hot()

data_spliter = DataSpliter(CONSTANTS.N_PDB_IDS)
data_spliter.split()
            Computes the precision, a metric for multi-label classification of
            how many selected items are relevant.
            """
            true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
            predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
            precision = true_positives / (predicted_positives + K.epsilon())
            return precision

        precision = precision(y_true, y_pred)
        recall = recall(y_true, y_pred)
        return 2 * ((precision * recall) / (precision + recall + K.epsilon()))


if __name__ == '__main__':
    # Data
    feature_generator = FeatureGenerator()
    urls, labels = feature_generator.load_data(os.path.join(
        "datasets", "url_data_mega_deep_learning.csv"),
                                               url_column_name="url",
                                               label_column_name="isMalicious",
                                               to_binarize=False)
    urls, labels = shuffle(urls, labels)
    one_hot_urls = feature_generator.one_hot_encoding(urls)

    # Model
    url_detector = UrlDetector("big_conv_nn")
    url_detector.fit(one_hot_urls, labels)
    url_detector.evaluate(one_hot_urls[-100:], labels[-100:])

    plt.show()
    # TODO: test ROC curve, new datasets
Example #23
0
    def __init__(self):
        """
        Initialize all the models and classification pipeline
        """

        # Preprocessor which removes left, right and middle outliers of the wave
        self.preprocessor = Preprocessor()

        # Feature Generator for generating features of a wave
        self.feature_generator = FeatureGenerator()

        # Output arrays
        self.features = []
        self.labels = []
        self.file_names = []

        # Custom scoring module
        self.scorer = Scorer()

        # Feature selection models
        self.feature_selector_1 = SelectFromModel(
            LinearSVC(penalty="l1", dual=False))
        self.feature_selector_2 = SelectFromModel(
            LinearSVC(penalty="l1", dual=False))

        # Classification models
        # clf_1 = DecisionTreeClassifier()
        #
        # params = {"criterion": ["gini", "entropy"],
        #           "min_samples_split": [2, 10, 20],
        #           "max_depth": [None, 2, 5, 10],
        #           "min_samples_leaf": [1, 5, 10],
        #           "max_leaf_nodes": [None, 5, 10, 20],
        #           }

        clf_1 = AdaBoostClassifier()
        base_classifier_1 = RandomForestClassifier()

        # Best Classifier 1
        # {'n_estimators': 40,
        #  'base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
        #                                           max_depth=None, max_features='auto', max_leaf_nodes=None,
        #                                           min_impurity_split=1e-07, min_samples_leaf=1,
        #                                           min_samples_split=2, min_weight_fraction_leaf=0.0,
        #                                           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
        #                                           verbose=0, warm_start=False), 'learning_rate': 0.85000000000000009}
        # Best
        # Classifier
        # 2
        # {'n_estimators': 30,
        #  'base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
        #                                           max_depth=None, max_features='auto', max_leaf_nodes=None,
        #                                           min_impurity_split=1e-07, min_samples_leaf=1,
        #                                           min_samples_split=2, min_weight_fraction_leaf=0.0,
        #                                           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
        #                                           verbose=0, warm_start=False), 'learning_rate': 0.80000000000000004}

        # params = {
        #     "base_estimator": [base_classifier_1],
        #     "n_estimators": range(30, 61, 10),
        #     "learning_rate": np.arange(0.8, 1.01, 0.05),
        # }
        optimal_params = {
            "base_estimator": [base_classifier_1],
            'n_estimators': [40],
            'learning_rate': [0.85000000000000009]
        }

        self.classifier_1 = GridSearchCV(clf_1,
                                         param_grid=optimal_params,
                                         cv=10,
                                         scoring=make_scorer(
                                             self.scorer.score),
                                         verbose=10)

        # clf_2 = DecisionTreeClassifier()
        clf_2 = AdaBoostClassifier()
        base_classifier_2 = RandomForestClassifier()

        # params = {
        #     "base_estimator": [base_classifier_2],
        #     "n_estimators": range(30, 61, 10),
        #     "learning_rate": np.arange(0.8, 1.01, 0.05),
        # }

        optimal_params = {
            "base_estimator": [base_classifier_2],
            'n_estimators': [30],
            'learning_rate': [0.80000000000000004]
        }

        # params = {"criterion": ["gini", "entropy"],
        #           "min_samples_split": [2, 10, 20],
        #           "max_depth": [None, 2, 5, 10],
        #           "min_samples_leaf": [1, 5, 10],
        #           "max_leaf_nodes": [None, 5, 10, 20],
        #           }
        self.classifier_2 = GridSearchCV(clf_2,
                                         param_grid=optimal_params,
                                         cv=2,
                                         scoring=make_scorer(
                                             self.scorer.score),
                                         verbose=10)

        # Pipeline initializations
        self.pipeline_1 = Pipeline([('feature_selector',
                                     self.feature_selector_1),
                                    ('clf', self.classifier_1)])
        self.pipeline_2 = Pipeline([('feature_selector',
                                     self.feature_selector_2),
                                    ('clf', self.classifier_2)])
class TrainingSampleGenerator(object):
    def __init__(self, svm_feature=False):
        super(TrainingSampleGenerator, self).__init__()
        self.training_samples = []
        self.processed_training_data = []   # format: tuple([feature vector] , label)
        self.feature_generator = FeatureGenerator()
        self.svm_feature = svm_feature

    def parse_from_text(self, path):
        fp = open(path, 'r')
        counter = 0
        lname = None
        root_url = None
        target_url = None

        for line in fp:
            counter += 1
            if counter == 4:
                data = Data(lname, root_url, target_url)
                self.training_samples.append(data)
                counter = 0
                continue

            elif counter == 1:
                lname = line.lower().strip()
            elif counter == 2:
                root_url = line.strip()
            elif counter == 3:
                target_url = line.strip()


    def generate(self, path):   # process and generate training data
        self.parse_from_text(path)

        for raw_training_data in self.training_samples:
            self.processed_training_data +=  self.convert_raw_training_data_to_features(raw_training_data)

        print self.processed_training_data


    def prepare_libsvm(self, data_lst, path):
        fw = open(path, 'w')
        for tp in data_lst:
            feature_lst = tp[0]
            label = tp[1]

            line = str(label) + ' '
            for i in xrange(len(feature_lst)):
                line += str(i+1) + ':' + str(feature_lst[i]) + ' '
            fw.write(line + '\n')
        fw.close()


    def serialize(self):
        fp = open('processed_training_data.pkl', 'wb')
        pickle.dump(self.processed_training_data, fp)
        fp.close()

    def deserialize(self):
        # self.processed_training_data = pickle.load(open('processed_training_data.pkl', 'rb'))
        self.processed_training_data = pickle.load(open('processed_training_data.pkl.bak', 'rb'))
        # self.processed_training_data = pickle.load(open('processed_training_data.pkl.svm.full', 'rb'))
        # print self.processed_training_data
        print len(self.processed_training_data)


    def convert_raw_training_data_to_features(self, train_data):
        q = Queue()
        url_deduplicator = {}
        DELIMITER = ('###DELIMITER###', '', -1)

        lname = train_data.lname
        root_url = train_data.root_url
        target_url = train_data.target_url
        self.feature_generator.setup(lname, root_url)

        training_data_features = []

        q.push( (root_url, '', 0) )
        q.push(DELIMITER)
        level_count = 0
        stop_pushing_flag = False

        while not q.is_empty():
            cur_url_info = q.pop()
            print cur_url_info
            print target_url, len(target_url)

            if cur_url_info == DELIMITER:   #if delimiter, increment level counter
                level_count += 1
                if level_count == 1:     #dig no mare than 1 levels
                    stop_pushing_flag = True
                else:
                    continue

            if url_deduplicator.has_key(url_cleanup(cur_url_info[0])):  # if appeared, ignore
                continue
            else:
                url_deduplicator[url_cleanup(cur_url_info[0])] = True

            self.feature_generator.generate_features(cur_url_info, self.svm_feature)

            if self.feature_generator.features:     #if feautures is [], then means previous is an invalid link
                if url_cleanup(cur_url_info[0]) == target_url:
                    training_data_features.append( (self.feature_generator.features, 1) )
                    print '\t\t', (self.feature_generator.features, 1)
                else:
                    training_data_features.append( (self.feature_generator.features, 0) )
                    print '\t\t', (self.feature_generator.features, 0)

                # Push sublinks
                if not stop_pushing_flag:
                    try:
                        raw_html = urllib2.urlopen(cur_url_info[0], timeout=3).read()
                        soup = BeautifulSoup(raw_html)
                        a_lst = soup.find_all('a', href=True)
                        for anchor in a_lst:
                            # new_url = urllib2.urlparse.urljoin(cur_url_info[0], anchor.attrs['href'])
                            new_url = get_absolute_url(cur_url_info[0], anchor.attrs['href'])
                            if under_same_maindomain(cur_url_info[0], new_url):
                                q.push( (new_url, anchor.text, level_count+1) )
                        if cur_url_info[2] == 0:
                            q.push(DELIMITER)
                    except:
                        pass

        self.feature_generator.close_up()
        return training_data_features



    def five_folds(self):
        """
            Format:
                [
                    (
                        [([feature vector], label), (), ...],     #training set 80%     fold[0]
                        [([feature vector], label), (), ...]      #testing set 20%      fold[1]
                    )  #1st fold,
                    
                    ()....
                ]
        """
        total_len = len(self.processed_training_data)
        five_folds = []
        test_set_len = total_len - 4 * (total_len/5)

        print 'total len: ', total_len

        for i in xrange(5):
            test_set = []
            for j in xrange(test_set_len):
                idx = i * (total_len/5) + j
                test_set.append(self.processed_training_data[idx])
            training_set = self.processed_training_data[: i*(total_len/5)] + self.processed_training_data[i*(total_len/5) + test_set_len :]
            self.prepare_libsvm(training_set, config.LIBSVM_TRAINING_FOLDS[i])
            self.prepare_libsvm(test_set, config.LIBSVM_TESTING_FOLDS[i])
            five_folds.append( (training_set, test_set) )

        return five_folds
Example #25
0
def predict_on_dated_dataset(day=15,
                             month=7,
                             year=2018,
                             randomise=False,
                             ratio_good_bad=1,
                             ratio_testing_set=0.2,
                             validation_split=0.2,
                             reverse=False):
    """
    Prediction on the dated dataset.

    Parameters
    ----------
    day, month, year
        Limit date: the network is trained on data dated before this day and tested on data from this day and newer.
        The proportion training/testing is at 80%/20% for the date 15/07/2018.
        Only used if 'randomise' = False.
    randomise
        Whether to randomise the set so as to use or not the date of the data.
    ratio_good_bad
        Ratio of (Good Data)/(Bad Data).
    ratio_testing_set
        Represents the proportion of the dataset to include in the test split. Only used if 'randomise' = True
    validation_split
        % of data to put in the validation set.
    reverse
        True to train on newer data and test on older. Only used if 'randomise' = False.
    """
    # Data
    if randomise:
        training_urls, training_labels, testing_urls, testing_labels = load_randomized_dated_data(
            os.path.join("datasets", "bad_urls1.csv"),
            os.path.join("datasets", "good_urls.csv"),
            ratio_good_bad=ratio_good_bad,
            ratio_testing_set=ratio_testing_set)
    else:
        training_urls, training_labels, testing_urls, testing_labels = load_dated_data(
            os.path.join("datasets", "bad_urls1.csv"),
            os.path.join("datasets", "good_urls.csv"),
            ratio_good_bad=ratio_good_bad,
            separation_date=date(year, month, day))  # 20% is at 15/07/2018

        if reverse:
            training_urls, testing_urls = testing_urls, training_urls
            training_labels, testing_labels = testing_labels, training_labels

            # Features
    feature_generator = FeatureGenerator()
    one_hot_training_urls = feature_generator.one_hot_encoding(training_urls)
    one_hot_testing_urls = feature_generator.one_hot_encoding(testing_urls)

    # Model
    url_detector = UrlDetector("big_conv_nn")
    url_detector.fit(one_hot_training_urls,
                     training_labels,
                     epochs=5,
                     batch_size=128,
                     validation_split=validation_split)

    # Evaluate
    url_detector.evaluate(one_hot_testing_urls, testing_labels)
    url_detector.plot_roc_curve(one_hot_testing_urls, testing_labels)

    plt.show()

    url_detector.evaluate(one_hot_training_urls + one_hot_testing_urls,
                          training_labels + testing_labels)
Example #26
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    set_dist_env()
    #------bulid Tasks------
    model_params = {
        "learning_rate": FLAGS.learning_rate,
        "l2_reg": FLAGS.l2_reg,
        "fcn_layers": list(map(int,FLAGS.fcn_layers.split(','))),
        "atten_layers":list(map(int,FLAGS.atten_layers.split(','))),
        "auxili_layers":list(map(int,FLAGS.auxili_layers.split(','))),
        "dropout": list(map(float,FLAGS.dropout.split(','))),
        "optimizer":FLAGS.optimizer,
        "neg_count":FLAGS.neg_count
    }
    #### negative sampling source 
    model_params["mid_cat"] = cPickle.load(open("data/mid_cat.pkl", "rb"))

    tr_files = "./data/train.tfrecords"
    va_files ="./data/test.tfrecords"

    fea_json = feature_json('./feature_generator.json')
    fg = FeatureGenerator(fea_json)
    md = DIEN(fg)
    model = Model(fg,md)

    config = tf.estimator.RunConfig().replace(
        session_config = tf.ConfigProto(device_count={'GPU':0, 'CPU':FLAGS.num_threads}),
        log_step_count_steps=FLAGS.log_steps, 
        save_summary_steps=FLAGS.log_steps,
        save_checkpoints_secs=FLAGS.save_checkpoints_secs)
    Estimator = tf.estimator.Estimator(
        model_fn=model.model_fn, model_dir='./model/', 
        params=model_params, 
        config=config)

    if FLAGS.task_type == 'train':
        train_spec = tf.estimator.TrainSpec(
            input_fn=lambda: model.input_fn(
                tr_files, 
                num_epochs=FLAGS.num_epochs, 
                batch_size=FLAGS.batch_size))
        eval_spec = tf.estimator.EvalSpec(
            input_fn=lambda: model.input_fn(
                va_files, 
                num_epochs=1, 
                batch_size=FLAGS.batch_size), 
            steps=None, 
            start_delay_secs=10, 
            throttle_secs=FLAGS.save_checkpoints_secs)
        tf.estimator.train_and_evaluate(
            Estimator, 
            train_spec, 
            eval_spec)
    elif FLAGS.task_type == 'eval':
        Estimator.evaluate(
            input_fn=lambda: model.input_fn(
                tr_files, 
                num_epochs=1, 
                batch_size=FLAGS.batch_size))
        Estimator.evaluate(
            input_fn=lambda: model.input_fn(
                va_files, 
                num_epochs=1, 
                batch_size=FLAGS.batch_size))
    elif FLAGS.task_type == 'infer':
        preds = Estimator.predict(
            input_fn=lambda: model.input_fn(
                va_files, 
                num_epochs=1, 
                batch_size=FLAGS.batch_size), 
            predict_keys="prob")
    elif FLAGS.task_type == 'export':
        ##单机使用保存
        # print(fg.feature_spec)
        # serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(fg.feature_spec)
        serving_input_receiver_fn = (
            tf.estimator.export.build_raw_serving_input_receiver_fn(fg.feature_placeholders)
        )

        Estimator.export_saved_model(
            FLAGS.servable_model_dir, 
            serving_input_receiver_fn)
Example #27
0
class CrfSuite(Tags):
    __seperator = "/"
    __crf_model_name = "current_crf_model.pkl"

    def __init__(self):
        self.logger = Logger()
        self.logger.println("CrfSuite created")

    def train_model(self, X, y):
        self.logger.println("transforming data to train model")
        X_combined = list(chain.from_iterable(X))
        y_combined = list(chain.from_iterable(y))

        self.logger.println("crf trainer init")
        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   c1=0.35,
                                   c2=0.35,
                                   max_iterations=125,
                                   all_possible_transitions=True,
                                   verbose=False)
        crf.fit(X_combined, y_combined)
        return crf

    def save_model(self, model, name=__crf_model_name):
        joblib.dump(model, name)

    def load_model(self, name=__crf_model_name):
        return joblib.load(name)

    def score_model(self, y_true, y_pred):
        lb = LabelBinarizer()

        y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
        y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

        tagset = set(lb.classes_) - {'O'}
        tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])

        class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

        return f1_score(y_true_combined,
                        y_pred_combined,
                        average="weighted",
                        labels=[class_indices[cls] for cls in tagset])

    def print_classification_report(self, y_true, y_pred):
        lb = LabelBinarizer()

        y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
        y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

        tagset = set(lb.classes_) - {'O'}
        tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])

        class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

        # TODO return f1 score or other wise here: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
        print(
            classification_report(
                y_true_combined,
                y_pred_combined,
                labels=[class_indices[cls] for cls in tagset],
                target_names=tagset))

    def load_tagger(self):
        self.__trained_tagger = pycrfsuite.Tagger()
        self.__trained_tagger.open('test_NER.crfsuite')

        we_model = WeModel()
        self.w2v_model = we_model.read()
        dataset = Dataset()
        data = dataset.read(nr_of_files=-1)
        word2count, word2idx = dataset.encode_dataset(data)
        self.f_generator = FeatureGenerator(self.w2v_model, word2count,
                                            word2idx)

    # doc: in format of tagged tuples
    def tag_doc(self, doc):
        feature_input = self.f_generator.generate_features_docs([doc])
        model = self.load_model()
        """
        xseq = []
        for line_idx, line in enumerate(feature_input[0]):
            for token_idx, token in enumerate(line):
                xseq.append(token)
        """

        predicted_tags = model.predict(feature_input[0])

        # TODO change this to take in doc and not xseq (convert predicted tags
        # to the structure of doc)
        return self.interpret_predicted_tags(doc, predicted_tags)

    def interpret_predicted_tags(self, doc, tags):
        dataset = Dataset()
        identified_entities = []
        doc = dataset.docs2lines(doc)
        tags = dataset.docs2lines(tags)
        for tag_idx, tag in enumerate(tags):
            if tag in Tags.start_tagset:
                entity_found = ""
                tag_idx_forward = tag_idx
                while True:
                    if tag_idx_forward >= len(tags) or tags[
                            tag_idx_forward] == self._Tags__outside_tag:
                        break
                    entity_found = entity_found + " " + doc[tag_idx_forward][0]
                    #entity_found = entity_found + " " + doc[line_idx]['word']
                    tag_idx_forward += 1

                identified_entities.append((entity_found, tags[tag_idx]))

        return identified_entities

    # use an existing model to tag data
    def test_model(self, model, features):
        X_features = list(chain.from_iterable(features))
        y_pred = model.predict(X_features)
        return y_pred

    # hyperparamter optimisation
    def optimise_model(self, X, y):
        # prepare data structure
        xseq = []
        yseq = []
        # transform data structure to group tokens by lines
        for doc_x, doc_y in zip(X, y):
            for line_idx, line in enumerate(doc_x):
                xseq.append(line)
                yseq.append(doc_y[line_idx])

        # define fixed parameters and parameters to search
        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   max_iterations=200,
                                   all_possible_transitions=True,
                                   verbose=True)
        params_space = {
            'c1': scipy.stats.expon(scale=0.03),
            'c2': scipy.stats.expon(scale=0.03),
        }

        labels = Tags.tag_list
        labels.remove('O')
        # use the same metric for evaluation
        f1_scorer = make_scorer(metrics.flat_f1_score,
                                average='weighted',
                                labels=labels)

        # search
        rs = RandomizedSearchCV(crf,
                                params_space,
                                cv=5,
                                verbose=1,
                                n_jobs=2,
                                n_iter=50,
                                scoring=f1_scorer)
        rs.fit(xseq, yseq)

        print('best params:', rs.best_params_)
        print('best CV score:', rs.best_score_)
        print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ /
                                            1000000))

        _x = [s.parameters['c1'] for s in rs.grid_scores_]
        _y = [s.parameters['c2'] for s in rs.grid_scores_]
        _c = [s.mean_validation_score for s in rs.grid_scores_]

        fig = plt.figure()
        fig.set_size_inches(12, 12)
        ax = plt.gca()
        ax.set_yscale('log')
        ax.set_xscale('log')
        ax.set_xlabel('C1')
        ax.set_ylabel('C2')
        ax.set_title(
            "Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})"
            .format(min(_c), max(_c)))

        ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0, 0, 0])

        print("Dark blue => {:0.4}, dark red => {:0.4}".format(
            min(_c), max(_c)))
        plt.show()

    def plot_learning_curve(self, X, y):
        train_sizes = np.linspace(.1, 1.0, 5)
        n_jobs = 8
        title = "Learning Curves"
        cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
        plt.figure()
        plt.title(title)
        ylim = (0.01, 1.01)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel("Training examples")
        plt.ylabel("Score")

        X_lines = []
        y_lines = []
        for doc_x, doc_y in zip(X, y):
            for line_idx, line in enumerate(doc_x):
                X_lines.append(line)
                y_lines.append(doc_y[line_idx])

        estimator = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                         c1=0.001,
                                         c2=0.001,
                                         max_iterations=110,
                                         all_possible_transitions=True,
                                         verbose=True)
        custom_scorer = make_scorer(self.score_model, greater_is_better=True)

        #train_sizes, train_scores, test_scores = learning_curve(estimator, X_lines, y_lines, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
        train_sizes, train_scores, test_scores = learning_curve(
            estimator,
            X_lines,
            y_lines,
            cv=cv,
            scoring=custom_scorer,
            n_jobs=n_jobs,
            train_sizes=train_sizes)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        plt.grid()

        plt.fill_between(train_sizes,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=0.1,
                         color="r")
        plt.fill_between(train_sizes,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.1,
                         color="g")
        plt.plot(train_sizes,
                 train_scores_mean,
                 'o-',
                 color="r",
                 label="Training score")
        plt.plot(train_sizes,
                 test_scores_mean,
                 'o-',
                 color="g",
                 label="Cross-validation score")

        plt.legend(loc="best")
        return plt
Example #28
0
class magicalObject:
    def __init__(self):
        print('Initializing...')
        self.client_credentials_manager = SpotifyClientCredentials(
            client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
        self.spotify_api = spotipy.Spotify(
            client_credentials_manager=self.client_credentials_manager)
        print('Training...')
        self.gen = FeatureGenerator(spotify=self.spotify_api)
        self.X = [json.loads(d.strip()) for d in open('./training.data', 'r')]
        self.classifier = BallTree([x[4] for x in self.X])
        print('Training complete.')

    def add_track_info_to_results(self, results):
        for i, arr in enumerate(results):
            username = arr[3]
            playlist_id = arr[0]
            playlist_search = self.spotify_api.user_playlist(
                username, playlist_id)
            playlist_url = playlist_search['external_urls']['spotify']
            ps = playlist_search['tracks']['items']
            arr.append(playlist_url)
            sub_list = []
            for item in ps:
                track_name = item['track']['name']
                artist_name = item['track']['artists'][0]['name']
                track_url = item['track']['external_urls']['spotify']
                sub_list.append([track_name, artist_name, track_url])
            arr.append(sub_list)
            results[i] = json.dumps(arr)
        # return results

    def classify_playlist(self, playlist_uri):
        parts = playlist_uri.split(':')
        user = parts[2]
        playlist = parts[4]
        p = self.spotify_api.user_playlist(user, playlist)
        feature_vec = [p['id'], p['name'], p['uri'], user, self.gen.process(p)]
        dist, ind = self.classifier.query([feature_vec[4]], k=5)
        results = []
        for i in range(len(ind[0])):
            results.append(self.X[ind[0][i]])
        self.add_track_info_to_results(results)
        return results

    def classify_song(self, song_uri):
        parts = song_uri.split(':')
        track = self.spotify_api.track(parts[-1])
        p = {'tracks': {'items': [{'track': track}]}}
        feature_vec = [
            track['id'], track['name'], track['uri'], None,
            self.gen.process(p)
        ]
        dist, ind = self.classifier.query([feature_vec[4]], k=5)
        results = []
        for i in range(len(ind[0])):
            results.append(self.X[ind[0][i]])
        self.add_track_info_to_results(results)
        return results

    def search_for_tracks(self, string_query):
        results = self.spotify_api.search(string_query)['tracks']['items']
        return_list = []
        for result in results:
            return_list.append({
                'song_id':
                result['id'],
                'song_name':
                result['name'],
                'artist_name':
                result['album']['artists'][0]['name'],
                'artist_id':
                result['album']['artists'][0]['id'],
                'song_uri':
                result['uri']
            })
        return return_list

    def search_for_playlists(self, string_query):
        results = self.spotify_api.search(string_query, type='playlist')
        return results
 def __init__(self, svm_feature=False):
     super(TrainingSampleGenerator, self).__init__()
     self.training_samples = []
     self.processed_training_data = []   # format: tuple([feature vector] , label)
     self.feature_generator = FeatureGenerator()
     self.svm_feature = svm_feature
    def perform_bootstrapping(self, dataset, sample_size, iterations):
        """
        bootstraps a sample n times. Averages the precision, recall, f1, tpr and
        fpr for each of the entities. Prints results of precision, recall and
        f1. Plots roc curves for tpr and fpr of each entity.
        """
        training_scores = []
        test_scores = []

        emp_pos_scores = np.empty(shape=(0, 3), dtype='float64')
        emp_comp_scores = np.empty(shape=(0, 3), dtype='float64')
        edu_major_scores = np.empty(shape=(0, 3), dtype='float64')
        edu_inst_scores = np.empty(shape=(0, 3), dtype='float64')

        mean_fpr = np.linspace(0, 1, 100)
        lb = LabelBinarizer()

        emp_pos_tpr = np.empty(shape=(0, 3), dtype='float64')
        emp_pos_fpr = np.empty(shape=(0, 3), dtype='float64')
        emp_comp_tpr = np.empty(shape=(0, 3), dtype='float64')
        emp_comp_fpr = np.empty(shape=(0, 3), dtype='float64')
        edu_major_tpr = np.empty(shape=(0, 3), dtype='float64')
        edu_major_fpr = np.empty(shape=(0, 3), dtype='float64')
        edu_inst_tpr = np.empty(shape=(0, 3), dtype='float64')
        edu_inst_fpr = np.empty(shape=(0, 3), dtype='float64')

        for x in range(0, iterations):
            print("iteration nr %s" % x)
            sampled_train_set, oob_test_set = self.resample_data(
                dataset, sample_size, return_leftovers=True)
            cs = CrfSuite()
            ds = Dataset()
            we_model = WeModel()
            w2v_model = we_model.train(
                dataset)  # optionally load a pretrained model here
            word2count, word2idx = ds.encode_dataset(sampled_train_set)

            f_generator = FeatureGenerator(w2v_model, word2count, word2idx)

            train_features = f_generator.generate_features_docs(
                sampled_train_set)
            y_train = f_generator.generate_true_outcome(sampled_train_set)

            test_features = f_generator.generate_features_docs(oob_test_set)
            y_test = f_generator.generate_true_outcome(oob_test_set)

            trainer = cs.train_model(train_features, y_train)
            y_train_pred = cs.test_model(trainer, train_features)
            y_test_pred = cs.test_model(trainer, test_features)

            score_train = cs.score_model(ds.docs2lines(y_train), y_train_pred)
            score_test = cs.score_model(ds.docs2lines(y_test), y_test_pred)

            y_true_combined = lb.fit_transform(
                list(chain.from_iterable(ds.docs2lines(y_test))))
            y_pred_combined = lb.transform(
                list(chain.from_iterable(y_test_pred)))

            class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
            """
            # fpr and tpr for one class
            temp_fpr, temp_tpr, _ = roc_curve(y_true_combined[:, class_indices["B-EMP-POS"]], y_pred_combined[:, class_indices["B-EMP-POS"]], pos_label=1)
            temp_fpr1, temp_tpr1, _ = roc_curve(y_true_combined[:, class_indices["I-EMP-POS"]], y_pred_combined[:, class_indices["I-EMP-POS"]], pos_label=1)
            temp_fpr = np.vstack([temp_fpr, temp_fpr1])
            temp_tpr = np.vstack([temp_tpr, temp_tpr1])
            emp_pos_tpr = np.vstack([emp_pos_tpr, temp_tpr.mean(axis=0)])
            emp_pos_fpr = np.vstack([emp_pos_fpr, temp_fpr.mean(axis=0)])
            temp_fpr = temp_tpr = temp_fpr1 = temp_tpr1 = np.empty(shape=(0,3),dtype='float64')

            temp_fpr, temp_tpr, _ = roc_curve(y_true_combined[:, class_indices["B-EMP-COMP"]], y_pred_combined[:, class_indices["B-EMP-COMP"]], pos_label=1)
            temp_fpr1, temp_tpr1, _ = roc_curve(y_true_combined[:, class_indices["I-EMP-COMP"]], y_pred_combined[:, class_indices["I-EMP-COMP"]], pos_label=1)
            temp_fpr = np.vstack([temp_fpr, temp_fpr1])
            temp_tpr = np.vstack([temp_tpr, temp_tpr1])
            emp_comp_tpr = np.vstack([emp_comp_tpr, temp_tpr.mean(axis=0)])
            emp_comp_fpr = np.vstack([emp_comp_fpr, temp_fpr.mean(axis=0)])
            temp_fpr = temp_tpr = temp_fpr1 = temp_tpr1 = np.empty(shape=(0,3),dtype='float64')

            temp_fpr, temp_tpr, _ = roc_curve(y_true_combined[:, class_indices["B-EDU-MAJOR"]], y_pred_combined[:, class_indices["B-EDU-MAJOR"]], pos_label=1)
            temp_fpr1, temp_tpr1, _ = roc_curve(y_true_combined[:, class_indices["I-EDU-MAJOR"]], y_pred_combined[:, class_indices["I-EDU-MAJOR"]], pos_label=1)
            temp_fpr = np.vstack([temp_fpr, temp_fpr1])
            temp_tpr = np.vstack([temp_tpr, temp_tpr1])
            edu_major_tpr = np.vstack([edu_major_tpr, temp_tpr.mean(axis=0)])
            edu_major_fpr = np.vstack([edu_major_fpr, temp_fpr.mean(axis=0)])
            temp_fpr = temp_tpr = temp_fpr1 = temp_tpr1 = np.empty(shape=(0,3),dtype='float64')

            temp_fpr, temp_tpr, _ = roc_curve(y_true_combined[:, class_indices["B-EDU-INST"]], y_pred_combined[:, class_indices["B-EDU-INST"]], pos_label=1)
            temp_fpr1, temp_tpr1, _ = roc_curve(y_true_combined[:, class_indices["I-EDU-INST"]], y_pred_combined[:, class_indices["I-EDU-INST"]], pos_label=1)
            temp_fpr = np.vstack([temp_fpr, temp_fpr1])
            temp_tpr = np.vstack([temp_tpr, temp_tpr1])
            edu_inst_tpr = np.vstack([edu_inst_tpr, temp_tpr.mean(axis=0)])
            edu_inst_fpr = np.vstack([edu_inst_fpr, temp_fpr.mean(axis=0)])
            """

            emp_pos_scores = np.vstack([
                emp_pos_scores,
                self.entity_scorer(ds.docs2lines(y_test), y_test_pred,
                                   "EMP-POS")
            ])
            emp_comp_scores = np.vstack([
                emp_comp_scores,
                self.entity_scorer(ds.docs2lines(y_test), y_test_pred,
                                   "EMP-COMP")
            ])
            edu_major_scores = np.vstack([
                edu_major_scores,
                self.entity_scorer(ds.docs2lines(y_test), y_test_pred,
                                   "EDU-MAJOR")
            ])
            edu_inst_scores = np.vstack([
                edu_inst_scores,
                self.entity_scorer(ds.docs2lines(y_test), y_test_pred,
                                   "EDU-INST")
            ])

            w2v_model = None
            train_features = test_features = None
        """
        print("EMP-POS")
        print("precision %s" % np.mean(emp_pos_scores[:,0]))
        print("recall %s" % np.mean(emp_pos_scores[:,1]))
        print("f1 %s" % np.mean(emp_pos_scores[:,2]))

        print("EMP-COMP")
        print("precision %s" % np.mean(emp_comp_scores[:,0]))
        print("recall %s" % np.mean(emp_comp_scores[:,1]))
        print("f1 %s" % np.mean(emp_comp_scores[:,2]))

        print("EDU-MAJOR")
        print("precision %s" % np.mean(edu_major_scores[:,0]))
        print("recall %s" % np.mean(edu_major_scores[:,1]))
        print("f1 %s" % np.mean(edu_major_scores[:,2]))

        print("EDU-INST")
        print("precision %s" % np.mean(edu_inst_scores[:,0]))
        print("recall %s" % np.mean(edu_inst_scores[:,1]))
        print("f1 %s" % np.mean(edu_inst_scores[:,2]))

        emp_pos_tpr = emp_pos_tpr.mean(axis=0)
        emp_pos_fpr = emp_pos_fpr.mean(axis=0)

        emp_comp_tpr = emp_comp_tpr.mean(axis=0)
        emp_comp_fpr = emp_comp_fpr.mean(axis=0)

        edu_major_tpr = edu_major_tpr.mean(axis=0)
        edu_major_fpr = edu_major_fpr.mean(axis=0)

        edu_inst_tpr = edu_inst_tpr.mean(axis=0)
        edu_inst_fpr = edu_inst_fpr.mean(axis=0)

        lw=2
        plt.subplot(221)
        plt.plot(emp_pos_fpr, emp_pos_tpr, color='g', linestyle='--', label='EMP-POS', lw=lw)
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC')
        plt.legend(loc="lower right")

        plt.subplot(222)
        plt.plot(emp_comp_fpr, emp_comp_tpr, color='g', linestyle='--', label='EMP-COMP', lw=lw)
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC')
        plt.legend(loc="lower right")

        plt.subplot(223)
        plt.plot(edu_major_fpr, edu_major_tpr, color='g', linestyle='--', label='EDU-MAJOR', lw=lw)
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC')
        plt.legend(loc="lower right")

        plt.subplot(224)
        plt.plot(edu_inst_fpr, edu_inst_tpr, color='g', linestyle='--', label='EDU-INST', lw=lw)
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC')
        plt.legend(loc="lower right")

        plt.show()
        """

        return emp_pos_scores, emp_comp_scores, edu_inst_scores, edu_major_scores
Example #31
0
    # SPLIT THE DATA (we could use sklearn.model_selection.train_test_split)
    training_set_size = int(0.8 * len(data))
    training_data = data[:training_set_size]
    test_data = data[training_set_size:]

    # K-Fold split
    kf = KFold(n_splits=5)
    kf.get_n_splits(training_data)
    scores = []
    for train_index, dev_index in kf.split(training_data):

        ttp_train_data = get_sub(training_data, train_index)
        ttp_dev_data = get_sub(training_data, dev_index)

        feature_generator = FeatureGenerator(ttp_train_data)
        train_data_vectors, train_classes = feature_generator.transform(
            ttp_train_data)
        dev_data_vectors, dev_classes = feature_generator.transform(
            ttp_dev_data)
        test_data_vectors, test_classes = feature_generator.transform(
            test_data)

        # TRAIN THE MODEL
        cls_models = []
        #cls_models.append(GaussianNB())
        cls_models.append(LinearSVC(random_state=0))
        #cls_models.append(RandomForestClassifier(bootstrap=True, max_depth=10, max_features='auto',
        #                                        min_samples_leaf=1, min_samples_split=2, n_estimators=10,
        #                                        random_state=0, n_jobs=-1))
        #cls_models.append(MLPClassifier(activation='tanh', hidden_layer_sizes=(16,)))
Example #32
0
class ECGClassifier:
    def __init__(self):
        """
        Initialize all the models and classification pipeline
        """

        # Preprocessor which removes left, right and middle outliers of the wave
        self.preprocessor = Preprocessor()

        # Feature Generator for generating features of a wave
        self.feature_generator = FeatureGenerator()

        # Output arrays
        self.features = []
        self.labels = []
        self.file_names = []

        # Custom scoring module
        self.scorer = Scorer()

        # Feature selection models
        self.feature_selector_1 = SelectFromModel(
            LinearSVC(penalty="l1", dual=False))
        self.feature_selector_2 = SelectFromModel(
            LinearSVC(penalty="l1", dual=False))

        # Classification models
        # clf_1 = DecisionTreeClassifier()
        #
        # params = {"criterion": ["gini", "entropy"],
        #           "min_samples_split": [2, 10, 20],
        #           "max_depth": [None, 2, 5, 10],
        #           "min_samples_leaf": [1, 5, 10],
        #           "max_leaf_nodes": [None, 5, 10, 20],
        #           }

        clf_1 = AdaBoostClassifier()
        base_classifier_1 = RandomForestClassifier()

        # Best Classifier 1
        # {'n_estimators': 40,
        #  'base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
        #                                           max_depth=None, max_features='auto', max_leaf_nodes=None,
        #                                           min_impurity_split=1e-07, min_samples_leaf=1,
        #                                           min_samples_split=2, min_weight_fraction_leaf=0.0,
        #                                           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
        #                                           verbose=0, warm_start=False), 'learning_rate': 0.85000000000000009}
        # Best
        # Classifier
        # 2
        # {'n_estimators': 30,
        #  'base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
        #                                           max_depth=None, max_features='auto', max_leaf_nodes=None,
        #                                           min_impurity_split=1e-07, min_samples_leaf=1,
        #                                           min_samples_split=2, min_weight_fraction_leaf=0.0,
        #                                           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
        #                                           verbose=0, warm_start=False), 'learning_rate': 0.80000000000000004}

        # params = {
        #     "base_estimator": [base_classifier_1],
        #     "n_estimators": range(30, 61, 10),
        #     "learning_rate": np.arange(0.8, 1.01, 0.05),
        # }
        optimal_params = {
            "base_estimator": [base_classifier_1],
            'n_estimators': [40],
            'learning_rate': [0.85000000000000009]
        }

        self.classifier_1 = GridSearchCV(clf_1,
                                         param_grid=optimal_params,
                                         cv=10,
                                         scoring=make_scorer(
                                             self.scorer.score),
                                         verbose=10)

        # clf_2 = DecisionTreeClassifier()
        clf_2 = AdaBoostClassifier()
        base_classifier_2 = RandomForestClassifier()

        # params = {
        #     "base_estimator": [base_classifier_2],
        #     "n_estimators": range(30, 61, 10),
        #     "learning_rate": np.arange(0.8, 1.01, 0.05),
        # }

        optimal_params = {
            "base_estimator": [base_classifier_2],
            'n_estimators': [30],
            'learning_rate': [0.80000000000000004]
        }

        # params = {"criterion": ["gini", "entropy"],
        #           "min_samples_split": [2, 10, 20],
        #           "max_depth": [None, 2, 5, 10],
        #           "min_samples_leaf": [1, 5, 10],
        #           "max_leaf_nodes": [None, 5, 10, 20],
        #           }
        self.classifier_2 = GridSearchCV(clf_2,
                                         param_grid=optimal_params,
                                         cv=2,
                                         scoring=make_scorer(
                                             self.scorer.score),
                                         verbose=10)

        # Pipeline initializations
        self.pipeline_1 = Pipeline([('feature_selector',
                                     self.feature_selector_1),
                                    ('clf', self.classifier_1)])
        self.pipeline_2 = Pipeline([('feature_selector',
                                     self.feature_selector_2),
                                    ('clf', self.classifier_2)])

    def fit(self, X, Y, filenames):
        """
        Fits the training data and labels in the classifier
        
        :param X: Training data
         
        :param Y: Training labels 
        """

        X1, I1, X2, I2 = self.__transform__(X, filenames, 'training')
        Y1, Y2 = self.__get_feature_labels__(Y, I1, I2)
        self.pipeline_1.fit(X1, Y1)
        self.pipeline_2.fit(X2, Y2)

    def predict(self, X, file_names):
        """
        Predict test labels
        
        :param X: Test Data
        
        :return: Return predicted output labels
        """

        X1, I1, X2, I2 = self.__transform__(X, file_names, 'test')
        Y1 = self.pipeline_1.predict(X1)
        Y2 = self.pipeline_2.predict(X2)
        return self.__merge__(Y1, Y2, I1, I2)

    def score(self, X, Y, file_names):
        """
        Predict and compute the accuracy score
        
        :param X: Test data
        
        :param Y: Actual labels of test data 
        """
        predicted_Y = self.predict(X, file_names)
        return self.scorer.score(predicted_Y, Y)

    def __transform__(self, X, filenames, prefix='training'):
        """
        Transforms the provided waves data into array containing features of each wave
        
        :param X: 2D array containing data points of all the waves
        
        :return: Tranformed X
        """

        # Return data from pickle files if it was transformed once
        if os.path.isfile("pickle_files/" + prefix + "_peak_data.pickle"):
            # Fetch data points
            with open("pickle_files/" + prefix + "_peak_data.pickle",
                      "rb") as handle:
                peak_features = cPickle.load(handle)

            with open("pickle_files/" + prefix + "_point_data.pickle",
                      "rb") as handle:
                point_features = cPickle.load(handle)

            with open("pickle_files/" + prefix + "_peak_indices.pickle",
                      "rb") as handle:
                peak_indices = cPickle.load(handle)

            with open("pickle_files/" + prefix + "_point_indices.pickle",
                      "rb") as handle:
                point_indices = cPickle.load(handle)

            return [peak_features, peak_indices, point_features, point_indices]

        # Initializing output labels
        peak_features = []
        peak_indices = []
        point_features = []
        point_indices = []

        # for data in X:
        for i in range(0, len(X)):
            data = X[i]

            print filenames[i]
            # pyplot.close("all")
            # peakfinder = ([], [])
            # pre.plot("original", data)

            # Remove outlier sections from the wave
            data, outliers = self.preprocessor.process(data)

            # Append the features of the transformed wave in the final output array
            features, type = self.feature_generator.get_features(
                data, outliers)

            if type == "peak":
                peak_features.append(features)
                peak_indices.append(i)
            else:
                point_features.append(features)
                point_indices.append(i)

        # Store the data in pickle files
        gc.disable()
        with open("pickle_files/" + prefix + '_peak_data.pickle',
                  'wb') as handle:
            cPickle.dump(peak_features,
                         handle,
                         protocol=cPickle.HIGHEST_PROTOCOL)
        gc.enable()

        gc.disable()
        with open("pickle_files/" + prefix + '_point_data.pickle',
                  'wb') as handle:
            cPickle.dump(point_features,
                         handle,
                         protocol=cPickle.HIGHEST_PROTOCOL)
        gc.enable()

        gc.disable()
        with open("pickle_files/" + prefix + '_peak_indices.pickle',
                  'wb') as handle:
            cPickle.dump(peak_indices,
                         handle,
                         protocol=cPickle.HIGHEST_PROTOCOL)
        gc.enable()

        gc.disable()
        with open("pickle_files/" + prefix + '_point_indices.pickle',
                  'wb') as handle:
            cPickle.dump(point_indices,
                         handle,
                         protocol=cPickle.HIGHEST_PROTOCOL)
        gc.enable()

        return [peak_features, peak_indices, point_features, point_indices]

    def __get_feature_labels__(self, Y, I1, I2):
        """
        Get feature labels for corresponding index arrays

        :param Y: Output labels array

        :param I1: Index Array

        :param I2: Index Array

        :return: Corresponding label arrays
        """
        Y1 = []
        Y2 = []

        for i in I1:
            Y1.append(Y[i])

        for i in I2:
            Y2.append(Y[i])

        return [Y1, Y2]

    def __merge__(self, Y1, Y2, I1, I2):
        """
        Merge two output labels arrays using index arrays

        :param Y1: Labels array

        :param Y2: Labels array

        :param I1: Index array

        :param I2: Index array

        :return: Merged output labels array
        """
        output = np.zeros(len(Y1) + len(Y2))

        for i in range(0, len(I1)):
            output[I1[i]] = Y1[i]

        for i in range(0, len(I2)):
            output[I2[i]] = Y2[i]

        return output