Exemple #1
0
def rf_model(train, target, test, text_train_tfidf, text_test_tfidf):

    text_train = train["Title"].values + ". " + train["BodyMarkdown"].values
    text_test = test["Title"].values + ". " + test["BodyMarkdown"].values
    print("Creating word2vec model...")
    w2v.make_word2vec_model(text_train, text_test)
    wv_train, wv_test = w2v.word2vec_features(text_train, text_test, load=False)

    X_train, X_test = fe.extract_features(train), fe.extract_features(test)


    X_train, X_test = fe.categories_to_counters(X_train, X_test, target)
    X_train, X_test = fe.transform_features(X_train, X_test)
    print("Creating linear model metafeature...")
    X_train["LinearModelText"], X_test["LinearModelText"] = mf.linear_model_as_feature(text_train_tfidf, target, text_test_tfidf, load=False)
    print("Creating word2vec model metafeature...")
    X_train["w2vModelRFText"], X_test["w2vModelRFText"] = mf.w2v_model_as_feature(wv_train, target, wv_test, load=False, model_to_train="rf")

    scaler = sklearn.preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = sklearn.ensemble.RandomForestClassifier(criterion="entropy", max_depth=14, n_estimators=2000,
                                                    min_samples_leaf=4, min_samples_split=16, n_jobs=4, random_state=1234)

    result = make_predictions(model, X_train, target, X_test)
    io.save_result(test["PostId"], result)

    return result
Exemple #2
0
def linear_model(train, target, test, text_train_tfidf, text_test_tfidf):

    X_train, X_test = fe.extract_features(train), fe.extract_features(test)

    X_train, X_test = fe.categories_to_counters(X_train, X_test, target)
    X_train, X_test = fe.transform_features(X_train, X_test)

    feature_train = np.load("w2v/word2vec_feature_train")
    feature_test = np.load("w2v/word2vec_feature_test")

    X_train = np.column_stack((X_train.values, feature_train))
    X_test = np.column_stack((X_test.values, feature_test))


    scaler = sklearn.preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)


    X_train = scipy.sparse.hstack((text_train_tfidf, X_train), format="csr")
    X_test = scipy.sparse.hstack((text_test_tfidf, X_test), format="csr")

    model = sklearn.linear_model.LogisticRegression(C=0.7, penalty="l2")

    result = make_predictions(model, X_train, target, X_test)
    io.save_result(test["PostId"], result)

    return result
def main():
    pkl_file = open(pickle_file_name, 'rb')
    _, patient_data, matched_data = pickle.load(pkl_file)
    pkl_file.close()
    # get training data
    # get OD data
    all_encodings, target_encoding = fe.extract_features(matched_data)
    bin_target_encoding = [[1] for i in range(len(target_encoding))]
    print('target_encoding num : {}'.format(len(target_encoding)))

    general_samples = ds.draw_general_sample(GEN_SAMPLE_NUM, patient_data, 'race', [0.805, 0.134, 0.037])
    print(len(matched_data))
    print(len(general_samples))
    general_sample_encodings, general_sample_target_encoding = fe.extract_features(general_samples, True)
    bin_general_sample_target_encoding = [[0] for i in range(len(general_sample_encodings))]
    print('general_encoding num : {}'.format(len(general_sample_target_encoding)))

    # create X and Y
    X = all_encodings
    X.extend(general_sample_encodings)
    # for categorical prediciton
    # Y = target_encoding
    # Y.extend(general_sample_target_encoding)
    # for binardy prediction Overdose vs non-Overdose
    Y = bin_target_encoding
    Y.extend(bin_general_sample_target_encoding)
    print(Y)
    # build NN model
    # for categorical prediction
    # nn_model = nn.NN(X, Y)
    # nn_model.train()
    # for binary prediction
    nn_model = nn.Bin_NN(X, Y)
    nn_model.train()
def prepare_data(save_scaler=True, location='./db.p'):
    """
    Prepares the data for training. It does so by extracting the features from both cars and non cars datasets.
    :param save_scaler: Flag that indicates if we should persist our scaler in database (i.e. the pickle file).
    :param location: Path of the pickle file that contains the data.
    :return: Features and labels ready to be passed to a classifier.
    """

    print("Loading training data")
    cars = __db['cars_features']
    cars_labels = __db['cars_labels']
    non_cars = __db['non_cars_features']
    non_cars_labels = __db['non_cars_labels']

    parameters = __db['parameters']

    print("Extracting features...")
    cars_extracted_features = extract_features(
        images=cars,
        color_space=parameters['color_space'],
        hog_channels=parameters['hog_channels'],
        orient=parameters['orientations'],
        pix_per_cell=parameters['pix_per_cell'],
        cell_per_block=parameters['cell_per_block'],
        histogram_bins=parameters['number_of_bins'],
        spatial_size=parameters['spatial_size'],
        spatial_feat=parameters['spatial_features'],
        hist_feat=parameters['histogram_features'],
        hog_feat=parameters['hog_features'])
    non_cars_extracted_features = extract_features(
        images=non_cars,
        color_space=parameters['color_space'],
        hog_channels=parameters['hog_channels'],
        orient=parameters['orientations'],
        pix_per_cell=parameters['pix_per_cell'],
        cell_per_block=parameters['cell_per_block'],
        histogram_bins=parameters['number_of_bins'],
        spatial_size=parameters['spatial_size'],
        spatial_feat=parameters['spatial_features'],
        hist_feat=parameters['histogram_features'],
        hog_feat=parameters['hog_features'])

    features = np.vstack((cars_extracted_features,
                          non_cars_extracted_features)).astype(np.float64)
    labels = np.hstack((cars_labels, non_cars_labels))

    print("Normalizing...")
    scaler = get_normalizer(features)
    if save_scaler:
        with open(location, 'wb') as pickle_file:
            __db['scaler'] = scaler
            pickle.dump(__db, pickle_file)

    features = scaler.transform(features)

    print("Done!")
    return features, labels
Exemple #5
0
    def test_feature_extraction(self):
        audio, sr = get_audio('lIYCHbOTab4')

        result = extract_features(audio, sr)
        self.assertEqual((22, 18424), result.shape,
                         "frame features extraction")

        result2 = extract_features(audio, sr, mfcc=False)
        self.assertEqual((2, 18424), result2.shape,
                         "frame features extraction")
Exemple #6
0
def build_training_data(symbol_files,
                        junk_files=[],
                        segment_data_func=None,
                        print_progress=True,
                        ground_truth_file=None):
    """
    Given the symbol files as input, create a dataframe from the given data

    Parameters:
    1. symbol_files (list) - list of symbol file names 

    Returns:
    1. df (Dataframe) - A pandas dataframe representation of the data
    """
    df = pd.DataFrame([])  # contains both junk and symbol files
    if ground_truth_file:
        ui_to_symbols = map_ids_to_symbols(ground_truth_file)
    all_files = symbol_files[:]
    all_files.extend(junk_files)
    num_files = len(all_files)
    row_num = 0
    for data_file in all_files:
        # segmentation to be done here
        trace_dict, unique_id = create_trace_dict(data_file)
        if segment_data_func:
            segemented_trace_dicts = segment_data_func(trace_dict)
            unique_id = data_file.split('.')[0].split('/')[-1]
            for segmented_trace_dict in segemented_trace_dicts:
                row = extract_features(segmented_trace_dict, unique_id)
                row['TRACES'] = list(segmented_trace_dict.keys())
                if ground_truth_file:
                    row['SYMBOL_REPRESENTATION'] = ui_to_symbols[row['UI']]
                if len(df.columns) == 0:
                    df = pd.DataFrame(columns=[n for n in row.keys()])
                df.loc[row_num] = list(row.values())
                row_num += 1
        else:
            row = extract_features(trace_dict, unique_id)
            if ground_truth_file:
                row['SYMBOL_REPRESENTATION'] = ui_to_symbols[
                    row['UI']] if row['UI'] in ui_to_symbols else 'junk'
            if len(df.columns) == 0:
                df = pd.DataFrame(columns=[n for n in row.keys()])
            df.loc[row_num] = list(row.values())
        percentage = num_files // 100
        if print_progress and percentage != 0 and row_num % percentage == 0:
            # print('{0} ({1}%) of {2} files loaded...'.format(row_num, round((row_num/num_files)*100), num_files))
            print('File \'{}\' processed.'.format(data_file))
        row_num += 1
    print('All files loaded.')
    return df  # use this to operate on the data
Exemple #7
0
def main():
    """
    Main function
  """
    # Extract features
    if not feature_file_exists():
        extract_features()

    # Select features
    if not select_feature_file_exists():
        select_features()

    # Train model
    train()
Exemple #8
0
def test_model(file_name):

    gmm_files = [
        os.path.join(modelpath, fname) for fname in os.listdir(modelpath)
        if fname.endswith('.gmm')
    ]

    models = [pickle.load(open(fname, 'rb')) for fname in gmm_files]

    speakers = [fname.split('/')[-1].split('.gmm')[0] for fname in gmm_files]

    # print("do you want to test a single audio : press 1 or complete press 0")
    # take=int(input().strip())

    # if take==1:
    # print("enter the file name")
    # path=input().strip()
    # print("testing audio",path)
    path = file_name
    sr, audio = read(source + path)
    vector = extract_features(audio, sr)

    log_likelihood = np.zeros(len(models))

    for i in range(len(models)):
        gmm = models[i]
        scores = np.array(gmm.score(vector))
        log_likelihood[i] = scores.sum()
    max1 = max(log_likelihood)
    if max1 < -30:
        return "You are not Register..."
    # winner=np.argmax(log_likelihood)
    else:
        winner = np.where(log_likelihood == max1)
        return speakers[winner[0][0]]
Exemple #9
0
def load():
    """
    Load all training points into a matrix
    """

    Xs = []
    Ys = []

    for path, label in ((NEGATIVE_SAMPLES_DIR, 0), (POSITIVE_SAMPLES_DIR, 1)):

        audio_files = glob.glob(os.path.join(path, '*.wav'))
        logging.info('Detected {} files for label {}'.format(
            len(audio_files), label))
        Xs.append(np.zeros((len(audio_files), NUM_FEATURES)))
        Ys.extend([label] * len(audio_files))

        for i, audio_file in enumerate(audio_files):
            rate, data = scipy.io.wavfile.read(audio_file)
            assert rate == SAMPLE_RATE

            Xs[-1][i, :] = extract_features(data)

            if i % 1000 == 0:
                logging.info('Loaded {} files'.format(i))

    X = np.vstack(Xs)
    y = np.array(Ys)
    return X, y
Exemple #10
0
def process_test_samples(test_samples):
    processed_samples = pd.DataFrame()

    samples_in_one_srch = pd.DataFrame()
    for r_idx, sample in test_samples.iterrows():
        if (r_idx + 1) % 1000 == 0:
            print "Processed %i sample of %i" % (r_idx + 1, test_samples.shape[0])
        is_next_in_same_search = True
        samples_in_one_srch = pd.concat((sample.to_frame().transpose(), samples_in_one_srch), axis=0)
        current_srch_id = sample['srch_id']

        if (r_idx + 1) == test_samples.shape[0]:
            is_next_in_same_search = False
        else:
            next_srch_id = test_samples['srch_id'][r_idx + 1]
            if current_srch_id != next_srch_id:
                is_next_in_same_search = False

        if not is_next_in_same_search:
            ## if next one is not in the same search process the samples in the same search

            # feature extraction for samples
            ext_samples_in_one_srch = extract_features(samples_in_one_srch)
            processed_samples = pd.concat((processed_samples, ext_samples_in_one_srch), axis=0)

            # create new samples for the next search
            samples_in_one_srch = pd.DataFrame()

    return processed_samples
Exemple #11
0
def svm(train_docs, train_keys, test_docs, test_keys, model_file, N):
    X_train, y_train, phrase_list_train, idf_vec = extract_features(train_docs, train_keys)
    #X_test, y_test, fl_test, junk = extract_features(test_docs, test_keys)
    #print y_train
    print "--Feature matrices calculated, SVM now training..."
    clf = train_svm(X_train, y_train)
    print "--Saving model..."
    with open(model_file, 'w') as f:
        pickle.dump(model_file, f)
    print "--SVM trained, SVM now testing..."
    accuracy = 0

    precisions = []
    recalls = []
    for doc, true_keys in zip(test_docs, test_keys):
        candidates, features = extract_candidates_doc(doc, phrase_list_train, idf_vec, len(train_docs))
        precision, recall = evaluate_one_doc('svm', clf, candidates, features, true_keys, N)
        precisions.append(precision)
        recalls.append(recall)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)


    '''
    accuracy = test_svm(svm, X_test, y_test)
    features_doc, labels_doc, phrase_idx_doc, phrase_list = extract_features_test(test_docs, test_keys)
    avg_precision, avg_recall = evaluate_on_each_doc('svm', svm, features_doc, labels_doc, phrase_idx_doc, phrase_list, test_keys)
    '''
    return {'accuracy': accuracy,
            'recall': avg_recall,
            'precision': avg_precision}
Exemple #12
0
def svm(train_docs, train_keys, test_docs, test_keys, model_file, N):
    X_train, y_train, phrase_list_train, idf_vec = extract_features(
        train_docs, train_keys)
    #X_test, y_test, fl_test, junk = extract_features(test_docs, test_keys)
    #print y_train
    print "--Feature matrices calculated, SVM now training..."
    clf = train_svm(X_train, y_train)
    print "--Saving model..."
    with open(model_file, 'w') as f:
        pickle.dump(model_file, f)
    print "--SVM trained, SVM now testing..."
    accuracy = 0

    precisions = []
    recalls = []
    for doc, true_keys in zip(test_docs, test_keys):
        candidates, features = extract_candidates_doc(doc, phrase_list_train,
                                                      idf_vec, len(train_docs))
        precision, recall = evaluate_one_doc('svm', clf, candidates, features,
                                             true_keys, N)
        precisions.append(precision)
        recalls.append(recall)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    '''
    accuracy = test_svm(svm, X_test, y_test)
    features_doc, labels_doc, phrase_idx_doc, phrase_list = extract_features_test(test_docs, test_keys)
    avg_precision, avg_recall = evaluate_on_each_doc('svm', svm, features_doc, labels_doc, phrase_idx_doc, phrase_list, test_keys)
    '''
    return {
        'accuracy': accuracy,
        'recall': avg_recall,
        'precision': avg_precision
    }
Exemple #13
0
def grid_search():
    print(FILE_NAME)
    svm_features, svm_labels = extract_features(config.TRAIN_DIR,config.sample_count)
    X = svm_features.reshape(config.sample_count, 7*7*2048)
    y = svm_labels

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # defining parameter range 
    param_grid = {'C': [0.1, 1, 10, 100, 1000],  
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
                  'kernel': ['rbf']}  
      
    grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
      
    # fitting the model for grid search 
    grid.fit(X_train, y_train)

    # print best parameter after tuning 
    print(grid.best_params_) 
      
    # print how our model looks after hyper-parameter tuning 
    print(grid.best_estimator_) 

    grid_predictions = grid.predict(X_test) 
  
    # print classification report 
    print(classification_report(y_test, grid_predictions))
Exemple #14
0
def naive_bayes(train_docs, train_keys, test_docs, test_keys,model_file, N):
    X_train, y_train, phrase_list_train, idf_vec= extract_features(train_docs, train_keys)
    #X_test, y_test, fl_test, junk = extract_features(test_docs, test_keys)
    #print y_train
    print "--Feature matrices calculated, NB now training..."
    clf = NB.train(X_train, y_train)
    print "--Saving model..."
    with open(model_file, 'w') as f:
        pickle.dump(clf, f)
    with open(model_file+'.phrase_list', 'w') as f:
        pickle.dump(phrase_list_train, f)
    with open(model_file+'.idf_vec', 'w') as f:
        pickle.dump(idf_vec, f)
    with open(model_file+'.training_size', 'w') as f:
        pickle.dump(len(train_docs), f)
    print "--NB trained, NB now testing..."
    #accuracy = NB.score(clf, X_test, y_test)
    accuracy = 0

    precisions = []
    recalls = []
    for doc, true_keys in zip(test_docs, test_keys):
        candidates, features = extract_candidates_doc(doc, phrase_list_train, idf_vec, len(train_docs))
        precision, recall = evaluate_one_doc('NB', clf, candidates, features, true_keys, N)
        precisions.append(precision)
        recalls.append(recall)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    #features_doc, labels_doc, phrase_idx_doc, phrase_list = extract_features_test(test_docs, test_keys)
    #avg_precision, avg_recall = evaluate_on_each_doc('NB', clf, features_doc, labels_doc, phrase_idx_doc, phrase_list, test_keys, 10)
    return {'accuracy': accuracy,
            'recall': avg_recall,
            'precision': avg_precision}
def build_features_labels_dataset(events,
                                  unique_from=46,
                                  unique_to=1006,
                                  unique_granularity=1,
                                  unique_deltas=[0, 46],
                                  to_withdraw=[]):
    data = []
    labels = []
    feature_names = None
    for device in events:
        for app in events[device]:
            for action in events[device][app]:
                label = app + "_" + action
                for event in events[device][app][action]:
                    features_dict = extract_features(
                        event,
                        unique_from,
                        unique_to,
                        unique_granularity,
                        unique_deltas=unique_deltas,
                        to_withdraw=to_withdraw)
                    features = list(features_dict.values())
                    data.append(features)
                    labels.append(label)
                    if feature_names is None:
                        feature_names = list(features_dict.keys())

    return data, labels, feature_names
Exemple #16
0
def run_grid_search(cnn_filepath='script_CNN.h5',
                    grid_ready_data=None,
                    grid_ready_feature_groups=None):

    # SET DIRECTORY
    os.chdir(CURRENT_DIRECTORY)
    print(f'{datetime.datetime.now()} RUNNING GRID SEARCH')

    # if not grid_ready_data:
    # LOAD CLEANED DATA, DICT OF FEATURE GROUPS, AND DTL
    print(f'{datetime.datetime.now()} 1. Loading/Cleaning Data.')
    df, feature_dict, DTL = load_and_clean_data()

    # LOAD CNN, EXTRACT FEATURES, ADD TO FEATURE GROUPS
    print(f'{datetime.datetime.now()} 2. Extracting Features.')
    layer_name = 'dense1'
    model = load_model(cnn_filepath)
    df_features = extract_features(model, DTL, layer_name)

    # USE PCA TO SELECT EXTRACTED FEATURES
    print(f'{datetime.datetime.now()} 3. Performing PCA.')
    df_features_pca = perform_pca(df_features, 10)
    feature_dict['EXTRACTED_FEATURES'] = df_features_pca.columns.to_list()
    feature_dict['EXTRACT_OSM_FB_FEATURES'] = feature_dict['EXTRACTED_FEATURES'] \
                                        + feature_dict['OSM_FB_FEATURES']

    # DEFINE FINAL DATA
    df_final = df.join(df_features_pca)
    df_final.to_pickle('fully_prepped_data.pkl')

    # OVERSAMPLE
    print(f'{datetime.datetime.now()} 4. Defining Sample.')
    x_df = pd.DataFrame(df_final.drop(labels=['uid', TARGET_NAME], axis=1))
    y_df = df_final[TARGET_NAME]
    feature_dict['ALL_FEATURES'] = x_df.columns.tolist()
    oversample = RandomOverSampler(sampling_strategy=0.65)
    x, y = oversample.fit_resample(x_df, y_df)

    # SPLIT INTO TRAIN/TEST AND NORMALIZE
    print(f'{datetime.datetime.now()} 5. Defining Training and Testing Sets.')
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=TEST_SIZE)
    normalize(x_train, x_test)

    # TRAIN MODELS AND EXPORT ERRORS
    print(f'{datetime.datetime.now()} 6. Training Models.')
    parameters = cf.GRID_TEST_CLASS
    training_errors = train_models(parameters, x_train, y_train, feature_dict)
    training_errors.to_csv(os.path.join('output', 'errors.csv'))

    # PREDICT LABELS AND EVALUATE RESULTS
    print(f'{datetime.datetime.now()} 7. Evalating Models.')
    trained_obj_list = [
        f for f in os.listdir('output') if f.endswith('_trained.pkl')
    ]
    results_df = evaluate_models(trained_obj_list, x_test, y_test,
                                 feature_dict)
    results_df.to_csv(os.path.join('output', 'results.csv'))
def predict(time_serie,
            CLF,
            WINDOW_SIZE=20,
            INTER_SPACING_CRITICAL_POINTS=5,
            WINDOW_MINIMAL_PAYLOAD=200,
            MINIMUM_SIZE_CRITICAL_STARTING_POINT=-1,
            FILTER_LENGTH_LIMIT=46,
            INTER_TIMER_EVENT_CUTOFF=5,
            TO_WITHDRAW=[]):
    """
    Make Prediction on a longrun capture
    Args:
        time_series : dict['xs', 'ys'] -> [x,],[y,]. Dictonary having 2 entries: xs for the time and ys of packet length
        WINDOW_SIZE : int. Size in seconds of the Sliding Window.
        INTER_SPACING_CRITICAL_POINTS : int. Minimum spacing between two critical points.

    """

    critical_points = find_critical_point(
        time_serie, WINDOW_SIZE, INTER_SPACING_CRITICAL_POINTS,
        WINDOW_MINIMAL_PAYLOAD, MINIMUM_SIZE_CRITICAL_STARTING_POINT)

    cap_predict = []  # tuple list of
    critical_points_i = 0
    xs_end = -1
    xs_capt = time_serie["xs"]
    ys_capt = time_serie["ys"]

    for i, _ in enumerate(xs_capt):

        current_xs = xs_capt[i]
        critical_point = critical_points[critical_points_i]
        if current_xs > critical_point and current_xs > xs_end:

            j = i - 1  # take previous one since we are one step further

            xs_start = xs_capt[i]
            xs_end = find_action_end(xs_capt[i:], ys_capt[i:],
                                     FILTER_LENGTH_LIMIT,
                                     INTER_TIMER_EVENT_CUTOFF)
            end_indice = find_x_indices(xs_capt, j, xs_end)
            xy = dict()
            xy["xs"] = xs_capt[j:end_indice + 1]
            xy["ys"] = ys_capt[j:end_indice + 1]
            features_dict = extract_features(xy, to_withdraw=TO_WITHDRAW)
            features = list(features_dict.values())
            y = CLF.predict_proba(np.array(features).reshape(1, -1))

            cap_predict.append((xs_start, xs_end, y[0]))

            while critical_points[critical_points_i] < xs_end:
                critical_points_i += 1
                if critical_points_i == len(critical_points):
                    break

        if critical_points_i == len(critical_points) or xs_end == xs_capt[-1]:
            break
    return cap_predict
Exemple #18
0
def load_or_train2(X, y, force_train=False, enable_plot=False):
    clf_path = './clf.pkl'

    if not force_train and os.path.exists(clf_path):
        print "> loading from file..."
        classifier = pickle.load(open(clf_path, 'rb'))
        print "> loaded"
    else:
        print "> training..."

        X = extract_features(X)

#[CV]  logistic__C=1.0, rbm__n_iter=80, rbm__learning_rate=0.1, rbm__n_components=200, score=0.931818 - 5.3min

        ## initialize the RBM + Logistic Regression pipeline
        rbm = BernoulliRBM(learning_rate=0.1, n_iter=80, n_components=200)
        logistic = LogisticRegression(C=1.0)
        classifier = Pipeline([("rbm", rbm), ("logistic", logistic)])

        # Perform a grid search on the learning rate, number of iterations, and number of components on the RBM and
        # C for Logistic Regression
        print "SEARCHING RBM + LOGISTIC REGRESSION"
        params = {
            "rbm__learning_rate": [0.1, 0.01, 0.001],
            "rbm__n_iter": [20, 40, 80],
            "rbm__n_components": [50, 100, 200],
            "logistic__C": [1.0, 10.0, 100.0]}

        ## Cross-validation
    #    cv = ShuffleSplit(len(X), n_iter=10, test_size=0.2, random_state=0)

        ## Perform a grid search over the parameter
        start = time.time()
    #    gs = GridSearchCV(classifier, params, verbose=10, cv=cv)
    #    gs.fit(X, y)
        classifier.fit(X, y)

        ## Print diagnostic information to the user and grab the best model
        print "\ndone in %0.3fs" % (time.time() - start)
    #    print "best score: %0.3f" % (gs.best_score_)
    #    print "RBM + LOGISTIC REGRESSION PARAMETERS"
    #    bestParams = gs.best_estimator_.get_params()
        # loop over the parameters and print each of them out
        # so they can be manually set
    #    for p in sorted(params.keys()):
    #        print "\t %s: %f" % (p, bestParams[p])

        ## show information about the training
        if enable_plot or True:
            outputs = classifier.predict(X)
            plot_confusion_matrix(y, outputs, range(0, 10))
            print_classification_report(y, outputs)

        ## Save the model
        pickle.dump(classifier, open(clf_path, 'wb'))

    return classifier
Exemple #19
0
def svm_ranking(train_docs, train_keys, test_docs, test_keys):
    X_train_vec, y_train_vec = extract_features(train_docs, train_keys)
    X_train, y_train = get_vec_differences_train(X_train_vec, y_train_vec)

    X_test_vec, y_test_vec = extract_features(test_docs, test_keys)
    X_test, y_test = get_vec_differences_train(X_test_vec, y_test_vec)
    print "--Training SVM"
    svm = train_svm(X_train, y_train)
    # The test_svm function needs to be replaced for this method
    # so it finds the diff. of test vectors, classifies those
    # differences, and ranks using those classifications
    print "--Testing SVM"
    accuracy = test_svm(svm, X_test, y_test)
    avg_recall = 0
    avg_precision = 0
    return {'accuracy': accuracy,
            'recall': avg_recall,
            'precision': avg_precision}
Exemple #20
0
def main():
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        args = parser.parse_args()

        clasf = Classifier()

        if args.parse:
            parse_all_files()
            from feature_extraction import main as extract_features
            extract_features()

        if args.train:
            folder_index = args.folder_index
            ratio = args.ratio
            if ratio <= 0 and ratio > 1:
                ratio = 1
            if folder_index == 1 or folder_index == 2:
                if args.task == 1:
                    clasf.train_NER_model(train_folder = folder_index, ratio = ratio,
                            classifier = args.classifier)
                elif args.task == 2:
                    clasf.train_DDI_model(train_folder = folder_index, ratio = ratio,
                            classifier = args.classifier)
                else:
                    parser.print_help()
            else:
                parser.print_help()
        elif args.test:
            model_index = args.model_index
            folder_index = args.folder_index
            if model_index >= 0 and folder_index >= 1:
                if args.task == 1:
                    clasf.test_NER_model(model_index = model_index, test_folder = folder_index,
                            classifier = args.classifier)
                elif args.task == 2:
                    clasf.test_DDI_model(model_index = model_index, test_folder = folder_index,
                            classifier = args.classifier)
                else:
                    parser.print_help()
            else:
                parser.print_help()
        else:
            parser.print_help()
Exemple #21
0
    def _get_features(self):
        import warnings
        warnings.simplefilter("error")

        # Divide up into cars and notcars
        # Read in car and non-car images
        cars = glob.glob('training_data/vehicles/*/*.png')
        notcars = glob.glob('training_data/non-vehicles/*/*.png')

        print("Training data", "positive", len(cars), "negative", len(notcars))

        car_features = extract_features(
            cars,
            cspace=self.hog_config.colorspace,
            orient=self.hog_config.orient,
            pix_per_cell=self.hog_config.pix_per_cell,
            cell_per_block=self.hog_config.cell_per_block,
            hog_channels=self.hog_config.hog_channels,
            spatial_size=self.hog_config.spatial_size,
            hist_bins=self.hog_config.hist_bins,
            hist_range=self.hog_config.hist_range)
        notcar_features = extract_features(
            notcars,
            cspace=self.hog_config.colorspace,
            orient=self.hog_config.orient,
            pix_per_cell=self.hog_config.pix_per_cell,
            cell_per_block=self.hog_config.cell_per_block,
            hog_channels=self.hog_config.hog_channels,
            spatial_size=self.hog_config.spatial_size,
            hist_bins=self.hog_config.hist_bins,
            hist_range=self.hog_config.hist_range)

        print("Training data after feature extraction", "positive",
              len(car_features), "negative", len(notcar_features))

        # Create an array stack of feature vectors
        X = np.vstack((car_features, notcar_features)).astype(np.float64)

        # Define the labels vector
        y = np.hstack(
            (np.ones(len(car_features)), np.zeros(len(notcar_features))))

        return X, y
Exemple #22
0
def get_distances(keyword_image_path, normalize, comparison_words_folder):

    # extract the features from the given keyword to spot in the document
    keyword_features = extract_features(keyword_image_path, normalize)

    print("Calculating distances...")
    distances = []

    for compared_word in glob.glob(comparison_words_folder + "/**/*" + ".png"): # for each word in each subfolder (OS dependant?)
        print(compared_word)
        # 1) calculate the feature vector for the second image
        word_features = extract_features(compared_word, normalize)
        # 2) calculate the dtw distance
        dtw_distance = get_dtw_distance(keyword_features, word_features)
        distances.append([os.path.basename(compared_word), dtw_distance])


    sorted_distances = sorted(distances,key=lambda l:l[1])
    return sorted_distances 
Exemple #23
0
    def load_database(self):
        self.update_status('>>> Loading DB videos', clear=True)
        print('Started')
        print('=' * 80)
        print('Database video list')
        print('-' * 80)
        print('\n'.join(
            ['%d. %s' % (i + 1, f) for (i, f) in enumerate(self.folders)]))
        print('=' * 80)

        self.db_vids = []
        for selected_folder in self.folders:
            self.update_status('>>> DB video selected: ' + selected_folder)
            pkl_path = glob.glob(os.path.join(selected_folder, '*.pkl'))
            if len(pkl_path) and not self.FORCE_CREATE:
                tic = time.time()
                self.update_status('>>> Loading pre-calculated features')
                with open(pkl_path[0], 'rb') as pkl_fp:
                    v = pickle.load(pkl_fp)
                self.update_status('>>> Done. Time taken: %0.4fs' %
                                   (time.time() - tic))
            else:
                tic = time.time()
                self.update_status('>>> Loading video')
                vid_path = selected_folder
                aud_path = glob.glob(os.path.join(selected_folder, '*.wav'))[0]
                v = Video(vid_path, aud_path)
                self.update_status('>>> Done. Time taken: %0.4fs' %
                                   (time.time() - tic))

                # Computing features
                tic = time.time()
                self.update_status('>>> Calculating video features')
                extract_features(v)
                self.update_status('>>> Calculated in %0.4fs' %
                                   (time.time() - tic))

                self.update_status('>>> Saving results to database')
                with open(os.path.join(selected_folder, '%s.pkl' % v.name),
                          'wb') as pkl_fp:
                    pickle.dump(v, pkl_fp)
            self.db_vids.append(v)
            self.update_status('>>> Saved results to database')
Exemple #24
0
def applyFeatures(fileTrain, fileTest, DataTestAfterFeature, fileCentroid,
                  DataTrainAfterFeature):

    ##################### Création des centroid, puis chargement ###################
    #on extrait des centroid des data train
    extract_centroid(fileTrain, fileCentroid, 500)

    #on charge les centroid
    Fcentroid = open(fileCentroid, 'rb')
    listCentroid = cPickle.load(Fcentroid)

    print "fin extraction des centroids"

    #################### Application des centroids sur les données test et train ############

    ###### application sur les données train ###############
    #on charge les data train
    data = unpickle(fileTrain)

    #on applique les features sur les data train
    dataTrain = extract_features(data['data'], listCentroid)

    #sauvegarde des data train après l'application des features
    pickle.dump(dataTrain, open(DataTrainAfterFeature, "wb"))

    print "fin application des centroids sur les données train "

    ###### application sur les données test ###############

    #on charge les data test
    dataT = unpickle(fileTest)

    #on applique les features sur les data train
    dataTest = extract_features(dataT['data'], listCentroid)

    #sauvegarde des data train après l'application des features
    pickle.dump(dataTest, open(DataTestAfterFeature, "wb"))

    print "fin application des centroids sur les données test "

    ################ chargement des données test et train ######################
    """
Exemple #25
0
def svm_ranking(train_docs, train_keys, test_docs, test_keys):
    X_train_vec, y_train_vec = extract_features(train_docs, train_keys)
    X_train, y_train = get_vec_differences_train(X_train_vec, y_train_vec)

    X_test_vec, y_test_vec = extract_features(test_docs, test_keys)
    X_test, y_test = get_vec_differences_train(X_test_vec, y_test_vec)
    print "--Training SVM"
    svm = train_svm(X_train, y_train)
    # The test_svm function needs to be replaced for this method
    # so it finds the diff. of test vectors, classifies those
    # differences, and ranks using those classifications
    print "--Testing SVM"
    accuracy = test_svm(svm, X_test, y_test)
    avg_recall = 0
    avg_precision = 0
    return {
        'accuracy': accuracy,
        'recall': avg_recall,
        'precision': avg_precision
    }
Exemple #26
0
def process_train_samples(samples, max_srch_size=10, each_saved_size=1000000):
    sorted_samples = samples.sorted_values(by=["srch_id"])
    sorted_samples = sorted_samples.reset_index(drop=True)
    samples_in_one_srch = pd.DataFrame()
    for r_idx, sample in sorted_samples.iterrows():
        if (r_idx + 1) % 1000 == 0:
            print "Processed %i sample of %i" % (r_idx + 1,
                                                 sorted_samples.shape[0])
        is_next_in_same_search = True
        samples_in_one_srch = pd.concat(
            (sample.to_frame().transpose(), samples_in_one_srch), axis=0)
        current_srch_id = sample["srch_id"]
        if (r_idx + 1) == sorted_samples.shape[0]:
            is_next_in_same_search == False
        else:
            next_srch_id = sorted_samples["srch_id"][r_idx + 1]
            if current_srch_id != next_srch_id:
                is_next_in_same_search = False
        if not is_next_in_same_search:
            ext_samples_in_one_srch = extract_features(samples_in_one_srch)
            n_samples = ext_samples_in_one_srch.shape[0]
            if n_samples > max_srch_size:
                if np.any(ext_samples_in_one_srch["bookings_bool"]):
                    pos_samples = ext_samples_in_one_srch[
                        ext_samples_in_one_srch["booking_bool"] == 1]
                    neg_samples = ext_samples_in_one_srch[
                        ext_samples_in_one_srch["booking_bool"] == 0]
                    selected_neg_samples = neg_samples.samples(
                        n=max_srch_size - pos_samples.shape[0])
                    selected_samples = pd.concat(
                        (pos_samples, selected_neg_samples), axis=0)
                else:
                    selected_samples = ext_samples_in_one_srch.sample(
                        n=max_srch_size)
            else:
                selected_samples = ext_samples_in_one_srch.copy()
            processed_samples = pd.concat(
                (processed_samples, selected_samples), axis=0)
            samples_in_one_srch = pd.DataFrame()
        if (r_idx + 1) % each_saved_size == 0:
            save_file_name = "proc_train_samples_%i.csv" % (r_idx + 1)
            save_path = get_paths()["proc_train_path"]
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            if np.any(np.isnan(processd_samples.values)):
                processd_samples = processd_samples.fillna(value=0)
            processd_samples.to_csv(os.path.join(save_path, save_file_name),
                                    index=None)
            save_file_name = "proc_train_samples%i.csv" % (r_idx + 1)
            save_path = get_paths()["proc_train_path"]
            if np.any(np.isnan(processd_samples.values)):
                processd_samples = processd_samples.fillna(value=0)
            processd_samples.to_csv(os.path.join(save_path, save_file_name),
                                    index=None)
Exemple #27
0
def grid_search():
    for step_size in range(8, 36, 4):
        for pixels_per_cell in range(3, 10, 1):
            for cells_per_block in range(3, 10, 1):
                args = config.Config()
                args.STEP_SIZE = [step_size, step_size]
                args.CELLS_PER_BLOCK = [cells_per_block, cells_per_block]
                args.PIXELS_PER_CELL = [pixels_per_cell, pixels_per_cell]
                args.PROJECT_ID = args.PROJECT_ID + "_SS_" + str(
                    step_size) + "_CPB_" + str(
                        cells_per_block) + "_PPC_" + str(pixels_per_cell)
                args.update_names()
                print(args.PROJECT_ID, args.DIR_PATHS)
                args.mk_new_dirs()
                feature_extraction.extract_features(args=args)
                train_classifier(args=args)
                test_classifier(args=args)
                if not args.KEEP_FEAT:
                    shutil.rmtree(args.DIR_PATHS['NEG_FEAT_PH'])
                    shutil.rmtree(args.DIR_PATHS['POS_FEAT_PH'])
Exemple #28
0
def get_performances(file_path):
    model = utils.get_model(HERE / 'outputs/XGBoost.pkl')
    xgb_obj = xgb.XGBoost()
    xgb_obj.model = model
    df = utils.read_to_df(file_path)
    df = pre.preprocess(df)
    X = fe.extract_features(df, FEATURE_LIST)
    X = X.drop(columns=['id'])
    y = (df['cb_level'] == 3).astype(int)
    y_prob_rf = xgb_obj.predict(X)
    pred = np.where(y_prob_rf > 0.5, 1, 0)
    return per.get_performances(y, pred)
def __extract_for_training(imgs):
    """ Function that receives a list of image paths, load each image and
        calls single_img_features for each. Saves the features for all images
        and returns all features together. Is needed for the training of
        the SVM.
    """
    features = []

    for image_name in imgs:
        img = cv2.imread(image_name)
        features.append(fe.extract_features(img))
    return features
Exemple #30
0
    def prepare_images(self, images):
        features = extract_features(
            images,
            cspace=self.hog_config.colorspace,
            orient=self.hog_config.orient,
            pix_per_cell=self.hog_config.pix_per_cell,
            cell_per_block=self.hog_config.cell_per_block,
            hog_channels=self.hog_config.hog_channels,
            hog_feature_vec=False)
        features = np.array(features).astype(np.float64)
        features = self.scaler.transform(features)

        return features
Exemple #31
0
def get_distances(user_id, ver_sig_id, enrollment_files, verification_file, normalize=True):

    #get data in given verification file
    verification_data = np.loadtxt(verification_file)
    # compute features for given verification signature
    verification_features = extract_features(verification_data, normalize)
    #print("verification_features: ", verification_features)

    #print("Calculating distances...")
    distances = []

    for ind in range(len(enrollment_files)):
        #get data from enrollment file
        en_data = np.loadtxt(enrollment_files[ind])
        # compute features for enrollment signatures of given user
        enrollment_features = extract_features(en_data, normalize)
        dtw_distance = get_dtw_distance(verification_features, enrollment_features)
        distances.append(dtw_distance)

    print("distances: ", distances)
    #sorted_distances = sorted(distances,key=lambda l:l[3])
    return distances
Exemple #32
0
def classify(classifier, file_path):
    features = extract_features(file_path)
    fieldnames = pickle.load(open(fieldnames_array_path, 'r'))
    features_arr = []
    for field in fieldnames:
        if field in features:
            features_arr.append(features[field])
        else:
            features_arr.append(0)

    prediction = classifier.predict([features_arr])

    return prediction
Exemple #33
0
def train(epoch):

    # set train mode
    clas1.train()
    clas2.train()
    att1.train()
    att2.train()

    # for each batch
    for batch_idx, (data, targets) in enumerate(train_loader):

        # initialization
        _, target = targets
        # print(data.shape)
        data, (target) = Variable(data), Variable(target)
        if cuda:
            data, target = data.cuda(), target.cuda()
        opt1.zero_grad()
        opt2.zero_grad()

        # forward pass
        features = extract_features(fullres, data)
        attention_map1 = att1(features)
        attention_map2 = att2(features)

        (region1, region1_coord) = crop_region(features, attention_map1)
        (region2, region2_coord) = crop_region(features, attention_map2)

        out1 = clas1(region1)
        out2 = (clas2(region2) + out1) / 2

        # loss
        loss1 = loss(out1, target)
        loss2 = loss(out2, target)
        loss_value = (loss1 + loss2) / 2
        reward_loss = ()

        # backward pass
        loss_value.backward()
        reward_grad = ()

        # weight upgrade
        opt1.step()
        opt2.step()
        reward_opt()

        # log
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss_value.data[0]))
Exemple #34
0
def test_extract_features():
    from feature_extraction import extract_features
    import numpy as np

    r = 5*np.ones([3, 3])
    r[1, 1] = 0
    g = np.zeros([3, 3])
    b = [[ii for ii in range(0, 3)] for _ in range(0, 3)]

    img = np.zeros([3, 3, 3])
    img[:, :, 0] = r
    img[:, :, 1] = g
    img[:, :, 2] = b

    actual, labels = extract_features(img, 'gb', 'r', 'rg', pct_yellow=True)
    expected = [np.median(g), np.median(b), np.std(r), 5, 0, 0]

    assert np.array_equal(actual, expected)

    actual, labels = extract_features(img, 'r', 'r', 'r', omit=0)
    expected = [5, 0, 5]

    assert np.array_equal(actual, expected)
Exemple #35
0
def create_classifier(image_dir, out_path):
    X_true, all_X_false = [], []
    img_path = file_paths(image_dir)
    for fname in img_path:
        seg = cv2.imread(fname)
        masks_true = get_masks_of_number(seg)
        masks_true = get_unique_masks(masks_true)
        X_true.extend(masks_true)
        masks_all = get_masks_of_segments(fill_black_pixels(seg))
        masks_all_false = filter(lambda x: not array_is_contained(x, masks_true), masks_all)
        all_X_false.extend(masks_all_false)

    X_true = extract_features(X_true)
    all_X_false = extract_features(all_X_false)

    X_train = np.concatenate((X_true, all_X_false), axis=0)
    y_train = np.concatenate((np.ones(X_true.shape[0]), np.zeros(all_X_false.shape[0])))
    n_true = sum(y_train)
    n_false = sum(y_train == 0)
    estimator = RandomForestClassifier(100, class_weight={1:n_true / (n_true + n_false), 0: n_false / (n_true + n_false)})

    estimator.fit(X_train, y_train)
    joblib.dump(estimator, out_path)
Exemple #36
0
def train_file(file_path):
    path_object = pathlib.Path(HERE / 'outputs')
    if path_object.exists():
        shutil.rmtree(HERE / 'outputs')
        os.makedirs(HERE / 'outputs')
    tagged_df = utils.read_to_df(file_path)
    tagged_df = pre.preprocess(tagged_df)
    X = fe.extract_features(tagged_df, FEATURE_LIST)
    y = (tagged_df['cb_level'] == 3).astype(int)
    X = X.drop(columns=['id'])
    xgb_obj = xgb.XGBoost()
    xgb_obj.train(X, y)
    exp.explain_model(xgb_obj.model, X, False)
    utils.save_model(xgb_obj.model, os.path.join(HERE / 'outputs', 'XGBoost.pkl'))
Exemple #37
0
	def predict(self):
		# extract features from the existing skeleton frames, run prediction and print the top 5 predictions
		sample = np.array(self.skeletons)
		sample = np.swapaxes(sample, 0, 1)
		x = extract_features(sample)
		y_pred = self.predictor.predict_proba(x.reshape(1, -1))
		sorted_ind = np.argsort(y_pred[0])
		top_ind = np.flip(sorted_ind[-5:])
		top_probs = np.around(y_pred[0, top_ind], decimals=4)
		self.currentActivity = _class_names[top_ind[0]]
		print('Prediction results: ')
		for i in range(len(top_ind)):
			print(_class_names[top_ind[i]] + ': ' + str(top_probs[i]))
		print('')
Exemple #38
0
def vsumm_frames_in_memory(video):
    segmentation = vs.VideoSegmentation(video)
    frames = segmentation.read_and_keep_frames()

    if len(frames) == 0:
        return False

    features = feat.extract_features(frames)
    keyframes = cl.find_clusters(features)

    summary_folder = 'summaryM-'+video[7:-4]
    if not os.path.isdir(summary_folder):
        os.mkdir(summary_folder)

    for k in keyframes:
        frame = frames[k.frame_id-1]
        frame_name = summary_folder+'/frame-'+str(k.frame_id).zfill(6)+'.jpg'
        cv2.imwrite(frame_name,frame)

    return True
Exemple #39
0
def load_or_train(X, y, force_train=False, enable_plot=False):
    """
    Load an existing one or train a new SVM classifier, and return it.
    Once the classifier is trained, it is saved through pickle.
    """

    clf_path = './clf.pkl'

    if not force_train and os.path.exists(clf_path):
        print "> loading from file..."
        clf = pickle.load(open(clf_path, 'rb'))
        print "> loaded"
    else:
        print "> training..."

        X = extract_features(X)

        ## Cross-validation
        cv = ShuffleSplit(len(X), n_iter=10, test_size=0.2, random_state=0)
        gammas = np.logspace(-6, -1, 10)

        ## Grid search
        clf = GridSearchCV(estimator=svm.SVR(), cv=cv, param_grid=dict(gamma=gammas), n_jobs=1, verbose=10)
        clf.fit(X, y)

        ## Plot learning curve
        title = 'Learning Curves (SVM, linear kernel, $\gamma=%.6f$)' % clf.best_estimator_.gamma
        estimator = svm.SVC(kernel='linear', gamma=clf.best_estimator_.gamma)
        plot_learning_curve(estimator, title, X, y, cv=cv)
        plt.show()

        ## show information about the training
        if enable_plot:
            outputs = clf.predict(X)
            plot_confusion_matrix(y, outputs, range(0, 10))
            print_classification_report(y, outputs)

        ## Save the model
        pickle.dump(clf, open(clf_path, 'wb'))

    return clf
if __name__ == "__main__":
  # get trained network
  fnn = NetworkReader.readFrom(TRAINED_NN_FILE)

  # get data
  file_location = STEP_DETECTION_DATA_LOCATION + "/"
  accel_list = file.get_sensor_list(file_location + ACCEL_STEP_DETECTION_DATA)
  gyro_list = file.get_sensor_list(file_location + GYRO_STEP_DETECTION_DATA)
  compass_list = file.get_sensor_list(file_location + COMPASS_STEP_DETECTION_DATA)

  imu_list = sync_accel_gyro_compass(accel_list, gyro_list, compass_list)

  accel_y_dict = imu_list.extract_sensor_axis_list(ACCEL, Y_AXIS)
  accel_y = accel_y_dict[READINGS]
  timestamps = accel_y_dict[TIMESTAMPS]

  peaks_index = extract_peaks(accel_y)

  step_count = 0
  index = 0
  # while(index < len(peaks_index)-1):
  start = peaks_index[index]
  for j in range(index+1, len(peaks_index)):
    end = peaks_index[j]
    feature = extract_features(imu_list, start, end)
    result = fnn.activate(feature)
    print index, j, result


Exemple #41
0
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NON INFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
#######################################################################################################################

import matplotlib.pyplot as pyplot
from data_loader import load_numbers
from feature_extraction import extract_features

## Load the numbers
print "Loading numbers..."
X, y = load_numbers()

## Extract the features
print "Extracting features..."
features = extract_features(X)

print "Prepare data..."
separated_by_classes = [[], [], [], [], [], [], [], [], [], []]
for i, c in enumerate(y):
    separated_by_classes[c].append(X[i])

## Box plot
print "Creating boxplot..."
boxplotElements = pyplot.boxplot(separated_by_classes)

    # TODO: load experiment using params
    train_path = ""
    test_path = ""
    # loads images from given paths
    train = dataset.load_dataset(train_path)
    test = dataset.load_dataset(test_path)

    # extracts descriptors for train and test sets
    train_descriptors = {item.path:feature_extraction.extract_descriptors(item.data) for item in train}
    test_descriptors = {item.path:feature_extraction.extract_descriptors(item.data) for item in test}

    # creates codebook (default size=300) based on train samples
    codebook = feature_extraction.create_codebook(np.concatenate(train_descriptors.values()))

    # generate feature vectors for train and test based on previously calculated codebook
    train_features = {key:feature_extraction.extract_features(codebook, train_descriptors[key]) for key in train_descriptors}
    test_features = {key:feature_extraction.extract_features(codebook, test_descriptors[key]) for key in test_descriptors}

    # TODO: create a similarity matrix using all features

    # persists features, codebook and similarity matrix
    pickle.dump(train_features, open("train_features.pk", "wb"))
    pickle.dump(test_features, open("test_features.pk", "wb"))
    pickle.dump(codebook, open("codebook.pk", "wb"))

    # creates index using LSHash and train set
    searcher = Searcher(train_features)

    # persists index for future use
    pickle.dump(searcher, open("searcher.pk", "wb"))
Exemple #43
0
from feature_extraction import extract_features

## Vars
force_train = True
enable_plot = False

## Load and split the data in train and test sets
print "Loading data..."
X, y = load_numbers()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Get trained classifier
print "Training classifier..."
clf = load_or_train(X_train, y_train, force_train, enable_plot)
print clf

## Compute the features of the test set and predict
print "Predicting test set..."
features = extract_features(X_test)
y_pred = clf.predict(features)

print y_test
print y_pred

## Score
f1 = f1_score(y_test, y_pred)
print "f1-score for is {}%".format(f1)
#if enable_plot:
print_classification_report(y_test, y_pred)
plot_confusion_matrix(y_test, y_pred, range(0, 10))
Exemple #44
0
def process_train_samples(samples, max_srch_size=10, each_saved_size=1000000):
    '''
    func:
    Process samples including feature extraction and downsampling
    MB:samples with the same srch_id that have one positive traget
    are treated as one positive sample,otherwise,are negative samples
   max_srch_size  就是每一个srch_id最多有几行数据,
   比如 train数据集中,一个srch_id有20行,但是我们只随机取到10行就可以了,这就做了个downsampling
    因为训练集很大,所以就每处理100万条,就生成一个文件,并训练
    '''

    # 训练集数据乱序,这里先拍下序,相同srch_id的数据放一块
    sorted_samples = samples.sort_values(by=['srch_id'])  # grou by srch_id
    sorted_samples = sorted_samples.reset_index(drop=True)  # reset row index
    processed_samples = pd.DataFrame()

    samples_in_one_srch = pd.DataFrame()
    # for 循环处理的就是下一个srch_id是不是与上一个相同
    for r_idx, sample in sorted_samples.iterrows():
        if (r_idx + 1) % 1000 == 0:
            print "processed %i sample of %i " % (r_idx + 1, sorted_samples.shape[0])

        is_next_in_same_search = True
        samples_in_one_srch = pd.concat((sample.to_frame().transpose(), samples_in_one_srch), axis=0)

        current_srch_id = sample['srch_id']

        # 最后一行
        if (r_idx + 1) == sorted_samples.shape[0]:
            is_next_in_same_search = False
        else:
            next_srch_id = sorted_samples['srch_id'][r_idx + 1]
            if current_srch_id != next_srch_id:
                is_next_in_same_search = False

        # 正好是一组srch_id ,进行特征提取
        # 16G内存,跑了8小时,这部分处理速度慢
        if not is_next_in_same_search:
            ## if next one is not in the same search process the samples in the same search

            # feature extraction for samples
            ext_samples_in_one_srch = extract_features(samples_in_one_srch)

            # downsample samples  同一个srch_id下有多少samples
            n_samples = ext_samples_in_one_srch.shape[0]

            # 比如设定为10,这里大于10
            if n_samples > max_srch_size:
                # if too many samples in one search,do downsampling
                if np.any(ext_samples_in_one_srch['booking_bool']):
                    # if this is a positive sample(1 exists in booking_bool)
                    # 有预定酒店的数据  正样本需要留下了
                    pos_samples = ext_samples_in_one_srch[ext_samples_in_one_srch['booking_bool'] == 1]
                    neg_samples = ext_samples_in_one_srch[ext_samples_in_one_srch['booking_bool'] == 0]
                    # 然后在负样本里,随机选择  eg: samples 28条数据, 设定为10 ,有1条正样本,则在剩下的数据中随机选择9条
                    selected_neg_samples = neg_samples.sample(n=max_srch_size - pos_samples.shape[0])
                    selected_samples = pd.concat((pos_samples, selected_neg_samples), axis=0)
                else:
                    # 没有正样本数据,就都随机选择了
                    # if this is a negative sample,random select max_srch_size
                    selected_samples = ext_samples_in_one_srch.sample(n=max_srch_size)
            else:
                #
                selected_samples = ext_samples_in_one_srch.copy()

            processed_samples = pd.concat((processed_samples, selected_samples), axis=0)

            # create new samples for the next search
            samples_in_one_srch = pd.DataFrame()

        # 每100万条,存储下来
        if (r_idx + 1) % each_saved_size == 0:
            # save samples for every each_saved_size
            save_file_name = 'proc_train_samples_%i.csv' % (r_idx + 1)
            save_path = get_paths()['proc_train_path']
            if not os.path.exists(save_path):
                os.mkdir(save_path)

            if np.any(np.isnan(processed_samples.values)):
                # remove nan
                processed_samples = processed_samples.fillna(value=0)
                print "remove nan."
            processed_samples.to_csv(os.path.join(save_path, save_file_name), index=None)

    # out of loop save all processed samples
    save_file_name = 'proc_train_samples_%i.csv' % (r_idx + 1)
    save_path = get_paths()['proc_train_path']
    if np.any(np.isnan(processed_samples.values)):
        # remove nan
        processed_samples = processed_samples.fillna(value=0)
        print "remove nan."
    processed_samples.to_csv(os.path.join(save_path, save_file_name), index=None)