Exemple #1
0
def main():
    # Set logging level
    logging.basicConfig(level = logging.INFO)

    # Init League data and process info 
    silver1_1 = LeagueData(api_key, 'kr', 'RANKED_SOLO_5x5', 'SILVER', 'I', 1)
    silver1_1.process_info()
    silver1_1.save_info()

    # Feature Extraction
    feature_extraction(silver1_1.match_playerid_df)
def extract_features(features_opts, dataset_opts, params):
    print "# Extracting image features"
    files1, files2 = dataset(dataset_opts)
    features = []
    for img_file, depth_file in print_progress(files1 + files2):
        features.append(feature_extraction(img_file, depth_file, features_opts, params))
    return files1, features[: len(features) / 2], files2, features[len(features) / 2 :]
def build_vocabulary(image_paths, vocab_size, feature):
    """
    This function will sample feature descriptors from the training images,
    cluster them with kmeans, and the return the cluster centers.

    :param image_paths: a N array of string where each string is an image path
    :param vocab_size: the size of the vocabulary.
    :param feature: name of image feature representation.

    :return: a vocab_size x feature_size matrix. center positions of k-means clustering.
    """
    all_features = []

    for path in image_paths:
        img = cv2.imread(path)[:, :, ::-1]  # 이미지 읽기

        features = feature_extraction(img, feature)  # 이미지에서 feature 추출
        all_features.append(features)  # feature들을 리스트에 추가

    all_features = np.concatenate(all_features, 0)  # 모든 feature들을 붙여서 하나의 matrix 생성

    # k-means clustering
    centers = kmeans_clustering(all_features, vocab_size, 1e-4, 100)

    return centers  # k-means clustering 결과의 center 값들을 반환
Exemple #4
0
def learning():
    reviews = get_data('Electronics_5.json', 'reviewText')
    ratings = get_data('Electronics_5.json', 'overall')

    labels = []
    i1 = 0
    i2 = 0
    i3 = 0
    i4 = 0
    i5 = 0
    for rating in ratings:
        '''
        if(rating <= 1) : i1 += 1
        if(rating <= 2) : i2 += 1
        if(rating <= 3) : i3 += 1
        if(rating <= 4) : i4 += 1
        if(rating <= 5) : i5 += 1
        if rating >= 4 : labels.append('positive')
        else : labels.append('negative')
        '''
        labels.append(str(rating))
    data_train = ([(review, label) for review, label in zip(reviews, labels)])
    feature_sets = [(feature_extraction(data), label)
                    for (data, label) in data_train]
    classifier = NaiveBayesClassifier.train(feature_sets)
    return classifier
def load_test(path=TEST_PATH):

    files = []

    for f in os.listdir(path):
        if not os.path.isfile(os.path.join(path, f)):
            continue

        files.append((path + f))


    n_examples = 0
    X_test = np.array([])
    for f in files:
        filepath = f

        try:
            matdata = load_matdata(filepath)
        except Exception as e:
            logger.error("Ignoring corruped file: {}".format(filepath))
            continue

        logger.debug("Extracting features from {}: {}".format(n_examples, filepath))

        x = feature_extraction(matdata)

        flat = x.flatten()
        X_test = np.hstack((X_test, flat))

        n_feat = len(flat)
        n_examples = n_examples + 1

    logger.debug('{} examples loaded.'.format(n_examples))
    X_test = X_test.reshape(n_examples, n_feat)
    return X_test
def evaluate_classifier(featxs, datasets):
    posfeats, negfeats = feature_extraction(featxs, datasets)

    print '\ncross validation MLP'
    print cross_validation(posfeats,
                           negfeats,
                           folds=5,
                           classifier='decision_tree')
def main(classifier=None):
    filename = "model_file.sav"
    print("Select function:")
    print(
        "1 - Dataset Generation : 2 - Training : 3 - Validation : 4 - Prediction : 5 - Exit"
    )
    function = input()
    function = str(function)
    print("\n")

    if function == "1":

        url_list_legitimate, url_list_phishing = read_url_list_training()
        for i in url_list_legitimate:
            print(i)
            feature_extraction(i, -1)
        for i in url_list_phishing:
            print(i)
            feature_extraction(i, 1)

        main()

    elif function == "2":
        classifier = training_func()
        main(classifier)
    elif function == "3":
        if classifier != None:
            validation_func(classifier)
            main(classifier)
        else:
            load_model = pickle.load(open(filename, "rb"))
            validation_func(load_model)
            main(load_model)
    elif function == "4":
        if classifier != None:
            prediction_func(classifier)
            main(classifier)
        else:
            load_model = pickle.load(open(filename, "rb"))
            prediction_func(load_model)
            main(load_model)
    elif function == "5":
        pass
    else:
        print("Not valid option")
Exemple #8
0
def training_with_random_forest(X_train, y_train, X_valid, y_valid, X_test,
                                y_test):
    '''
    使用 随机森林 进行训练
    :param X_train: 训练集
    :param y_train: 训练集标签
    :param X_valid: 验证集
    :param y_valid: 验证集标签
    :param X_test: 测试集
    :param y_test: 测试集标签
    :return:
            clf_rfc:训练完成的模型
            score:模型在验证集上的得分
            X_train_feature_extraction:将原数据进行了特征提取过的训练集
            X_test_feature_extraction:将原数据进行了特征提取过的测试集
    '''
    # 把训练集和验证集合并,全部用作训练集
    X_train = np.vstack((X_train, X_valid))
    y_train = np.vstack((y_train, y_valid))

    # 将one-hot编码了的标签解码(这里不需要one-hot编码)
    y_train = [np.argmax(item) for item in y_train]
    y_train = np.array(y_train)
    y_test = [np.argmax(item) for item in y_test]
    y_test = np.array(y_test)

    loader = np.empty(shape=[X_train.shape[0], 16])
    for i in range(X_train.shape[0]):
        loader[i] = feature_extraction(X_train[i])
    X_train_feature_extraction = loader

    loader = np.empty(shape=[X_test.shape[0], 16])
    for i in range(X_test.shape[0]):
        loader[i] = feature_extraction(X_test[i])
    X_test_feature_extraction = loader

    clf_rfc = RandomForestClassifier(n_estimators=17,
                                     max_depth=21,
                                     criterion='gini',
                                     min_samples_split=2,
                                     max_features=9,
                                     random_state=60)
    clf_rfc.fit(X_train_feature_extraction, y_train)
    score = clf_rfc.score(X_test_feature_extraction, y_test)
    return clf_rfc, score, X_train_feature_extraction, X_test_feature_extraction
def evaluate_classifier(featxs, datasets):

    posfeats, negfeats = feature_extraction(featxs, datasets)

    print '\ncross validation KNN'
    print cross_validation(posfeats,
                           negfeats,
                           folds=5,
                           classifier='k_neighbors')
 def _get_histogram_for_feature(img, vocab, feature, bins):
     features = feature_extraction(img, feature)
     try:
         dist = pdist(vocab, features)
         min_dist_index = dist.argmin(axis=0)
         hist, _ = np.histogram(min_dist_index, bins=bins)
         return hist
     except:
         hist, _ = np.histogram([], bins=bins)
         return hist
Exemple #11
0
def get_spatial_pyramid_feats(image_paths, max_level, feature):
    """
    This function assumes that 'vocab_hog.npy' (for HoG) or 'vocab_sift.npy' (for SIFT)
    exists and contains an N x feature vector length matrix 'vocab' where each row
    is a kmeans centroid or visual word. This matrix is saved to disk rather than passed
    in a parameter to avoid recomputing the vocabulary every run.

    :param image_paths: a N array of string where each string is an image path,
    :param max_level: level of pyramid,
    :param feature: name of image feature representation.

    :return: an N x d matrix, where d is the dimensionality of the
        feature representation. In this case, d will equal the number
        of clusters or equivalently the number of entries in each
        image's histogram ('vocab_size'), multiplies with
        (1 / 3) * (4 ^ (max_level + 1) - 1).
    """
    if feature == 'HoG':
        vocab = np.load('vocab_hog.npy')
    elif feature == 'SIFT':
        vocab = np.load('vocab_sift.npy')

    vocab_size = vocab.shape[0]

    # Your code here. You should also change the return value.
    imgs_n = int((1 / 3) * (4**(max_level + 1) - 1))
    sp = np.zeros((1500, vocab_size * imgs_n)).astype(np.float)
    n = 0

    for path in image_paths:
        img_origin = cv2.imread(path)
        height = img_origin.shape[0]
        width = img_origin.shape[1]
        idx = 0  #the number of subimage of one image
        #cut image
        for l in range(0, max_level + 1):
            item_width = int(width / (2**l))
            item_height = int(height / (2**l))
            for i in range(0, 2**l):
                for j in range(0, 2**l):
                    subimg = img_origin[j * item_height:(j + 1) * item_height,
                                        i * item_width:(i + 1) * item_width, :]
                    features = feature_extraction(subimg, feature)
                    distances = []
                    for k in range(features.shape[0]):
                        distances = pdist(np.mat(features[k]), vocab)
                        indice = np.argsort(distances)[0, 0]
                        sp[n, idx * vocab_size +
                           indice] = sp[n, idx * vocab_size + indice] + 1
                    idx = idx + 1
        n = n + 1
    #sp = sp / sp.max(axis=0)

    return sp
def get_features_and_classification(filename):
    df = pd.read_csv("csv_files/" + filename,
                     header=None,
                     names=['reviewText', 'rating'])
    df = preprocess(df, nlp)
    df = construct_spacy_obj(df, nlp)

    features = feature_extraction(df, ft_model, nlp)
    result, _, __ = classify(df, features, model)

    return features, result
def evaluate_classifier(featxs, datasets):

    posfeats, negfeats = feature_extraction(featxs,
                                            datasets,
                                            punctuation=False)

    print '\ncross validation NB'
    print cross_validation(posfeats,
                           negfeats,
                           folds=5,
                           classifier='naive_bayes')
Exemple #14
0
def get_model(nlp, ft_model):

	if os.path.isfile('models/model.joblib'):
		print("Trained model found. Using them.")
		model = load('models/model.joblib')
		# tfidf = load('models/tfidf.joblib')

	else:
		print("Trained models not found. Training now!")

		train_data = pd.read_csv('csv_files/training.csv', header=None, names=['reviewText', 'rating'])
		train_data.dropna(inplace=True)
		train_data['reviewText'] = train_data['reviewText'].apply(lambda x: preprocess(x, nlp))
		train_data.dropna(inplace=True)
		train_data = construct_spacy_obj(train_data, nlp)

		features = feature_extraction(train_data, ft_model, nlp)

		single_aspect_reviews = get_sigle_aspect_reviews(train_data, features=features)
		single_aspect_reviews['reviewText'] = single_aspect_reviews['reviewText'].apply(lambda x: postprocess(x, nlp))

		X_train = single_aspect_reviews['reviewText']
		y_train = single_aspect_reviews['rating'].apply(lambda x: giveRating(x))

		final_lr = Pipeline([
			('tfidf', TfidfVectorizer(lowercase=False, min_df=0.00006, ngram_range=(1,3))),
			('lr', LogisticRegression(solver='lbfgs', max_iter=175))
		])

		# final_rf = Pipeline([
		# 	('tfidf', TfidfVectorizer(lowercase=False, min_df=0.00006, ngram_range=(1,3))),
		# 	('rf', RandomForestClassifier(n_estimators=100))
		# ])

		scores_final_lr = cross_validate(final_lr, X_train, y_train, scoring=scoring, cv=5)

		for scoring_measure, scores_arr in scores_final_lr.items():
			print(scoring_measure, ":\t%f (+/- %f)" % (scores_arr.mean(), scores_arr.std()*2))

		# scores_final_rf = cross_validate(final_rf, X_train, y_train, scoring=scoring, cv=5)

		# for scoring_measure, scores_arr in scores_final_rf.items():
		# 	print(scoring_measure, ":\t%f (+/- %f)" % (scores_arr.mean(), scores_arr.std()*2))

		final_lr.fit(X_train, y_train)
		# final_rf.fit(X_train, y_train)

		dump(final_lr, 'models/model.joblib')
		# dump(final_rf, 'models/model_rf.joblib')
		# dump(tfidf, 'tfidf.joblib')

		model = final_lr

	return model
def images_to_feature(df):
    'takes a dataframe and converts it to images'
    frame = []
    for i in df.Image:
       # print(os.path
        image = cv2.imread(data_path + i)
        frame.append(feature.feature_extraction(img = image))
        
        
    df['Feature'] = frame     
    
    return df
Exemple #16
0
def main(paths):
    df_train = pd.read_csv(paths[0])
    df_test = pd.read_csv(paths[1])
    print('Read {}'.format(paths))
    df_train = df_train.set_index('PassengerId')
    df_test = df_test.set_index('PassengerId')
    df_train = fea_eng.feature_engineering(df_train)
    df_test = fea_eng.feature_engineering(df_test)
    df_train = fea_ext.feature_extraction(df_train)
    df_test = fea_ext.feature_extraction(df_test)
    df_test.Fare = df_test.Fare.fillna(df_test.Fare.median())

    df_train, df_test = fea_ext.process_age(df_train, df_test)
    drop_cols = [
        'Name', 'Ticket', 'Cabin', 'Age', 'Sex', 'Embarked', 'Title', 'Surname'
    ]
    df_train = df_train.drop(drop_cols, axis=1)
    df_test = df_test.drop(drop_cols, axis=1)

    # Save file, added "-processed" as suffix
    df_train.to_csv('./data/train-processed.csv')
    df_test.to_csv('./data/test-processed.csv')
Exemple #17
0
def make_random(queen_directory, noqueen_directory, n_chunks, mode):
    features_q = []
    queen_q = []
    for filename in os.listdir(queen_directory):
        if filename.endswith(".wav"):
            filepath = os.path.join(queen_directory, filename)
            out = ft.feature_extraction(filepath, n_chunks, mode)
            features_q.append(out)
            q = ft.queen_info(filepath)
            queen_q.append(q)

    features_q = np.asarray(features_q)
    queen_q = np.asarray(queen_q)

    queen_nq = []
    features_nq = []
    for filename in os.listdir(noqueen_directory):
        if filename.endswith(".wav"):
            filepath = os.path.join(noqueen_directory, filename)
            out = ft.feature_extraction(filepath, n_chunks, mode)
            features_nq.append(out)
            q = ft.queen_info(filepath)
            queen_nq.append(q)

    features_nq = np.asarray(features_nq)
    queen_nq = np.asarray(queen_nq)

    X_q_train, X_q_test, Y_q_train, Y_q_test = train_test_split(features_q,
                                                                queen_q,
                                                                test_size=0.3)
    X_nq_train, X_nq_test, Y_nq_train, Y_nq_test = train_test_split(
        features_nq, queen_nq, test_size=0.3)

    X_train = np.concatenate((X_q_train, X_nq_train))
    X_test = np.concatenate((X_q_test, X_nq_test))
    Y_train = np.concatenate((Y_q_train, Y_nq_train))
    Y_test = np.concatenate((Y_q_test, Y_nq_test))
    return X_train, X_test, Y_train, Y_test
Exemple #18
0
def main():
    '''
    classifier_file = open('classifier.pickle','rb')
    classifier = pickle.load(classifier_file)
    classifier_file.close()
    '''
    classifier = learning()
    classifier_file = open('classifier.pickle', 'wb')
    pickle.dump(classifier, classifier_file)
    classifier_file.close()

    prob = classifier.prob_classify(
        feature_extraction('i hate this product, it is really bad'))
    label = prob.samples()
    print(prob.prob('5.0'))
    '''
Exemple #19
0
def extract_feature(m, names_list, speaker_list, pathfile):
    data = pd.DataFrame()
    meta = pd.DataFrame()
    for n in range(10):
        file_num = speaker_list+n
        filename_sig = pathfile.format(file_num)
        audiofile = pd.read_json(filename_sig, orient='split')
        audiofile = audiofile[audiofile.columns[3]].values

        test_feature, num_rows = feature_extraction(audiofile=audiofile)
        data = pd.concat([data, test_feature])
        meta_one = pd.DataFrame({'name': names_list.iloc[file_num]['name'], 'gender': names_list.iloc[file_num]['gender'], 'sample': names_list.iloc[file_num]['sample'], 'labeled': m, 'utterance': m*10+n}, index=[file_num])
        meta_one = meta_one.loc[meta_one.index.repeat(num_rows)]
        meta = pd.concat([meta, meta_one])

    return pd.concat([data.reset_index(), meta.reset_index()], axis=1, sort=False)
def load_train(path=TRAIN_PATH):

    files = []

    for f in os.listdir(path):
        if not os.path.isfile(os.path.join(path, f)):
            continue

        f_desc = f.split('_')

        if f_desc[2] == '0.mat':
            files.append(((path + f), 0))
        elif f_desc[2] == '1.mat':
            files.append(((path + f), 1))
        else:
            raise Exception('Invalid filename')

    n_examples = 0
    X = np.array([])
    y = []
    for f in files:
        filepath = f[0]
        cls = int(f[1])

        try:
            matdata = load_matdata(filepath)
        except Exception as e:
            logger.error("Ignoring corruped file: {}".format(filepath))
            continue

        logger.debug("Extracting features from {}: {}".format(n_examples, filepath))

        x = feature_extraction(matdata)

        flat = x.flatten()
        X = np.hstack((X, flat))
        y.append(cls)

        n_feat = len(flat)
        n_examples = n_examples + 1

    logger.debug('{} examples loaded.'.format(n_examples))
    X = X.reshape(n_examples, n_feat)
    y = np.array(y)
    return X, y
Exemple #21
0
def scan_image_features(image_file, num_f=9, normalize_feature_matrix=False):
    """Scans image column-wise and returns vector of dimensions no_of_features x image width
    with features for each column (extracted by feature_extraction-function).
    Returns feature matrix for each image column, with option for matrix being normalized."""
    no_of_features = num_f
    img = Image.open(image_file)
    img = img.convert("1")
    img_array = np.array(img)
    img_height = img_array.shape[0]
    img_width = img_array.shape[1]
    #    no_of_features = len( fe.feature_extraction(img_array[:, 1].reshape(img_height, 1), num_f) )  # get number of features assessed by feature_extraction-function by extracting features of one colum
    feature_matrix = np.zeros(shape=(no_of_features, img_width))
    #print("shape of feature matrix : ", feature_matrix.shape)
    for column in range(img_width):
        col = img_array[:, column].reshape(img_height, 1)
        col_features = fe.feature_extraction(col, no_of_features)
        feature_matrix[:, column] = col_features
    if normalize_feature_matrix:
        feature_matrix = fe.normalization(feature_matrix, no_of_features)
    return feature_matrix
Exemple #22
0
def get_bags_of_words(image_paths, feature):
    """
    This function assumes that 'vocab.mat' exists and contains an N x feature vector
    length matrix 'vocab' where each row is a kmeans centroid or visual word. This
    matrix is saved to disk rather than passed in a parameter to avoid recomputing
    the vocabulary every run.

    :param image_paths: a N array of string where each string is an image path
    :param feature: name of image feature representation.

    :return: an N x d matrix, where d is the dimensionality of the
        feature representation. In this case, d will equal the number
        of clusters or equivalently the number of entries in each
        image's histogram ('vocab_size') below.
    """
    if feature == 'HoG':
        vocab = np.load('vocab_hog.npy')
    elif feature == 'SIFT':
        vocab = np.load('vocab_sift.npy')

    vocab_size = vocab.shape[0]
    N = len(image_paths)
    hist = np.zeros((N, vocab_size))
    # Your code here. You should also change the return value.
    k = 0
    for path in image_paths:
        img = cv2.imread(path)[:, :, ::-1]

        # get features of image
        features = feature_extraction(img, feature)

        # get distance of features and codevectors, then get histogram
        d = pdist(features, vocab)
        lab = np.argmin(d, axis=1)
        for l in lab:
            hist[k, l] += 1
        hist[k, :] = hist[k, :] / len(lab)
        k += 1

    return hist
def get_bags_of_words(image_paths, feature):
    """
    This function assumes that 'vocab.mat' exists and contains an N x feature vector
    length matrix 'vocab' where each row is a kmeans centroid or visual word. This
    matrix is saved to disk rather than passed in a parameter to avoid recomputing
    the vocabulary every run.

    :param image_paths: a N array of string where each string is an image path
    :param feature: name of image feature representation.

    :return: an N x d matrix, where d is the dimensionality of the
        feature representation. In this case, d will equal the number
        of clusters or equivalently the number of entries in each
        image's histogram ('vocab_size') below.
    """
    if feature == 'HoG':
        vocab = np.load('vocab_hog.npy')
    elif feature == 'SIFT':
        vocab = np.load('vocab_sift.npy')

    vocab_size = vocab.shape[0]
    ft_size = vocab.shape[1]

    output_mat = np.zeros((image_paths.shape[0], vocab_size))

    # # Your code here. You should also change the return value.
    i = 0
    for path in image_paths:
        #dealing with one image
        img = cv2.imread(path)[:, :, ::-1]
        #total_ft*featurevect_length
        features = feature_extraction(img, feature)
        distance_mat = pdist(features, vocab)
        for vec in distance_mat:
            index = np.argmin(vec)
            output_mat[i][index] += 1
        output_mat[i] = output_mat[i] / linalg.norm(output_mat[i])
        i = i + 1

    return output_mat
Exemple #24
0
    def __init__(self,
                 hyper_parameters,  # dictionary with all hyper-parameters
                 E_w,  # pre-processed word embedding matrix
                 ):

        # reset graph
        tf.reset_default_graph()

        ############################
        # setup for hyper-parameters
        ############################
        self.hyper_parameters = hyper_parameters

        #########################
        # initialize placeholders
        #########################
        self.placeholders, self.thetas_E = initialize_placeholders(E_w, hyper_parameters)

        ####################
        # feature extraction
        ####################
        self.thetas_feature, self.features = feature_extraction(self.placeholders, self.hyper_parameters)

        ###################################################
        # metric learning (and adversarial domain adaption)
        ###################################################
        self.thetas_metric, self.out = metric_learning(self.placeholders, self.hyper_parameters, self.features)

        ##########
        # training
        ##########
        thetas = (self.thetas_feature, self.thetas_E, self.thetas_metric)
        self.training, self.distance = loss_optimizer(self.placeholders, self.hyper_parameters, self.out, thetas)

        ################
        # launch session
        ################
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
def get_bags_of_words(image_paths, feature):
    """
    This function assumes that 'vocab.mat' exists and contains an N x feature vector
    length matrix 'vocab' where each row is a kmeans centroid or visual word. This
    matrix is saved to disk rather than passed in a parameter to avoid recomputing
    the vocabulary every run.

    :param image_paths: a N array of string where each string is an image path
    :param feature: name of image feature representation.

    :return: an N x d matrix, where d is the dimensionality of the
        feature representation. In this case, d will equal the number
        of clusters or equivalently the number of entries in each
        image's histogram ('vocab_size') below.
    """
    if feature == 'HoG':
        vocab = np.load('vocab_hog.npy')
    elif feature == 'SIFT':
        vocab = np.load('vocab_sift.npy')

    vocab_size = vocab.shape[0]

    # Your code here. You should also change the return value.
    bag_of_words = np.zeros((1500, vocab_size))
    n = 0  # number of image
    for path in image_paths:
        img = cv2.imread(path)[:, :, ::-1]
        features = feature_extraction(img, feature)
        if features is not None:
            distances = []
            for i in range(features.shape[0]):
                distances = pdist(np.mat(features[i]),
                                  vocab)  # size of distance = (1,vocabsize)
                indice = np.argsort(distances)[0, 0]
                bag_of_words[n, indice] = bag_of_words[n, indice] + 1
        print(n)
        n = n + 1
    #bag_of_words = bag_of_words / bag_of_words.max(axis=0)
    return bag_of_words
Exemple #26
0
def get_bags_of_words(image_paths, feature):
    """
    This function assumes that 'vocab.mat' exists and contains an N x feature vector
    length matrix 'vocab' where each row is a kmeans centroid or visual word. This
    matrix is saved to disk rather than passed in a parameter to avoid recomputing
    the vocabulary every run.

    :param image_paths: a N array of string where each string is an image path
    :param feature: name of image feature representation.

    :return: an N x d matrix, where d is the dimensionality of the
        feature representation. In this case, d will equal the number
        of clusters or equivalently the number of entries in each
        image's histogram ('vocab_size') below.
    """
    if feature == 'HoG':
        vocab = np.load('vocab_hog.npy')
    elif feature == 'SIFT':
        vocab = np.load('vocab_sift.npy')

    vocab_size = vocab.shape[0]
    bins = range(-1,vocab_size)

    # Your code here. You should also change the return value.
    all_histograms = np.empty((0, vocab_size))

    for path in image_paths:
        img = cv2.imread(path)[:, :, ::-1]  # 이미지 읽기

        features = feature_extraction(img, feature)  # 이미지에서 feature 추출
        dist =  pdist(vocab, features)
        min_dist_index = dist.argmin(axis=0)
        hist, _ = np.histogram(min_dist_index, bins=bins, density=True)
        #bins to check
        #plt.plot(bins[1:], hist)
        #plt.show()
        all_histograms = np.vstack((all_histograms, hist))

    return all_histograms
Exemple #27
0
def diagnosis(diagnosis_samples, model_file_path):
    '''
    故障诊断
    :param diagnosis_samples: 数据样本
    :param model_file_path: 模型路径
    :return: pred_result:诊断结果
    '''
    suffix = model_file_path.split('/')[-1].split('.')[-1]  # 获得所选模型的后缀名
    if 'm' == suffix:  # 说明是随机森林
        # 提取特征
        loader = np.empty(shape=[diagnosis_samples.shape[0], 16])
        for i in range(diagnosis_samples.shape[0]):
            loader[i] = feature_extraction(diagnosis_samples[i])
        diagnosis_samples_feature_extraction = loader

        # 加载模型
        model = joblib.load(model_file_path)
        # 使用模型进行诊断
        y_preds = model.predict(diagnosis_samples_feature_extraction)
    else:
        diagnosis_samples_new = diagnosis_samples[:, :, np.newaxis]  # 添加一个新维度
        # 加载模型 --- 这里要用这种方法加载,不然加载有的模型会报错,我也不知道为什么
        with CustomObjectScope({'GlorotUniform': glorot_uniform()}):
            model = load_model(model_file_path)
        # 对于CNN模型和LSTM,GRU模型,两者的输入不相同,所以捕捉一下异常,如果上面那种维度错了,那就换一个维度
        try:
            y_preds = model.predict_classes(diagnosis_samples_new)
        except ValueError:
            diagnosis_samples_new = diagnosis_samples[:,
                                                      np.newaxis, :]  # 添加一个新维度
            y_preds = model.predict_classes(diagnosis_samples_new)

    y_preds = list(y_preds)
    # 计算这些样本诊断结果中出现次数最多的结果作为最后结果
    y_pred = max(y_preds, key=y_preds.count)
    pred_result = result_decode(y_pred)

    return pred_result
X_train = []
Y_train = []

X_test = []
Y_test = []

sr = 22050
frame_len = 1000
hop_len = 80

for _dir in train_male_dirs:
    print('extracting features from => {}'.format(_dir))

    features = feature_extraction(_dir,
                                  sr=sr,
                                  mono=True,
                                  frame_len=frame_len,
                                  hop_len=hop_len)

    # print("features shape => ", features.shape)
    # print("features => ", features)

    # exit()

    X_train.append(features)
    Y_train.append(1)

    # break

for _dir in train_female_dirs:
    print('extracting features from => {}'.format(_dir))
Exemple #29
0
from predict import predict
from utils import prepare_table
from get_aggrs import get_aggrs
from feature_extraction import feature_extraction
from get_real_performance import get_real_performance
from update_fault_rate_month import update_fault_rate_month
from update_fault_number_month import update_fault_number_month

# 决定使用哪种模型进行预测
method_type = 'random forest'
# method_type = 'decision tree'
# method_type = 'knn'

# 数据库更新时间为diff天
diff = 2

# 性能评估的时间窗口为window天
window = 1

# prepare_table()
update_fault_number_month()
update_fault_rate_month()
feature_extraction()
predict(method_type)
get_real_performance(method_type, diff, window)
get_aggrs()
Exemple #30
0
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
from scipy.optimize import minimize
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import roc_curve, auc
import seaborn as sns

torch.manual_seed(1)
np.random.seed(0)

# dataset: 0 = SF data only, 1 = SF + LA data, 2 = SF + SJ data, 3 = All of CA

X, X_encode, X_train, y_train, X_val, y_val, X_test, y_test, n = \
    feature_extraction(dataset = 0, onehot_option = False, smote_option = True)
    
    
#%% Categorical embedding for categorical columns having more than two values

# Choosing columns for embedding
embedded_cols = {n: len(col.cat.categories) for n,col in X.loc[:,X_encode==1].items() if len(col.cat.categories) > 2}
embedded_col_names = embedded_cols.keys()

# Determinining size of embedding
# (borrowed from https://www.usfca.edu/data-institute/certificates/fundamentals-deep-learning lesson 2)
embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in embedded_cols.items()]

#%% Implement neural net
# Code copied from: https://jovian.ai/aakanksha-ns/shelter-outcome
Exemple #31
0
N = 10
num_train = 100000
num_test = 10000
epochs = 1
batch_size = 256

load = sio.loadmat('data/Train_data_%d_%d.mat' % (N, num_train))
loadTest = sio.loadmat('data/Test_data_%d_%d.mat' % (N, num_test))
Htrain = load['Xtrain']
Ptrain = load['Ytrain']
H_test = loadTest['X']
P_test = loadTest['Y']
timeW = loadTest['swmmsetime']
swmmsetime = timeW[0, 0]

H_ii, H_ij, H_ij_T, D = feature_extraction(Htrain,num_train,N)

weights = {
    'w_c_1': tf.Variable(tf.random_normal([3, 3, 3, 32], stddev=0.1)),
    'w_c_2': tf.Variable(tf.random_normal([3, 3, 32, 32], stddev=0.1)),
    'w_c_3': tf.Variable(tf.random_normal([3, 3, 32, 16], stddev=0.1)),
    'w_c_4': tf.Variable(tf.random_normal([3, 3, 16, 6], stddev=0.1)),

    'w_fc_1': tf.Variable(tf.random_normal([12, 40], stddev=0.1)),
    'w_fc_2': tf.Variable(tf.random_normal([40, 20], stddev=0.1)),
    'w_fc_3': tf.Variable(tf.random_normal([20, 1])),
}

biases = {
    'b_c_1': tf.Variable(tf.random_normal([32], stddev=0.1)),
    'b_c_2': tf.Variable(tf.random_normal([32], stddev=0.1)),
Exemple #32
0
def get_feature_(r):
    import feature_extraction
    f=feature_extraction.feature_extraction(r)
    return f.combine_all()