Python Features Examples, feature_extraction.Features Python Examples

Example #1

0

Show file

File: train_network.py Project: jasmeet13n/codesketch

 def testNetwork(self, data):
   converted_json = self.convertJsonToList(data)
   featuresObject = Features(converted_json)
   featuresObject.processFeatures()
   input_arr = featuresObject.getFeatures()
   print "in testNetwork"
   print input_arr
   return chr(self.clf.predict(input_arr))

Example #2

0

Show file

File: train_network.py Project: jasmeet13n/codesketch

 def addTrainingSetEntry(self, data, target, convert):
   if convert:
     data = convertJsonToList(data)
   featuresObject = Features(data)
   featuresObject.processFeatures()
   input = featuresObject.getFeatures()
   outfile = open('training_data.csv', 'a')
   outfile.write(','.join(map(str, input)))
   outfile.write(',')
   outfile.write(str(ord(target)))
   outfile.write('\n')
   outfile.close()

Example #3

0

Show file

File: test_bug.py Project: chuajiesheng/eclipse-bugs-analysis

    def test_bugs_reported_prior(self):
        objs = [{'creation_ts': 86400, 'something': 1}, # t+1
                {'creation_ts': (2 * 86400) - 1, 'something': 1}, # t+1
                {'creation_ts': 2 * 86400, 'something': 1}, # t+2
                {'creation_ts': 3 * 86400, 'something': 1}] # t+3

        f = Features()
        f.vec = DictVectorizer()
        f.matrix = f.vec.fit_transform(objs).toarray()

        res = f.bugs_within(f.bugs_between, 1)
        expected = np.array([0, 1, 1, 0])
        self.assertTrue((res == expected).all())

Example #4

0

Show file

def prepare_test_data():
    test_data = []
    log.write("Open test set")
    with open(TESTING_FILE, "r") as csv_file:
        csv_data = csv.DictReader(csv_file)
        for row in csv_data:
            test_data.append(row)
    log.write("Preprocessed test set")
    for data in test_data:
        data["preprocessed_kalimat"] = Preprocess(data).preprocess()
    log.write("Extract feature test set")
    feature = Features(test_data)
    feature.extract_feature()
    # feature.get_trainable_dataset()
    return test_data

Example #5

0

Show file

def initialModelTraining(user_email, data_sources, stress_model):
    # first model init based on 14 days data
    from_time = 0  # from the very beginning of data collection
    data = grpc_handler.grpc_load_user_data(
        from_ts=from_time,
        uid=user_email,
        data_sources=data_sources,
        data_src_for_sleep_detection=Features.SCREEN_ON_OFF)

    features = Features(uid=user_email, dataset=data)
    df = pd.DataFrame(features.extract_for_after_survey())

    # preprocessing and saving the result
    df_preprocessed = stress_model.preprocessing(df)
    with open('data_result/' + str(user_email) + "_features.p", 'wb') as file:
        pickle.dump(df_preprocessed, file)

    # normalizing
    norm_df = stress_model.normalizing("default", df_preprocessed, None)

    # init model
    stress_model.initModel(norm_df)

Example #6

0

Show file

def main():
    log.write("Open dataset")
    dataset = open_dataset([TRAINING_DIR + files for files in TRAINING_FILES])
    merged_dataset = []
    log.write("Resolve disagreement data")
    for k, v in dataset.items():
        if k == TRAINING_DIR + TRAINING_FILES[
                2] or k == TRAINING_DIR + TRAINING_FILES[4]:
            dataset[k] = disagreement_handling(v)
        merged_dataset += dataset[k]
    analyze_data(merged_dataset)
    log.write("Analyzing sense")
    sense_id = set()
    for datum in merged_dataset:
        sense_id.add(datum["sense"])
    xml_root = ET.parse(SENSE_FILES).getroot()
    for word in xml_root:
        for sense in word.findall("senses/sense"):
            if word.attrib["wid"].zfill(2) + sense.attrib["sid"].zfill(
                    2) not in sense_id:
                log.write(
                    "Kata `{}` dengan sense `{}` tidak ditemukan di data training"
                    .format(word[0].text, sense.attrib))
    log.write("Preprocessing")
    for data in merged_dataset:
        data["preprocessed_kalimat"] = Preprocess(data).preprocess()

    # for datum in merged_dataset:
    #     datum["preprocessed_kalimat"] =

    print(merged_dataset[0])
    log.write("Feature extraction")
    feature = Features(merged_dataset)
    feature.extract_feature()
    # feature.get_trainable_dataset()
    # log.write(merged_dataset[0])
    with open("feature.csv", "w") as csv_file:
        csv_writer = csv.writer(csv_file)
        # csv_writer.writerow(["kalimat_id", "sense", "features"])
        for data in merged_dataset:
            if "data_embedding" in data:
                csv_writer.writerow(
                    [data["\ufeffkalimat_id"], data["kata"], data["sense"]] +
                    list(data["data_embedding"]))
    log.write("Build Dataset")
    word_feature_mat, dummy_train, dummy_test = build_dataset(merged_dataset)
    classifier = {
        "Random Forest":
        RandomForestClassifier(n_estimators=1000),
        "SVM":
        SVC(C=10000, gamma=0.1, tol=1e-6, decision_function_shape='ovo'),
        "Neural Net":
        MLPClassifier(hidden_layer_sizes=2000,
                      activation='tanh',
                      solver='adam',
                      tol=1e-6,
                      learning_rate_init=0.001,
                      max_iter=1000,
                      early_stopping=True)
    }
    best_model = None
    best_acc = 0.0000001
    test_data = prepare_test_data()
    for model_name, model_class in classifier.items():
        log.write("Try {} :".format(model_name))
        true_count = 0
        n_data = 0
        model = model_class
        ansfile = "answers/{}_{}.csv".format(model_name, int(time.time()))
        for word in sorted(list(word_feature_mat.keys())):
            print("predicting {}".format(word))
            model.fit(dummy_train[word][0], dummy_train[word][1])
            prediction = model.predict(dummy_test[word][0])
            n_data += len(prediction)
            for pred, true in zip(prediction, dummy_test[word][1]):
                if pred == true:
                    true_count += 1
            model = model.fit(word_feature_mat[word][0],
                              word_feature_mat[word][1])
            actual_test(test_data, model, word, ansfile)
        accuracy = 100 * true_count / n_data
        # if accuracy > best_acc:
        #     #     best_model = model_class
        log.write("Akurasi dari {} : {} %".format(model_name, accuracy))

Example #7

0

Show file

File: test_model.py Project: ak1003/CSE572-Data-Mining

]
test_dataframe = pd.read_csv('MealNoMealData/mealData3.csv', names=columns)
# print(test_dataframe)
row, column = test_dataframe.shape
for i in range(row):
    test_dataframe.dropna(thresh=4, axis=0)
print("test_data")
# print(test_dataframe)
test_dataframe = test_dataframe.interpolate(method='linear',
                                            limit_direction='backward')
print(test_dataframe)
# test_dataframe=test_dataframe.dropna()

# print(test_dataframe)
s = DataSetFormation()
f = Features(4)
data = f.completefeatures(test_dataframe)
data = normalized_data = s.normalizeData(data)
# data=s.applyPCA(data,3)
data["Label"] = 1
print(data)
column = [
    'fft1', 'fft2', 'fft3', 'fft4', 'velocity1', 'velocity2', 'velocity3',
    'velocity4', 'rolling1', 'rolling2', 'rolling3', 'rolling4', 'dwt1',
    'dwt2', 'dwt3', 'dwt4'
]
column_p = ['pc1', 'pc2', 'pc3']
column_v = ['velocity1', 'velocity2', 'rolling2', 'rolling1']
value = loaded_model.predict(data[column_v])
print(value)
result = loaded_model.score(data[column_v], data['Label'])

Example #8

0

Show file

            'fft1', 'fft2', 'fft3', 'fft4', 'velocity1', 'velocity2',
            'velocity3', 'velocity4', 'rolling1', 'rolling2', 'dwt1', 'dwt2',
            'dwt3', 'dwt4'
        ]
        data = pd.DataFrame(extracted_features, columns=columns)
        data = data.dropna()
        print(data.head())
        data = StandardScaler().fit_transform(data.values)
        data = pd.DataFrame(data, columns=columns)
        return data


s = DataSetFormation()
s.read_csv()
s.createFeatureMatrixCGM()
mealFeatures = Features(4)
s.mealDataFrame.to_csv("myMealData.csv")
noMealFeatures = Features(4)
s.noMealDataFrame.to_csv("myNoMealData.csv")
finalMealDataFrame = pd.read_csv("myMealData.csv")
finalNoMealDataFrame = pd.read_csv("myNoMealData.csv")
meal = mealFeatures.completefeatures(finalMealDataFrame)
print(meal)
print("Final Meal DataSet")
mealPrincipalComponentDataFrame = s.normalizeData(meal)
nomeal = noMealFeatures.completefeatures(finalNoMealDataFrame)
print(nomeal)
print("Here", mealPrincipalComponentDataFrame)
mealPrincipalComponentDataFrame['Label'] = 1
print("Final NoMeal DataSet")
noMealPrincipalComponentDataFrame = s.normalizeData(nomeal)

Example #9

0

Show file

            'rolling4', 'expwindow1', 'expwindow2', 'expwindow3', 'expwindow4',
            'dwt1', 'dwt2', 'dwt3', 'dwt4'
        ]
        data = pd.DataFrame(extracted_features, columns=columns)
        data = data.dropna()
        print(data.head())
        data = MinMaxScaler().fit_transform(data.values)
        data = pd.DataFrame(data, columns=columns)
        self.applyPCA(data, 5, person, 'PCA')


print("""----------------------------------------|
|      Enter a Person Number                     |
|                                                |
|-----------------------------------------|""")
n = input()
directoryPath = os.getcwd()
access_right = 0o777
try:
    if not os.path.isdir(directoryPath + '/Person' + str(n)):
        os.mkdir(directoryPath + '/Person' + str(n), access_right)
except OSError:
    print('Directoy not created')
s = DataSetFormation(int(n))
s.plotCGMData(int(n))
b = Features(4, s.CGMData)
final_extracted_feature_matrix = b.completefeatures(int(n))
df = pd.DataFrame(final_extracted_feature_matrix)
df.to_csv('FeaturesExtracted.csv')
s.normalizeData(final_extracted_feature_matrix, n)

Example #10

0

Show file

def prediction_task(i):
    global grpc_handler
    print("Prediction task for {} is running... ".format(prediction_times[i]))
    grpc_handler = GrpcHandler('165.246.21.202:50051', manager_id,
                               manager_email, campaign_id)

    now_time = int(datetime.datetime.now().timestamp()) * 1000
    from_time = now_time - (4 * 3600 * 1000)  # from 4 hours before now time

    users_info = grpc_handler.grpc_load_user_emails()
    ema_order = i + 1

    data_sources = grpc_handler.grpc_get_data_sources_info()

    for user_email, id_day in users_info.items():
        user_id = id_day['uid']
        day_num = id_day['dayNum']
        sm = StressModel(uid=user_email, dayNo=day_num, emaNo=ema_order)

        # 0. check users day num if it's more than 14 days, only then extract features for 14 days and init the model
        if day_num > survey_duration:
            # if the first day and the first ema order after 14days
            if day_num == survey_duration + 1 and ema_order == 1:
                initialModelTraining(user_email, data_sources, sm)
            else:
                # 1. retrieve the data from the gRPC server
                # get all user data from gRPC server between start_ts and end_ts
                data = grpc_handler.grpc_load_user_data(
                    from_ts=from_time,
                    uid=user_email,
                    data_sources=data_sources,
                    data_src_for_sleep_detection=Features.SCREEN_ON_OFF)

                # 2. extract features from retrieved data
                with open('data_result/' + str(user_email) + "_features.p",
                          'rb') as file:
                    step1_preprocessed = pickle.load(file)

                features = Features(uid=user_email, dataset=data)
                df = pd.DataFrame(
                    features.extract_regular(start_ts=from_time,
                                             end_ts=now_time,
                                             ema_order=ema_order))

                # 3. pre-process and normalize the extracted features
                new_row_preprocessed = sm.preprocessing(df)
                norm_df = sm.normalizing("new", step1_preprocessed,
                                         new_row_preprocessed)

                # 4. init StressModel here
                # get test data
                new_row_for_test = norm_df[(norm_df['Day'] == day_num) &
                                           (norm_df['EMA order'] == ema_order)]

                ## get trained model
                with open('model_result/' + str(user_email) + "_model.p",
                          'rb') as file:
                    initModel = pickle.load(file)

                # 5. make prediction using current features with that model
                features = StressModel.feature_df_with_state['features'].values

                y_pred = initModel.predict(new_row_for_test[features])

                new_row_preprocessed['Sterss_label'] = y_pred

                # 6. save current features prediction as label to DB
                # insert a new pre-processed feature entry in DB with predicted label
                # DATA- , Model UPDATE
                update_df = pd.concat([
                    step1_preprocessed.reset_index(drop=True),
                    new_row_preprocessed.reset_index(drop=True)
                ])

                with open('data_result/' + str(user_email) + "_features.p",
                          'wb') as file:
                    pickle.dump(update_df, file)

                # 7. save prediction in DB and return it to gRPC server
                # Send the prediction with "STRESS_PREDICTION" data source and "day_num ema_order prediction_value" value
                user_all_labels = list(set(step1_preprocessed['Stress_label']))
                model_results = list(
                    sm.getSHAP(
                        user_all_labels, y_pred, new_row_for_test,
                        initModel))  # saves results on ModelResult table in DB

                # construct a message from model results and return it to gRPC server
                result_data = {}
                for model_result in model_results:
                    result_data[model_result.prediction_result] = {
                        "day_num": model_result.day_num,
                        "ema_order": model_result.ema_order,
                        "accuracy": model_result.accuracy,
                        "feature_ids": model_result.feature_ids
                    }
                #return prediction message to gRPC for user to see
                grpc_handler.grpc_send_user_data(
                    user_id, user_email, data_sources['STRESS_PREDICTION'],
                    now_time, result_data)

                # 8. check user self report and update the DB of pre-processed features with reported stress label if if there is self report from user
                # check 'SELF_STRESS_REPORT' data source for user and run retrain if needed and retrain
                # region Retrain the models with prev self reports
                check_and_handle_self_report(user_email, data, sm)

    grpc_handler.grpc_close()

Example #11

0

Show file

File: dataset_formation.py Project: ak1003/CSE572-Data-Mining

        # print(final_label_df)
        self.finalDataFrame = pd.concat([final_df_features, final_label_df],
                                        axis=1)
        df.to_csv("New.csv")
        self.finalDataFrame.to_csv("FinalDataFrame.csv")
        # # db_default = DBSCAN(eps = 0.375, min_samples = 5).fit(finalPCADataFrame)


#create a matrix to check if the vl
s = DataSetFormation()
s.read_csv()
s.createFeatureMatrixCGM()
s.createGroundTruth()
#Creating a ground truth table of 6 clusters
#, >0 to 20, 21 to 40, 41 to 60, 61 t o 80, 81 to 100.
mealFeatures = Features(4)
features = s.getFeatures()

mealPrincipalComponentDataFrame = s.normalizeData(features)
print(len(mealPrincipalComponentDataFrame))

s.createDBSCANClusterFromFeatures(mealPrincipalComponentDataFrame)
# s.createDBSCANClusterFromFeaturesMax()
# s.SSEMetrics()
# X_train, X_test, y_train, y_test = train_test_split(self.mealPrincipalComponentDataFrame,self.carbIntakeDataFrame,test_size=0.33, random_state=42)
# print(X_train)
# s.createKMeansCluster(mealPrincipalComponentDataFrame)
print(len(mealPrincipalComponentDataFrame))
# s.plotPointCluster()
# s.createDBSCANClusterFromFeaturesMax()
s.calculateAccuracy()

Example #12

0

Show file

File: npsvg.py Project: mlinderm/npsv

    def __call__(self, parser, namespace, values, option_string=None):
        from feature_extraction import Features

        Features.header(sys.stdout)
        parser.exit()