Python pre_processing Examples

Programming Language: Python

Namespace/Package Name: data_processing

Method/Function: pre_processing

Examples at hotexamples.com: 5

Python pre_processing - 5 examples found. These are the top rated real world Python examples of data_processing.pre_processing extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def processing(self):

        if self.abafile and self.syncfile and self.segfile and self.poifile:

            self.ppdata = pre_processing(self.abafile, self.syncfile,
                                         self.segfile, self.poifile, None)
            self.savefileButton.setVisible(True)
            self.pprocessButton.setVisible(False)
            self.abaEdit.setText("ABA file")
            self.syncEdit.setText("SYNC file")
            self.segEdit.setText("SEG file")
            self.poiEdit.setText("POI file")

        else:
            msg = QMessageBox()
            msg.setIcon(QMessageBox.Critical)
            msg.setText("File Error")
            msg.setInformativeText('Please load all the required files...')
            msg.setWindowTitle("File missing!")
            msg.exec_()

Example #2

Show file

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import data_processing
import tensorflow as tf
import tensorflow.contrib.slim as slim
import tensorflow.contrib.slim.nets as nets
import csv as csv

CHECK_POINT_PATH = 'check_point/train_model.ckpt-2'
NUM_KAGGLE_TEST = 12500
BATCH_SIZE = 50
batch_test_set = data_processing.pre_processing(data_set='test',
                                                batch_size=BATCH_SIZE)

prediction_file = open('kaggle_result_file.csv', 'wb')
prediction_file_object = csv.writer(prediction_file)
prediction_file_object.writerow(['id', 'label'])

with tf.Graph().as_default():
    images = tf.placeholder(tf.float32, [BATCH_SIZE, 224, 224, 3])
    keep_prob = tf.placeholder(tf.float32)
    logits, _ = nets.vgg.vgg_19(inputs=images,
                                num_classes=2,
                                dropout_keep_prob=keep_prob,
                                is_training=False)
    variables_to_restore = slim.get_variables_to_restore()
    restorer = tf.train.Saver(variables_to_restore)

    pros = tf.nn.softmax(logits)

Example #3

Show file

File: main.py Project: cyrillay/bot-detection

from data_processing import pre_processing, _window_logs, get_session_attributes
from parsing import parse_log_file
from detection_rules import (
    has_robots_txt_request,
    has_bot_name_in_user_agent,
    has_high_number_requests,
    has_low_request_interrarival_time,
)
import numpy as np
import datetime as dt

# Processing the input
print("Starting app...")
parsed_logs = parse_log_file("./access.log", from_date=dt.date(2014, 2, 20))
requests = pre_processing(parsed_logs)

# Generating features
hourly_windowed_logs = _window_logs(requests, window_time_frame="hour")
daily_windowed_logs = _window_logs(requests, window_time_frame="day")
hour_sessions = get_session_attributes(requests, aggregation_level="hour")
day_sessions = get_session_attributes(requests, aggregation_level="day")

# Examine generated features to flag potential bots
robots_txt = has_robots_txt_request(requests)
bot_names = has_bot_name_in_user_agent(requests)
low_inter_request_time_hour = has_low_request_interrarival_time(
    hour_sessions, threshold_number_requests_no_referrer=100)
low_inter_request_time_day = has_low_request_interrarival_time(
    day_sessions, threshold_number_requests_no_referrer=1000)

high_number_requests_hour = has_high_number_requests(hour_sessions,

Example #4

Show file

File: train.py Project: wolfworld6/Cat_or_dog-kaggle-vgg19-tensorflow

import os
import os.path
import time
TRAIN_LOG_DIR = os.path.join('Log/train/', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
TRAIN_CHECK_POINT = 'check_point/train_model.ckpt'
VALIDATION_LOG_DIR = 'Log/validation/'
VGG_19_MODEL_DIR = 'check_point/vgg_19.ckpt'
BATCH_SIZE = 32
EPOCH = 3
if not tf.gfile.Exists(TRAIN_LOG_DIR):
    tf.gfile.MakeDirs(TRAIN_LOG_DIR)

if not tf.gfile.Exists(VALIDATION_LOG_DIR):
    tf.gfile.MakeDirs(VALIDATION_LOG_DIR)

batch_train_set, batch_validation_set, images_num = data_processing.pre_processing(data_set='train', batch_size=BATCH_SIZE)

def get_accuracy(logits, labels):
    correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    return accuracy

with tf.Graph().as_default():
    images = tf.placeholder(tf.float32, [BATCH_SIZE, 224, 224, 3])
    labels = tf.placeholder(tf.float32, [BATCH_SIZE, len(data_processing.IMG_CLASSES)])
    keep_prob = tf.placeholder(tf.float32)
    with slim.arg_scope(nets.vgg.vgg_arg_scope()):
        logits, _ = nets.vgg.vgg_19(inputs=images, num_classes=2, dropout_keep_prob=keep_prob, is_training=True)
    variables_to_restore = slim.get_variables_to_restore(exclude=['vgg_19/fc8'])
    restorer = tf.train.Saver(variables_to_restore)

Example #5

Show file

    def anomaly_detection(self, pprocessed_file, seg_file, features='RMS', sliding_window=1000, sub_sampling=128, impurity=0.05, num_trees=100):

        self.processed_file = pprocessed_file
        # In case pre-processed file is not available
        if not os.path.isfile(self.processed_file):
            pre_processing(self.data_file, self.sync_file, self.seg_file, self.poi_file, self.processed_file)
        else:
            geo_list = []
            with open(self.poi_file) as csv_file:
                csv_reader = csv.reader(csv_file)
                line_count = 0
                for row in csv_reader:
                    tempStr = ''.join(row)
                    if tempStr.startswith('#') or len(tempStr) == 0:
                        continue
                    elif tempStr.startswith('CNT'):
                        print(f'Column names are {", ".join(row)}')
                        line_count += 1
                    else:
                        line_count += 1
                        tlist = tempStr.split(";")
                        ttlist = [float(x) for x in tlist if len(x) > 0]
                        geo_list.append(ttlist)
                print(f'Processed {line_count} lines in POI file.')
                geo_list = np.array(geo_list)

            lat = geo_list[:, 1]
            lon = geo_list[:, 2]

            # Interpolation external and geo-coordinates
            get_lat = interp1d(geo_list[:, 0], lat, fill_value='extrapolate')
            get_long = interp1d(geo_list[:, 0], lon, fill_value='extrapolate')

            # read pre-processed data in dataframe
            processed_data = pd.read_hdf(self.processed_file, 'processed', mode='r')

            # CHC1 = np.array(processed_data.CHC1)
            # CHC3 = np.array(processed_data.CHC3)
            # CHD1 = np.array(processed_data.CHD1)
            # CHD3 = np.array(processed_data.CHD3)
            EDIR = np.array(processed_data.ERS_DIR)
            CHA1 = np.array(processed_data.CHA1)
            CHA3 = np.array(processed_data.CHA3)
            CHB1 = np.array(processed_data.CHB1)
            CHB3 = np.array(processed_data.CHB3)

            # read internal and external counters
            int_count = np.array(processed_data.INTCNT)
            ext_count = np.array(processed_data.EXTCNT)
            # date_time = syncdat.DateTime

            # Pushing & Pulling ABA data for one side (left or right)
            pull_data_cha1 = CHA1[(EDIR == 1)]
            push_data_cha1 = CHA1[(EDIR == -1)]
            pull_data_cha3 = CHA3[(EDIR == 1)]
            push_data_cha3 = CHA3[(EDIR == -1)]

            pull_int_count = int_count[(EDIR == 1)]
            push_int_count = int_count[(EDIR == -1)]

            pull_ext_count = ext_count[(EDIR == 1)]
            push_ext_count = ext_count[(EDIR == -1)]

            if len(push_ext_count) == 0:
                push_ext_count = pull_ext_count

            # transformed data from X and Z axes
            pull_data = np.power((np.power(pull_data_cha1, 2) + np.power(pull_data_cha3, 2)), 1 / 2)
            push_data = np.power((np.power(push_data_cha1, 2) + np.power(push_data_cha3, 2)), 1 / 2)

            cha_data = np.power((np.power(CHA1, 2) + np.power(CHA3, 2)), 1 / 2)

            # Pulling and pushing data for other side of the rail
            pull_data_chb1 = CHB1[(EDIR == 1)]
            push_data_chb1 = CHB1[(EDIR == -1)]
            pull_data_chb3 = CHB3[(EDIR == 1)]
            push_data_chb3 = CHB3[(EDIR == -1)]
            # transformed data from X and Z axes
            pull_data2 = np.power((np.power(pull_data_chb1, 2) + np.power(pull_data_chb3, 2)), 1 / 2)
            push_data2 = np.power((np.power(push_data_chb1, 2) + np.power(push_data_chb3, 2)), 1 / 2)

            ########################################
            rail_data = []
            rail_counters = []
            rail_xcounters = []
            data_list = []
            counters_list = []
            xcounters_list = []
            data_list.append(pull_data)

            # check if data exist for push mode
            if len(push_data) != 0:
                data_list.append(push_data)
            counters_list.append(pull_int_count)
            counters_list.append(push_int_count)
            xcounters_list.append(pull_ext_count)
            if len(push_ext_count) != 0:
                xcounters_list.append(push_ext_count)
            ########################################
            data_list2 = []
            data_list2.append(pull_data2)

            # check if data exist for push mode
            if len(push_data2) != 0:
                data_list2.append(push_data2)
            rail_data.append(data_list)
            rail_data.append(data_list2)
            rail_counters.append(counters_list)
            rail_counters.append(counters_list)
            rail_xcounters.append(xcounters_list)
            rail_xcounters.append(xcounters_list)

            # interpolating internal and external counters
            get_xcount = interp1d(int_count, ext_count, fill_value='extrapolate')
            get_icount = interp1d(ext_count, int_count, fill_value='extrapolate')

            # ///////////// Feature Extraction //////////////
            aba_data_side = []
            all_xcount_mode = []
            anom_xcount_mode = []
            anom_score_mode = []

            for i in range(2):
                aba_data_mode = []
                int_count_mode = []
                anom_xcount_list = []
                anom_score_list = []
                input_data = rail_data[i]

                for j in range(len(data_list)):
                    in_data = input_data[j]
                    if len(in_data) == 0:
                        continue
                    counters = counters_list[j]
                    # inputs are ABA data, counters and the sliding window size
                    list_of_features = extract_features(in_data, counters, sliding_window)

                    rms = np.array(list_of_features[:, 0])
                    kurtosis = np.array(list_of_features[:, 2])
                    skewness = np.array(list_of_features[:, 3])
                    peak_to_peak = np.array(list_of_features[:, 4])
                    crest_factor = np.array(list_of_features[:, 5])
                    impulse_factor = np.array(list_of_features[:, 6])
                    rmsf = np.array(list_of_features[:, 12])
                    int_count = np.array(list_of_features[:, 13])

                    # features comparison
                    # plt.figure(2)
                    # plt.subplot(211)
                    # plt.ylabel('ABA')
                    # plt.plot(list(range(0, 401000)), in_data[:401000])
                    # plt.subplot(212)
                    # plt.ylabel('RMS')
                    # plt.plot(list(range(1000, 401000, 2000)), rms[:200], '*')
                    # plt.subplot(413)
                    # plt.ylabel('Kurtosis')
                    # plt.plot(list(range(1000, 401000, 2000)), kurtosis[:200], '*')
                    # plt.subplot(212)
                    # plt.ylabel('Peak to peak')
                    # plt.xlabel('Data Samples')
                    # plt.plot(list(range(1000, 401000, 2000)), peak_to_peak[:200], '*')
                    # plt.show()

                    # plt.figure(2)
                    # plt.subplot(211)
                    # plt.ylabel('ABA')
                    # plt.plot(list(range(390000, 400000)), in_data[390000:400000])
                    # plt.subplot(212)
                    # plt.ylabel('RMS')
                    # plt.xlabel('Samples')
                    # plt.plot(list(range(391000, 401000, 2000)), rms[195:200], 'r*')
                    # plt.xlim(390000, 400000)
                    # plt.show()

                    if features == 'RMS':
                        mylist = np.stack((rms, rms), axis=-1)
                    if features == 'Kurtosis':
                        mylist = np.stack((kurtosis, kurtosis), axis=-1)
                    if features == 'Crest factor':
                        mylist = np.stack((crest_factor, crest_factor), axis=-1)
                    if features == 'Impulse factor':
                        mylist = np.stack((impulse_factor, impulse_factor), axis=-1)
                    if features == 'Skewness':
                        mylist = np.stack((skewness, skewness), axis=-1)
                    if features == 'Peak-to-peak':
                        mylist = np.stack((peak_to_peak, peak_to_peak), axis=-1)
                    if features == 'All':
                        mylist = np.stack((rms, kurtosis, peak_to_peak, crest_factor, impulse_factor, skewness), axis=-1)
                    else:
                        mylist = np.stack((kurtosis, peak_to_peak),
                                          axis=-1)
                    norm_train, anom_train, norm_test, anom_test, anom_icount, anom_icount_train, anom_score = isolation_forest(
                        mylist, int_count, sub_sampling, impurity, num_trees)
                    print("Return from Anomaly detection")

                    all_xcount_mode.append(get_xcount(int_count))
                    anom_xcount_test = get_xcount(anom_icount)
                    anom_xcount_train = get_xcount(anom_icount_train)
                    anom_xcount = np.concatenate((anom_xcount_train, anom_xcount_test), axis=0)
                    anom_score = anom_score

                    anom_xcount_list.append(anom_xcount)
                    anom_score_list.append(anom_score)

                    # validation of anomalies
                    latitude = get_lat(anom_xcount)
                    longitude = get_long(anom_xcount)
                    dist = [0]

                    for z in range(len(latitude) - 1):
                        point_one = (latitude[z], longitude[z])
                        point_two = (latitude[z + 1], longitude[z + 1])
                        # distance = geodesic(point_one, point_two).km
                        # dist.append(1000 * distance)
                        dist.append(longitude[z])

                    anom_xcount = [str(x) for x in anom_xcount]
                    latitude = [str(x) for x in latitude]
                    longitude = [str(x) for x in longitude]
                    dist = [str(x) for x in dist]

                    write_data = zip(anom_xcount, latitude, longitude, dist)
                    track_side = 'chb' if i else 'cha'
                    train_mode = 'pushing' if j else 'pulling'

                    with open(self.counters_path + '\Prorail17112805si12_' + track_side + '_' + train_mode + '.csv', 'w',
                              newline='') as file:
                        try:
                            writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                            writer.writerow(['counters', 'latitude', 'longitude', 'distance'])
                            for cnt, lat, lon, dist in write_data:
                                writer.writerow([cnt, lat, lon, dist])
                        finally:
                            file.close()
                    #######################################################

                    lat_list = get_lat(anom_xcount).tolist()
                    long_list = get_long(anom_xcount).tolist()
                    lat_list_train = get_lat(anom_xcount_train).tolist()
                    long_list_train = get_long(anom_xcount_train).tolist()

                    # gmap_plot(lat_list_train + lat_list, long_list_train + long_list)

                aba_data_side.append(aba_data_mode)
                anom_xcount_mode.append(anom_xcount_list)
                anom_score_mode.append(anom_score_list)

            # function call: compare anomalies in ABA on both channels i.e. CHA and CHB and return anomaly_positions
            anomaly_positions = match_anomaly(rail_data, rail_xcounters, anom_xcount_mode, seg_file)
            # ///////////////////////////////////////////

            if len(anomaly_positions) > 2:
                anom_pos_cha = np.round(anomaly_positions[0] + anomaly_positions[2], 2)
                anom_xcount_cha = np.round(np.concatenate((anom_xcount_mode[0][0], anom_xcount_mode[0][1]), axis=0), 2)
                anom_score_cha = np.round(np.concatenate((anom_score_mode[0][0], anom_score_mode[0][1]), axis=0), 3)
            else:
                anom_pos_cha = np.round(anomaly_positions[0], 2)
                anom_xcount_cha = np.round(anom_xcount_mode[0][0], 2)
                anom_score_cha = np.round(anom_score_mode[0][0], 3)

            anom_pos_xcount = np.stack((anom_pos_cha, anom_xcount_cha, anom_score_cha), axis=-1)
            anom_pos_xcount_sorted = anom_pos_xcount[anom_pos_xcount[:, 0].argsort()]
            anom_pos_cha = list(anom_pos_xcount_sorted[:, 0])
            anom_xcount_cha = list(anom_pos_xcount_sorted[:, 1])
            anom_score_cha = list(anom_pos_xcount_sorted[:, 2])

            # Severity analysis (head-checks vs ABA anomaly severity)

            # dict = {'position': anom_pos_cha, 'counters': anom_xcount_cha, 'score': anom_score_cha}
            # df_anom_pos_score = pd.DataFrame(data=dict)
            # ectpath = r'D:\strukton_project\WP_180306\ECT\EC_data_2018_FC_FO_LR.csv'
            # headchecks = DefectSeverity(df_anom_pos_score, ectpath).get_trend()
            #
            # plotlist = []
            # depth = normalize(headchecks['depth'].tolist())
            # score = headchecks['score'].tolist()
            # plotlist.append(depth)
            # plotlist.append(score)
            # pltlist = [[plotlist[j][i] for j in range(len(plotlist))] for i in range(len(plotlist[0]))]
            # pltarr = np.array(pltlist)
            # sorted = pltarr[pltarr[:, 0].argsort()]
            # cracksize = sorted[:, 0]
            # anomscore = sorted[:, 1]
            #
            # write_data = zip(cracksize, anomscore)
            # track_side = 'cha_crack_anom'
            #
            # with open(self.counters_path + '\Prorail18030614si12_' + track_side + '.csv', 'w',
            #           newline='') as file:
            #     try:
            #         writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            #         writer.writerow(['crack_depth', 'anom_severity'])
            #         for crack, sev in write_data:
            #             writer.writerow([crack, sev])
            #     finally:
            #         file.close()
            #
            # ##################################

            write_data = zip(anom_pos_cha, anom_xcount_cha, anom_score_cha)
        return anom_pos_xcount_sorted