Exemple #1
0
    def test_plot(self):
        input_file = "../../EAGE2018/Well-A_finished/HQLD_B_2C1_75-1_Well-A_ISF-BHC-MSFL-GR__COMPOSIT__1.LAS"
        result = read_data.read(input_file)

        result_keys = result.keys()

        plotting.plot_two_columns(result.df(), result_keys[1], result_keys[2])
Exemple #2
0
    def continous_train(self, smoothing=1e-2):

        self.gaussians = {}
        self.priors = {}  # P(c)

        eprint("continous mode!")
        """ read data"""
        d, data_num = read_data.read("training")
        #		data_num=100
        eprint("train data_num: ", data_num)

        labels_list = np.array([d(i)[0] for i in range(data_num)])
        labels = set(labels_list)
        """ calcuate mean and var for each pixel and label!!"""
        for c in labels:
            img_list = np.array(
                [d(i)[1] for i in range(data_num) if d(i)[0] == c])
            #			print(img_list.mean(axis=0).shape )
            self.gaussians[c] = {
                "mean": img_list.mean(axis=0),
                "var": img_list.var(axis=0) + smoothing
            }
            self.priors[c] = len([ele for ele in labels_list if ele == c
                                  ]) / len(labels_list)
#			print( img_list.shape)
#			break
        pass
Exemple #3
0
def interactive_plots():

    current_feature_name = request.args.get("feature_name")
    if current_feature_name == None:
        current_feature_name = "GR"

    input_file = DATA_DIR + "EAGE2018/Well-A_finished/HQLD_B_2C1_75-1_Well-A_ISF-BHC-MSFL-GR__COMPOSIT__1.LAS"
    result = read_data.read(input_file).df()
    result_small = result[current_feature_name]
    print(result_small.name)



    plot = result_small.plot()
    print(current_feature_name)
    fig = plot.get_figure()
    fname="output{}.png".format(current_feature_name)
    fig.savefig('static/{}'.format(fname))


    mytext = "Hello with my text"

    #myimage = plotting.plot_two_columns(result.df(), result_keys[1], result_keys[2])
    #print(type(myimage))
    #print(myimage)
    print('rendering')
    return render_template('interactive_plots.html', mytext=mytext, myimage=fname,  feature_names=result.columns,  current_feature_name='GR')
Exemple #4
0
    def discrete_train(self):
        self.bit_shift = 3

        d, data_num = read_data.read("training")
        """ init dicttionary"""
        count_d = {}

        def return_0_9_d(d):
            for i in range(10):
                #				d[i]=[1 for j in range( 256/(2**self.bit_shift)  )]
                d[i] = [0.00000001 for j in range(32)]
            return d

        self.two_d_list_of_d = [[return_0_9_d({}) for ele in range(28)]
                                for ele in range(28)]

        self.label_count = [0 for i in range(10)]
        #		data_num=1000
        eprint("loading data " + str(data_num) + ", it will be late!")
        for i in range(data_num):
            """ [0] in label, [1] is image """
            label = d(i)[0]
            self.label_count[label] += 1
            img = d(i)[1]
            #			print("data:", i)
            #			print("label:", label)
            for row, p in enumerate(img):
                #				print([ ele>>self.bit_shift for ele in p])
                for col, v in enumerate([ele >> self.bit_shift for ele in p]):
                    self.two_d_list_of_d[row][col][label][v] += 1

        print(self.label_count)
Exemple #5
0
def get_data(input_file_name=constants.FILE_DATA,column_offset=0,source_type=constants.SOURCE_TYPE,to_index=True,to_append=False):

	selected_columns = get_features()	#Only select these features

	log.info("Reading data")

	try:
		df = read_data.read(input_file_name,source_type,selected_columns)
	except IOError as e:
		log.error("Could not open file - %s" % input_file_name)
		raise
	
	if(DEBUG):
		df.to_csv('raw.csv')

	df = process_data(df)

	if(DEBUG):
		df.to_csv('preprocessed.csv')

	if(to_index):
		log.info("Connecting to db: %s" % constants.FILE_INDEX_DB)
		conn = sqlite3.connect(constants.FILE_INDEX_DB)
		outf = df[selected_columns[0]]	#select only ids
		if(to_append):
			log.info("Appending to index")
			outf.to_sql(constants.TABLE_NAME,conn,if_exists='append',index_label=constants.COL_INDEX)
		else:
			log.info("Saving to index")
			outf.to_sql(constants.TABLE_NAME,conn,if_exists='replace',index_label=constants.COL_INDEX)
		conn.close()
		
	return df[selected_columns[column_offset:]]
def analyze_file(p, n, K):
    freq_dict = {}
    frac_dim_size_lim = 50
    frac_p_lim=0.465
    f_arr = np.zeros(shape=(10,))
    max_bin = 0

    for i in range(1,K+1):
        burst_sizes, bonds = read('../data/p{p}n{n}/run{i}p{p}n{n}.data'.format(i=i, p=p, n=n), force_small = False)
       
        if bonds and n>frac_dim_size_lim and p>frac_p_lim: #bonds not empty... (ant not n not (too small))
            freq, n_bins = fracdim_anal(bonds, frac_spacing) #If we dont need to check bond, we should definiatle force_small
            f_arr = add_frequencies(f_arr, freq)
            if n_bins > max_bin:
                max_bin = n_bins
                    
        for size in burst_sizes:
            if size in freq_dict:
                freq_dict[size]+=1
            else:
                freq_dict[size]=1
                
        sys.stdout.write('\r ... completed file {i}'.format(i=i))


    burst_size = list(freq_dict.keys())
    multiplicity = list(freq_dict.values())
    
    pdf, surv = tau_b_anal(burst_size, multiplicity)
    
    print_data('./data/p{p}n{n}{type}'.format(p=p, n=n, type='tau'), *pdf)
    print_data('./data/p{p}n{n}{type}'.format(p=p, n=n, type='survivor'), *surv)
    if bonds and n>frac_dim_size_lim and p>frac_p_lim:
        bins = plot_bins(exp_bins(1, base=frac_spacing, nums=max_bin))
        print_data('./data/p{p}n{n}{type}'.format(p=p, n=n, type='fracdim'), bins, f_arr)
Exemple #7
0
def bokeh():
    input_file = DATA_DIR + "EAGE2018/Well-A_finished/HQLD_B_2C1_75-1_Well-A_ISF-BHC-MSFL-GR__COMPOSIT__1.LAS"
    las = read_data.read(input_file)
    df = las.df()
    keys = las.keys()
    script, div, js_resources, css_resources = plotting.plot_bokeh(df, keys[1], keys[2])
    return render_template('bokeh_index.html',
                           plot_script=script,
                           plot_div=div,
                           js_resources=js_resources,
                           css_resources=css_resources )
def classify(image):
    train_images_gray, train_labels = read_data.read(range(10), 'training')
    if USEBW:
        train_images_bw = convert_bw(train_images_gray)
        test = otsu.otsu(numpy.array(jtov.jtov(image)))
        clf = svm.SVC(kernel="poly", degree=1)
        clf.fit(train_images_bw[:10000], train_labels[:10000])
        print clf.predict(test)
    else:
        test = numpy.array(jtov.jtov(image))
        clf = svm.SVC(kernel="poly", degree=2)
        clf.fit(train_images_gray[:10000], train_labels[:10000])
        print clf.predict(test)
Exemple #9
0
def load_data(digits):
  images = []
  for i in xrange(len(digits)):
    images.append(read_data.read([digits[i]], 'training')[0])
  for i in xrange(len(digits)):
    target_images = images[i]
    rest_images = images[:i] + images[i+1:]
    with open('svm_train' + str(digits[i]) + '.txt', 'w') as f:
      for image in target_images:
        f.write(to_svm_format(image, 1) + "\n")
      for num in rest_images:
        for image in num:
          f.write(to_svm_format(image, -1) + "\n")
    print "Done " + str(digits[i])
def init(file):
    global packets, pkt, label, attack_cat

    packets = rd.read(file)

    prep.proto_to_value(packets)
    prep.state_to_value(packets)
    prep.service_to_value(packets)

    pkt = packets.copy()

    prep.ip_to_value(packets)

    label = packets['Label'].to_numpy()
    attack_cat = packets['attack_cat'].to_numpy()
    del packets['Label']
    del packets['attack_cat']
def analyze_file(p, n, K, gamma_dict):
    freq_dict = {}

    for i in range(1, K + 1):
        burst_sizes, bonds = read(
            '../data/p{p}n{n}/run{i}p{p}n{n}.data'.format(i=i, p=p, n=n),
            force_small=True)
        for size in burst_sizes:
            if size in freq_dict:
                freq_dict[size] += 1
            else:
                freq_dict[size] = 1
        sys.stdout.write('\r ... completed file {i}'.format(i=i))

    burst_size = list(freq_dict.keys())
    multiplicity = list(freq_dict.values())
    mean_cluster = gamma_anal(burst_size, multiplicity)
    gamma_dict[n][p] = mean_cluster  #this works..
Exemple #12
0
    def continous_test(self):
        """ read  test data"""
        d, data_num = read_data.read("testing")
        eprint("test data_num: ", data_num)
        #		data_num=100
        X = np.array([d(i)[1] for i in range(data_num)])
        Y = np.array([d(i)[0] for i in range(data_num)])

        correct = 0
        for i, ele in enumerate(X):
            P = (self.continous_predict(ele))
            if Y[i] == P:
                #				print("OK!")
                correct += 1
        print("accuracy(%): ", 100 * correct / len(X))
        """ accuracy(%):  63.2 """

        pass
Exemple #13
0
    def discrete_test(self):
        d, data_num = read_data.read("testing")
        correct = 0
        eprint("start training!")
        eprint("train data_num: ", data_num)
        """ test """
        #data_num=1000
        for i in range(data_num):
            t_label = d(i)[0]
            t_img = d(i)[1]
            #			print("i:",i)
            #			print(self.predict(t_img)[0] )
            #			print("t_label:",t_label)
            if t_label == self.predict(t_img)[0]:
                correct += 1


#			input()
        print("accurcy(%):", 100 * correct / data_num)
        #70.1%
        pass
Exemple #14
0
def init(file):
    """ with open(file,  newline='') as csvfile:

        global packets, pkt
        packets = pd.read_csv(csvfile)
        #print(type(packets)) """
    
    packets = rd.read(file)
    global label, attack_cat 


    prep.proto_to_value(packets)
    prep.state_to_value(packets)
    prep.service_to_value(packets)

    pkt = packets.copy()

    prep.ip_to_value(packets)

    label = packets['Label'].to_numpy()
    attack_cat = packets['attack_cat'].to_numpy()
    del packets['Label']
    del packets['attack_cat']
Exemple #15
0
    print('\033[92m' + text.replace('\n', '') + '\033[0m')


# Save a review sentence to the dataset
def save_sentence(destination, entry_id, text, polarity='', aspects=''):
    formatted_aspects = '%-20s' % ('[' + str(' '.join(word
                                                      for word in aspects)) +
                                   ']')

    write_data(
        destination, entry_id + '  ' + polarity + formatted_aspects +
        SENTENCE_BEGIN_INDICATOR + '  ' + text)
    write_data(destination, '\n\n')


products_data = read_data.read(DIR_read)

current_time = datetime.now()

import os

if not os.path.exists(DIR_save):
    os.makedirs(DIR_save)

for product_data in products_data:

    meta = product_data['meta']
    data = product_data['data']

    product_id = meta['ID']
Exemple #16
0
        noduplicate_points.append(presention[0])
    noduplicate_points.append(compressed_points[-1][1])
    return noduplicate_points


if __name__ == '__main__':
    # 读取轨迹数据
    tradata = []
    # 文件目录路径
    raw_path = r"C:\Users\TJ\Desktop\Dataset\Geolife Trajectories 1.3\Data_4.0\9"
    new_path = r"C:\Users\TJ\Desktop\Dataset\Geolife Trajectories 1.3\OPERB\operb_data\9"
    file_list = os.listdir(raw_path)
    file_list.sort(key=lambda x: x[10:-5])
    # 读取每个文件的轨迹数据
    for i in range(len(file_list)):
        tradata.append(read(raw_path, file_list[i]))
    time_records = []
    compression_ratios = []
    error_bound = 10
    total_time = 0
    for j in range(len(tradata)):
        id = 0
        points = []
        for point in tradata[j]:
            p = Point(id, point[0], point[1], point[2])
            points.append(p)
            id = id + 1
        compressed_points = []
        start_time = time.clock()
        operb(error_bound)
        end_time = time.clock()
Exemple #17
0
    croo = roo - mroo
    crco = rco - mrco
    crcc_ = rcc_ - mrcc_
    croc_ = roc_ - mroc_
    crvp_ = rvp_ / mrvp_
    ctvl_ = tvl_ / mtvl_
    ind = ind[:, 3:]

    d['rcc_'] = rcc_
    d['rcc'] = rcc
    d['rco'] = rco
    d['roc'] = roc
    d['roc_'] = roc_
    d['roo'] = roo
    d['ind'] = ind
    d['croo'] = croo
    d['crco'] = crco
    d['crcc_'] = crcc_
    d['croc_'] = croc_
    d['crvp_'] = crvp_
    d['ctvl_'] = ctvl_

    # return d
    pass


if __name__ == '__main__':
    d = read_data.read()
    print calc(d)
 def test_reader(self):
     input_file = "../../EAGE2018/Well-A_finished/HQLD_B_2C1_75-1_Well-A_ISF-BHC-MSFL-GR__COMPOSIT__1.LAS"
     result = read_data.read(input_file)
     assert (isinstance(result, lasio.las.LASFile))
Exemple #19
0
    entropy_cost = tf.train.GradientDescentOptimizer(0.0003).minimize(cost)
    batches = input_x.shape[0]//batch_size
    if input_x.shape[0] % batch_size != 0:
        batches += 1
    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        for _ in range(ephocs):
            for batch in range(batches):
                start = batch * batch_size % input_x.shape[0]
                end = min(start + batch_size, input_x.shape[0])
                sess.run([entropy_cost], feed_dict={x: input_x[start:end], y: input_y[start:end]})
            c = sess.run([cost], feed_dict={x: input_x, y: input_y})
            print(c)


data, label = read_data.read()
train(data, label)













Exemple #20
0
def predict(df=None):
    """

    :param df: 一条或多条http请求,DataFrame格式
    :return: result 其中pre为预测结果
    """

    # 读取模型
    net = torch.load('model/LSTM_model.pkl')
    gs = joblib.load('model/gs.m')

    result = read_data.read('data/data.csv')
    if df is None:
        result['tmp'] = 0
    else:
        df = read_data.get_data(df)
        result['tmp'] = 1
        df['tmp'] = 0
        result = pd.concat([result, df], axis=0)
    result = cal_smilar.group_feature(result, 'user_agent')
    result = cal_smilar.group_feature(result, 'host')
    result = cal_smilar.group_feature(result, 'accept_language')
    result = cal_smilar.group_feature(result, 'accept_encoding')
    result = cal_smilar.group_feature(result, 'ip_dst')
    result = result[result['tmp'] == 0]
    del result['tmp']

    CPT = pd.read_csv('model/CPT.csv', sep='\a')
    train = pd.read_csv('data/test.csv')
    safe_host = pd.read_csv('data/host.csv', names=['host'])

    result['original_host'] = result['original_host'].apply(lambda x: get_host(x))

    # 规则筛选
    ans_host = result['original_host'].unique()
    safe_host = list(safe_host['host'].values)
    same_host = list(set(safe_host) & set(ans_host))
    result_safe = result[result['original_host'].isin(same_host)]
    result = result[~result['original_host'].isin(same_host)]
    print(len(result[result['y'] == 'malicious']))

    test_ip_1 = train['ip_24'].unique()
    test_ip_2 = [ip for ip in result['ip_24'].unique() if ip not in test_ip_1]
    # result = result[(result['y'] == 'safe') | (result['y'] == 'malicious')]

    dis = []
    lens = len(result)
    with tqdm.tqdm(range(lens), 'calculate dis') as t:
        for i in t:
            #     print(test_df.iloc[i,:])
            cpt, tmp = match.get_dis(pd.DataFrame(result.iloc[i, :]).T, CPT, gs)
            dis.append(tmp)
    result['dis'] = dis

    result_in_ip = result[result['ip_24'].isin(test_ip_1)]
    result = result[result['ip_24'].isin(test_ip_2)]
    # result = result[(result['y'] == 'safe') | (result['y'] == 'malicious')]

    # LSTM预测不在模板中的
    now_time = time.time()
    seq_length = 200
    pre = []
    for s in result['original_host']:
        val = tldextract.extract(s)
        strs = val.domain
        pre.append(net.predict(strs, seq_length))
    result['label'] = pre
    print("LSTM spent time {} s".format(time.time() - now_time))

    result_in_ip['pre'] = result_in_ip['dis'].apply(lambda x: in_ip(x))
    result_safe['pre'] = 0
    if len(result) != 0:
        result['pre'] = result.apply(lambda x: not_in_ip(x['dis'], x['label']), axis=1)
    result = pd.concat([result, result_in_ip, result_safe])

    # x = 0.12
    # print(len(result[(result['y'] == 'safe') | (result['y'] == 'malicious')]))
    # print("safe dis > " + str(x) + " :" + str(len(result[(result['y'] == 'safe') & (result['dis'] > x)])))
    # print("safe dis > " + str(x) + " label=1:" + str(
    #     len(result[(result['y'] == 'safe') & (result['dis'] > x) & (result['label'] == 1)])))
    # print("safe dis <= " + str(x) + " :" + str(len(result[(result['y'] == 'safe') & (result['dis'] <= x)])))
    # print("safe dis <= " + str(x) + " label=1:" + str(
    #     len(result[(result['y'] == 'safe') & (result['dis'] <= x) & (result['label'] == 1)])))
    # print("malicious dis > " + str(x) + " :" + str(len(result[(result['y'] == 'malicious') & (result['dis'] > x)])))
    # print("malicious dis > " + str(x) + " label=1:" + str(
    #     len(result[(result['y'] == 'malicious') & (result['dis'] > x) & (result['label'] == 1)])))
    # print("malicious dis <= " + str(x) + " :" + str(len(result[(result['y'] == 'malicious') & (result['dis'] <= x)])))
    # print("malicious dis <= " + str(x) + " label=1:" + str(
    #     len(result[(result['y'] == 'malicious') & (result['dis'] <= x) & (result['label'] == 1)])))
    #
    # # 计算得分
    # TP = len(result[(result['pre'] == 1) & (result['y'] == 'malicious')])
    # FN = len(result[(result['pre'] == 0) & (result['y'] == 'malicious')])
    # FP = len(result[(result['pre'] == 1) & (result['y'] == 'safe')])
    # TN = len(result[(result['pre'] == 0) & (result['y'] == 'safe')])
    # P = TP / (TP + FP)
    # R = TP / (TP + FN)
    # F1 = 2 * TP / (2 * TP + FP + FN)
    # auc = (TP + TN) / (TP + FN + FP + TN)
    # print("精确率 = {} 召回率 = {} F1_Score = {} 准确率 = {}".format(P, R, F1, auc))
    # print(TP, FN, FP, TN)

    return result
Exemple #21
0

# hello world 30 Jun 2020
import pandas as pd
import read_data

data = read_data.read()
# print(data.df)
# print(data.sname)



# if __name__ == '__main__':
#     a = 1
Exemple #22
0

def dice_coef_loss(y_true, y_pred):
    return -dice_coef(y_true, y_pred)


smooth = 1.

MASSACHUSETTS_PATH = "Massachusetts/"
TRAINING_SET = 1
MODEL_NAME = 'UNETV2'  # or 'UNET' or 'INCEPTION'

window = 28 * 8

path = MASSACHUSETTS_PATH + 'train/'
x_train, y_train = read_data.read(path, 110)

if 2 == TRAINING_SET:
    index = 75 * 49
    x_train = x_train[0:index, :, :, :]
    y_train = y_train[0:index, :, :, :]

print("len train ", len(x_train))

path = MASSACHUSETTS_PATH + 'validation/'
x_valid, y_valid = read_data.read(path, 4)
print("len valid ", len(x_valid))

if 'UNET' == MODEL_NAME:
    model = unet.get_unet()
if 'INCEPTION' == MODEL_NAME:
Exemple #23
0
import read_data
'''
Things to experiment with 
-Use pure black/white based on thresholding http://en.wikipedia.org/wiki/Otsu%27s_Method

for SVMs:
-Kernels
-C

KNN:
-k

Decision Trees:

'''
images, labels = read_data.read(range(10))
print "Read Raw Data"
sparse_images = preprocessing.scale(images)
#sparse_images = sparse.csr_matrix(sparse_images)

print "Made images sparse"

clf = svm.SVC(kernel='linear', cache_size=1000.)
print "initialized SVC"
clf.fit(sparse_images[:1000], labels[:1000])
print "Fit SVM"
guesses = clf.predict(sparse_images[10001:11001])
print metrics.classification_report(labels[10001:11001], guesses)
print guesses[:10]
print labels[10001:10011]
Exemple #24
0
from read_data import read, show

# Reading training and testing features from dataset
features_labels, features_train = read(dataset="training")
labels_test, features_test = read(dataset="testing")

# Reshaping features_train and features_test as sklearn
# requires 2D array in fit()
nsamples, nx, ny = features_train.shape
features_train = features_train.reshape((nsamples,nx*ny))

nsamples, nx, ny = features_test.shape
features_test = features_test.reshape((nsamples,nx*ny))

# NaiveBayes Model
from sklearn.naive_bayes import GaussianNB

clf_nb = GaussianNB()

clf_nb.fit(features_train, features_labels)

labels_pred_nb = clf.predict(features_test)

# SVM
from sklearn.svm import SVC

clf_svm = SVC(kernel='rbf')

clf_svm.fit(features_train, features_labels)

labels_pred_svm = clf.predict(features_test)
    def _process_data(self):

        # Read config file
        config = self._config

        if "input" not in config.keys():
            self._no_input_dialog()
            return

        if "output" not in config.keys():
            self._no_output_dialog()
            return

        fields = [
            'type',
            'modules',
            'lmax summary remove',
        ]

        for f in fields:
            if f not in config.keys():
                self._incomplete_config_dialog(f)
                return

        # Infer file type if config set to auto
        if config["type"] == "auto":
            file_type = infer_filetype.infer(config["input"][0])
            print("\nInferred file type: " + file_type)
        else:
            file_type = config["type"]

        # Read input data
        print("Reading data...")
        user_metadata = config["percentiles"].copy()
        user_metadata.insert(0, config["frequency weighting"])

        if 'columns' in config.keys():
            data, metadata = read(file_type,
                                  config["input"],
                                  user_metadata,
                                  columns=config["columns"])
        else:
            data, metadata = read(file_type, config["input"], user_metadata)

        print("Data read successfully")

        # Run pre-processing modules
        data, _ = process_batch(data, config["modules"], metadata)

        # Generate output tables
        tables = outputs_ui.daily_table(data, metadata["Frequency Weighting"],
                                        config["lmax summary remove"],
                                        config["lmax summary override"])

        # Export to Excel (also export config duplicate)
        writer, config_out = outputs_ui.export_excel(data, metadata, tables,
                                                     config)

        while True:
            try:

                # Export workbook
                writer.save()

                # Export config file
                with open(config_out, 'w') as file:
                    file.write(dumps(config, indent=4))

                print("Export complete")

                startfile(config["output"][0] + ".xlsm")
                QCoreApplication.quit()
                break

            except PermissionError:
                action = self._file_open_dialog()
                if action == 4194304:
                    QCoreApplication.quit()
                    break
Exemple #26
0
# -*- coding: utf-8 -*-
import read_data
import GA
import plots
#made changes to this script to incorporate the curve, addition of plots after the print avg specificity is the change.

[X, Y, features_no] = read_data.read()

obj = GA.genetic_algo(population_no=1000,
                      features=features_no,
                      X=X,
                      Y=Y,
                      generations=1000,
                      crossover_prob=.05,
                      mutation_prob=.001,
                      partition=0.70,
                      cross_validation=False,
                      fold_cv=1)
sol = obj.build_solution2()
#score=obj.build_solution()
#obj.final_solution(score)
precision = 0
recall = 0
f1score = 0
accuracy = 0
specificity = 0
for i in range(len(sol)):
    precision = precision + sol[i][0]
    recall = recall + sol[i][1]
    f1score = f1score + sol[i][2]
    accuracy = accuracy + sol[i][3]
#this file will serve to generate the initial model, and could be retooled
#to serve as a model "updater", i.e. generating a new model any time
#we gain access to a new labeled data set
#It also saves the model to a file so that we don't have to regenerate
#the model each time we want to use it to classify new data

from read_data import read
from split_data import split
from sklearn import svm
from joblib import dump
from test_model import run_metrics, F1, read_model_from_file

data = read("data_151.csv")
data += read("data_130.csv") + read("data_53.csv")
features_training, labels_training, features_testing, labels_testing = split(
    data)

#create SVM model
svm_model = svm.SVC(max_iter=10000, kernel='rbf')
svm_model.fit(features_training, labels_training.ravel())

dump(svm_model, 'model.joblib')

svm_model = read_model_from_file()

TP, FP, TN, FN = run_metrics(svm_model, features_testing, labels_testing)
f1 = F1(TP, FP, TN, FN)
print(
    'Coord_x|Coord_y|ISOS_z|ISOS_Size_x|ISOS_Size_y|COST_z|COST_Size_x|COST_Size_y '
)
#print(svm_model.coef_)
Exemple #28
0
import read_data
import math
import cProfile
import pylab as pl
import otsu
import numpy
import Image
#See http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.cross_val_score.html
from sklearn import cross_validation
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier

#Read dataset
train_images_gray, train_labels = read_data.read(range(10), 'training')
test_images_gray, test_labels = read_data.read(range(10), 'testing')
print "Done reading data"

def generate_pngs():
  for x in xrange(10):
    test = train_images_gray[x]
    test2 = []
    for i in xrange(len(test)):
      test2.append(numpy.uint8(255 - test[i]))
    test2 = numpy.array(test2)
    test2.shape = (28,28)
    print test2
    img = Image.fromarray(test2, 'L')
    img.save('report/' +str(x) + '.png')

def convert_bw(images):
# Author Taher Ahmadi

import read_data
import matplotlib.pyplot as plt
from matplotlib import gridspec
from sklearn import datasets, linear_model
import numpy as np
import random
import methods

data_set, labels = read_data.read('./data_set/Dataset1.csv', 8)

# Split train and test data
train_data = data_set[:400]
train_labels = labels[:400]
test_data = data_set[400:]
test_labels = labels[400:]

print('Train data size:', len(train_data))
print('Test data size:', len(test_data))

# a) Scatter plot each feature vs label
fig1 = plt.figure('a')
gs = gridspec.GridSpec(3, 3)
counter = 0
for i in range(0, 3):
    for j in range(0, 3):
        counter += 1
        if counter == 9:
            break
        ax_temp = fig1.add_subplot(gs[i, j])
Exemple #30
0
def run_training():
    """Train  for a number of steps."""
    # Get the sets of images and labels for training, validation, and
    # test on .
    filename = "adult.data.txt"
    vectors_data, labels_data = read_data.read(filename)
    print(vectors_data.shape)

    filename = "adult.test.txt"
    test_data, tlabels_data = read_data.read(filename)
    print(test_data.shape)

    # Tell TensorFlow that the model will be built into the default Graph.
    with tf.Graph().as_default() as data:
        # Generate placeholders for the images and labels.
        vectors_placeholder, labels_placeholder = placeholder_inputs(
            FLAGS.batch_size)

        # Build a Graph that computes predictions from the inference model.
        logits = mnist_tb.inference(vectors_placeholder, FLAGS.hidden1,
                                    FLAGS.hidden2)

        # Add to the Graph the Ops for loss calculation.
        loss = mnist_tb.loss(logits, labels_placeholder)
        #test_loss = mnist_tb.loss(test_logits, test_labels_placeholder)

        # Add to the Graph the Ops that calculate and apply gradients.
        train_op = mnist_tb.training(loss, FLAGS.learning_rate)

        # Add the Op to compare the logits to the labels during evaluation.
        eval_correct = mnist_tb.evaluation(logits, labels_placeholder)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Create a saver for writing training checkpoints.
        saver = tf.train.Saver()

        # Create a session for running Ops on the Graph.
        sess = tf.Session()

        # Run the Op to initialize the variables.
        init = tf.initialize_all_variables()
        sess.run(init)

        # Instantiate a SummaryWriter to output summaries and the Graph.

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        # And then after everything is built, start the training loop.
        for step in xrange(FLAGS.max_steps):
            start_time = time.time()

            # Fill a feed dictionary with the actual set of images and labels
            # for this particular training step.
            feed_dict = fill_feed_dict(step, vectors_data, labels_data,
                                       vectors_placeholder, labels_placeholder,
                                       FLAGS.batch_size)
            # Run one step of the model.  The return values are the activations
            # from the `train_op` (which is discarded) and the `loss` Op.  To
            # inspect the values of your Ops or variables, you may include them
            # in the list passed to sess.run() and the value tensors will be
            # returned in the tuple from the call.
            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)
            duration = time.time() - start_time

            # Write the summaries and print an overview fairly often.
            if step % 100 == 0:
                # Print status to stdout.
                print('Step %d: loss = %.2f (%.3f sec)' %
                      (step, loss_value, duration))
                # Update the events file.
                summary_str_train = sess.run(summary_op, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str_train, step)

                saver.save(sess, FLAGS.train_dir, global_step=step)

            # Save a checkpoint and evaluate the model periodically.
            if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                saver.save(sess, FLAGS.train_dir, global_step=step)
                # Evaluate against the training set.
                print('Training Data Eval:')
                do_eval(sess, eval_correct, vectors_placeholder,
                        labels_placeholder, vectors_data, labels_data,
                        FLAGS.batch_size)
                # Evaluate against the validation set.
                # Evaluate against the test set.
                print('Test Data Eval:')
                do_eval(sess, eval_correct, vectors_placeholder,
                        labels_placeholder, test_data, tlabels_data,
                        FLAGS.batch_size)
Exemple #31
0
def test():
    # 读取模型
    net = torch.load('model/LSTM_model.pkl')
    gs = joblib.load('model/gs.m')

    result = read_data.read('data/data.csv')
    result = cal_smilar.group_feature(result, 'user_agent')
    result = cal_smilar.group_feature(result, 'host')
    result = cal_smilar.group_feature(result, 'accept_language')
    result = cal_smilar.group_feature(result, 'accept_encoding')
    result = cal_smilar.group_feature(result, 'ip_dst')
    CPT = pd.read_csv('model/CPT.csv', sep='\a')
    valid = pd.read_csv('data/valid.csv')
    train = pd.read_csv('data/test.csv')
    safe_host = pd.read_csv('data/host.csv', names=['host'])

    result['original_host'] = result['original_host'].apply(lambda x: get_host(x))

    test_ip_1 = valid['ip_24'].unique()
    test_ip_2 = [ip for ip in result['ip_24'].unique() if ip not in test_ip_1]

    result = result[result['ip_24'].isin(test_ip_2)]
    result = result[(result['y'] == 'safe') | (result['y'] == 'malicious')]

    ans_host = result['original_host'].unique()
    safe_host = list(safe_host['host'].values)
    same_host = list(set(safe_host) & set(ans_host))
    result = result[~result['original_host'].isin(same_host)]
    print(len(result[result['y'] == 'malicious']))

    dis = []
    lens = len(result)
    with tqdm.tqdm(range(lens), 'calculate dis') as t:
        for i in t:
            #     print(test_df.iloc[i,:])
            cpt, tmp = match.get_dis(pd.DataFrame(result.iloc[i, :]).T, CPT, gs)
            dis.append(tmp)
    result['dis'] = dis

    # LSTM预测不在模板中的
    now_time = time.time()
    seq_length = 200
    pre = []
    for s in result['original_host']:
        val = tldextract.extract(s)
        strs = val.domain
        pre.append(net.predict(strs, seq_length))
    result['label'] = pre
    print("LSTM spent time {} s".format(time.time() - now_time))

    result['pre'] = result.apply(lambda x: not_in_ip(x['dis'], x['label']), axis=1)

    x = 0.12
    print(len(result[(result['y'] == 'safe') | (result['y'] == 'malicious')]))
    print("safe dis > " + str(x) + " :" + str(len(result[(result['y'] == 'safe') & (result['dis'] > x)])))
    print("safe dis > " + str(x) + " label=1:" + str(
        len(result[(result['y'] == 'safe') & (result['dis'] > x) & (result['label'] == 1)])))
    print("safe dis <= " + str(x) + " :" + str(len(result[(result['y'] == 'safe') & (result['dis'] <= x)])))
    print("safe dis <= " + str(x) + " label=1:" + str(
        len(result[(result['y'] == 'safe') & (result['dis'] <= x) & (result['label'] == 1)])))
    print("malicious dis > " + str(x) + " :" + str(len(result[(result['y'] == 'malicious') & (result['dis'] > x)])))
    print("malicious dis > " + str(x) + " label=1:" + str(
        len(result[(result['y'] == 'malicious') & (result['dis'] > x) & (result['label'] == 1)])))
    print("malicious dis <= " + str(x) + " :" + str(len(result[(result['y'] == 'malicious') & (result['dis'] <= x)])))
    print("malicious dis <= " + str(x) + " label=1:" + str(
        len(result[(result['y'] == 'malicious') & (result['dis'] <= x) & (result['label'] == 1)])))

    # 计算得分
    TP = len(result[(result['pre'] == 1) & (result['y'] == 'malicious')])
    FN = len(result[(result['pre'] == 0) & (result['y'] == 'malicious')])
    FP = len(result[(result['pre'] == 1) & (result['y'] == 'safe')])
    TN = len(result[(result['pre'] == 0) & (result['y'] == 'safe')])
    P = TP / (TP + FP)
    R = TP / (TP + FN)
    F1 = 2 * TP / (2 * TP + FP + FN)
    auc = (TP + TN) / (TP + FN + FP + TN)
    print("精确率 = {} 召回率 = {} F1_Score = {} 准确率 = {}".format(P, R, F1, auc))
    print(TP, FN, FP, TN)
Exemple #32
0
# Author Taher Ahmadi

import read_data
import numpy as np
from matplotlib import gridspec
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
import pandas as pd

data_set_1, label_1 = read_data.read('./data_set/Dataset2.csv', 6)

# Split train and test data
train_data_1 = data_set_1[:200]
train_label_1 = label_1[:200]
test_data_1 = data_set_1[200:]
test_label_1 = label_1[200:]

print('Train data size:', len(train_data_1))
print('Test data size:', len(test_data_1))

# Scatter plot each feature vs label
fig = plt.figure()
gs = gridspec.GridSpec(2, 3)
counter = 0
for i in range(0, 2):
    for j in range(0, 3):
        counter += 1
        ax_temp = fig.add_subplot(gs[i, j])
        ax_temp.scatter(train_data_1.get(counter - 1), train_label_1)
        ax_temp.title.set_text(('Feature ' + str(counter)))
plt.show()
def running(learning_rate, keep_prob, BATCH_SIZE, weight_decay):
    x = tf.placeholder(tf.float32, [BATCH_SIZE, 140, 320, 3])
    y = tf.placeholder(tf.float32, [BATCH_SIZE])

    global_step = tf.Variable(0, trainable=False)

    ##### training queue inputs #####

    ## get input
    train_images, train_angles, valid_images, valid_angles = read_data.read(
        input_path, eval_path)

    num_train = len(train_angles)
    num_valid = len(valid_angles)

    train_per_epoch = int((num_train * 1.0) / BATCH_SIZE)
    valid_per_epoch = int((num_valid * 1.0) / BATCH_SIZE)

    learning_rate_decay = tf.train.exponential_decay(learning_rate,
                                                     global_step,
                                                     30000,
                                                     0.80,
                                                     staircase=True)
    tf.scalar_summary('learning_rate', learning_rate_decay)
    ## pointer
    train_pointer = 0
    valid_pointer = 0

    ## inference build model
    prediction = pred_steer.inference(x, train_flag, drop_prob, wd)

    ## calculate loss
    loss = pred_steer.loss(prediction, y)

    ## build model per batch and update parameters
    train_op = pred_steer.train(loss, learning_rate_decay, global_step)

    ## build initialization peration
    init = tf.initialize_all_variables()
    ## merge all summaries and initialize writer
    #summary_op = tf.merge_all_summaries()
    #train_writer = tf.train.SummaryWriter("./tensorboard", graph = tf.get_default_graph())

    tf.scalar_summary('train_RMSE', tf.sqrt(loss))
    #tf.scalar_summary('l2_norm', l2)
    #tf.scalar_summary('train_pred', tf.reduce_mean(prediction))
    #tf.scalar_summary('eval_pred', tf.reduce_mean(eval_pred))
    #tf.scalar_summary('train_angle', tf.reduce_mean(angles))
    #tf.scalar_summary('eval_angle', tf.reduce_mean(tf.string_to_number(eval_angs, out_type = tf.float32)))

    sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
    merged = tf.merge_all_summaries()
    writer = tf.train.SummaryWriter("./tensor/", sess.graph)
    saver = tf.train.Saver()
    sess.run(init)

    epoch = 0
    ## start the queue runners
    #coord = tf.train.Coordinator()
    #enqueue_threads = tf.train.start_queue_runners(sess = sess, coord = coord)
    saver.restore(sess, './save/my-model-121000')

    for step in range(1, 400000):
        start_time = time.time()
        images_array, angles_array = read_data.Train_Batch(
            train_images, train_angles, BATCH_SIZE)  #, train_pointer)
        _, summary = sess.run(
            [train_op, merged],
            feed_dict={
                x: images_array,
                y: angles_array,
                train_flag: True,
                drop_prob: keep_prob,
                wd: weight_decay
            })
        if step % 20 == 0:
            #train_images_sub, train_angles_sub = read_data.Train_Batch(train_images, train_angles, BATCH_SIZE)
            eval_images_array, eval_angles_array = read_data.Valid_Batch(
                valid_images, valid_angles, BATCH_SIZE)  #, valid_pointer)
            #print("step: %d, eval_loss: %g"%(step, sess.run(loss, feed_dict = {
            #	x: eval_images_array, y:eval_angles_array, train_flag:False, drop_prob:1.0})))
            #train_out = sess.run(loss, feed_dict = {x: train_images_sub, y: train_angles_sub, train_flag:False, drop_prob:1.0, wd:0.0})
            out = sess.run(loss,
                           feed_dict={
                               x: eval_images_array,
                               y: eval_angles_array,
                               train_flag: False,
                               drop_prob: 1.0,
                               wd: 0.0
                           })
            print("step: " + str(step) + " loss: " + str(np.sqrt(out)))
            if step % 2000 == 0:
                #checkpath = "./save/model.ckpt"
                filename = saver.save(sess,
                                      './save/my-model',
                                      global_step=global_step)
                #filename = saver.save(sess, checkpath)
                print("Model saved in file: %s" % filename)
            # _, summary = sess.run([train_op, summary_op])
            #train_writer.add_summary(summary, step)
        #duration = time.time() - start_time
        writer.add_summary(summary, step)
Exemple #34
0
from datetime import datetime
from alexnet import AlexNet
from generator import Generator
import read_data as readData
import evaluation as eval
from sklearn import utils
from tqdm import *

learning_rates = [0.0000001]
num_epochs = 10
batch_size = 450
path = 'data/dataset/'

print "####################### ConTagNet ############################"
print "loading the dataset: please wait for a while!"
X1_train, X2_train, Y_train = readData.read(path + 'train/')
X1_val, X2_val, Y_val = readData.read(path + 'validate/')
X1_test, X2_test, Y_test = readData.read(path + 'test/')

X2_train = readData.readCoTagNet(path + 'train/')
X2_val = readData.readCoTagNet(path + 'validate/')
X2_test = readData.readCoTagNet(path + 'test/')

X_train_generator = Generator(X1_train, X2_train, Y_train)
X_validate_generator = Generator(X1_val, X2_val, Y_val)
X_test_generator = Generator(X1_test, X2_test, Y_test)

train_batches_per_epoch = np.floor(X_train_generator.num_samples /
                                   batch_size).astype(np.int16)
val_batches_per_epoch = np.floor(X_validate_generator.num_samples /
                                 batch_size).astype(np.int16)
Exemple #35
0
    first_bin_edge = 50
    min_exp = int(np.log(first_bin_edge)/np.log(base))
    max_exp = int(np.ceil(np.log(max_num)/np.log(base)))
    bins = []
    for i in range(min_exp, max_exp+1):
        bins.append(base**i)
    return bins

fracdim = []

n_bins = 100

f_arr = np.zeros((n_bins,))
for i in range(1, 11):
    print(i)
    freq, bonds = read('../data/p0.49n10000/run'+str(i)+'p0.49n10000.data')
    flatten = [bond for burst in bonds for bond in burst]
    radius = [r(*bond[0:2]) for bond in flatten]
    f, bins = np.histogram(radius, bins=n_bins)
    f_arr += f    
bins = bin_middles(bins)

M = 1/10000*np.cumsum(f_arr) #get mass as the integral of f_arr. 
logm = np.log(M)
logr = np.log(bins) 

s=slice(0,-1)

logfit, logcov = opt.curve_fit(lin, logr[s], logm[s]) #filter end and start
plt.plot(logr, logm,'o')
plt.plot(logr[s], [lin(x, *logfit) for x in logr[s]], '--')