def test_variable_size_image(self): shape1 = (2, 3, 1) shape2 = (2, 2, 2) tmp1 = gl.SArray([array.array('d', [0] * 6)]) tmp2 = gl.SArray([array.array('d', [0] * 8)]) data = gl.SFrame({'x': [tmp1.pixel_array_to_image(*shape1)[0], tmp2.pixel_array_to_image(*shape2)[0]]}) it = mxnet.io.SFrameIter(data, data_field='x') self.assertRaises(lambda: [it])
def generate_features_from_lists(list1, list2): set1 = set(list1) set2 = set(list2) feature0 = len(list1) feature1 = len(list2) feature2 = len(set1) feature3 = len(set2) feature4 = len(set1 & set2) feature5 = len(set1 | set2) feature6 = feature4 / max(feature5, 1) feature7 = sum(len(e) for e in list1) / max(feature0, 1) feature8 = sum(len(e) for e in list2) / max(feature1, 1) if feature0 == feature1: feature9 = (list1 == list2) else: feature9 = False return sframe.SArray([ feature0, feature1, feature2, feature3, feature4, feature5, feature6, feature7, feature8, feature9 ])
def generate_flows(input_url='data/scored_packets.csv', output_url="data/raw_flows.csv"): ''' Generate raw network flows from a list of captured packets ''' def __flow_id(x): if x['Source'] > x['Destination']: return x['Source'] + '-' + x['Destination'] + '-' + str( x['Source Port']) + '-' + str( x['Destination Port']) + '-' + str(x['Protocol']) else: return x['Destination'] + '-' + x['Source'] + '-' + str( x['Destination Port']) + '-' + str( x['Source Port']) + '-' + str(x['Protocol']) sorted_flow = sf.SFrame.read_csv(input_url, verbose=False) sorted_flow = sorted_flow[(sorted_flow['Source Port'] != '') & (sorted_flow['Destination Port'] != '')] sorted_flow['tcp_Flags'] = sorted_flow['tcp_Flags'].apply( lambda x: int(x, 16) if x != '' else 0) sorted_flow['UFid'] = sorted_flow.apply(lambda x: __flow_id(x)) sorted_flow = sorted_flow.sort(['UFid', 'Time']) packet_flow_memberships = [] current_flow = 0 current_ufid = None start_time = None for row in sorted_flow: if current_ufid is None: if start_time is None: start_time = row['Time'] packet_flow_memberships.append(current_flow) current_ufid = row['UFid'] elif (row['UFid'] == current_ufid): # Terminate connection. if row['tcp_Flags'] & 1: packet_flow_memberships.append(current_flow) current_ufid = None start_time = None current_flow += 1 # Time-outs # elif row['Time'] - startTime >= 360000: # current_flow_id = current_flow_id + 1 # Flow.append(current_flow_id) # prev_flow_id = None # startTime = row['Time'] else: packet_flow_memberships.append(current_flow) current_ufid = row['UFid'] else: current_flow = current_flow + 1 packet_flow_memberships.append(current_flow) current_ufid = row['UFid'] start_time = row['Time'] sorted_flow['FlowNo.'] = sf.SArray(packet_flow_memberships) sorted_flow.save(output_url)
def text_to_word_lists(raw_text): # Convert to lower case words = raw_text.lower() # Replace punctuation with spaces, split into words words = words.translate( string.maketrans(string.punctuation, ' ' * len(string.punctuation))) words = words.split() # Set of stop words stops = set(stopwords.words("english")) # Return list with and without stop words return sframe.SArray([words, [w for w in words if w not in stops]])
def unitTests(): # Test case 1 example_labels = sframe.SArray([-1, -1, 1, 1, 1]) if intermediate_node_num_mistakes(example_labels) == 2: print 'Test passed!' else: print 'Test 1 failed... try again!' # Test case 2 example_labels = sframe.SArray([-1, -1, 1, 1, 1, 1, 1]) if intermediate_node_num_mistakes(example_labels) == 2: print 'Test passed!' else: print 'Test 3 failed... try again!' # Test case 3 example_labels = sframe.SArray([-1, -1, -1, -1, -1, 1, 1]) if intermediate_node_num_mistakes(example_labels) == 2: print 'Test passed!' else: print 'Test 3 failed... try again!' #unitTests()
def bipartition(cluster, maxiter=400, num_runs=4, seed=None): '''cluster: should be a dictionary containing the following keys * dataframe: original dataframe * matrix: same data, in matrix format * centroid: centroid for this particular cluster''' data_matrix = cluster['matrix'] dataframe = cluster['dataframe'] # Run k-means on the data matrix with k=2. We use scikit-learn here to simplify workflow. kmeans_model = KMeans(n_clusters=2, max_iter=maxiter, n_init=num_runs, random_state=seed, n_jobs=1) kmeans_model.fit(data_matrix) centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_ # Divide the data matrix into two parts using the cluster assignments. data_matrix_left_child, data_matrix_right_child = data_matrix[ cluster_assignment == 0], data_matrix[cluster_assignment == 1] # Divide the dataframe into two parts, again using the cluster assignments. cluster_assignment_sa = sframe.SArray( cluster_assignment) # minor format conversion dataframe_left_child, dataframe_right_child = dataframe[ cluster_assignment_sa == 0], dataframe[cluster_assignment_sa == 1] # Package relevant variables for the child clusters cluster_left_child = { 'matrix': data_matrix_left_child, 'dataframe': dataframe_left_child, 'centroid': centroids[0] } cluster_right_child = { 'matrix': data_matrix_right_child, 'dataframe': dataframe_right_child, 'centroid': centroids[1] } return (cluster_left_child, cluster_right_child)
def FlowIdentifier(filename): SF2 = sf.SFrame.read_csv(filename, verbose=False) print "Done reading" # Removing records not having Source or Destination Ports SF2 = SF2[(SF2['Source Port'] != '') & (SF2['Destination Port'] != '')] # Convert tcp Flags to integer, if present, else mark 0 SF2['tcp_Flags'] = SF2['tcp_Flags'].apply(lambda x: int(x, 16) if x != '' else 0) #For identifying IOPR feature, used later SF2['Forward'] = SF2.apply(lambda x: 1 if x['Source'] > x['Destination'] else 0) # Assign Flow ID based on the 5 tuple SF2['UFid'] = SF2.apply(lambda x: flow_id(x)) ##Code for logic of Bidirectional flow identification FlowNo = 0 ##Unique Flow Number for each flow, assigned to every packet prev = None Flow = [] ##Stores all flows in form of list of dictionary #cFlow = [] ##Store the current flow (all details) count = 0 fc = 0 startTime = None ##Start Time of each flow to implement timeout # Sort the records based on the 5 tuple flow id and time, so that all packets corresponding to same 5 tuple are grouped together, making it easier for identifying the flows SF2 = SF2.sort(['UFid', 'Time']) #print 'Done Sorting' # Now, we will label every packet with a unique flow no. to which it belongs for x in SF2: #if count%500000 == 0: #print 'Running '+str(count)+' Done !' count = count + 1 if prev is None: if startTime is None: #New Flow, record the start time and add to List of all flows startTime = x['Time'] Flow.append(FlowNo) prev = x['UFid'] elif compareUF(x['UFid'], prev): #Flow is already existing if x['tcp_Flags'] & 1: #Packet has a FIN Flag, terminate the flow including this as the last packet Flow.append(FlowNo) prev = None startTime = None FlowNo = FlowNo + 1 elif x['Time'] - startTime >= 3600: # Duration of the flow crosses Timeout value, start a new flow with this as its first packet FlowNo = FlowNo + 1 Flow.append(FlowNo) prev = None startTime = x['Time'] else: # New packet in a pre-existing flow Flow.append(FlowNo) prev = x['UFid'] else: # Previous Flow tuple didnt receive any more packets, start a new flow FlowNo = FlowNo + 1 Flow.append(FlowNo) prev = x['UFid'] startTime = x['Time'] print len(sf.SArray(Flow).unique()) SF2['Flow'] = sf.SArray(Flow) temp = SF2.groupby('Flow', {'Count': sf.aggregate.COUNT()}) #len(temp[temp['Count']>1]) SF2['FlowNo.'] = sf.SArray(Flow) # Output of this function : Packet wise Flow Number marked and stored as a csv file in the same folder. # This file will be used to generate the features SF2.save('Ports_Only_Sorted_Flow_BD.csv')
def Flow_Feature_Generator(packetcapturecsv): # Generate packet wise flow numbers FlowIdentifier(packetcapturecsv) SF2 = sf.SFrame.read_csv('Ports_Only_Sorted_Flow_BD.csv', verbose=False) ## FLOW BASED FEATURE GENERATION ## Ratio of incoming to outgoing packets temp = SF2.groupby('FlowNo.', { 'NumForward': sf.aggregate.SUM('Forward'), 'Total': sf.aggregate.COUNT() }) temp['IOPR'] = temp.apply( lambda x: ((x['Total'] - x['NumForward']) * 1.0) / x['NumForward'] if x['NumForward'] != 0 else (-1)) temp = temp['FlowNo.', 'IOPR'] SF2 = SF2.join(temp, on='FlowNo.') del (temp) ## First Packet Length FlowFeatures = [ 'Source', 'Destination', 'Source Port', 'Destination Port', 'Protocol' ] FPL = SF2.groupby(['FlowNo.'], {'Time': sf.aggregate.MIN('Time')}) #print len(FPL) FPL = FPL.join(SF2, on=['FlowNo.', 'Time'])[['FlowNo.', 'Length']].unique() FPL = FPL.groupby(['FlowNo.'], {'FPL': sf.aggregate.AVG('Length')}) SF2 = SF2.join(FPL, on='FlowNo.') del (FPL) # ## 18/10/2016 ## Number of packets per flow temp = SF2.groupby(['FlowNo.'], {'NumPackets': sf.aggregate.COUNT()}) #print temp.head(3) SF2 = SF2.join(temp, on='FlowNo.') del (temp) ## Number of bytes exchanged temp = SF2.groupby(['FlowNo.'], {'BytesEx': sf.aggregate.SUM('Length')}) SF2 = SF2.join(temp, on='FlowNo.') del (temp) # In[38]: ## Standard deviation of packet length temp = SF2.groupby(['FlowNo.'], {'StdDevLen': sf.aggregate.STDV('Length')}) SF2 = SF2.join(temp, on='FlowNo.') del (temp) # In[40]: ## Same length packet ratio temp2 = SF2.groupby( ['FlowNo.'], {'SameLenPktRatio': sf.aggregate.COUNT_DISTINCT('Length')}) ##temp from number of packets computation temp = SF2.groupby(['FlowNo.'], {'NumPackets': sf.aggregate.COUNT()}) temp = temp.join(temp2, on='FlowNo.') temp[ 'SameLenPktRatio'] = temp['SameLenPktRatio'] * 1.0 / temp['NumPackets'] temp2 = None temp = temp[['FlowNo.', 'SameLenPktRatio']] SF2 = SF2.join(temp, on='FlowNo.') del (temp) # In[41]: ## Duration of flow timeF = SF2.groupby(['FlowNo.'], { 'startTime': sf.aggregate.MIN('Time'), 'endTime': sf.aggregate.MAX('Time') }) timeF['Duration'] = timeF['endTime'] - timeF['startTime'] timeF = timeF[['FlowNo.', 'Duration']] SF2 = SF2.join(timeF, on='FlowNo.') # In[45]: #sorted(SF2.column_names()) # Relevant Features extracted till now features = [ 'Answer RRs', 'BytesEx', 'Destination', 'Destination Port', 'Duration', 'FPL', 'IP_Flags', 'Length', 'Next sequence number', 'No.', 'NumPackets', 'Protocol', 'Protocols in frame', 'SameLenPktRatio', 'Sequence number', 'Source', 'Source Port', 'StdDevLen', 'TCP Segment Len', 'Time', 'tcp_Flags', 'FlowNo.', 'udp_Length', 'IOPR' ] SF2 = SF2[features] # In[52]: ## Average packets per second temp = SF2.groupby(['FlowNo.'], {'NumPackets': sf.aggregate.COUNT()}) temp = temp.join(timeF, on=['FlowNo.']) temp['AvgPktPerSec'] = temp.apply(lambda x: 0.0 if x[ 'Duration'] == 0.0 else x['NumPackets'] * 1.0 / x['Duration']) temp = temp[['FlowNo.', 'AvgPktPerSec']] SF2 = SF2.join(temp, on='FlowNo.') del (temp) # In[53]: ##Average Bits Per Second temp = SF2.groupby(['FlowNo.'], {'BytesEx': sf.aggregate.SUM('Length')}) temp = temp.join(timeF, on=['FlowNo.']) temp['BitsPerSec'] = temp.apply(lambda x: 0.0 if x['Duration'] == 0.0 else x['BytesEx'] * 8.0 / x['Duration']) temp = temp[['FlowNo.', 'BitsPerSec']] SF2 = SF2.join(temp, on='FlowNo.') del (temp) # In[55]: ## Average Packet Lentgth temp = SF2.groupby(['FlowNo.'], {'APL': sf.aggregate.AVG('Length')}) SF2 = SF2.join(temp, on='FlowNo.') del (temp) # In[ ]: ## Inter arrival time of the packets SF2['IAT'] = 0 SF2 = SF2.sort(['FlowNo.', 'Time']) prev = None prevT = None li = [] for x in SF2: if prev is None or x['FlowNo.'] != prev: li.append(0) else: li.append(x['Time'] - prevT) prev = x['FlowNo.'] prevT = x['Time'] SF2['IAT'] = sf.SArray(li) # In[67]: #SF2.save('Bidirectional_Test_Bot_features_till_IAT.csv') # # Is Null feature # ### Number of TCP Null packets ## Null Packets handling def checkNull(x): if (x['TCP Segment Len'] == '0' or x['udp_Length'] == 8): return 1 elif ('ipx' in x['Protocols in frame'].split(':')): l = x['Length'] - 30 if ('eth' in x['Protocols in frame'].split(':')): l = l - 14 if ('ethtype' in x['Protocols in frame'].split(':')): l = l - 2 if ('llc' in x['Protocols in frame'].split(':')): l = l - 8 if (l == 0 or l == -1): return 1 return 0 SF2['isNull'] = SF2.apply(lambda x: checkNull(x)) NPEx = SF2.groupby(['FlowNo.'], {'NPEx': sf.aggregate.SUM('isNull')}) SF2 = SF2.join(NPEx, on='FlowNo.') del (NPEx) # ### Number of Reconnects - considering only TCP reconnects, using sequence number recon = SF2[SF2['Sequence number'] != ''].groupby( ['FlowNo.'], { 'total_seq_no.': sf.aggregate.COUNT('Sequence number'), 'distinct_seq_no.': sf.aggregate.COUNT_DISTINCT('Sequence number') }) recon['reconnects'] = recon['total_seq_no.'] - recon['distinct_seq_no.'] recon.head() recon = recon[['FlowNo.', 'reconnects']] SF2 = SF2.join(recon, on='FlowNo.', how='left') len(SF2) del (recon) # In[81]: #To identify records where reconnect check was not applied like UDP etc SF2.fillna('reconnects', -1) # A lot of these reconnects can be simple retransmissions - due to out of order/timeout etcb #Combine the features to flow based information SF2['Forward'] = SF2.apply(lambda x: 1 if x['Source'] > x['Destination'] else 0) temp = SF2.groupby('FlowNo.', { 'NumForward': sf.aggregate.SUM('Forward'), }) SF2 = SF2.join(temp, on='FlowNo.') del (temp) # Combine the packet level features to select only the FLOW BASED FEATURES SF2 = SF2.groupby( 'FlowNo.', { 'Answer RRs': sf.aggregate.SELECT_ONE('Answer RRs'), 'BytesEx': sf.aggregate.SELECT_ONE('BytesEx'), 'Destination': sf.aggregate.SELECT_ONE('Destination'), 'Destination Port': sf.aggregate.SELECT_ONE('Destination Port'), 'Duration': sf.aggregate.SELECT_ONE('Duration'), 'FPL': sf.aggregate.SELECT_ONE('FPL'), 'IP_Flags': sf.aggregate.SELECT_ONE('IP_Flags'), 'Length': sf.aggregate.SELECT_ONE('Length'), 'Next sequence number': sf.aggregate.SELECT_ONE('Next sequence number'), 'No.': sf.aggregate.SELECT_ONE('No.'), 'NumPackets': sf.aggregate.SELECT_ONE('NumPackets'), 'Protocol': sf.aggregate.SELECT_ONE('Protocol'), 'Protocols in frame': sf.aggregate.SELECT_ONE('Protocols in frame'), 'SameLenPktRatio': sf.aggregate.SELECT_ONE('SameLenPktRatio'), 'Sequence number': sf.aggregate.SELECT_ONE('Sequence number'), 'Source': sf.aggregate.SELECT_ONE('Source'), 'Source Port': sf.aggregate.SELECT_ONE('Source Port'), 'StdDevLen': sf.aggregate.SELECT_ONE('StdDevLen'), 'IAT': sf.aggregate.SELECT_ONE('IAT'), 'isNull': sf.aggregate.SELECT_ONE('isNull'), 'NPEx': sf.aggregate.SELECT_ONE('NPEx'), 'reconnects': sf.aggregate.SELECT_ONE('reconnects'), 'APL': sf.aggregate.SELECT_ONE('APL'), 'BitsPerSec': sf.aggregate.SELECT_ONE('BitsPerSec'), 'AvgPktPerSec': sf.aggregate.SELECT_ONE('AvgPktPerSec'), 'udp_Length': sf.aggregate.SELECT_ONE('udp_Length'), 'tcp_Flags': sf.aggregate.SELECT_ONE('tcp_Flags'), 'Time': sf.aggregate.SELECT_ONE('Time'), 'TCP Segment Len': sf.aggregate.SELECT_ONE('TCP Segment Len'), 'IOPR': sf.aggregate.SELECT_ONE('IOPR'), 'NumForward': sf.aggregate.SELECT_ONE('NumForward') }) # FINAL OUTPUT : A CSV File having all the flows and Extracted Flow Based Features SF2.save('Bidirectional_Botnet_all_features.csv')
def flow_separator(input_url=PROJ_ROOT + 'data/modified_data.csv', output_url=PROJ_ROOT + "models/sorted_flow.csv"): print("Initiating flow separation") sorted_flow = sf.SFrame.read_csv(input_url, verbose=False) print("Preprocessing file") # Preprocess file sorted_flow = sorted_flow[(sorted_flow['Source Port'] != '') & (sorted_flow['Destination Port'] != '')] sorted_flow['Forward'] = sorted_flow.apply( lambda x: 1 if x['Source'] > x['Destination'] else 0) sorted_flow['tcp_Flags'] = sorted_flow['tcp_Flags'].apply( lambda x: int(x, 16) if x != '' else 0) sorted_flow['UFid'] = sorted_flow.apply(lambda x: flow_id(x)) sorted_flow = sorted_flow.sort(['UFid', 'Time']) # Master flow list Flow = [] # Incremental vars current_flow_id = 0 # incrementing id for flow prev_flow_id = None startTime = None ##Start Time of each flow to implement timeout for row in sorted_flow: # Means prev is set to none so no previous flow to continue if prev_flow_id is None: if startTime is None: startTime = row['Time'] # Add this new flow to the current_flow Flow.append(current_flow_id) prev_flow_id = row['UFid'] elif (row['UFid'] == prev_flow_id): # TCP termination if row['tcp_Flags'] & 1: Flow.append(current_flow_id) prev_flow_id = None startTime = None current_flow_id += 1 # Timeout termination and restart elif row['Time'] - startTime >= 3600: current_flow_id = current_flow_id + 1 Flow.append(current_flow_id) prev_flow_id = None startTime = row['Time'] # New time else: Flow.append(current_flow_id) prev_flow_id = row['UFid'] else: # Previous Flow tuple didnt receive any more packets, start a new flow current_flow_id = current_flow_id + 1 Flow.append(current_flow_id) prev_flow_id = row['UFid'] startTime = row['Time'] print("Flow sorting complete") print(len(sf.SArray(Flow).unique())) sorted_flow['Flow'] = sf.SArray(Flow) temp = sorted_flow.groupby('Flow', {'Count': sf.aggregate.COUNT()}) sorted_flow['FlowNo.'] = sf.SArray(Flow) sorted_flow.save(output_url) print("Flow saved\n##############################")
def flow_featurization(input_url=PROJ_ROOT + "models/sorted_flow.csv", output_url=PROJ_ROOT + "models/all_features.csv"): print("Intializing flow featurization") flow_list = sf.SFrame.read_csv(input_url, verbose=False) ## Ratio of incoming to outgoing packets temp = flow_list.groupby('FlowNo.', { 'NumForward': sf.aggregate.SUM('Forward'), 'Total': sf.aggregate.COUNT() }) temp['IOPR'] = temp.apply( lambda x: ((x['Total'] - x['NumForward']) * 1.0) / x['NumForward'] if x['NumForward'] != 0 else (-1)) temp = temp['FlowNo.', 'IOPR'] flow_list = flow_list.join(temp, on='FlowNo.') del (temp) print(" Ratio measuring complete") ## First Packet Length FlowFeatures = [ 'Source', 'Destination', 'Source Port', 'Destination Port', 'Protocol' ] FPL = flow_list.groupby(['FlowNo.'], {'Time': sf.aggregate.MIN('Time')}) FPL = FPL.join(flow_list, on=['FlowNo.', 'Time'])[['FlowNo.', 'Length']].unique() FPL = FPL.groupby(['FlowNo.'], {'FPL': sf.aggregate.AVG('Length')}) flow_list = flow_list.join(FPL, on='FlowNo.') del (FPL) print(" Packet length measured") ## Number of packets per flow temp = flow_list.groupby(['FlowNo.'], {'NumPackets': sf.aggregate.COUNT()}) flow_list = flow_list.join(temp, on='FlowNo.') del (temp) print(" Packet sum measured") ## Number of bytes exchanged temp = flow_list.groupby(['FlowNo.'], {'BytesEx': sf.aggregate.SUM('Length')}) flow_list = flow_list.join(temp, on='FlowNo.') del (temp) print(" Byte exchange measured") ## Standard deviation of packet length temp = flow_list.groupby(['FlowNo.'], {'StdDevLen': sf.aggregate.STDV('Length')}) flow_list = flow_list.join(temp, on='FlowNo.') del (temp) print(" Standard deviation of packet length measured") ## Same length packet ratio temp2 = flow_list.groupby( ['FlowNo.'], {'SameLenPktRatio': sf.aggregate.COUNT_DISTINCT('Length')}) temp = flow_list.groupby(['FlowNo.'], {'NumPackets': sf.aggregate.COUNT()}) temp = temp.join(temp2, on='FlowNo.') temp[ 'SameLenPktRatio'] = temp['SameLenPktRatio'] * 1.0 / temp['NumPackets'] temp2 = None temp = temp[['FlowNo.', 'SameLenPktRatio']] flow_list = flow_list.join(temp, on='FlowNo.') del (temp) print(" Same length packet ratio measured") ## Duration of flow timeF = flow_list.groupby(['FlowNo.'], { 'startTime': sf.aggregate.MIN('Time'), 'endTime': sf.aggregate.MAX('Time') }) timeF['Duration'] = timeF['endTime'] - timeF['startTime'] timeF = timeF[['FlowNo.', 'Duration']] flow_list = flow_list.join(timeF, on='FlowNo.') print(" Duration of flow measured") # Relevant Features extracted till now features = [ 'BytesEx', 'Destination', 'Destination Port', 'Duration', 'FPL', 'IP_Flags', 'Length', 'NumPackets', 'Protocol', 'Protocols in frame', 'SameLenPktRatio', 'Source', 'Source Port', 'StdDevLen', 'TCP Segment Len', 'Time', 'tcp_Flags', 'FlowNo.', 'udp_Length', 'IOPR' ] flow_list = flow_list[features] ## Average packets per second temp = flow_list.groupby(['FlowNo.'], {'NumPackets': sf.aggregate.COUNT()}) temp = temp.join(timeF, on=['FlowNo.']) temp['AvgPktPerSec'] = temp.apply(lambda x: 0.0 if x[ 'Duration'] == 0.0 else x['NumPackets'] * 1.0 / x['Duration']) temp = temp[['FlowNo.', 'AvgPktPerSec']] flow_list = flow_list.join(temp, on='FlowNo.') del (temp) print(" Average packets calculated") ##Average Bits Per Second temp = flow_list.groupby(['FlowNo.'], {'BytesEx': sf.aggregate.SUM('Length')}) temp = temp.join(timeF, on=['FlowNo.']) temp['BitsPerSec'] = temp.apply(lambda x: 0.0 if x['Duration'] == 0.0 else x['BytesEx'] * 8.0 / x['Duration']) temp = temp[['FlowNo.', 'BitsPerSec']] flow_list = flow_list.join(temp, on='FlowNo.') del (temp) print(" Average bits calculated") ## Average Packet Lentgth temp = flow_list.groupby(['FlowNo.'], {'APL': sf.aggregate.AVG('Length')}) flow_list = flow_list.join(temp, on='FlowNo.') del (temp) print(" Average package length calculated") flow_list['IAT'] = 0 flow_list = flow_list.sort(['FlowNo.', 'Time']) prev = None prevT = None li = [] for x in flow_list: if prev is None or x['FlowNo.'] != prev: li.append(0) else: li.append(x['Time'] - prevT) prev = x['FlowNo.'] prevT = x['Time'] flow_list['IAT'] = sf.SArray(li) ## Null Packets handling def checkNull(x): if (x['TCP Segment Len'] == '0' or x['udp_Length'] == 8): return 1 elif ('ipx' in x['Protocols in frame'].split(':')): l = x['Length'] - 30 if ('eth' in x['Protocols in frame'].split(':')): l = l - 14 if ('ethtype' in x['Protocols in frame'].split(':')): l = l - 2 if ('llc' in x['Protocols in frame'].split(':')): l = l - 8 if (l == 0 or l == -1): return 1 return 0 flow_list['isNull'] = flow_list.apply(lambda x: checkNull(x)) NPEx = flow_list.groupby(['FlowNo.'], {'NPEx': sf.aggregate.SUM('isNull')}) flow_list = flow_list.join(NPEx, on='FlowNo.') del (NPEx) print(" Null packets handled") flow_list['Forward'] = flow_list.apply( lambda x: 1 if x['Source'] > x['Destination'] else 0) temp = flow_list.groupby('FlowNo.', { 'NumForward': sf.aggregate.SUM('Forward'), }) flow_list = flow_list.join(temp, on='FlowNo.') del (temp) flow_list = flow_list.groupby( 'FlowNo.', { 'BytesEx': sf.aggregate.SELECT_ONE('BytesEx'), 'Destination': sf.aggregate.SELECT_ONE('Destination'), 'Destination Port': sf.aggregate.SELECT_ONE('Destination Port'), 'Duration': sf.aggregate.SELECT_ONE('Duration'), 'FPL': sf.aggregate.SELECT_ONE('FPL'), 'IP_Flags': sf.aggregate.SELECT_ONE('IP_Flags'), 'Length': sf.aggregate.SELECT_ONE('Length'), 'NumPackets': sf.aggregate.SELECT_ONE('NumPackets'), 'Protocol': sf.aggregate.SELECT_ONE('Protocol'), 'Protocols in frame': sf.aggregate.SELECT_ONE('Protocols in frame'), 'SameLenPktRatio': sf.aggregate.SELECT_ONE('SameLenPktRatio'), 'Source': sf.aggregate.SELECT_ONE('Source'), 'Source Port': sf.aggregate.SELECT_ONE('Source Port'), 'StdDevLen': sf.aggregate.SELECT_ONE('StdDevLen'), 'IAT': sf.aggregate.SELECT_ONE('IAT'), 'isNull': sf.aggregate.SELECT_ONE('isNull'), 'NPEx': sf.aggregate.SELECT_ONE('NPEx'), 'APL': sf.aggregate.SELECT_ONE('APL'), 'BitsPerSec': sf.aggregate.SELECT_ONE('BitsPerSec'), 'AvgPktPerSec': sf.aggregate.SELECT_ONE('AvgPktPerSec'), 'udp_Length': sf.aggregate.SELECT_ONE('udp_Length'), 'tcp_Flags': sf.aggregate.SELECT_ONE('tcp_Flags'), 'Time': sf.aggregate.SELECT_ONE('Time'), 'TCP Segment Len': sf.aggregate.SELECT_ONE('TCP Segment Len'), 'IOPR': sf.aggregate.SELECT_ONE('IOPR'), 'NumForward': sf.aggregate.SELECT_ONE('NumForward') }) flow_list.save(output_url) print("Flow feature generation complete") print("Updated flow saved")
def polynomial_sframe(feature, degree): # assume that degree >= 1 & initialize the SFrame poly_sframe = sframe.SFrame() # first degree poly_sframe['power_1'] = feature if degree > 1: # then loop over the remaining degrees: # range usually starts at 0 and stops at the endpoint-1. We want it to start at 2 and stop at degree for power in range(2, degree + 1): name = 'power_' + str(power) poly_sframe[name] = feature.apply(lambda x: x**power) return poly_sframe # Example tmp = sframe.SArray([1., 2., 3.]) print polynomial_sframe(tmp, 6) ''' # Visualizing polynomial regression sales = sales.sort(['sqft_living', 'price']) # For plotting purposes (connecting the dots),we must sort by the values poly_data = polynomial_sframe(sales['sqft_living'], 1) poly_data['price'] = sales['price'] poly_data = sframe.SFrame.to_dataframe(poly_data) output = poly_data['price'] input_features = poly_data.drop('price', axis=1) from sklearn import linear_model regr = linear_model.LinearRegression() model = regr.fit(input_features, output) print model.intercept_