Python SArray Examples, sframe.SArray Python Examples

Example #1

0

Show file

 def test_variable_size_image(self):
     shape1 = (2, 3, 1)
     shape2 = (2, 2, 2)
     tmp1 = gl.SArray([array.array('d', [0] * 6)])
     tmp2 = gl.SArray([array.array('d', [0] * 8)])
     data = gl.SFrame({'x': [tmp1.pixel_array_to_image(*shape1)[0], tmp2.pixel_array_to_image(*shape2)[0]]})
     it = mxnet.io.SFrameIter(data, data_field='x')
     self.assertRaises(lambda: [it])

Example #2

0

Show file

File: 01_create_features.py Project: tinve/kaggle_quora

def generate_features_from_lists(list1, list2):
    set1 = set(list1)
    set2 = set(list2)

    feature0 = len(list1)
    feature1 = len(list2)

    feature2 = len(set1)
    feature3 = len(set2)

    feature4 = len(set1 & set2)
    feature5 = len(set1 | set2)
    feature6 = feature4 / max(feature5, 1)

    feature7 = sum(len(e) for e in list1) / max(feature0, 1)
    feature8 = sum(len(e) for e in list2) / max(feature1, 1)

    if feature0 == feature1:
        feature9 = (list1 == list2)
    else:
        feature9 = False

    return sframe.SArray([
        feature0, feature1, feature2, feature3, feature4, feature5, feature6,
        feature7, feature8, feature9
    ])

Example #3

0

Show file

def generate_flows(input_url='data/scored_packets.csv',
                   output_url="data/raw_flows.csv"):
    '''
  Generate raw network flows from a list of captured packets
  '''
    def __flow_id(x):
        if x['Source'] > x['Destination']:
            return x['Source'] + '-' + x['Destination'] + '-' + str(
                x['Source Port']) + '-' + str(
                    x['Destination Port']) + '-' + str(x['Protocol'])
        else:
            return x['Destination'] + '-' + x['Source'] + '-' + str(
                x['Destination Port']) + '-' + str(
                    x['Source Port']) + '-' + str(x['Protocol'])

    sorted_flow = sf.SFrame.read_csv(input_url, verbose=False)
    sorted_flow = sorted_flow[(sorted_flow['Source Port'] != '')
                              & (sorted_flow['Destination Port'] != '')]
    sorted_flow['tcp_Flags'] = sorted_flow['tcp_Flags'].apply(
        lambda x: int(x, 16) if x != '' else 0)
    sorted_flow['UFid'] = sorted_flow.apply(lambda x: __flow_id(x))
    sorted_flow = sorted_flow.sort(['UFid', 'Time'])

    packet_flow_memberships = []

    current_flow = 0
    current_ufid = None
    start_time = None

    for row in sorted_flow:
        if current_ufid is None:
            if start_time is None:
                start_time = row['Time']
            packet_flow_memberships.append(current_flow)
            current_ufid = row['UFid']
        elif (row['UFid'] == current_ufid):
            # Terminate connection.
            if row['tcp_Flags'] & 1:
                packet_flow_memberships.append(current_flow)
                current_ufid = None
                start_time = None
                current_flow += 1
            # Time-outs
            # elif row['Time'] - startTime >= 360000:
            # current_flow_id = current_flow_id + 1
            # Flow.append(current_flow_id)
            # prev_flow_id = None
            # startTime = row['Time']
            else:
                packet_flow_memberships.append(current_flow)
                current_ufid = row['UFid']
        else:
            current_flow = current_flow + 1
            packet_flow_memberships.append(current_flow)
            current_ufid = row['UFid']
            start_time = row['Time']

    sorted_flow['FlowNo.'] = sf.SArray(packet_flow_memberships)
    sorted_flow.save(output_url)

Example #4

0

Show file

File: 00_text_to_lists.py Project: tinve/kaggle_quora

def text_to_word_lists(raw_text):
    # Convert to lower case
    words = raw_text.lower()
    # Replace punctuation with spaces, split into words
    words = words.translate(
        string.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    words = words.split()
    # Set of stop words
    stops = set(stopwords.words("english"))
    # Return list with and without stop words
    return sframe.SArray([words, [w for w in words if w not in stops]])

Example #5

0

Show file

File: Implement_Binary_Decision_Tree.py Project: howardx/machinelearningspecwashington

def unitTests():
    # Test case 1
    example_labels = sframe.SArray([-1, -1, 1, 1, 1])
    if intermediate_node_num_mistakes(example_labels) == 2:
        print 'Test passed!'
    else:
        print 'Test 1 failed... try again!'

    # Test case 2
    example_labels = sframe.SArray([-1, -1, 1, 1, 1, 1, 1])
    if intermediate_node_num_mistakes(example_labels) == 2:
        print 'Test passed!'
    else:
        print 'Test 3 failed... try again!'

    # Test case 3
    example_labels = sframe.SArray([-1, -1, -1, -1, -1, 1, 1])
    if intermediate_node_num_mistakes(example_labels) == 2:
        print 'Test passed!'
    else:
        print 'Test 3 failed... try again!'


#unitTests()

Example #6

0

Show file

File: 6_hierarchical_clustering.py Project: marco11235/deep-learn

def bipartition(cluster, maxiter=400, num_runs=4, seed=None):
    '''cluster: should be a dictionary containing the following keys
                * dataframe: original dataframe
                * matrix:    same data, in matrix format
                * centroid:  centroid for this particular cluster'''

    data_matrix = cluster['matrix']
    dataframe = cluster['dataframe']

    # Run k-means on the data matrix with k=2. We use scikit-learn here to simplify workflow.
    kmeans_model = KMeans(n_clusters=2,
                          max_iter=maxiter,
                          n_init=num_runs,
                          random_state=seed,
                          n_jobs=1)
    kmeans_model.fit(data_matrix)
    centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_

    # Divide the data matrix into two parts using the cluster assignments.
    data_matrix_left_child, data_matrix_right_child = data_matrix[
        cluster_assignment == 0], data_matrix[cluster_assignment == 1]

    # Divide the dataframe into two parts, again using the cluster assignments.
    cluster_assignment_sa = sframe.SArray(
        cluster_assignment)  # minor format conversion
    dataframe_left_child, dataframe_right_child = dataframe[
        cluster_assignment_sa == 0], dataframe[cluster_assignment_sa == 1]

    # Package relevant variables for the child clusters
    cluster_left_child = {
        'matrix': data_matrix_left_child,
        'dataframe': dataframe_left_child,
        'centroid': centroids[0]
    }
    cluster_right_child = {
        'matrix': data_matrix_right_child,
        'dataframe': dataframe_right_child,
        'centroid': centroids[1]
    }

    return (cluster_left_child, cluster_right_child)

Example #7

0

Show file

def FlowIdentifier(filename):
    SF2 = sf.SFrame.read_csv(filename, verbose=False)
    print "Done reading"

    # Removing records not having Source or Destination Ports
    SF2 = SF2[(SF2['Source Port'] != '') & (SF2['Destination Port'] != '')]

    # Convert tcp Flags to integer, if present, else mark 0
    SF2['tcp_Flags'] = SF2['tcp_Flags'].apply(lambda x: int(x, 16)
                                              if x != '' else 0)

    #For identifying IOPR feature, used later
    SF2['Forward'] = SF2.apply(lambda x: 1
                               if x['Source'] > x['Destination'] else 0)

    # Assign Flow ID based on the 5 tuple
    SF2['UFid'] = SF2.apply(lambda x: flow_id(x))

    ##Code for logic of Bidirectional flow identification

    FlowNo = 0  ##Unique Flow Number for each flow, assigned to every packet
    prev = None
    Flow = []  ##Stores all flows in form of list of dictionary
    #cFlow = []    ##Store the current flow (all details)
    count = 0
    fc = 0
    startTime = None  ##Start Time of each flow to implement timeout

    # Sort the records based on the 5 tuple flow id and time, so that all packets corresponding to same 5 tuple are grouped together, making it easier for identifying the flows
    SF2 = SF2.sort(['UFid', 'Time'])
    #print 'Done Sorting'

    # Now, we will label every packet with a unique flow no. to which it belongs
    for x in SF2:
        #if count%500000 == 0:
        #print 'Running '+str(count)+' Done !'

        count = count + 1

        if prev is None:
            if startTime is None:
                #New Flow, record the start time and add to List of all flows
                startTime = x['Time']
            Flow.append(FlowNo)
            prev = x['UFid']

        elif compareUF(x['UFid'], prev):
            #Flow is already existing
            if x['tcp_Flags'] & 1:
                #Packet has a FIN Flag, terminate the flow including this as the last packet
                Flow.append(FlowNo)
                prev = None
                startTime = None
                FlowNo = FlowNo + 1

            elif x['Time'] - startTime >= 3600:
                # Duration of the flow crosses Timeout value, start a new flow with this as its first packet
                FlowNo = FlowNo + 1
                Flow.append(FlowNo)
                prev = None
                startTime = x['Time']

            else:
                # New packet in a pre-existing flow
                Flow.append(FlowNo)
                prev = x['UFid']

        else:
            # Previous Flow tuple didnt receive any more packets, start a new flow
            FlowNo = FlowNo + 1
            Flow.append(FlowNo)
            prev = x['UFid']
            startTime = x['Time']

    print len(sf.SArray(Flow).unique())

    SF2['Flow'] = sf.SArray(Flow)
    temp = SF2.groupby('Flow', {'Count': sf.aggregate.COUNT()})
    #len(temp[temp['Count']>1])

    SF2['FlowNo.'] = sf.SArray(Flow)

    # Output of this function : Packet wise Flow Number marked and stored as a csv file in the same folder.
    # This file will be used to generate the features
    SF2.save('Ports_Only_Sorted_Flow_BD.csv')

Example #8

0

Show file

def Flow_Feature_Generator(packetcapturecsv):

    # Generate packet wise flow numbers
    FlowIdentifier(packetcapturecsv)
    SF2 = sf.SFrame.read_csv('Ports_Only_Sorted_Flow_BD.csv', verbose=False)

    ## FLOW BASED FEATURE GENERATION

    ## Ratio of incoming to outgoing packets
    temp = SF2.groupby('FlowNo.', {
        'NumForward': sf.aggregate.SUM('Forward'),
        'Total': sf.aggregate.COUNT()
    })
    temp['IOPR'] = temp.apply(
        lambda x: ((x['Total'] - x['NumForward']) * 1.0) / x['NumForward']
        if x['NumForward'] != 0 else (-1))
    temp = temp['FlowNo.', 'IOPR']

    SF2 = SF2.join(temp, on='FlowNo.')
    del (temp)

    ## First Packet Length
    FlowFeatures = [
        'Source', 'Destination', 'Source Port', 'Destination Port', 'Protocol'
    ]
    FPL = SF2.groupby(['FlowNo.'], {'Time': sf.aggregate.MIN('Time')})
    #print len(FPL)
    FPL = FPL.join(SF2, on=['FlowNo.', 'Time'])[['FlowNo.', 'Length']].unique()
    FPL = FPL.groupby(['FlowNo.'], {'FPL': sf.aggregate.AVG('Length')})

    SF2 = SF2.join(FPL, on='FlowNo.')
    del (FPL)

    # ## 18/10/2016

    ## Number of packets per flow
    temp = SF2.groupby(['FlowNo.'], {'NumPackets': sf.aggregate.COUNT()})
    #print temp.head(3)
    SF2 = SF2.join(temp, on='FlowNo.')
    del (temp)

    ## Number of bytes exchanged
    temp = SF2.groupby(['FlowNo.'], {'BytesEx': sf.aggregate.SUM('Length')})
    SF2 = SF2.join(temp, on='FlowNo.')
    del (temp)

    # In[38]:

    ## Standard deviation of packet length
    temp = SF2.groupby(['FlowNo.'], {'StdDevLen': sf.aggregate.STDV('Length')})
    SF2 = SF2.join(temp, on='FlowNo.')
    del (temp)

    # In[40]:

    ## Same length packet ratio
    temp2 = SF2.groupby(
        ['FlowNo.'],
        {'SameLenPktRatio': sf.aggregate.COUNT_DISTINCT('Length')})
    ##temp from number of packets computation
    temp = SF2.groupby(['FlowNo.'], {'NumPackets': sf.aggregate.COUNT()})
    temp = temp.join(temp2, on='FlowNo.')
    temp[
        'SameLenPktRatio'] = temp['SameLenPktRatio'] * 1.0 / temp['NumPackets']
    temp2 = None
    temp = temp[['FlowNo.', 'SameLenPktRatio']]
    SF2 = SF2.join(temp, on='FlowNo.')

    del (temp)
    # In[41]:

    ## Duration of flow
    timeF = SF2.groupby(['FlowNo.'], {
        'startTime': sf.aggregate.MIN('Time'),
        'endTime': sf.aggregate.MAX('Time')
    })
    timeF['Duration'] = timeF['endTime'] - timeF['startTime']
    timeF = timeF[['FlowNo.', 'Duration']]
    SF2 = SF2.join(timeF, on='FlowNo.')

    # In[45]:

    #sorted(SF2.column_names())

    # Relevant Features extracted till now

    features = [
        'Answer RRs', 'BytesEx', 'Destination', 'Destination Port', 'Duration',
        'FPL', 'IP_Flags', 'Length', 'Next sequence number', 'No.',
        'NumPackets', 'Protocol', 'Protocols in frame', 'SameLenPktRatio',
        'Sequence number', 'Source', 'Source Port', 'StdDevLen',
        'TCP Segment Len', 'Time', 'tcp_Flags', 'FlowNo.', 'udp_Length', 'IOPR'
    ]
    SF2 = SF2[features]

    # In[52]:

    ## Average packets per second
    temp = SF2.groupby(['FlowNo.'], {'NumPackets': sf.aggregate.COUNT()})
    temp = temp.join(timeF, on=['FlowNo.'])
    temp['AvgPktPerSec'] = temp.apply(lambda x: 0.0 if x[
        'Duration'] == 0.0 else x['NumPackets'] * 1.0 / x['Duration'])
    temp = temp[['FlowNo.', 'AvgPktPerSec']]
    SF2 = SF2.join(temp, on='FlowNo.')

    del (temp)
    # In[53]:

    ##Average Bits Per Second
    temp = SF2.groupby(['FlowNo.'], {'BytesEx': sf.aggregate.SUM('Length')})
    temp = temp.join(timeF, on=['FlowNo.'])
    temp['BitsPerSec'] = temp.apply(lambda x: 0.0 if x['Duration'] == 0.0 else
                                    x['BytesEx'] * 8.0 / x['Duration'])
    temp = temp[['FlowNo.', 'BitsPerSec']]
    SF2 = SF2.join(temp, on='FlowNo.')
    del (temp)

    # In[55]:

    ## Average Packet Lentgth
    temp = SF2.groupby(['FlowNo.'], {'APL': sf.aggregate.AVG('Length')})
    SF2 = SF2.join(temp, on='FlowNo.')
    del (temp)

    # In[ ]:

    ## Inter arrival time of the packets
    SF2['IAT'] = 0
    SF2 = SF2.sort(['FlowNo.', 'Time'])
    prev = None
    prevT = None
    li = []
    for x in SF2:
        if prev is None or x['FlowNo.'] != prev:
            li.append(0)
        else:
            li.append(x['Time'] - prevT)
        prev = x['FlowNo.']
        prevT = x['Time']
    SF2['IAT'] = sf.SArray(li)

    # In[67]:

    #SF2.save('Bidirectional_Test_Bot_features_till_IAT.csv')

    # # Is Null feature

    # ### Number of TCP Null packets

    ## Null Packets handling
    def checkNull(x):
        if (x['TCP Segment Len'] == '0' or x['udp_Length'] == 8):
            return 1
        elif ('ipx' in x['Protocols in frame'].split(':')):
            l = x['Length'] - 30
            if ('eth' in x['Protocols in frame'].split(':')):
                l = l - 14
            if ('ethtype' in x['Protocols in frame'].split(':')):
                l = l - 2
            if ('llc' in x['Protocols in frame'].split(':')):
                l = l - 8
            if (l == 0 or l == -1):
                return 1
        return 0

    SF2['isNull'] = SF2.apply(lambda x: checkNull(x))
    NPEx = SF2.groupby(['FlowNo.'], {'NPEx': sf.aggregate.SUM('isNull')})
    SF2 = SF2.join(NPEx, on='FlowNo.')

    del (NPEx)

    # ### Number of Reconnects - considering only TCP reconnects, using sequence number

    recon = SF2[SF2['Sequence number'] != ''].groupby(
        ['FlowNo.'], {
            'total_seq_no.': sf.aggregate.COUNT('Sequence number'),
            'distinct_seq_no.': sf.aggregate.COUNT_DISTINCT('Sequence number')
        })
    recon['reconnects'] = recon['total_seq_no.'] - recon['distinct_seq_no.']
    recon.head()
    recon = recon[['FlowNo.', 'reconnects']]
    SF2 = SF2.join(recon, on='FlowNo.', how='left')
    len(SF2)

    del (recon)
    # In[81]:

    #To identify records where reconnect check was not applied like UDP etc
    SF2.fillna('reconnects', -1)

    # A lot of these reconnects can be simple retransmissions - due to out of order/timeout etcb

    #Combine the features to flow based information

    SF2['Forward'] = SF2.apply(lambda x: 1
                               if x['Source'] > x['Destination'] else 0)
    temp = SF2.groupby('FlowNo.', {
        'NumForward': sf.aggregate.SUM('Forward'),
    })

    SF2 = SF2.join(temp, on='FlowNo.')

    del (temp)

    # Combine the packet level features to select only the FLOW BASED FEATURES
    SF2 = SF2.groupby(
        'FlowNo.', {
            'Answer RRs': sf.aggregate.SELECT_ONE('Answer RRs'),
            'BytesEx': sf.aggregate.SELECT_ONE('BytesEx'),
            'Destination': sf.aggregate.SELECT_ONE('Destination'),
            'Destination Port': sf.aggregate.SELECT_ONE('Destination Port'),
            'Duration': sf.aggregate.SELECT_ONE('Duration'),
            'FPL': sf.aggregate.SELECT_ONE('FPL'),
            'IP_Flags': sf.aggregate.SELECT_ONE('IP_Flags'),
            'Length': sf.aggregate.SELECT_ONE('Length'),
            'Next sequence number':
            sf.aggregate.SELECT_ONE('Next sequence number'),
            'No.': sf.aggregate.SELECT_ONE('No.'),
            'NumPackets': sf.aggregate.SELECT_ONE('NumPackets'),
            'Protocol': sf.aggregate.SELECT_ONE('Protocol'),
            'Protocols in frame':
            sf.aggregate.SELECT_ONE('Protocols in frame'),
            'SameLenPktRatio': sf.aggregate.SELECT_ONE('SameLenPktRatio'),
            'Sequence number': sf.aggregate.SELECT_ONE('Sequence number'),
            'Source': sf.aggregate.SELECT_ONE('Source'),
            'Source Port': sf.aggregate.SELECT_ONE('Source Port'),
            'StdDevLen': sf.aggregate.SELECT_ONE('StdDevLen'),
            'IAT': sf.aggregate.SELECT_ONE('IAT'),
            'isNull': sf.aggregate.SELECT_ONE('isNull'),
            'NPEx': sf.aggregate.SELECT_ONE('NPEx'),
            'reconnects': sf.aggregate.SELECT_ONE('reconnects'),
            'APL': sf.aggregate.SELECT_ONE('APL'),
            'BitsPerSec': sf.aggregate.SELECT_ONE('BitsPerSec'),
            'AvgPktPerSec': sf.aggregate.SELECT_ONE('AvgPktPerSec'),
            'udp_Length': sf.aggregate.SELECT_ONE('udp_Length'),
            'tcp_Flags': sf.aggregate.SELECT_ONE('tcp_Flags'),
            'Time': sf.aggregate.SELECT_ONE('Time'),
            'TCP Segment Len': sf.aggregate.SELECT_ONE('TCP Segment Len'),
            'IOPR': sf.aggregate.SELECT_ONE('IOPR'),
            'NumForward': sf.aggregate.SELECT_ONE('NumForward')
        })

    # FINAL OUTPUT : A CSV File having all the flows and Extracted Flow Based Features
    SF2.save('Bidirectional_Botnet_all_features.csv')

Example #9

0

Show file

File: flow_generator.py Project: og3niuz/NetDetect

def flow_separator(input_url=PROJ_ROOT + 'data/modified_data.csv',
                   output_url=PROJ_ROOT + "models/sorted_flow.csv"):

    print("Initiating flow separation")
    sorted_flow = sf.SFrame.read_csv(input_url, verbose=False)

    print("Preprocessing file")
    # Preprocess file
    sorted_flow = sorted_flow[(sorted_flow['Source Port'] != '')
                              & (sorted_flow['Destination Port'] != '')]
    sorted_flow['Forward'] = sorted_flow.apply(
        lambda x: 1 if x['Source'] > x['Destination'] else 0)
    sorted_flow['tcp_Flags'] = sorted_flow['tcp_Flags'].apply(
        lambda x: int(x, 16) if x != '' else 0)
    sorted_flow['UFid'] = sorted_flow.apply(lambda x: flow_id(x))
    sorted_flow = sorted_flow.sort(['UFid', 'Time'])

    # Master flow list
    Flow = []

    # Incremental vars
    current_flow_id = 0  # incrementing id for flow
    prev_flow_id = None
    startTime = None  ##Start Time of each flow to implement timeout

    for row in sorted_flow:
        # Means prev is set to none so no previous flow to continue
        if prev_flow_id is None:
            if startTime is None:
                startTime = row['Time']
            # Add this new flow to the current_flow
            Flow.append(current_flow_id)
            prev_flow_id = row['UFid']

        elif (row['UFid'] == prev_flow_id):
            # TCP termination
            if row['tcp_Flags'] & 1:
                Flow.append(current_flow_id)
                prev_flow_id = None
                startTime = None
                current_flow_id += 1

            # Timeout termination and restart
            elif row['Time'] - startTime >= 3600:
                current_flow_id = current_flow_id + 1
                Flow.append(current_flow_id)
                prev_flow_id = None
                startTime = row['Time']

            # New time
            else:
                Flow.append(current_flow_id)
                prev_flow_id = row['UFid']

        else:
            # Previous Flow tuple didnt receive any more packets, start a new flow
            current_flow_id = current_flow_id + 1
            Flow.append(current_flow_id)
            prev_flow_id = row['UFid']
            startTime = row['Time']

    print("Flow sorting complete")
    print(len(sf.SArray(Flow).unique()))
    sorted_flow['Flow'] = sf.SArray(Flow)
    temp = sorted_flow.groupby('Flow', {'Count': sf.aggregate.COUNT()})
    sorted_flow['FlowNo.'] = sf.SArray(Flow)
    sorted_flow.save(output_url)
    print("Flow saved\n##############################")

Example #10

0

Show file

def flow_featurization(input_url=PROJ_ROOT + "models/sorted_flow.csv",
                       output_url=PROJ_ROOT + "models/all_features.csv"):
    print("Intializing flow featurization")

    flow_list = sf.SFrame.read_csv(input_url, verbose=False)

    ## Ratio of incoming to outgoing packets
    temp = flow_list.groupby('FlowNo.', {
        'NumForward': sf.aggregate.SUM('Forward'),
        'Total': sf.aggregate.COUNT()
    })
    temp['IOPR'] = temp.apply(
        lambda x: ((x['Total'] - x['NumForward']) * 1.0) / x['NumForward']
        if x['NumForward'] != 0 else (-1))
    temp = temp['FlowNo.', 'IOPR']

    flow_list = flow_list.join(temp, on='FlowNo.')
    del (temp)
    print(" Ratio measuring complete")

    ## First Packet Length
    FlowFeatures = [
        'Source', 'Destination', 'Source Port', 'Destination Port', 'Protocol'
    ]
    FPL = flow_list.groupby(['FlowNo.'], {'Time': sf.aggregate.MIN('Time')})
    FPL = FPL.join(flow_list, on=['FlowNo.', 'Time'])[['FlowNo.',
                                                       'Length']].unique()
    FPL = FPL.groupby(['FlowNo.'], {'FPL': sf.aggregate.AVG('Length')})
    flow_list = flow_list.join(FPL, on='FlowNo.')
    del (FPL)
    print(" Packet length measured")

    ## Number of packets per flow
    temp = flow_list.groupby(['FlowNo.'], {'NumPackets': sf.aggregate.COUNT()})
    flow_list = flow_list.join(temp, on='FlowNo.')
    del (temp)
    print(" Packet sum measured")

    ## Number of bytes exchanged
    temp = flow_list.groupby(['FlowNo.'],
                             {'BytesEx': sf.aggregate.SUM('Length')})
    flow_list = flow_list.join(temp, on='FlowNo.')
    del (temp)
    print(" Byte exchange measured")

    ## Standard deviation of packet length
    temp = flow_list.groupby(['FlowNo.'],
                             {'StdDevLen': sf.aggregate.STDV('Length')})
    flow_list = flow_list.join(temp, on='FlowNo.')
    del (temp)
    print(" Standard deviation of packet length measured")

    ## Same length packet ratio
    temp2 = flow_list.groupby(
        ['FlowNo.'],
        {'SameLenPktRatio': sf.aggregate.COUNT_DISTINCT('Length')})
    temp = flow_list.groupby(['FlowNo.'], {'NumPackets': sf.aggregate.COUNT()})
    temp = temp.join(temp2, on='FlowNo.')
    temp[
        'SameLenPktRatio'] = temp['SameLenPktRatio'] * 1.0 / temp['NumPackets']
    temp2 = None
    temp = temp[['FlowNo.', 'SameLenPktRatio']]
    flow_list = flow_list.join(temp, on='FlowNo.')
    del (temp)
    print(" Same length packet ratio measured")

    ## Duration of flow
    timeF = flow_list.groupby(['FlowNo.'], {
        'startTime': sf.aggregate.MIN('Time'),
        'endTime': sf.aggregate.MAX('Time')
    })
    timeF['Duration'] = timeF['endTime'] - timeF['startTime']
    timeF = timeF[['FlowNo.', 'Duration']]
    flow_list = flow_list.join(timeF, on='FlowNo.')
    print("  Duration of flow measured")

    # Relevant Features extracted till now
    features = [
        'BytesEx', 'Destination', 'Destination Port', 'Duration', 'FPL',
        'IP_Flags', 'Length', 'NumPackets', 'Protocol', 'Protocols in frame',
        'SameLenPktRatio', 'Source', 'Source Port', 'StdDevLen',
        'TCP Segment Len', 'Time', 'tcp_Flags', 'FlowNo.', 'udp_Length', 'IOPR'
    ]
    flow_list = flow_list[features]

    ## Average packets per second
    temp = flow_list.groupby(['FlowNo.'], {'NumPackets': sf.aggregate.COUNT()})
    temp = temp.join(timeF, on=['FlowNo.'])
    temp['AvgPktPerSec'] = temp.apply(lambda x: 0.0 if x[
        'Duration'] == 0.0 else x['NumPackets'] * 1.0 / x['Duration'])
    temp = temp[['FlowNo.', 'AvgPktPerSec']]
    flow_list = flow_list.join(temp, on='FlowNo.')
    del (temp)
    print(" Average packets calculated")

    ##Average Bits Per Second
    temp = flow_list.groupby(['FlowNo.'],
                             {'BytesEx': sf.aggregate.SUM('Length')})
    temp = temp.join(timeF, on=['FlowNo.'])
    temp['BitsPerSec'] = temp.apply(lambda x: 0.0 if x['Duration'] == 0.0 else
                                    x['BytesEx'] * 8.0 / x['Duration'])
    temp = temp[['FlowNo.', 'BitsPerSec']]
    flow_list = flow_list.join(temp, on='FlowNo.')
    del (temp)
    print(" Average bits calculated")

    ## Average Packet Lentgth
    temp = flow_list.groupby(['FlowNo.'], {'APL': sf.aggregate.AVG('Length')})
    flow_list = flow_list.join(temp, on='FlowNo.')
    del (temp)
    print(" Average package length calculated")

    flow_list['IAT'] = 0
    flow_list = flow_list.sort(['FlowNo.', 'Time'])
    prev = None
    prevT = None
    li = []
    for x in flow_list:
        if prev is None or x['FlowNo.'] != prev:
            li.append(0)
        else:
            li.append(x['Time'] - prevT)
        prev = x['FlowNo.']
        prevT = x['Time']
    flow_list['IAT'] = sf.SArray(li)

    ## Null Packets handling
    def checkNull(x):
        if (x['TCP Segment Len'] == '0' or x['udp_Length'] == 8):
            return 1
        elif ('ipx' in x['Protocols in frame'].split(':')):
            l = x['Length'] - 30
            if ('eth' in x['Protocols in frame'].split(':')):
                l = l - 14
            if ('ethtype' in x['Protocols in frame'].split(':')):
                l = l - 2
            if ('llc' in x['Protocols in frame'].split(':')):
                l = l - 8
            if (l == 0 or l == -1):
                return 1
        return 0

    flow_list['isNull'] = flow_list.apply(lambda x: checkNull(x))
    NPEx = flow_list.groupby(['FlowNo.'], {'NPEx': sf.aggregate.SUM('isNull')})
    flow_list = flow_list.join(NPEx, on='FlowNo.')
    del (NPEx)
    print("  Null packets handled")

    flow_list['Forward'] = flow_list.apply(
        lambda x: 1 if x['Source'] > x['Destination'] else 0)
    temp = flow_list.groupby('FlowNo.', {
        'NumForward': sf.aggregate.SUM('Forward'),
    })

    flow_list = flow_list.join(temp, on='FlowNo.')
    del (temp)

    flow_list = flow_list.groupby(
        'FlowNo.', {
            'BytesEx': sf.aggregate.SELECT_ONE('BytesEx'),
            'Destination': sf.aggregate.SELECT_ONE('Destination'),
            'Destination Port': sf.aggregate.SELECT_ONE('Destination Port'),
            'Duration': sf.aggregate.SELECT_ONE('Duration'),
            'FPL': sf.aggregate.SELECT_ONE('FPL'),
            'IP_Flags': sf.aggregate.SELECT_ONE('IP_Flags'),
            'Length': sf.aggregate.SELECT_ONE('Length'),
            'NumPackets': sf.aggregate.SELECT_ONE('NumPackets'),
            'Protocol': sf.aggregate.SELECT_ONE('Protocol'),
            'Protocols in frame':
            sf.aggregate.SELECT_ONE('Protocols in frame'),
            'SameLenPktRatio': sf.aggregate.SELECT_ONE('SameLenPktRatio'),
            'Source': sf.aggregate.SELECT_ONE('Source'),
            'Source Port': sf.aggregate.SELECT_ONE('Source Port'),
            'StdDevLen': sf.aggregate.SELECT_ONE('StdDevLen'),
            'IAT': sf.aggregate.SELECT_ONE('IAT'),
            'isNull': sf.aggregate.SELECT_ONE('isNull'),
            'NPEx': sf.aggregate.SELECT_ONE('NPEx'),
            'APL': sf.aggregate.SELECT_ONE('APL'),
            'BitsPerSec': sf.aggregate.SELECT_ONE('BitsPerSec'),
            'AvgPktPerSec': sf.aggregate.SELECT_ONE('AvgPktPerSec'),
            'udp_Length': sf.aggregate.SELECT_ONE('udp_Length'),
            'tcp_Flags': sf.aggregate.SELECT_ONE('tcp_Flags'),
            'Time': sf.aggregate.SELECT_ONE('Time'),
            'TCP Segment Len': sf.aggregate.SELECT_ONE('TCP Segment Len'),
            'IOPR': sf.aggregate.SELECT_ONE('IOPR'),
            'NumForward': sf.aggregate.SELECT_ONE('NumForward')
        })
    flow_list.save(output_url)
    print("Flow feature generation complete")
    print("Updated flow saved")

Example #11

0

Show file

File: r3_model_selection_and_assesment.py Project: dbartramshaw/machine_learning_regression_uw

def polynomial_sframe(feature, degree):
    # assume that degree >= 1 & initialize the SFrame
    poly_sframe = sframe.SFrame()
    # first degree
    poly_sframe['power_1'] = feature
    if degree > 1:
        # then loop over the remaining degrees:
        # range usually starts at 0 and stops at the endpoint-1. We want it to start at 2 and stop at degree
        for power in range(2, degree + 1):
            name = 'power_' + str(power)
            poly_sframe[name] = feature.apply(lambda x: x**power)
    return poly_sframe


# Example
tmp = sframe.SArray([1., 2., 3.])
print polynomial_sframe(tmp, 6)
'''
# Visualizing polynomial regression
sales = sales.sort(['sqft_living', 'price'])             # For plotting purposes (connecting the dots),we must sort by the values
poly_data = polynomial_sframe(sales['sqft_living'], 1)
poly_data['price'] = sales['price']
poly_data = sframe.SFrame.to_dataframe(poly_data)

output = poly_data['price']
input_features = poly_data.drop('price', axis=1)

from sklearn import linear_model
regr = linear_model.LinearRegression()
model = regr.fit(input_features, output)
print model.intercept_