Beispiel #1
0
    def get_features(self, filepath, source_ip=None):
        '''
        Reads a pcap specified by the file path and returns an array of the
        computed model inputs

        Args:
            filepath: Path to pcap to compute features for

        Returns:
            features: Numpy 2D array containing features for each time bin
            timestamp: datetime of the last observed packet
        '''

        # Read the capture into a feature array
        X = []
        timestamps = []
        binned_sessions = sessionizer(filepath, duration=self.duration)
        self.sessions = binned_sessions

        if len(binned_sessions) is 0:
            return None, None, None, None

        for session_dict in binned_sessions:
            if len(session_dict) > 0:
                if source_ip is None:
                    feature_list, source_ip, other_ips = extract_features(
                        session_dict
                    )
                else:
                    feature_list, _, other_ips = extract_features(
                        session_dict,
                        capture_source=source_ip
                    )
                X.append(feature_list)
                last_packet = list(session_dict.items())[-1]
                timestamps.append(last_packet[1][0][0])

        if len(X) == 0:
            return None, None, None, None

        full_features = np.stack(X)

        # Mean normalize the features
        full_features -= np.expand_dims(self.means, 0)
        full_features /= np.expand_dims(self.stds, 0)
        features = full_features[:, self.feature_list]
        return features, source_ip, timestamps, other_ips
Beispiel #2
0
def read_data(data_dir, duration=None, labels=None):
    '''
    Reads all the data in the specified directory and parses it into
    a feature array and a label array.

    Args:
        data_dir: path to the directory that contains the training data
        duration: Time window to compute feature information
        labels: List containing labels to use

    Returns:
        X: numpy 2D array that contains the (high dimensional) features
        y: numpy 1D array that contains the labels for the features in X
        new_labels: Reordered labels used in training
    '''
    X = []
    y = []
    assigned_labels = []

    # Get all the files in the directory
    files = []
    with open(os.path.join(data_dir,'label_assignments.json')) as handle:
        label_assignments = json.load(handle)

    for dirpath, dirnames, filenames in os.walk(data_dir):
        for file in filenames:
            files.append(os.path.join(dirpath,file))
    # Go through all the files in the directory
    for filename in files:
        # Extract the label from the filename
        name = os.path.split(filename)[1]
        name = name.split('-')[0]
        if name in label_assignments:
            label = label_assignments[name]
            if label not in labels: label = 'Unknown'
        else:
            label = 'Unknown'
        if label not in assigned_labels:
            assigned_labels.append(label)

        print("Reading", filename,"as",label)
        # Bin the sessions with the specified time window
        binned_sessions = sessionizer(
                                       filename,
                                       duration=duration
                                     )

        # For each of the session bins, compute the  full feature vectors
        for session_dict in binned_sessions:
            features, _, _ = extract_features(session_dict)
            # Store the feature vector and the labels
            X.append(features)
            y.append(assigned_labels.index(label))

        # Update the labels to reflect the new assignments
        new_labels = assigned_labels + \
                     [l for l in labels if l not in assigned_labels]

    return np.stack(X), np.stack(y), new_labels
Beispiel #3
0
def output():
    a = "me"
    data = request.form.to_dict()
    q1 = data.get('q1')
    q2 = data.get('q2')
    prob = data.get('probabiliy')

    #convert it into dataframe
    new_df = pd.DataFrame(columns=['question1', 'question2'])
    new_df = new_df.append({
        'question1': q1,
        'question2': q2
    },
                           ignore_index=True)
    new_df = extract_features(new_df)  #getting advance and basic features
    #get the tfidf vectorizer of text
    x_q1 = vectorizer.transform(new_df["question1"])
    x_q2 = vectorizer.transform(new_df["question2"])
    cols = [i for i in new_df.columns if i not in ['question1', 'question2']]
    new_df = new_df.loc[:, cols].values
    #get the hand crafted features
    X = hstack((x_q1, x_q2, new_df)).tocsr()
    X = std.transform(X)

    y_q = model.predict(X)
    y_q_proba = model.predict_proba(X)
    result = dict()
    result["Question-1"] = q1
    result["Question-2"] = q2

    if y_q == 1:
        result["Predicted Class"] = 'Similar'
    else:
        result["Predicted Class"] = "Not Similar"

    if prob == "yes":
        result["Probabiliy"] = round(max(y_q_proba[0]), 4)

    return render_template('output.html', result=result)
Beispiel #4
0
def read_data(data_dir, duration=None, labels=None):
    '''
    Reads all the data in the specified directory and parses it into
    a feature array and a label array.

    Args:
        data_dir: path to the directory that contains the training data
        duration: Time window to compute feature information
        labels: List containing labels to use

    Returns:
        X: numpy 2D array that contains the (high dimensional) features
        y: numpy 1D array that contains the labels for the features in X
        new_labels: Reordered labels used in training
    '''
    logger = logging.getLogger(__name__)
    try:
        if 'LOG_LEVEL' in os.environ and os.environ['LOG_LEVEL'] != '':
            logger.setLevel(os.environ['LOG_LEVEL'])
    except Exception as e:
        logger.error(
            'Unable to set logging level because: {0} defaulting to INFO.'.
            format(str(e)))
    X = []
    y = []
    assigned_labels = []

    # Get all the files in the directory
    files = []
    with open('opts/label_assignments.json') as handle:
        label_assignments = json.load(handle)

    for dirpath, _, filenames in os.walk(data_dir):
        for file in filenames:
            _, ext = os.path.splitext(file)
            if ext == '.pcap':
                files.append(os.path.join(dirpath, file))
    # Go through all the files in the directory
    logger.info('Found {0} pcap files to read.'.format(len(files)))
    count = 0
    for filename in files:
        count += 1
        # Extract the label from the filename
        name = os.path.split(filename)[1]
        name = name.split('-')[0]
        if name in label_assignments:
            label = label_assignments[name]
            if label not in labels:
                label = 'Unknown'
        else:
            label = 'Unknown'
        if label not in assigned_labels:
            assigned_labels.append(label)

        logger.info('Reading {0} ({1} bytes) as {2} ({3}/{4})'.format(
            filename, os.path.getsize(filename), label, count, len(files)))
        # Bin the sessions with the specified time window
        binned_sessions = sessionizer(filename, duration=duration)
        # Get the capture source from the binned sessions
        capture_source = get_source(binned_sessions)

        # For each of the session bins, compute the  full feature vectors
        for session_dict in binned_sessions:
            features, _, _ = extract_features(session_dict,
                                              capture_source=capture_source)

            # Store the feature vector and the labels
            X.append(features)
            y.append(assigned_labels.index(label))

        # Update the labels to reflect the new assignments
        new_labels = assigned_labels + \
            [l for l in labels if l not in assigned_labels]

    return np.stack(X), np.stack(y), new_labels
with open(d+"tfidf_GBDT_model.pkl", "rb") as f:
    model = pickle.load(f)

#let's take the input from users for prediction
more_input = True
while more_input:
    new_df = pd.DataFrame(columns = ['question1','question2'])
    print('Write first question:', end = " ")
    q1 = input()
    print('Write seconde question:', end = " ")
    q2 = input()

    print("\nvectorizing data...")
    #convert it into dataframe
    new_df = new_df.append({'question1': q1, 'question2':q2}, ignore_index = True)
    new_df = extract_features(new_df) #getting advance and basic features
    #get the tfidf vectorizer of text
    x_q1 = vectorizer.transform(new_df["question1"])
    x_q2 = vectorizer.transform(new_df["question2"])
    cols = [i for i in new_df.columns if i not in ['question1', 'question2']]
    new_df = new_df.loc[:,cols].values
    #get the hand crafted features
    X = hstack((x_q1, x_q2, new_df)).tocsr()
    X = std.transform(X)

    y_q = model.predict(X)
    y_q_proba = model.predict_proba(X)
    print('\nPredicted class is: {} i.e. {} and'.format(y_q, "Similar" if y_q == 1 else "Not Similar"))
    print("Probability of predicted class is {:.4f}".format(max(y_q_proba[0])))

    print("\nDo you want to check more: Pess 1 if yes, Ohterwise it'll terminate the session.")