def get_features(self, filepath, source_ip=None): ''' Reads a pcap specified by the file path and returns an array of the computed model inputs Args: filepath: Path to pcap to compute features for Returns: features: Numpy 2D array containing features for each time bin timestamp: datetime of the last observed packet ''' # Read the capture into a feature array X = [] timestamps = [] binned_sessions = sessionizer(filepath, duration=self.duration) self.sessions = binned_sessions if len(binned_sessions) is 0: return None, None, None, None for session_dict in binned_sessions: if len(session_dict) > 0: if source_ip is None: feature_list, source_ip, other_ips = extract_features( session_dict ) else: feature_list, _, other_ips = extract_features( session_dict, capture_source=source_ip ) X.append(feature_list) last_packet = list(session_dict.items())[-1] timestamps.append(last_packet[1][0][0]) if len(X) == 0: return None, None, None, None full_features = np.stack(X) # Mean normalize the features full_features -= np.expand_dims(self.means, 0) full_features /= np.expand_dims(self.stds, 0) features = full_features[:, self.feature_list] return features, source_ip, timestamps, other_ips
def read_data(data_dir, duration=None, labels=None): ''' Reads all the data in the specified directory and parses it into a feature array and a label array. Args: data_dir: path to the directory that contains the training data duration: Time window to compute feature information labels: List containing labels to use Returns: X: numpy 2D array that contains the (high dimensional) features y: numpy 1D array that contains the labels for the features in X new_labels: Reordered labels used in training ''' X = [] y = [] assigned_labels = [] # Get all the files in the directory files = [] with open(os.path.join(data_dir,'label_assignments.json')) as handle: label_assignments = json.load(handle) for dirpath, dirnames, filenames in os.walk(data_dir): for file in filenames: files.append(os.path.join(dirpath,file)) # Go through all the files in the directory for filename in files: # Extract the label from the filename name = os.path.split(filename)[1] name = name.split('-')[0] if name in label_assignments: label = label_assignments[name] if label not in labels: label = 'Unknown' else: label = 'Unknown' if label not in assigned_labels: assigned_labels.append(label) print("Reading", filename,"as",label) # Bin the sessions with the specified time window binned_sessions = sessionizer( filename, duration=duration ) # For each of the session bins, compute the full feature vectors for session_dict in binned_sessions: features, _, _ = extract_features(session_dict) # Store the feature vector and the labels X.append(features) y.append(assigned_labels.index(label)) # Update the labels to reflect the new assignments new_labels = assigned_labels + \ [l for l in labels if l not in assigned_labels] return np.stack(X), np.stack(y), new_labels
def output(): a = "me" data = request.form.to_dict() q1 = data.get('q1') q2 = data.get('q2') prob = data.get('probabiliy') #convert it into dataframe new_df = pd.DataFrame(columns=['question1', 'question2']) new_df = new_df.append({ 'question1': q1, 'question2': q2 }, ignore_index=True) new_df = extract_features(new_df) #getting advance and basic features #get the tfidf vectorizer of text x_q1 = vectorizer.transform(new_df["question1"]) x_q2 = vectorizer.transform(new_df["question2"]) cols = [i for i in new_df.columns if i not in ['question1', 'question2']] new_df = new_df.loc[:, cols].values #get the hand crafted features X = hstack((x_q1, x_q2, new_df)).tocsr() X = std.transform(X) y_q = model.predict(X) y_q_proba = model.predict_proba(X) result = dict() result["Question-1"] = q1 result["Question-2"] = q2 if y_q == 1: result["Predicted Class"] = 'Similar' else: result["Predicted Class"] = "Not Similar" if prob == "yes": result["Probabiliy"] = round(max(y_q_proba[0]), 4) return render_template('output.html', result=result)
def read_data(data_dir, duration=None, labels=None): ''' Reads all the data in the specified directory and parses it into a feature array and a label array. Args: data_dir: path to the directory that contains the training data duration: Time window to compute feature information labels: List containing labels to use Returns: X: numpy 2D array that contains the (high dimensional) features y: numpy 1D array that contains the labels for the features in X new_labels: Reordered labels used in training ''' logger = logging.getLogger(__name__) try: if 'LOG_LEVEL' in os.environ and os.environ['LOG_LEVEL'] != '': logger.setLevel(os.environ['LOG_LEVEL']) except Exception as e: logger.error( 'Unable to set logging level because: {0} defaulting to INFO.'. format(str(e))) X = [] y = [] assigned_labels = [] # Get all the files in the directory files = [] with open('opts/label_assignments.json') as handle: label_assignments = json.load(handle) for dirpath, _, filenames in os.walk(data_dir): for file in filenames: _, ext = os.path.splitext(file) if ext == '.pcap': files.append(os.path.join(dirpath, file)) # Go through all the files in the directory logger.info('Found {0} pcap files to read.'.format(len(files))) count = 0 for filename in files: count += 1 # Extract the label from the filename name = os.path.split(filename)[1] name = name.split('-')[0] if name in label_assignments: label = label_assignments[name] if label not in labels: label = 'Unknown' else: label = 'Unknown' if label not in assigned_labels: assigned_labels.append(label) logger.info('Reading {0} ({1} bytes) as {2} ({3}/{4})'.format( filename, os.path.getsize(filename), label, count, len(files))) # Bin the sessions with the specified time window binned_sessions = sessionizer(filename, duration=duration) # Get the capture source from the binned sessions capture_source = get_source(binned_sessions) # For each of the session bins, compute the full feature vectors for session_dict in binned_sessions: features, _, _ = extract_features(session_dict, capture_source=capture_source) # Store the feature vector and the labels X.append(features) y.append(assigned_labels.index(label)) # Update the labels to reflect the new assignments new_labels = assigned_labels + \ [l for l in labels if l not in assigned_labels] return np.stack(X), np.stack(y), new_labels
with open(d+"tfidf_GBDT_model.pkl", "rb") as f: model = pickle.load(f) #let's take the input from users for prediction more_input = True while more_input: new_df = pd.DataFrame(columns = ['question1','question2']) print('Write first question:', end = " ") q1 = input() print('Write seconde question:', end = " ") q2 = input() print("\nvectorizing data...") #convert it into dataframe new_df = new_df.append({'question1': q1, 'question2':q2}, ignore_index = True) new_df = extract_features(new_df) #getting advance and basic features #get the tfidf vectorizer of text x_q1 = vectorizer.transform(new_df["question1"]) x_q2 = vectorizer.transform(new_df["question2"]) cols = [i for i in new_df.columns if i not in ['question1', 'question2']] new_df = new_df.loc[:,cols].values #get the hand crafted features X = hstack((x_q1, x_q2, new_df)).tocsr() X = std.transform(X) y_q = model.predict(X) y_q_proba = model.predict_proba(X) print('\nPredicted class is: {} i.e. {} and'.format(y_q, "Similar" if y_q == 1 else "Not Similar")) print("Probability of predicted class is {:.4f}".format(max(y_q_proba[0]))) print("\nDo you want to check more: Pess 1 if yes, Ohterwise it'll terminate the session.")