def generateData(self, rate): obj = DataProcessor(0) obj.loadFileAndRetrieveCh() obj.saveToFile(rate) obj = DataProcessor(1) obj.loadFileAndRetrieveCh() obj.saveToFile(rate)
def main(): data_path = "../data2/training-Obama-Romney-tweets.xlsx" test_data_path = '' # test_data_path = '../data/testing-Obama-Romney-tweets-3labels.xlsx' try: opts, args = getopt.getopt(sys.argv[1:], "d:t:") for o, a in opts: if o == '-d': data_path = a elif o == '-t': test_data_path = a except getopt.GetoptError as err: # print help information and exit: print str(err) print 'read the readme file to know how to run this project' sys.exit(2) dp = DataProcessor(data_path) tc = TweetClassifier() if test_data_path != '': dpt = DataProcessor(test_data_path) print '\n****** OBAMA ******\n' data = dp.load_excel_data('Obama') data_test = dpt.load_excel_data('Obama') report = tc.train_test(data, data_test) DataProcessor.print_report(report) print '\n****** ROMNEY ******\n' data = dp.load_excel_data('Romney') data_test = dpt.load_excel_data('Romney') report = tc.train_test(data, data_test) DataProcessor.print_report(report) else: print '\n****** OBAMA ******\n' data = dp.load_excel_data('Obama') report = tc.crossvalidate(data, 10) DataProcessor.print_report(report) print '\n****** ROMNEY ******\n' data = dp.load_excel_data('Romney') report = tc.crossvalidate(data, 10) DataProcessor.print_report(report)
def get_graph(self, area=None, company=None): """ Queries tweets by given area/company filter. """ # Get all tweets or filtered by area if area: data = self.client.get_tweets_by_area(area) suffix = area.upper() elif company: data = self.client.get_tweets_by_company(company) suffix = company.upper() else: data = self.client.get_all_tweets() suffix = 'GLOBAL' processor = DataProcessor(data) # Get the time series data time_series = processor.prepare_time_series() # Save all the graph info in a list we can access from the view template graph = [ dict(data=[dict(x=time_series.index, y=time_series)], layout=dict(title='Tweet Frequency - ' + suffix), id='timeseries') ] # Plotly needs the graph/pandas data encoded in compatible JSON format graph = json.dumps(graph, cls=plotly.utils.PlotlyJSONEncoder) return graph
def compute_scores(self, estimator): dp = DataProcessor() already_processed = False previous_commit = None all_scores = [] reports = dp.read_and_process_report_data(self.path_to_reports_data, self.project) #print self.train_split_index_start, self.train_split_index_end reports_to_process = reports[self.train_split_index_start: self.train_split_index_end] pool = pp.ProcessPool(10) #don't have more than number of reports?? self.cur_estimator = estimator all_scores = pool.map(self.get_report_score, reports_to_process) #pool.close() #pool.join() all_matrixes = [i[0] for i in all_scores] total_tried = sum([i[1] for i in all_scores]) number_achieved = sum([i[2] for i in all_scores]) print "finished pooling" print all_scores final_MAP_score = self.MAP(all_matrixes) final_MRR_score = self.MRR(all_matrixes) print final_MAP_score, " final MAP score" print final_MRR_score, " final MRR score" print float(number_achieved)/float(total_tried), " final accuracy at k score" return final_MAP_score
def roberta_pair_task(config): tokenizer = BertTokenizer.from_pretrained(config.tokenizer_file, do_lower_case=config.do_lower_case) processor = DataProcessor(config) config.class_list = processor.get_labels() config.num_labels = len(config.class_list) train_examples = processor.get_train_examples() dev_examples = processor.get_dev_examples() augment_examples = processor.read_data_augment(config.data_augment_method) cur_model = MODEL_CLASSES[config.use_model] model = cur_model(config) logging.info("self config %s", config_to_json_string(config)) model_example, dev_evaluate, predict_label = cross_validation( config=config, model=model, tokenizer=tokenizer, train_examples=train_examples, dev_examples=dev_examples, pattern=config.pattern, train_enhancement=augment_examples if config.data_augment else None, test_examples=None) logging.info("dev_evaluate: {}".format(dev_evaluate)) if config.pattern == 'full_train': model_save(config, model_example) return dev_evaluate
def test_reading_in(): dp = DataProcessor() dp.get_stackoverflow_data("/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/birt/") dp.get_stackoverflow_data("/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/eclipse/") dp.get_stackoverflow_data("/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/eclipse-jdt/") dp.get_stackoverflow_data("/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/swt/")
def __init__(self, model_path=None): self.config = DataConfig() self.dp = DataProcessor(self.config) self.num_channels = self.config.num_channels self.row = self.config.img_height self.col = self.config.img_width self.ch = self.config.num_channels self.model = self.load_model(model_path)
def process_files_eclipse(): dp = DataProcessor() path_to_reports_data = "/home/ndg/users/carmst16/EmbeddingBugs/resources/bugreport/Eclipse_Platform_UI.xlsx" path_to_starter_repo = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.ui/" path_to_processed_repo = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.ui_processed_split/" path_to_temp = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.ui_temp/" reports = dp.read_and_process_report_data(path_to_reports_data, "eclipse_platform_ui") dp.process_all_files(path_to_starter_repo, reports, path_to_processed_repo, path_to_temp)
def process_files_birt(): dp = DataProcessor() path_to_reports_data = "/home/ndg/users/carmst16/EmbeddingBugs/resources/bugreport/Birt.xlsx" path_to_starter_repo = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/birt/" path_to_processed_repo = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/birt_processed_split/" path_to_temp = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/birt_temp/" reports = dp.read_and_process_report_data(path_to_reports_data, "birt") dp.process_all_files(path_to_starter_repo, reports, path_to_processed_repo, path_to_temp)
def process_files_swt(): dp = DataProcessor() path_to_reports_data = "/home/ndg/users/carmst16/EmbeddingBugs/resources/bugreport/SWT.xlsx" path_to_starter_repo = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.swt/" path_to_processed_repo = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.swt_processed_split_text_trial/" path_to_temp = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.swt_temp_again/" reports = dp.read_and_process_report_data(path_to_reports_data, "swt") dp.process_all_files(path_to_starter_repo, reports, path_to_processed_repo, path_to_temp)
def Selector1(): selector = Selector() dataProcessor = DataProcessor() #abil = selector.getAbilityWithId(3) passed = selector.runSelector() print(passed) selector.setCharacterStats(15, 15) passed = selector.runSelector() print(passed) print(selector.HP) print(selector.getAuraWithId(selector.getAbilityWithId(2)['auraOne']))
def __init__(self): self.n_clusters = 2 self.algorithms = { 'current': STRPAlgorithm(self.n_clusters), 'future': STRPAlgorithm(self.n_clusters) } self.data_processors = { 'current': DataProcessor(), 'future': DataProcessor() } self.max_absolute_treshold = 13 self.min_absolute_treshold = 5 self.max_percentual_treshold = .1 self.min_percentual_treshold = .02 self.entity_temper_percentual_threshold = .2 self.is_running = False self.container = list() self.processed_nodes = list() self.raw_data = list() self.client = udp_client.UDPClient(OSC_SERVER, 8000) self.last_iteration = datetime.now() print('Application initialised') self.is_running = True # Create dummy data for i, d in enumerate(start_data): transformed_data = self.data_processors[ 'current'].transform_input_data(d) self.processed_nodes.append(transformed_data)
def __init__(self, filename, embedding_method='deepwalk', **kwargs): self.dp = DataProcessor(filename) self.workers = cpu_count() self.embedding_model = None self.embedding_method = embedding_method print("Init over.") sys.stdout.flush() if embedding_method == 'deepwalk': self.deepwalk(**kwargs) elif embedding_method == 'grarep': self.grarep(**kwargs) elif embedding_method == "node2vec": self.node2vec(**kwargs) else: raise TypeError("Unsupport type %s" % embedding_method)
def __init__(self): self.experimenter = Experimenter() self.dataProcessor = DataProcessor() self.gui = GUI() self.gui.addListener("computeButton", "clicked", self.compute) self.gui.addListener("computeFunctionButton", "clicked", self.compute) self.gui.addListener("computeSequenceButton", "clicked", self.compute) self.gui.addListener("graphButton", "clicked", self.graph) self.gui.addListener("viewInputButton", "clicked", self.viewInput) self.gui.addListener("roundOffSpinButton", "value-changed", self.roundOffSummary) self.gui.addListener("showAdvancedButton", "clicked", self.showAdvancedSettings) self.gui.addListener("hideAdvancedButton", "clicked", self.hideAdvancedSettings)
def getDataFromDB(): """ Funzione per ottenere i dati dal server locale influxdb contenente le misurazioni dei pazienti. """ ipDB = os.getenv('INFLUX_IP_AI', 'localhost') portDB = os.getenv('INFLUX_PORT_AI', '8086') userDB = os.getenv('INFLUX_USER_AI', 'admin') passwordDB = os.getenv('INFLUX_PW_AI', 'G10m1R0m3') nameDB = os.getenv('INFLUX_DB_AI', 'giomi') dr = DataRetriever(metrics) dfs = dr.loadDataFromDB(ipDB, portDB, userDB, passwordDB, nameDB) dp = DataProcessor(metrics, defaults) df = dp.applyPipeline(dfs) return df
def login(self): username_input = self.username_entry.get() password_input = self.password_entry.get() self.login_processor = DataProcessor(file_path="C:/Users/JakeT/onedrive/documents/visual studio 2017/Projects/PyStation/PyStation/static/config/users.csv") database = self.login_processor.data for entry in database: print(entry, "entry") print(type(entry), "type(entry)") print(entry[0], "entry[0]") print(entry[1], "entry[1]") if username_input == entry[0] and password_input == entry[1]: print("CORRECT USERNAME AND PASSWORD") # print(entry,"value") # print(type(entry),"type") print(self.login_processor)
def train(self): #this describes everything you want to search over parameters = {'size': [100, 500], 'window': [5, 10], 'sg': [1], 'workers': [16], 'hs': [0], 'negative': [25], 'iter': [1] } dp = DataProcessor() data = dp.get_stackoverflow_data_sentences_all(["/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/swt/", "/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/birt/", "/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/eclipse/", "/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/eclipse-jdt/"]) #if self.document: # data = dp.get_stackoverflow_data_document(self.path_to_stackoverflow_data) #else: # data = dp.get_stackoverflow_data_sentences(self.path_to_stackoverflow_data) w2v = W2VTransformer() # see: https://stackoverflow.com/questions/44636370/scikit-learn-gridsearchcv-without-cross-validation-unsupervised-learning/44682305#44682305 #clf = GridSearchCV(w2v, parameters, scoring={"MPP": self.call_MRR, "MAP": self.call_MAP}, verbose=2, n_jobs=3, refit="MAP", cv=[(slice(None), slice(None))]) #current implementation version only usees MAP to score #cv=[(slice(None), slice(None))] #clf = GridSearchCV(w2v, parameters, scoring= self.compute_scores, verbose=2) cur_max = 0 best_model = None parameters["size"] = [100] parameters["window"] = [10] for s in parameters["size"]: for w in parameters["window"]: print len(data) print "training model" model = gensim.models.Word2Vec(sentences=data, sg=1, size=s, window=w, workers=16, hs=0, negative=25, iter=5) print "model trained" print parameters score = self.compute_scores(model) if score > cur_max: cur_max = score best_model = model print cur_max word_vectors = best_model.wv print "VOCAB_SIZE", len(model.wv.vocab) word_vectors.save("best_model")
def __init__(self): ops.reset_default_graph() self.sess = tf.InteractiveSession() self.dp = DataProcessor(DataConfig()) self.config = self.dp.config self.row = self.config.img_height self.col = self.config.img_width self.ch = self.config.num_channels self.batch_count = 0 self.create_nvidia_model() self.create_train_method() self.epoch_count = 0 self.step_count = 0 self.loss_val = 1 self.saver = tf.train.Saver() if self.config.model_continue: self.restore_sess() else: self.sess.run(tf.global_variables_initializer())
def get_model_coverage(self): parameters = {'size': [100, 500], 'window': [5, 10], 'sg': [1], 'workers': [16], 'hs': [0], 'negative': [25], 'iter': [1] } dp = DataProcessor() data = dp.get_stackoverflow_data_sentences_all(["/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/swt/", "/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/birt/", "/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/eclipse/", "/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/eclipse-jdt/"]) model = gensim.models.Word2Vec(sentences=data, sg=1, size=100, window=10, workers=16, hs=0, negative=25, iter=1) vocab = model.wv.vocab print "VOCAB_SIZE", len(vocab) reports = dp.read_and_process_report_data(self.path_to_reports_data, self.project) all_report_text = [] all_source_file_text = [] for report in reports: report_text = report.processed_description file_path = self.path_to_processed_repo + str(report.reportID) + "/" all_report_text.extend(report_text) for dir_, _, files in os.walk(file_path): for fileName in files: relDir = os.path.relpath(dir_, file_path) relFile = os.path.join(relDir, fileName) full_path = file_path + relFile with open(full_path, 'r') as content_file: content = content_file.readlines() for line in content: l = line.strip().split(",") all_source_file_text.extend(l) all_report_vocab = set(all_report_text) all_source_file_vocab = set(all_source_file_text) print "report coverage", len(set.intersection(all_report_vocab, vocab))/ float(len(all_report_vocab)) print "source file coverage", len(set.intersection(all_source_file_vocab, vocab))/ float(len(all_source_file_vocab))
def main(): reader = DataReader('dataSrc') data = reader.readCoordinates() processor = DataProcessor(data) locations = processor.processDataPoints() try: for location in locations: location.state.country.addNew() location.state.country_id = location.state.country.id #location.state.country = None location.state.addNew() location.state_id = location.state.id #location.state = None location.addNew() except Exception as e: print(e) print(Location.listAll())
def main(): try: selectmodel = int( input("1. Deep Neural Network\n" "2. Convolutional Neural Network\n" "Choose Model : ")) except: selectmodel = 10 while selectmodel != 1 and selectmodel != 2: print("Wrong number of model, Please Insert number again.") selectmodel = int( input("1. Deep Neural Network\n" "2. Convolutional Neural Network\n" "Choose Model : ")) #Process Data# data = pd.read_csv(input_directory + "FinalTest_data.csv") dataProcessor = DataProcessor() data_fivefold = dataProcessor.fivefold(data, 'index') x_data_five, y_data_five = dataProcessor.divide_xy(data_fivefold, 'result') train_x, test_x = dataProcessor.train_test(x_data_five, 0) train_y, test_y = dataProcessor.train_test(y_data_five, 0) train_y = dataProcessor.one_hot_encoder(train_y) test_y = dataProcessor.one_hot_encoder(test_y) cal_x, test_x = dataProcessor.calibration(test_x) cal_y, test_y = dataProcessor.calibration(test_y) if selectmodel == 1: model = DNN([1000, 1000, 1000]) model.fit(train_x, train_y, cal_x, cal_y) model.test(test_x, test_y) else: model = CNN([10, 10, 10]) print(model.get_name())
def test_read_reports(): bug_file_path = "/home/ndg/users/carmst16/EmbeddingBugs/resources/bugreport/SWT.xlsx" project = "swt" #path_to_stackoverflow_data = "/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/swt/" dp = DataProcessor() already_processed = False previous_commit = None all_scores = [] path_to_starter = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.swt/" path_to_processed = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.swt_processed/" path_to_temp = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.swt_temp/" #print dp.get_stackoverflow_data(path_to_stackoverflow_data) reports = dp.read_and_process_report_data(bug_file_path, project) print "finished processing" for report in reports[1:2]: report_text = report.processed_description if not already_processed: dp.create_file_repo(path_to_starter, report, path_to_processed) already_processed = True previous_commit = report.commit else: dp.update_file_repo(previous_commit, report.commit, path_to_starter, path_to_temp, path_to_processed) previous_commit = report.commit report_text = report.processed_description for dir_, _, files in os.walk(path_to_processed): for fileName in files: relDir = os.path.relpath(dir_, path_to_processed) relFile = os.path.join(relDir, fileName) full_path = path_to_processed + relFile with open(full_path, 'r') as content_file: content = content_file.readlines() for line in content: l = line.split(",") l_content.append(l)
def vectorize_all(self, n, type='ngram'): dp_list = [ DataProcessor().load_raw_data_single_user_segments( user_num, num_of_segments=150) for user_num in range(40) ] vectorizer = Vectorizer(ngram_count=n, type=type) pdfs = [] for user_num in range(len(dp_list)): user_result = vectorizer.vectorize(dp_list[user_num], to_array=True) user_pdf = pd.DataFrame(user_result, columns=vectorizer.get_features()) user_pdf['User'] = user_num user_pdf['Segment'] = np.arange(150) user_pdf['Label'] = self.get_labels_array(user_num) user_pdf.to_csv('outputs/Vectorizer/{}-{}-user{}.csv'.format( type, n, user_num)) pdfs.append(user_pdf) del user_pdf print "Successfully vectorized user{} !".format(user_num) result_pdf = pd.concat(pdfs, ignore_index=True, axis=0, sort=True) result_pdf.to_csv('outputs/Vectorizer/all-{}-{}.csv'.format(n, type))
def augment_task(config): processor = DataProcessor(config) train_examples = processor.get_train_examples() dev_examples = processor.get_dev_examples() print("train_examples: {}".format(len(train_examples))) print("dev_examples: {}".format(len(dev_examples))) if config.transmit_augment: print('starting transmit data augment.') train_augment = sentence_set_pair(train_examples, random_state=config.seed) augment_data_save( train_augment, os.path.join(config.other_data_dir, config.train_augment_save_file)) new_train_augment = copy.deepcopy(train_examples) new_train_augment.extend(dev_examples) print(len(new_train_augment)) train_dev_augment = sentence_set_pair(new_train_augment, random_state=config.seed) augment_data_save( train_dev_augment, os.path.join(config.other_data_dir, config.train_dev_augment_save_file)) if config.category_augment: print('starting new category data augment.') medicine_examples = processor.get_medicine_examples() save_path = os.path.join(config.other_data_dir, config.category_augment_save_file) new_category_generate(train_examples, dev_examples, medicine_examples, save_path) if config.chip2019_augment: print('starting extract chip2019 data augment.') chip2019_extract(config)
def __init__(self, config, debug=True): self.config = config self.debug = debug self.logger = LogAPI.create_logger(self.__class__.__name__, self.debug) self.data_processor = DataProcessor(config) self.data_processor.prepare() n_vocab = len(self.data_processor.vocab) self.model = L.Classifier(Img2Seq(n_vocab, config)) self.model.compute_accuracy = False # I want loss, not accuracy self.optimizer = chainer.optimizers.Adam() self.optimizer.setup(self.model) if self.config['use_gpu'] >= 0: chainer.cuda.get_device(self.config['use_gpu']).use() self.model.to_gpu() self.xp = cuda.cupy else: self.xp = np # パラメータ・実験結果を保存しておくdict self.result_storage = {} self.result_storage["result"] = { "total_loss": [], "average_loss": [], "time_taken": [], "hyper_params": config}
def select_features(self, write=True): # first feature: TOP 20 common words # top-40 most common commands at all (We will use 1 gram, but it can work also with 2/3 gram) # second feature: NEW USED COMMANDS # number of commands that didn't appear in the first 50 segments, but appeared in the given chunk # (indication of a commands that are used recently now) # the feature will be 0 for the first 50 segments # 3th feature: count of fake commands # counter of commands that are used by malicious. # feature of unique commands that are used only as fake commands in the given segment. # feature of unique commands that are used only as benign commands in the given segment. # feature of number of commands from all the fake commands in the trainset for given segement. # feature of number of commands from all the benign commands in the trainset for given segement - NOT INCLUDED. # 4th feature: repeated sequence of commands # number of different repeated sequence of commands that appeared at least 4 times (for each lengths) # why? because legitimate user won't use sequence of commands repeatedly. df = pd.DataFrame(columns=['User', 'Segment']).set_index( ['User', 'Segment']) commands = pd.Series(DataProcessor().get_all_commands_series()) print commands.keys() partial_labels = self.get_partial_labels() bigram_list = [] commands_list = [] for user_cmd in commands: for segment_cmd in user_cmd: commands_list.extend(segment_cmd) bigram_list.extend([(segment_cmd[i], segment_cmd[i + 1]) for i in range(len(segment_cmd) - 1)]) # 1st feature top_commands = pd.Series(commands_list).value_counts().nlargest( 500).index.tolist() top_bigrams = pd.Series(bigram_list).value_counts().nlargest( 100).index.tolist() print 'top commands:' print top_commands print 'top bigrams:' print top_bigrams # preparations for 2nd feature distinct_first_50_commands = set() for user_num in commands.keys(): for segment in commands[user_num][:50]: for command in segment: distinct_first_50_commands.add(command) print 'Finished distinct_first_50_commands!' # preparation for 3th feature malicious_commands = defaultdict(list) for i in range(50, 150): col_index = str(100 * i) + '-' + str(100 * (i + 1)) for num_user in range(10): if partial_labels[col_index][num_user] == 1: malicious_commands[num_user].extend(commands[num_user][i]) malicious_commands_of_train_users_set = set() benign_commands_of_train_users_set = set() for num_user in range(10): malicious_commands_of_train_users_set = \ malicious_commands_of_train_users_set.union(set(malicious_commands[num_user])) user = pd.Series(commands[user_num]) for segment in user[:50]: benign_commands_of_train_users_set = benign_commands_of_train_users_set.union( set(segment)) commands_used_only_by_malicious_train = \ malicious_commands_of_train_users_set - benign_commands_of_train_users_set commands_used_only_by_benign_train = benign_commands_of_train_users_set - malicious_commands_of_train_users_set print 'Finished preparing sets of benign and malicious!' dp_list = [ DataProcessor().load_raw_data_single_user_segments( user_num, num_of_segments=150) for user_num in range(40) ] user_cmd_avg_len = [ self.command_avg_length(dp_list[user_num]) for user_num in range(40) ] user_diff_cmd = [ self.diff_commands_in_seg(dp_list[user_num]) for user_num in range(40) ] user_num_of_seq = [ self.num_of_sequences(dp_list[user_num]) for user_num in range(40) ] print 'Finished preparing features of michal!' ### adding the additional features for user_num in commands.keys(): for num_segment, segment in enumerate(commands[user_num]): #1st feature for top_cmd in top_commands: df.loc[(user_num, num_segment), top_cmd] = segment.count(top_cmd) string_segment = ' '.join(segment) for top_bigram in top_bigrams: string_bigram = top_bigram[0] + ' ' + top_bigram[1] df.loc[(user_num, num_segment), string_bigram] = string_segment.count(string_bigram) # 2nd feature df.loc[(user_num, num_segment), 'NewUsedCommands'] = \ len(set(segment) - distinct_first_50_commands) # 3th feature df.loc[(user_num, num_segment), 'UniqueMaliciousCommands'] = \ len( set(segment) & commands_used_only_by_malicious_train) df.loc[(user_num, num_segment), 'UniqueMaliciousCommands'] = \ len(set(segment) & commands_used_only_by_benign_train) df.loc[(user_num, num_segment), 'MaliciousCommandsCount'] = \ len(set(segment) & malicious_commands_of_train_users_set) # df.loc[(user_num, num_segment), 'BenignCommandsCount'] = \ # len(set(segment) & benign_commands_of_train_users_set) # 4th feature min_len = 2 max_len = 10 minimum_seq_count = 4 count_dict = {c: 0 for c in range(min_len, max_len)} lst = segment for sub in self.get_list_of_sublist(lst, min_len, max_len): sub_list = list(sub) counts = [ 1 if lst[i:(i + len(sub_list))] == sub_list else 0 for i in range(len(segment) - len(sub_list)) ] # we need to slice the slot in the list mapped by the length of the seq to avoid overlapping seqs. count_sum = sum( 1 for i in range(0, len(counts), len(sub_list)) if (sum(counts[i:i + len(sub_list)]) > 0)) if count_sum > minimum_seq_count: count_dict[len(sub)] += 1 for count_key, count_val in count_dict.items(): df.loc[(user_num, num_segment), 'Seq_of_commands_repeated_{}'.format(count_key )] = count_val df.loc[(user_num, num_segment), 'Length_duplicated_command'] = max( sum(1 for i in g) for k, g in groupby(segment)) # added michal features df.loc[( user_num, num_segment ), 'Num_of_sequences'] = user_num_of_seq[user_num][num_segment] df.loc[(user_num, num_segment), 'Diff_commands'] = user_diff_cmd[user_num][num_segment] df.loc[(user_num, num_segment), 'Avg_commands_length'] = user_cmd_avg_len[user_num][ num_segment] print 'Done loop: User {}, Segment {} ...'.format( user_num, num_segment) print 'Finished loop!' df.fillna(0, inplace=True) # remove overlapping counts df.loc[:, 'Seq_of_commands_repeated_2'] =\ df['Seq_of_commands_repeated_2'] - df['Seq_of_commands_repeated_3'] df.loc[:, 'Seq_of_commands_repeated_3'] = \ df.loc[:, 'Seq_of_commands_repeated_3'] - df['Seq_of_commands_repeated_4'] df.loc[:, 'Seq_of_commands_repeated_4'] = \ df['Seq_of_commands_repeated_4'] - df['Seq_of_commands_repeated_5'] df.loc[:, 'Seq_of_commands_repeated_5'] = \ df['Seq_of_commands_repeated_5'] - df['Seq_of_commands_repeated_6'] df.loc[:, 'Seq_of_commands_repeated_6'] = \ df['Seq_of_commands_repeated_6'] - df['Seq_of_commands_repeated_7'] df.loc[:, 'Seq_of_commands_repeated_7'] = \ df['Seq_of_commands_repeated_7'] - df['Seq_of_commands_repeated_8'] df.loc[:, 'Seq_of_commands_repeated_8'] = \ df['Seq_of_commands_repeated_8'] - df['Seq_of_commands_repeated_9'] del df['Seq_of_commands_repeated_9'] del df['Seq_of_commands_repeated_8'] del df['Seq_of_commands_repeated_7'] del df['Seq_of_commands_repeated_6'] del df['Seq_of_commands_repeated_4'] del df['Seq_of_commands_repeated_2'] df.loc[:, 'Label'] = self.get_labels_array_all() print 'Before write...' if write: df.to_csv(self.feature_select_output_file) return df
f1 += metric.f1() precision += metric.precision() recall += metric.recall() matrices += [metric.confusion_matrix()] f1 /= n_folds precision /= n_folds recall /= n_folds matrices = np.array(matrices) return f1, precision, recall, matrices if __name__ == '__main__': print('Random Forest') path = DatasetPath.MIT2 dp = DataProcessor(path=path) rf = RandomForest(dp) rf.fit(dp.data_processed) row = dp.process_sensors().iloc[[0]] print(rf.predict(row)) f1, precision, recall, matrices = rf.evaluate() print(f'F1 = {f1}') print(f'Precision = {precision}') print(f'Recall = {recall}')
def __init__(self): self.processor = DataProcessor(file_path=USERPATH)
from tensorflow.python.keras.layers import TextVectorization from DataProcessor import DataProcessor from matplotlib import pyplot as plt from keras.preprocessing.text import text_to_word_sequence from keras.preprocessing.text import Tokenizer import tensorflowjs as tfjs # Params BATCH_SIZE = 512 # Number of examples used in each iteration EPOCHS = 100 # Number of passes through entire dataset EMBEDDING = 40 # Dimension of word embedding vector # importing the data dir_path = 'annotated/corpus' dataProcessor = DataProcessor(dir_path, 'tei') sentences = dataProcessor.getListOfTuples() word2idx = {w: i + 2 for i, w in enumerate(dataProcessor.getWords())} word2idx['unk'] = 1 word2idx['pad'] = 0 idx2word = {i: w for w, i in word2idx.items()} tag2idx = {t: i + 1 for i, t in enumerate(dataProcessor.getTags())} tag2idx['pad'] = 0 idx2tag = {i: w for w, i in tag2idx.items()} # Write dictionary import json
#!/bin/python from DataProcessor import DataProcessor from Utils import Utils from Logger import Logger from MongoDB import MongoDB Utils.createFolderIfNotExists(DataProcessor.TMP_FOLDER) LOGGER = Logger(DataProcessor.TMP_FOLDER, verbose=True, name='processor') Utils(DataProcessor.TMP_FOLDER, LOGGER) mongo = MongoDB('127.0.0.1', 27017, LOGGER, user='******', password='******') mongo.startQueue(id=0) print(mongo.getQueueConsumerId()) processor = DataProcessor(mongo, LOGGER) processor.filterAndNormalizeFullDataset()