Example #1
0
    def generateData(self, rate):
        obj = DataProcessor(0)
        obj.loadFileAndRetrieveCh()
        obj.saveToFile(rate)

        obj = DataProcessor(1)
        obj.loadFileAndRetrieveCh()
        obj.saveToFile(rate)
Example #2
0
def main():

    data_path = "../data2/training-Obama-Romney-tweets.xlsx"
    test_data_path = ''
    # test_data_path = '../data/testing-Obama-Romney-tweets-3labels.xlsx'

    try:
        opts, args = getopt.getopt(sys.argv[1:], "d:t:")
        for o, a in opts:
            if o == '-d':
                data_path = a
            elif o == '-t':
                test_data_path = a

    except getopt.GetoptError as err:
        # print help information and exit:
        print str(err)
        print 'read the readme file to know how to run this project'
        sys.exit(2)

    dp = DataProcessor(data_path)
    tc = TweetClassifier()

    if test_data_path != '':

        dpt = DataProcessor(test_data_path)

        print '\n****** OBAMA ******\n'
        data = dp.load_excel_data('Obama')
        data_test = dpt.load_excel_data('Obama')
        report = tc.train_test(data, data_test)
        DataProcessor.print_report(report)

        print '\n****** ROMNEY ******\n'
        data = dp.load_excel_data('Romney')
        data_test = dpt.load_excel_data('Romney')
        report = tc.train_test(data, data_test)
        DataProcessor.print_report(report)

    else:
        print '\n****** OBAMA ******\n'
        data = dp.load_excel_data('Obama')
        report = tc.crossvalidate(data, 10)
        DataProcessor.print_report(report)

        print '\n****** ROMNEY ******\n'
        data = dp.load_excel_data('Romney')
        report = tc.crossvalidate(data, 10)
        DataProcessor.print_report(report)
Example #3
0
    def get_graph(self, area=None, company=None):
        """
        Queries tweets by given area/company filter.
        """
        # Get all tweets or filtered by area
        if area:
            data = self.client.get_tweets_by_area(area)
            suffix = area.upper()
        elif company:
            data = self.client.get_tweets_by_company(company)
            suffix = company.upper()
        else:
            data = self.client.get_all_tweets()
            suffix = 'GLOBAL'

        processor = DataProcessor(data)

        # Get the time series data
        time_series = processor.prepare_time_series()

        # Save all the graph info in a list we can access from the view template
        graph = [
            dict(data=[dict(x=time_series.index, y=time_series)],
                 layout=dict(title='Tweet Frequency - ' + suffix),
                 id='timeseries')
        ]

        # Plotly needs the graph/pandas data encoded in compatible JSON format
        graph = json.dumps(graph, cls=plotly.utils.PlotlyJSONEncoder)

        return graph
Example #4
0
    def compute_scores(self, estimator):
        dp = DataProcessor()

        already_processed = False
        previous_commit = None
        all_scores = []

        reports = dp.read_and_process_report_data(self.path_to_reports_data, self.project)
        #print self.train_split_index_start, self.train_split_index_end

        reports_to_process = reports[self.train_split_index_start: self.train_split_index_end]
        pool = pp.ProcessPool(10) #don't have more than number of reports??
        self.cur_estimator = estimator

        all_scores = pool.map(self.get_report_score, reports_to_process)
        #pool.close()
        #pool.join()
        all_matrixes = [i[0] for i in all_scores]
        total_tried = sum([i[1] for i in all_scores])
        number_achieved = sum([i[2] for i in all_scores])

        print "finished pooling"
        print all_scores
        final_MAP_score = self.MAP(all_matrixes)
        final_MRR_score = self.MRR(all_matrixes)
        print final_MAP_score, " final MAP score"
        print final_MRR_score, " final MRR score"
        print float(number_achieved)/float(total_tried), " final accuracy at k score"
        return final_MAP_score
Example #5
0
def roberta_pair_task(config):

    tokenizer = BertTokenizer.from_pretrained(config.tokenizer_file,
                                              do_lower_case=config.do_lower_case)
    processor = DataProcessor(config)
    config.class_list = processor.get_labels()
    config.num_labels = len(config.class_list)

    train_examples = processor.get_train_examples()
    dev_examples = processor.get_dev_examples()
    augment_examples = processor.read_data_augment(config.data_augment_method)

    cur_model = MODEL_CLASSES[config.use_model]
    model = cur_model(config)

    logging.info("self config %s", config_to_json_string(config))

    model_example, dev_evaluate, predict_label = cross_validation(
        config=config,
        model=model,
        tokenizer=tokenizer,
        train_examples=train_examples,
        dev_examples=dev_examples,
        pattern=config.pattern,
        train_enhancement=augment_examples if config.data_augment else None,
        test_examples=None)
    logging.info("dev_evaluate: {}".format(dev_evaluate))

    if config.pattern == 'full_train':
        model_save(config, model_example)

    return dev_evaluate
Example #6
0
def test_reading_in():
    dp = DataProcessor()

    dp.get_stackoverflow_data("/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/birt/")
    dp.get_stackoverflow_data("/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/eclipse/")
    dp.get_stackoverflow_data("/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/eclipse-jdt/")
    dp.get_stackoverflow_data("/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/swt/")
Example #7
0
 def __init__(self, model_path=None):
     self.config = DataConfig()
     self.dp = DataProcessor(self.config)
     self.num_channels = self.config.num_channels
     self.row = self.config.img_height
     self.col = self.config.img_width
     self.ch = self.config.num_channels
     self.model = self.load_model(model_path)
Example #8
0
def process_files_eclipse():
    dp = DataProcessor()
    path_to_reports_data = "/home/ndg/users/carmst16/EmbeddingBugs/resources/bugreport/Eclipse_Platform_UI.xlsx"
    path_to_starter_repo = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.ui/"
    path_to_processed_repo = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.ui_processed_split/"
    path_to_temp = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.ui_temp/"
    reports = dp.read_and_process_report_data(path_to_reports_data, "eclipse_platform_ui")
    dp.process_all_files(path_to_starter_repo, reports, path_to_processed_repo, path_to_temp)
Example #9
0
def process_files_birt():
    dp = DataProcessor()
    path_to_reports_data = "/home/ndg/users/carmst16/EmbeddingBugs/resources/bugreport/Birt.xlsx"
    path_to_starter_repo = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/birt/"
    path_to_processed_repo = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/birt_processed_split/"
    path_to_temp = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/birt_temp/"
    reports = dp.read_and_process_report_data(path_to_reports_data, "birt")
    dp.process_all_files(path_to_starter_repo, reports, path_to_processed_repo, path_to_temp)
Example #10
0
def process_files_swt():
    dp = DataProcessor()
    path_to_reports_data = "/home/ndg/users/carmst16/EmbeddingBugs/resources/bugreport/SWT.xlsx"
    path_to_starter_repo = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.swt/"
    path_to_processed_repo = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.swt_processed_split_text_trial/"
    path_to_temp = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.swt_temp_again/"
    reports = dp.read_and_process_report_data(path_to_reports_data, "swt")
    dp.process_all_files(path_to_starter_repo, reports, path_to_processed_repo, path_to_temp)
def Selector1():
    selector = Selector()
    dataProcessor = DataProcessor()
    #abil = selector.getAbilityWithId(3)
    passed = selector.runSelector()
    print(passed)
    selector.setCharacterStats(15, 15)
    passed = selector.runSelector()
    print(passed)
    print(selector.HP)
    print(selector.getAuraWithId(selector.getAbilityWithId(2)['auraOne']))
Example #12
0
    def __init__(self):
        self.n_clusters = 2

        self.algorithms = {
            'current': STRPAlgorithm(self.n_clusters),
            'future': STRPAlgorithm(self.n_clusters)
        }

        self.data_processors = {
            'current': DataProcessor(),
            'future': DataProcessor()
        }

        self.max_absolute_treshold = 13
        self.min_absolute_treshold = 5
        self.max_percentual_treshold = .1
        self.min_percentual_treshold = .02

        self.entity_temper_percentual_threshold = .2

        self.is_running = False

        self.container = list()
        self.processed_nodes = list()
        self.raw_data = list()

        self.client = udp_client.UDPClient(OSC_SERVER, 8000)

        self.last_iteration = datetime.now()
        print('Application initialised')
        self.is_running = True

        # Create dummy data
        for i, d in enumerate(start_data):
            transformed_data = self.data_processors[
                'current'].transform_input_data(d)
            self.processed_nodes.append(transformed_data)
Example #13
0
File: model.py Project: ztypl/MLexp
 def __init__(self, filename, embedding_method='deepwalk', **kwargs):
     self.dp = DataProcessor(filename)
     self.workers = cpu_count()
     self.embedding_model = None
     self.embedding_method = embedding_method
     print("Init over.")
     sys.stdout.flush()
     if embedding_method == 'deepwalk':
         self.deepwalk(**kwargs)
     elif embedding_method == 'grarep':
         self.grarep(**kwargs)
     elif embedding_method == "node2vec":
         self.node2vec(**kwargs)
     else:
         raise TypeError("Unsupport type %s" % embedding_method)
Example #14
0
 def __init__(self):
     self.experimenter = Experimenter()
     self.dataProcessor = DataProcessor()
     self.gui = GUI()
     self.gui.addListener("computeButton", "clicked", self.compute)
     self.gui.addListener("computeFunctionButton", "clicked", self.compute)
     self.gui.addListener("computeSequenceButton", "clicked", self.compute)
     self.gui.addListener("graphButton", "clicked", self.graph)
     self.gui.addListener("viewInputButton", "clicked", self.viewInput)
     self.gui.addListener("roundOffSpinButton", "value-changed",
                          self.roundOffSummary)
     self.gui.addListener("showAdvancedButton", "clicked",
                          self.showAdvancedSettings)
     self.gui.addListener("hideAdvancedButton", "clicked",
                          self.hideAdvancedSettings)
Example #15
0
def getDataFromDB():
    """ Funzione per ottenere i dati dal server locale influxdb contenente le misurazioni dei pazienti. """

    ipDB = os.getenv('INFLUX_IP_AI', 'localhost')
    portDB = os.getenv('INFLUX_PORT_AI', '8086')
    userDB = os.getenv('INFLUX_USER_AI', 'admin')
    passwordDB = os.getenv('INFLUX_PW_AI', 'G10m1R0m3')
    nameDB = os.getenv('INFLUX_DB_AI', 'giomi')

    dr = DataRetriever(metrics)
    dfs = dr.loadDataFromDB(ipDB, portDB, userDB, passwordDB, nameDB)

    dp = DataProcessor(metrics, defaults)
    df = dp.applyPipeline(dfs)

    return df
Example #16
0
    def login(self):
        username_input = self.username_entry.get()
        password_input = self.password_entry.get()
        self.login_processor = DataProcessor(file_path="C:/Users/JakeT/onedrive/documents/visual studio 2017/Projects/PyStation/PyStation/static/config/users.csv")
        database = self.login_processor.data

        for entry in database:
            print(entry, "entry")
            print(type(entry), "type(entry)")
            print(entry[0], "entry[0]")
            print(entry[1], "entry[1]")
            if username_input == entry[0] and password_input == entry[1]:
                print("CORRECT USERNAME AND PASSWORD")
#            print(entry,"value")
 #           print(type(entry),"type")

        print(self.login_processor)
Example #17
0
    def train(self):

        #this describes everything you want to search over
        parameters = {'size': [100,  500],
                      'window': [5, 10],
                      'sg': [1],
                      'workers': [16],
                      'hs': [0],
                      'negative': [25],
                      'iter': [1]
                      }

        dp = DataProcessor()
        data = dp.get_stackoverflow_data_sentences_all(["/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/swt/", "/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/birt/", "/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/eclipse/", "/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/eclipse-jdt/"])
        #if self.document:
         #   data = dp.get_stackoverflow_data_document(self.path_to_stackoverflow_data)
        #else:
          #  data = dp.get_stackoverflow_data_sentences(self.path_to_stackoverflow_data)
        w2v = W2VTransformer()
        # see: https://stackoverflow.com/questions/44636370/scikit-learn-gridsearchcv-without-cross-validation-unsupervised-learning/44682305#44682305
        #clf = GridSearchCV(w2v, parameters, scoring={"MPP": self.call_MRR, "MAP": self.call_MAP}, verbose=2, n_jobs=3, refit="MAP", cv=[(slice(None), slice(None))])

        #current implementation version only usees MAP to score
        #cv=[(slice(None), slice(None))]
        #clf = GridSearchCV(w2v, parameters, scoring= self.compute_scores, verbose=2)
        cur_max = 0
        best_model = None
        parameters["size"] = [100]
        parameters["window"] = [10]
        for s in parameters["size"]:
            for w in parameters["window"]:
                print len(data)
                print "training model"
                model = gensim.models.Word2Vec(sentences=data, sg=1, size=s, window=w, workers=16, hs=0, negative=25, iter=5)
                print "model trained"
                print parameters
                score = self.compute_scores(model)
                if score > cur_max:
                    cur_max = score
                    best_model = model
        print cur_max
        word_vectors = best_model.wv
        print "VOCAB_SIZE", len(model.wv.vocab)
        word_vectors.save("best_model")
 def __init__(self):
     ops.reset_default_graph()
     self.sess = tf.InteractiveSession()
     self.dp = DataProcessor(DataConfig())
     self.config = self.dp.config
     self.row = self.config.img_height
     self.col = self.config.img_width
     self.ch = self.config.num_channels
     self.batch_count = 0
     self.create_nvidia_model()
     self.create_train_method()
     self.epoch_count = 0
     self.step_count = 0
     self.loss_val = 1
     self.saver = tf.train.Saver()
     if self.config.model_continue:
         self.restore_sess()
     else:
         self.sess.run(tf.global_variables_initializer())
Example #19
0
    def get_model_coverage(self):

        parameters = {'size': [100,  500],
                      'window': [5, 10],
                      'sg': [1],
                      'workers': [16],
                      'hs': [0],
                      'negative': [25],
                      'iter': [1]
                      }

        dp = DataProcessor()
        data = dp.get_stackoverflow_data_sentences_all(["/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/swt/", "/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/birt/", "/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/eclipse/", "/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/eclipse-jdt/"])

        model = gensim.models.Word2Vec(sentences=data, sg=1, size=100, window=10, workers=16, hs=0, negative=25, iter=1)
        vocab = model.wv.vocab
        print "VOCAB_SIZE", len(vocab)

        reports = dp.read_and_process_report_data(self.path_to_reports_data, self.project)
        all_report_text = []
        all_source_file_text = []
        for report in reports:
            report_text = report.processed_description
            file_path = self.path_to_processed_repo + str(report.reportID) + "/"
            all_report_text.extend(report_text)

            for dir_, _, files in os.walk(file_path):
                for fileName in files:
                    relDir = os.path.relpath(dir_, file_path)
                    relFile = os.path.join(relDir, fileName)
                    full_path = file_path + relFile
                    with open(full_path, 'r') as content_file:
                        content = content_file.readlines()
                        for line in content:
                            l = line.strip().split(",")
                            all_source_file_text.extend(l)

        all_report_vocab = set(all_report_text)
        all_source_file_vocab = set(all_source_file_text)

        print "report coverage", len(set.intersection(all_report_vocab, vocab))/ float(len(all_report_vocab))
        print "source file coverage", len(set.intersection(all_source_file_vocab, vocab))/ float(len(all_source_file_vocab))
Example #20
0
def main():
    reader = DataReader('dataSrc')

    data = reader.readCoordinates()

    processor = DataProcessor(data)
    locations = processor.processDataPoints()
    try:
        for location in locations:
            location.state.country.addNew()
            location.state.country_id = location.state.country.id
            #location.state.country = None
            location.state.addNew()
            location.state_id = location.state.id
            #location.state = None
            location.addNew()
    except Exception as e:
        print(e)

    print(Location.listAll())
Example #21
0
def main():
    try:
        selectmodel = int(
            input("1. Deep Neural Network\n"
                  "2. Convolutional Neural Network\n"
                  "Choose Model : "))
    except:
        selectmodel = 10

    while selectmodel != 1 and selectmodel != 2:
        print("Wrong number of model, Please Insert number again.")
        selectmodel = int(
            input("1. Deep Neural Network\n"
                  "2. Convolutional Neural Network\n"
                  "Choose Model : "))

    #Process Data#
    data = pd.read_csv(input_directory + "FinalTest_data.csv")
    dataProcessor = DataProcessor()
    data_fivefold = dataProcessor.fivefold(data, 'index')
    x_data_five, y_data_five = dataProcessor.divide_xy(data_fivefold, 'result')
    train_x, test_x = dataProcessor.train_test(x_data_five, 0)
    train_y, test_y = dataProcessor.train_test(y_data_five, 0)
    train_y = dataProcessor.one_hot_encoder(train_y)
    test_y = dataProcessor.one_hot_encoder(test_y)
    cal_x, test_x = dataProcessor.calibration(test_x)
    cal_y, test_y = dataProcessor.calibration(test_y)

    if selectmodel == 1:
        model = DNN([1000, 1000, 1000])
        model.fit(train_x, train_y, cal_x, cal_y)
        model.test(test_x, test_y)

    else:
        model = CNN([10, 10, 10])

    print(model.get_name())
Example #22
0
def test_read_reports():
    bug_file_path = "/home/ndg/users/carmst16/EmbeddingBugs/resources/bugreport/SWT.xlsx"
    project = "swt"
    #path_to_stackoverflow_data = "/home/ndg/users/carmst16/EmbeddingBugs/resources/stackexchangedata/swt/"
    dp = DataProcessor()

    already_processed = False
    previous_commit = None
    all_scores = []
    path_to_starter = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.swt/"
    path_to_processed = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.swt_processed/"
    path_to_temp = "/home/ndg/users/carmst16/EmbeddingBugs/resources/source_files/test/eclipse.platform.swt_temp/"
    #print dp.get_stackoverflow_data(path_to_stackoverflow_data)
    reports = dp.read_and_process_report_data(bug_file_path, project)
    print "finished processing"
    for report in reports[1:2]:
        report_text = report.processed_description
        if not already_processed:
            dp.create_file_repo(path_to_starter, report, path_to_processed)
            already_processed = True
            previous_commit = report.commit
        else:
            dp.update_file_repo(previous_commit, report.commit, path_to_starter, path_to_temp, path_to_processed)
            previous_commit = report.commit

        report_text = report.processed_description

        for dir_, _, files in os.walk(path_to_processed):
            for fileName in files:
                relDir = os.path.relpath(dir_, path_to_processed)
                relFile = os.path.join(relDir, fileName)
                full_path = path_to_processed + relFile
                with open(full_path, 'r') as content_file:
                    content = content_file.readlines()
                    for line in content:
                        l = line.split(",")
                        l_content.append(l)
Example #23
0
    def vectorize_all(self, n, type='ngram'):
        dp_list = [
            DataProcessor().load_raw_data_single_user_segments(
                user_num, num_of_segments=150) for user_num in range(40)
        ]
        vectorizer = Vectorizer(ngram_count=n, type=type)
        pdfs = []
        for user_num in range(len(dp_list)):
            user_result = vectorizer.vectorize(dp_list[user_num],
                                               to_array=True)
            user_pdf = pd.DataFrame(user_result,
                                    columns=vectorizer.get_features())
            user_pdf['User'] = user_num
            user_pdf['Segment'] = np.arange(150)
            user_pdf['Label'] = self.get_labels_array(user_num)

            user_pdf.to_csv('outputs/Vectorizer/{}-{}-user{}.csv'.format(
                type, n, user_num))
            pdfs.append(user_pdf)
            del user_pdf
            print "Successfully vectorized user{} !".format(user_num)

        result_pdf = pd.concat(pdfs, ignore_index=True, axis=0, sort=True)
        result_pdf.to_csv('outputs/Vectorizer/all-{}-{}.csv'.format(n, type))
def augment_task(config):

    processor = DataProcessor(config)
    train_examples = processor.get_train_examples()
    dev_examples = processor.get_dev_examples()
    print("train_examples: {}".format(len(train_examples)))
    print("dev_examples: {}".format(len(dev_examples)))

    if config.transmit_augment:
        print('starting transmit data augment.')
        train_augment = sentence_set_pair(train_examples,
                                          random_state=config.seed)
        augment_data_save(
            train_augment,
            os.path.join(config.other_data_dir,
                         config.train_augment_save_file))

        new_train_augment = copy.deepcopy(train_examples)
        new_train_augment.extend(dev_examples)
        print(len(new_train_augment))
        train_dev_augment = sentence_set_pair(new_train_augment,
                                              random_state=config.seed)
        augment_data_save(
            train_dev_augment,
            os.path.join(config.other_data_dir,
                         config.train_dev_augment_save_file))
    if config.category_augment:
        print('starting new category data augment.')
        medicine_examples = processor.get_medicine_examples()
        save_path = os.path.join(config.other_data_dir,
                                 config.category_augment_save_file)
        new_category_generate(train_examples, dev_examples, medicine_examples,
                              save_path)
    if config.chip2019_augment:
        print('starting extract chip2019 data augment.')
        chip2019_extract(config)
Example #25
0
    def __init__(self, config, debug=True):
        self.config = config
        self.debug = debug
        self.logger = LogAPI.create_logger(self.__class__.__name__, self.debug)

        self.data_processor = DataProcessor(config)
        self.data_processor.prepare()
        n_vocab = len(self.data_processor.vocab)
        self.model = L.Classifier(Img2Seq(n_vocab, config))
        self.model.compute_accuracy = False  # I want loss, not accuracy
        self.optimizer = chainer.optimizers.Adam()
        self.optimizer.setup(self.model)

        if self.config['use_gpu'] >= 0:
            chainer.cuda.get_device(self.config['use_gpu']).use()
            self.model.to_gpu()
            self.xp = cuda.cupy
        else:
            self.xp = np

        # パラメータ・実験結果を保存しておくdict
        self.result_storage = {}
        self.result_storage["result"] = {
            "total_loss": [], "average_loss": [], "time_taken": [], "hyper_params": config}
Example #26
0
    def select_features(self, write=True):

        # first feature: TOP 20 common words
        # top-40 most common commands at all (We will use 1 gram, but it can work also with 2/3 gram)

        # second feature: NEW USED COMMANDS
        # number of commands that didn't appear in the first 50 segments, but appeared in the given chunk
        # (indication of a commands that are used recently now)
        # the feature will be 0 for the first 50 segments

        # 3th feature: count of fake commands
        # counter of commands that are used by malicious.
        # feature of unique commands that are used only as fake commands in the given segment.
        # feature of unique commands that are used only as benign commands in the given segment.
        # feature of number of commands from all the fake commands in the trainset for given segement.
        # feature of number of commands from all the benign commands in the trainset for given segement - NOT INCLUDED.

        # 4th feature: repeated sequence of commands
        # number of different repeated sequence of commands that appeared at least 4 times (for each lengths)
        # why? because legitimate user won't use sequence of commands repeatedly.

        df = pd.DataFrame(columns=['User', 'Segment']).set_index(
            ['User', 'Segment'])
        commands = pd.Series(DataProcessor().get_all_commands_series())
        print commands.keys()
        partial_labels = self.get_partial_labels()

        bigram_list = []
        commands_list = []
        for user_cmd in commands:
            for segment_cmd in user_cmd:
                commands_list.extend(segment_cmd)
                bigram_list.extend([(segment_cmd[i], segment_cmd[i + 1])
                                    for i in range(len(segment_cmd) - 1)])

    # 1st feature
        top_commands = pd.Series(commands_list).value_counts().nlargest(
            500).index.tolist()
        top_bigrams = pd.Series(bigram_list).value_counts().nlargest(
            100).index.tolist()

        print 'top commands:'
        print top_commands

        print 'top bigrams:'
        print top_bigrams

        # preparations for 2nd feature
        distinct_first_50_commands = set()
        for user_num in commands.keys():
            for segment in commands[user_num][:50]:
                for command in segment:
                    distinct_first_50_commands.add(command)

        print 'Finished distinct_first_50_commands!'
        # preparation for 3th feature
        malicious_commands = defaultdict(list)
        for i in range(50, 150):
            col_index = str(100 * i) + '-' + str(100 * (i + 1))
            for num_user in range(10):
                if partial_labels[col_index][num_user] == 1:
                    malicious_commands[num_user].extend(commands[num_user][i])

        malicious_commands_of_train_users_set = set()
        benign_commands_of_train_users_set = set()

        for num_user in range(10):
            malicious_commands_of_train_users_set = \
                malicious_commands_of_train_users_set.union(set(malicious_commands[num_user]))
            user = pd.Series(commands[user_num])
            for segment in user[:50]:
                benign_commands_of_train_users_set = benign_commands_of_train_users_set.union(
                    set(segment))

        commands_used_only_by_malicious_train = \
            malicious_commands_of_train_users_set - benign_commands_of_train_users_set
        commands_used_only_by_benign_train = benign_commands_of_train_users_set - malicious_commands_of_train_users_set

        print 'Finished preparing sets of benign and malicious!'

        dp_list = [
            DataProcessor().load_raw_data_single_user_segments(
                user_num, num_of_segments=150) for user_num in range(40)
        ]

        user_cmd_avg_len = [
            self.command_avg_length(dp_list[user_num])
            for user_num in range(40)
        ]
        user_diff_cmd = [
            self.diff_commands_in_seg(dp_list[user_num])
            for user_num in range(40)
        ]
        user_num_of_seq = [
            self.num_of_sequences(dp_list[user_num]) for user_num in range(40)
        ]

        print 'Finished preparing features of michal!'
        ### adding the additional features
        for user_num in commands.keys():
            for num_segment, segment in enumerate(commands[user_num]):

                #1st feature
                for top_cmd in top_commands:
                    df.loc[(user_num, num_segment),
                           top_cmd] = segment.count(top_cmd)

                string_segment = ' '.join(segment)
                for top_bigram in top_bigrams:
                    string_bigram = top_bigram[0] + ' ' + top_bigram[1]
                    df.loc[(user_num, num_segment),
                           string_bigram] = string_segment.count(string_bigram)
                # 2nd feature
                df.loc[(user_num, num_segment), 'NewUsedCommands'] = \
                    len(set(segment) - distinct_first_50_commands)

                # 3th feature
                df.loc[(user_num, num_segment), 'UniqueMaliciousCommands'] = \
                    len( set(segment) & commands_used_only_by_malicious_train)
                df.loc[(user_num, num_segment), 'UniqueMaliciousCommands'] = \
                    len(set(segment) & commands_used_only_by_benign_train)
                df.loc[(user_num, num_segment), 'MaliciousCommandsCount'] = \
                    len(set(segment) & malicious_commands_of_train_users_set)
                # df.loc[(user_num, num_segment), 'BenignCommandsCount'] = \
                #    len(set(segment) & benign_commands_of_train_users_set)

                # 4th feature
                min_len = 2
                max_len = 10
                minimum_seq_count = 4
                count_dict = {c: 0 for c in range(min_len, max_len)}
                lst = segment

                for sub in self.get_list_of_sublist(lst, min_len, max_len):
                    sub_list = list(sub)

                    counts = [
                        1 if lst[i:(i + len(sub_list))] == sub_list else 0
                        for i in range(len(segment) - len(sub_list))
                    ]

                    # we need to slice the slot in the list mapped by the length of the seq to avoid overlapping seqs.
                    count_sum = sum(
                        1 for i in range(0, len(counts), len(sub_list))
                        if (sum(counts[i:i + len(sub_list)]) > 0))

                    if count_sum > minimum_seq_count:
                        count_dict[len(sub)] += 1

                for count_key, count_val in count_dict.items():
                    df.loc[(user_num, num_segment),
                           'Seq_of_commands_repeated_{}'.format(count_key
                                                                )] = count_val

                df.loc[(user_num, num_segment),
                       'Length_duplicated_command'] = max(
                           sum(1 for i in g) for k, g in groupby(segment))
                # added michal features

                df.loc[(
                    user_num, num_segment
                ), 'Num_of_sequences'] = user_num_of_seq[user_num][num_segment]
                df.loc[(user_num, num_segment),
                       'Diff_commands'] = user_diff_cmd[user_num][num_segment]
                df.loc[(user_num, num_segment),
                       'Avg_commands_length'] = user_cmd_avg_len[user_num][
                           num_segment]

                print 'Done loop: User {}, Segment {} ...'.format(
                    user_num, num_segment)

        print 'Finished loop!'
        df.fillna(0, inplace=True)

        # remove overlapping counts
        df.loc[:, 'Seq_of_commands_repeated_2'] =\
            df['Seq_of_commands_repeated_2'] - df['Seq_of_commands_repeated_3']
        df.loc[:, 'Seq_of_commands_repeated_3'] = \
            df.loc[:, 'Seq_of_commands_repeated_3'] - df['Seq_of_commands_repeated_4']
        df.loc[:, 'Seq_of_commands_repeated_4'] = \
            df['Seq_of_commands_repeated_4'] - df['Seq_of_commands_repeated_5']
        df.loc[:, 'Seq_of_commands_repeated_5'] = \
            df['Seq_of_commands_repeated_5'] - df['Seq_of_commands_repeated_6']
        df.loc[:, 'Seq_of_commands_repeated_6'] = \
            df['Seq_of_commands_repeated_6'] - df['Seq_of_commands_repeated_7']
        df.loc[:, 'Seq_of_commands_repeated_7'] = \
            df['Seq_of_commands_repeated_7'] - df['Seq_of_commands_repeated_8']
        df.loc[:, 'Seq_of_commands_repeated_8'] = \
            df['Seq_of_commands_repeated_8'] - df['Seq_of_commands_repeated_9']
        del df['Seq_of_commands_repeated_9']
        del df['Seq_of_commands_repeated_8']
        del df['Seq_of_commands_repeated_7']

        del df['Seq_of_commands_repeated_6']

        del df['Seq_of_commands_repeated_4']
        del df['Seq_of_commands_repeated_2']

        df.loc[:, 'Label'] = self.get_labels_array_all()
        print 'Before write...'
        if write:
            df.to_csv(self.feature_select_output_file)
        return df
Example #27
0
            f1 += metric.f1()
            precision += metric.precision()
            recall += metric.recall()
            matrices += [metric.confusion_matrix()]

        f1 /= n_folds
        precision /= n_folds
        recall /= n_folds
        matrices = np.array(matrices)

        return f1, precision, recall, matrices


if __name__ == '__main__':
    print('Random Forest')

    path = DatasetPath.MIT2

    dp = DataProcessor(path=path)

    rf = RandomForest(dp)
    rf.fit(dp.data_processed)

    row = dp.process_sensors().iloc[[0]]
    print(rf.predict(row))

    f1, precision, recall, matrices = rf.evaluate()

    print(f'F1        = {f1}')
    print(f'Precision = {precision}')
    print(f'Recall    = {recall}')
Example #28
0
 def __init__(self):
     self.processor = DataProcessor(file_path=USERPATH)
Example #29
0
from tensorflow.python.keras.layers import TextVectorization

from DataProcessor import DataProcessor
from matplotlib import pyplot as plt
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
import tensorflowjs as tfjs

# Params
BATCH_SIZE = 512  # Number of examples used in each iteration
EPOCHS = 100  # Number of passes through entire dataset
EMBEDDING = 40  # Dimension of word embedding vector

# importing the data
dir_path = 'annotated/corpus'
dataProcessor = DataProcessor(dir_path, 'tei')
sentences = dataProcessor.getListOfTuples()

word2idx = {w: i + 2 for i, w in enumerate(dataProcessor.getWords())}
word2idx['unk'] = 1
word2idx['pad'] = 0

idx2word = {i: w for w, i in word2idx.items()}

tag2idx = {t: i + 1 for i, t in enumerate(dataProcessor.getTags())}
tag2idx['pad'] = 0

idx2tag = {i: w for w, i in tag2idx.items()}

# Write dictionary
import json
Example #30
0
#!/bin/python

from DataProcessor import DataProcessor
from Utils import Utils
from Logger import Logger
from MongoDB import MongoDB

Utils.createFolderIfNotExists(DataProcessor.TMP_FOLDER)
LOGGER = Logger(DataProcessor.TMP_FOLDER, verbose=True, name='processor')
Utils(DataProcessor.TMP_FOLDER, LOGGER)

mongo = MongoDB('127.0.0.1', 27017, LOGGER, user='******', password='******')
mongo.startQueue(id=0)
print(mongo.getQueueConsumerId())

processor = DataProcessor(mongo, LOGGER)

processor.filterAndNormalizeFullDataset()