Ejemplo n.º 1
0
    def parse_files_and_insert(self):
        start_calculation = timeit.default_timer()
        ## Parse spam/ham index
        spam_ham_index_file = open(self.spam_ham_index_filename, "r", encoding='iso-8859-1')
        spam_ham_index_content = spam_ham_index_file.readlines()
        spam_ham_index_dict = {}
        for line in spam_ham_index_content:
            spam_ham, filepath = line.split()
            spam_ham_index_dict[filepath[filepath.rfind('/')+1: ]] = spam_ham

        ## Parse all emails
        es = Es(settings.hosts, settings.index_name, settings.index_type)

        files = glob.glob(self.directory_path)
        doc_count = 0
        spam_count = 0
        ham_count = 0
        for i in range(len(files)):
            try:
                file = open(files[i], "r", encoding='iso-8859-1')

                doc_count = doc_count + 1
                filename = files[i][files[i].rfind('/')+1:]

                ## Read the entire file content
                # content = file.read()
                # text = ''.join(BeautifulSoup(content, "html.parser").findAll(text=True))

                final_words = list()
                words = BeautifulSoup(file, 'html.parser').text.split()
                for word in words:
                    if re.match("^[A-Za-z]*[?.!,]*$", word):
                        word = word.translate(str.maketrans(dict.fromkeys(string.punctuation)))
                        if wordnet.synsets(word):
                            final_words.append(word)
                text = ' '.join(final_words).lower()
                text = re.sub(' +', ' ', text)

                if(spam_ham_index_dict[filename] == "spam"):
                    spam_count = spam_count + 1
                else:
                    ham_count = ham_count + 1

                split = self.__get_train_or_test(spam_ham_index_dict[filename], spam_count, ham_count)

                if(doc_count % 1000 == 0):
                    print("Inserting {} document".format(doc_count))

                result = es.insert(doc_count, filename, spam_ham_index_dict[filename], text, split)

            except UnicodeDecodeError as ue:
                print("Error: {} in reading file: {}".format(ue, files[i]))
                continue
            except ValueError as ve:
                print("Error: {}, substring not found in file: {}".format(ve, files[i]))
                continue

        stop_calculation = timeit.default_timer()
        print("Time taken to parse and insert to elasticsearch: " + str(stop_calculation - start_calculation))
        return (doc_count)
Ejemplo n.º 2
0
def main():

    config = Config(BASE_DIR + '/configuration/')
    sqlite = Sqlite(BASE_DIR + '/storage/harvester.db', config.XMLconfiguration)
    pandadb = PandaDB(BASE_DIR + '/settings.ini')
    es = Es(BASE_DIR + '/settings.ini')

    metrics = pandadb.get_db_metrics()
    dictHarvesterHosts, dictHarvesterInstnaces = es.get_last_submittedtime()
    sqlite.instances_availability(dictHarvesterInstnaces, metrics)

    instances = sqlite.get_instances()

    for instance in instances:
        for harvesterhost in instances[instance]:
            if harvesterhost != 'none':
                availability = instances[instance][harvesterhost]['availability']
                notificated = instances[instance][harvesterhost]['notificated']
                contacts = instances[instance][harvesterhost]['contacts']
                text = instances[instance][harvesterhost]['errorsdesc']
                if (availability == 0 or availability == 10) and notificated == 0:
                    email = Notifications(text=text,
                                          subject='Service issues on {0} {1}'.format(instance, harvesterhost),
                                          to=contacts)
                    email.send_notification_email()
                    sqlite.update_field('notificated', 1, instance, harvesterhost)
                    email = {}
                elif availability == 100 and notificated == 1:
                    sqlite.update_field('notificated', 0, instance, harvesterhost)
                host = harvesterhost.split('.')[0]
                doc = ServiceDocument('harv_{0}_{1}'.format(instance, host), availability=availability, contact=','.join(contacts), availabilitydesc="PandaHarvester instance:{0}".format(instance), availabilityinfo="{0}".format(text))
                XSLSPublisher.send(doc)
                doc = {}
Ejemplo n.º 3
0
from flask import request, render_template, session, jsonify
from flask import redirect, url_for, make_response
from werkzeug.utils import secure_filename

from vulscan import vulScan
from common import cleanPostData
from lib.Login import logincheck
from es import Es

from Index import app

sys.path.append(os.path.split(os.path.realpath(__file__))[0] + "/../")
from celerynode.api import api_hostScan, api_vulPoc, api_vulScript
from redispool import getStrictRedis

es = Es()
FILE_PATH = os.path.split(os.path.realpath(__file__))[0] + '/vulscan/vuldb/'


#返回es中扫描配置
@app.route('/taskconfig/all', methods=['get', 'post'])
def AllConfig():
    defaultConfig = es.get_scan_config()
    scanHosts = defaultConfig["scanHosts"].split(",")
    if request.json is None:
        return jsonify({
            "scanHosts": scanHosts,
            "scanPeriod": defaultConfig["scanPeriod"],
            "scanThreads": defaultConfig["scanThreads"],
            "scanCMS": defaultConfig["scanCMS"],
            "scanPorts": defaultConfig["scanPorts"]
Ejemplo n.º 4
0
#coding: utf8
from Config import ElasticConfig
from es import Es
from vulscan import vulScan

if __name__ == '__main__':
    es = Es()
    #初始化周期扫描配置
    es.init_scan_config()
    #初始化vultask脚本
    vulScan.init()
    print 'es初始化完成'
Ejemplo n.º 5
0
class Logstash2CSV(object):
    def __init__(self):
        self._conf = Logstash2CSVConfig()
        self._query = Logstash2CSVQuery()
        self.set_index()

    def load_connection(self, file):
        self._conf.load_connection_file(file)

    def set_connection(self, conn):
        self._conf.load_connection(conn)

    def load_fields(self, file):
        self._conf.load_fields_file(file)

    def set_fields(self, fields):
        self._conf.load_fields(fields)

    def set_query(self, query):
        self._query.load_query(query)

    def set_output_fields(self, fields):
        self._conf.load_output_fields(fields)

    def set_range(self, term=0):
        if term == 0:
            from_datetime = datetime.today()
            ds = 30
        else:
            from_datetime = datetime.today() - timedelta(0, term)
            self._query.set_range_of_timestamp(from_datetime)
            ds = term / 3600 / 24

        self.set_index(ds=ds)
        return from_datetime

    def set_index(self, ds=7):
        format = self._conf.index_format()
        today = datetime.today()
        self._index = ",".join([(today - timedelta(d)).strftime(format)
                                for d in range(ds)])

    def index(self):
        return self._index

    def query(self):
        return self._query.query()

    def connection(self):
        return self._conf.connection()

    def fields(self):
        return self._conf.fields()

    def output_fields(self):
        return self._conf.output_fields()

    def search(self):
        self._es = Es(self.connection())
        self._es.search(index=self.index(), body=self.query())

    def render_csv(self, separator=","):
        return "\n".join([separator.join(c) for c in self._generate_csv()])

    def _generate_csv(self):
        fields = self.output_fields()
        hits = self._es.hits()
        csv = [fields[:]]
        for h in hits:
            row = [str(self._get_value(h, f)) for f in fields]
            csv.append(row)

        return csv

    def _get_value(self, hit, field):
        try:
            value = hit["_source"][field]
        except:
            try:
                value = hit[field]
            except:
                value = ""
        finally:
            return value
Ejemplo n.º 6
0
 def search(self):
     self._es = Es(self.connection())
     self._es.search(index=self.index(), body=self.query())
Ejemplo n.º 7
0
class Logstash2CSV(object):
    def __init__(self):
        self._conf = Logstash2CSVConfig()
        self._query = Logstash2CSVQuery()
        self.set_index()

    def load_connection(self, file):
        self._conf.load_connection_file(file)

    def set_connection(self, conn):
        self._conf.load_connection(conn)

    def load_fields(self, file):
        self._conf.load_fields_file(file)

    def set_fields(self, fields):
        self._conf.load_fields(fields)

    def set_query(self, query):
        self._query.load_query(query)

    def set_output_fields(self, fields):
        self._conf.load_output_fields(fields)

    def set_range(self, term=0):
        if term == 0:
            from_datetime = datetime.today()
            ds = 30
        else:
            from_datetime = datetime.today() - timedelta(0, term)
            self._query.set_range_of_timestamp(from_datetime)
            ds = term/3600/24

        self.set_index(ds=ds)
        return from_datetime

    def set_index(self, ds=7):
        format = self._conf.index_format()
        today = datetime.today()
        self._index = ",".join(
            [(today - timedelta(d)).strftime(format) for d in range(ds)])

    def index(self):
        return self._index

    def query(self):
        return self._query.query()

    def connection(self):
        return self._conf.connection()

    def fields(self):
        return self._conf.fields()

    def output_fields(self):
        return self._conf.output_fields()

    def search(self):
        self._es = Es(self.connection())
        self._es.search(index=self.index(), body=self.query())

    def render_csv(self, separator=","):
        return "\n".join([separator.join(c) for c in self._generate_csv()])

    def _generate_csv(self):
        fields = self.output_fields()
        hits = self._es.hits()
        csv = [fields[:]]
        for h in hits:
            row = [str(self._get_value(h, f)) for f in fields]
            csv.append(row)

        return csv

    def _get_value(self, hit, field):
        try:
            value = hit["_source"][field]
        except:
            try:
                value = hit[field]
            except:
                value = ""
        finally:
            return value
Ejemplo n.º 8
0
 def search(self):
     self._es = Es(self.connection())
     self._es.search(index=self.index(), body=self.query())
Ejemplo n.º 9
0
    def __generate_feature_matrix(self):

        feature_matrix = pd.DataFrame()

        es = Es(settings.hosts, settings.index_name, settings.index_type)
        doc_count = es.count()

        ## Populate the matrix
        print("Total number of features: {}".format(len(self.spamwords)))
        for i in range(len(self.spamwords)):
            spamword = self.spamwords[i]
            result_id_list, result_score_list = es.get_documents_for_query(
                spamword)

            if (len(result_id_list) == 0):
                print("No documents are present for spamword {}".format(
                    spamword))

            result_list = [0] * doc_count
            for i in range(len(result_id_list)):
                result_list[int(result_id_list[i]) - 1] = result_score_list[i]

            feature_matrix[spamword] = result_list

            ## Print purpose
            if (i % 10 == 0):
                print("Feature matrix for {} words done".format(i))

        print(feature_matrix.index.values)
        ## Get the train and test list
        id_list, filename_list, label_list, split_list = es.get_all_documents()
        label_list = [1 if x == "spam" else 0 for x in label_list]

        ## Divide into train-test set
        train_row_ids = []
        test_row_ids = []
        train_index_list = []
        train_labels = []
        test_index_list = []
        test_labels = []
        for i in range(len(split_list)):
            if (split_list[i] == 'train'):
                train_row_ids.append(int(id_list[i]) - 1)
                train_index_list.append(filename_list[i])
                train_labels.append(label_list[i])
            else:
                test_row_ids.append(int(id_list[i]) - 1)
                test_index_list.append(filename_list[i])
                test_labels.append(label_list[i])

        print("Number of documents in train: {}".format(len(train_row_ids)))
        print("Number of documents in test: {}".format(len(test_row_ids)))

        ## Divide the feature_matrix into train and test
        train_feature_matrix = feature_matrix.loc[train_row_ids, :]
        train_feature_matrix.index = train_index_list
        test_feature_matrix = feature_matrix.loc[test_row_ids, :]
        test_feature_matrix.index = test_index_list

        print("Shape of train feature matrix: {}".format(
            train_feature_matrix.shape))
        print("Shape of test feature matrix: {}".format(
            test_feature_matrix.shape))

        return (train_feature_matrix, train_labels, test_feature_matrix,
                test_labels, train_row_ids, test_row_ids)
class Spam_Classifier_Unigrams():

    es = None

    unigrams_dict = {}

    def __init__(self):
        print(" In the constructor")
        self.es = Es(settings.hosts, settings.index_name, settings.index_type)

    def __load_unigram(self, create_unigrams_dict):
        ## Save the unigrams_dict
        if (create_unigrams_dict == True):
            self.unigrams_dict = self.es.get_all_vocabulary()
            util.dump_pickle_file(settings.unigram_filename,
                                  self.unigrams_dict)
        else:
            self.unigrams_dict = util.load_pickle_file(
                settings.unigram_filename)

    def __generate_feature_matrix(self, label_dict):
        start_calculation = timeit.default_timer()
        feature_sparse_matrix = dok_matrix(
            (len(label_dict), len(self.unigrams_dict)), dtype=np.float32)
        count = 0
        ## Loop over all the documents
        for id in label_dict.keys():
            if (count % 1000 == 0):
                print(
                    "Generating features for {} documents done".format(count))
            doc_term_vector = self.es.term_vectors(id)
            if "body" in doc_term_vector[
                    "term_vectors"] and "terms" in doc_term_vector[
                        "term_vectors"]["body"]:
                tokens = doc_term_vector["term_vectors"]["body"]["terms"].keys(
                )
                for token in tokens:
                    ## Get the token number
                    token_number = self.unigrams_dict[token]
                    tf = doc_term_vector["term_vectors"]["body"]["terms"][
                        token]["term_freq"]
                    feature_sparse_matrix[count, token_number - 1] = tf
            count = count + 1
        stop_calculation = timeit.default_timer()
        print("Time taken to generate feature matrix: " +
              str(stop_calculation - start_calculation))
        return (feature_sparse_matrix)

    def __get_feature_matrix(self):
        start_calculation = timeit.default_timer()

        ## Label dict
        label_train_dict = {}
        label_test_dict = {}

        ## Get all documents and get the train and test list
        id_list, filename_list, label_list, split_list = self.es.get_all_documents(
        )
        label_list = [1 if x == "spam" else 0 for x in label_list]

        for i in range(len(split_list)):
            if (split_list[i] == 'train'):
                label_train_dict[id_list[i]] = label_list[i]
            else:
                label_test_dict[id_list[i]] = label_list[i]

        print("Generating feature matrix for train documents")
        feature_sparse_train_matrix = self.__generate_feature_matrix(
            label_train_dict)
        print("Generating feature matrix for test documents")
        feature_sparse_test_matrix = self.__generate_feature_matrix(
            label_test_dict)

        train_csr = feature_sparse_train_matrix.tocsr()
        train_labels = list(label_train_dict.values())
        test_csr = feature_sparse_test_matrix.tocsr()
        test_labels = list(label_test_dict.values())

        stop_calculation = timeit.default_timer()
        print("Time taken to generate feature matrix: " +
              str(stop_calculation - start_calculation))

        return (train_csr, train_labels, test_csr, test_labels)

    def __classification(self, train_df, train_labels, test_df, test_labels):
        logistic_regression_accuracy = ml_models.logistic_regression(
            train_df, train_labels, test_df, test_labels)
        print("Logistic Regression Accuracy {}".format(
            logistic_regression_accuracy))
        decision_tree_accuracy = ml_models.decision_tree(
            train_df, train_labels, test_df, test_labels)
        print("Decision Tree Accuracy {}".format(decision_tree_accuracy))
        accuracy = max(logistic_regression_accuracy, decision_tree_accuracy)
        return (accuracy)

    def run(self, create_unigrams_dict=False):
        start_calculation = timeit.default_timer()

        ## Load unigram
        self.__load_unigram(create_unigrams_dict)

        ## Generate feature matrix
        train_csr, train_labels, test_csr, test_labels = self.__get_feature_matrix(
        )

        ## Run the classification algorithm
        accuracy = self.__classification(train_csr, train_labels, test_csr,
                                         test_labels)

        print("Accuracy is {}".format(accuracy))

        stop_calculation = timeit.default_timer()
        print("Time taken to run spam classifier userwords: " +
              str(stop_calculation - start_calculation))
 def __init__(self):
     print(" In the constructor")
     self.es = Es(settings.hosts, settings.index_name, settings.index_type)