def parse_files_and_insert(self): start_calculation = timeit.default_timer() ## Parse spam/ham index spam_ham_index_file = open(self.spam_ham_index_filename, "r", encoding='iso-8859-1') spam_ham_index_content = spam_ham_index_file.readlines() spam_ham_index_dict = {} for line in spam_ham_index_content: spam_ham, filepath = line.split() spam_ham_index_dict[filepath[filepath.rfind('/')+1: ]] = spam_ham ## Parse all emails es = Es(settings.hosts, settings.index_name, settings.index_type) files = glob.glob(self.directory_path) doc_count = 0 spam_count = 0 ham_count = 0 for i in range(len(files)): try: file = open(files[i], "r", encoding='iso-8859-1') doc_count = doc_count + 1 filename = files[i][files[i].rfind('/')+1:] ## Read the entire file content # content = file.read() # text = ''.join(BeautifulSoup(content, "html.parser").findAll(text=True)) final_words = list() words = BeautifulSoup(file, 'html.parser').text.split() for word in words: if re.match("^[A-Za-z]*[?.!,]*$", word): word = word.translate(str.maketrans(dict.fromkeys(string.punctuation))) if wordnet.synsets(word): final_words.append(word) text = ' '.join(final_words).lower() text = re.sub(' +', ' ', text) if(spam_ham_index_dict[filename] == "spam"): spam_count = spam_count + 1 else: ham_count = ham_count + 1 split = self.__get_train_or_test(spam_ham_index_dict[filename], spam_count, ham_count) if(doc_count % 1000 == 0): print("Inserting {} document".format(doc_count)) result = es.insert(doc_count, filename, spam_ham_index_dict[filename], text, split) except UnicodeDecodeError as ue: print("Error: {} in reading file: {}".format(ue, files[i])) continue except ValueError as ve: print("Error: {}, substring not found in file: {}".format(ve, files[i])) continue stop_calculation = timeit.default_timer() print("Time taken to parse and insert to elasticsearch: " + str(stop_calculation - start_calculation)) return (doc_count)
def main(): config = Config(BASE_DIR + '/configuration/') sqlite = Sqlite(BASE_DIR + '/storage/harvester.db', config.XMLconfiguration) pandadb = PandaDB(BASE_DIR + '/settings.ini') es = Es(BASE_DIR + '/settings.ini') metrics = pandadb.get_db_metrics() dictHarvesterHosts, dictHarvesterInstnaces = es.get_last_submittedtime() sqlite.instances_availability(dictHarvesterInstnaces, metrics) instances = sqlite.get_instances() for instance in instances: for harvesterhost in instances[instance]: if harvesterhost != 'none': availability = instances[instance][harvesterhost]['availability'] notificated = instances[instance][harvesterhost]['notificated'] contacts = instances[instance][harvesterhost]['contacts'] text = instances[instance][harvesterhost]['errorsdesc'] if (availability == 0 or availability == 10) and notificated == 0: email = Notifications(text=text, subject='Service issues on {0} {1}'.format(instance, harvesterhost), to=contacts) email.send_notification_email() sqlite.update_field('notificated', 1, instance, harvesterhost) email = {} elif availability == 100 and notificated == 1: sqlite.update_field('notificated', 0, instance, harvesterhost) host = harvesterhost.split('.')[0] doc = ServiceDocument('harv_{0}_{1}'.format(instance, host), availability=availability, contact=','.join(contacts), availabilitydesc="PandaHarvester instance:{0}".format(instance), availabilityinfo="{0}".format(text)) XSLSPublisher.send(doc) doc = {}
from flask import request, render_template, session, jsonify from flask import redirect, url_for, make_response from werkzeug.utils import secure_filename from vulscan import vulScan from common import cleanPostData from lib.Login import logincheck from es import Es from Index import app sys.path.append(os.path.split(os.path.realpath(__file__))[0] + "/../") from celerynode.api import api_hostScan, api_vulPoc, api_vulScript from redispool import getStrictRedis es = Es() FILE_PATH = os.path.split(os.path.realpath(__file__))[0] + '/vulscan/vuldb/' #返回es中扫描配置 @app.route('/taskconfig/all', methods=['get', 'post']) def AllConfig(): defaultConfig = es.get_scan_config() scanHosts = defaultConfig["scanHosts"].split(",") if request.json is None: return jsonify({ "scanHosts": scanHosts, "scanPeriod": defaultConfig["scanPeriod"], "scanThreads": defaultConfig["scanThreads"], "scanCMS": defaultConfig["scanCMS"], "scanPorts": defaultConfig["scanPorts"]
#coding: utf8 from Config import ElasticConfig from es import Es from vulscan import vulScan if __name__ == '__main__': es = Es() #初始化周期扫描配置 es.init_scan_config() #初始化vultask脚本 vulScan.init() print 'es初始化完成'
class Logstash2CSV(object): def __init__(self): self._conf = Logstash2CSVConfig() self._query = Logstash2CSVQuery() self.set_index() def load_connection(self, file): self._conf.load_connection_file(file) def set_connection(self, conn): self._conf.load_connection(conn) def load_fields(self, file): self._conf.load_fields_file(file) def set_fields(self, fields): self._conf.load_fields(fields) def set_query(self, query): self._query.load_query(query) def set_output_fields(self, fields): self._conf.load_output_fields(fields) def set_range(self, term=0): if term == 0: from_datetime = datetime.today() ds = 30 else: from_datetime = datetime.today() - timedelta(0, term) self._query.set_range_of_timestamp(from_datetime) ds = term / 3600 / 24 self.set_index(ds=ds) return from_datetime def set_index(self, ds=7): format = self._conf.index_format() today = datetime.today() self._index = ",".join([(today - timedelta(d)).strftime(format) for d in range(ds)]) def index(self): return self._index def query(self): return self._query.query() def connection(self): return self._conf.connection() def fields(self): return self._conf.fields() def output_fields(self): return self._conf.output_fields() def search(self): self._es = Es(self.connection()) self._es.search(index=self.index(), body=self.query()) def render_csv(self, separator=","): return "\n".join([separator.join(c) for c in self._generate_csv()]) def _generate_csv(self): fields = self.output_fields() hits = self._es.hits() csv = [fields[:]] for h in hits: row = [str(self._get_value(h, f)) for f in fields] csv.append(row) return csv def _get_value(self, hit, field): try: value = hit["_source"][field] except: try: value = hit[field] except: value = "" finally: return value
def search(self): self._es = Es(self.connection()) self._es.search(index=self.index(), body=self.query())
class Logstash2CSV(object): def __init__(self): self._conf = Logstash2CSVConfig() self._query = Logstash2CSVQuery() self.set_index() def load_connection(self, file): self._conf.load_connection_file(file) def set_connection(self, conn): self._conf.load_connection(conn) def load_fields(self, file): self._conf.load_fields_file(file) def set_fields(self, fields): self._conf.load_fields(fields) def set_query(self, query): self._query.load_query(query) def set_output_fields(self, fields): self._conf.load_output_fields(fields) def set_range(self, term=0): if term == 0: from_datetime = datetime.today() ds = 30 else: from_datetime = datetime.today() - timedelta(0, term) self._query.set_range_of_timestamp(from_datetime) ds = term/3600/24 self.set_index(ds=ds) return from_datetime def set_index(self, ds=7): format = self._conf.index_format() today = datetime.today() self._index = ",".join( [(today - timedelta(d)).strftime(format) for d in range(ds)]) def index(self): return self._index def query(self): return self._query.query() def connection(self): return self._conf.connection() def fields(self): return self._conf.fields() def output_fields(self): return self._conf.output_fields() def search(self): self._es = Es(self.connection()) self._es.search(index=self.index(), body=self.query()) def render_csv(self, separator=","): return "\n".join([separator.join(c) for c in self._generate_csv()]) def _generate_csv(self): fields = self.output_fields() hits = self._es.hits() csv = [fields[:]] for h in hits: row = [str(self._get_value(h, f)) for f in fields] csv.append(row) return csv def _get_value(self, hit, field): try: value = hit["_source"][field] except: try: value = hit[field] except: value = "" finally: return value
def __generate_feature_matrix(self): feature_matrix = pd.DataFrame() es = Es(settings.hosts, settings.index_name, settings.index_type) doc_count = es.count() ## Populate the matrix print("Total number of features: {}".format(len(self.spamwords))) for i in range(len(self.spamwords)): spamword = self.spamwords[i] result_id_list, result_score_list = es.get_documents_for_query( spamword) if (len(result_id_list) == 0): print("No documents are present for spamword {}".format( spamword)) result_list = [0] * doc_count for i in range(len(result_id_list)): result_list[int(result_id_list[i]) - 1] = result_score_list[i] feature_matrix[spamword] = result_list ## Print purpose if (i % 10 == 0): print("Feature matrix for {} words done".format(i)) print(feature_matrix.index.values) ## Get the train and test list id_list, filename_list, label_list, split_list = es.get_all_documents() label_list = [1 if x == "spam" else 0 for x in label_list] ## Divide into train-test set train_row_ids = [] test_row_ids = [] train_index_list = [] train_labels = [] test_index_list = [] test_labels = [] for i in range(len(split_list)): if (split_list[i] == 'train'): train_row_ids.append(int(id_list[i]) - 1) train_index_list.append(filename_list[i]) train_labels.append(label_list[i]) else: test_row_ids.append(int(id_list[i]) - 1) test_index_list.append(filename_list[i]) test_labels.append(label_list[i]) print("Number of documents in train: {}".format(len(train_row_ids))) print("Number of documents in test: {}".format(len(test_row_ids))) ## Divide the feature_matrix into train and test train_feature_matrix = feature_matrix.loc[train_row_ids, :] train_feature_matrix.index = train_index_list test_feature_matrix = feature_matrix.loc[test_row_ids, :] test_feature_matrix.index = test_index_list print("Shape of train feature matrix: {}".format( train_feature_matrix.shape)) print("Shape of test feature matrix: {}".format( test_feature_matrix.shape)) return (train_feature_matrix, train_labels, test_feature_matrix, test_labels, train_row_ids, test_row_ids)
class Spam_Classifier_Unigrams(): es = None unigrams_dict = {} def __init__(self): print(" In the constructor") self.es = Es(settings.hosts, settings.index_name, settings.index_type) def __load_unigram(self, create_unigrams_dict): ## Save the unigrams_dict if (create_unigrams_dict == True): self.unigrams_dict = self.es.get_all_vocabulary() util.dump_pickle_file(settings.unigram_filename, self.unigrams_dict) else: self.unigrams_dict = util.load_pickle_file( settings.unigram_filename) def __generate_feature_matrix(self, label_dict): start_calculation = timeit.default_timer() feature_sparse_matrix = dok_matrix( (len(label_dict), len(self.unigrams_dict)), dtype=np.float32) count = 0 ## Loop over all the documents for id in label_dict.keys(): if (count % 1000 == 0): print( "Generating features for {} documents done".format(count)) doc_term_vector = self.es.term_vectors(id) if "body" in doc_term_vector[ "term_vectors"] and "terms" in doc_term_vector[ "term_vectors"]["body"]: tokens = doc_term_vector["term_vectors"]["body"]["terms"].keys( ) for token in tokens: ## Get the token number token_number = self.unigrams_dict[token] tf = doc_term_vector["term_vectors"]["body"]["terms"][ token]["term_freq"] feature_sparse_matrix[count, token_number - 1] = tf count = count + 1 stop_calculation = timeit.default_timer() print("Time taken to generate feature matrix: " + str(stop_calculation - start_calculation)) return (feature_sparse_matrix) def __get_feature_matrix(self): start_calculation = timeit.default_timer() ## Label dict label_train_dict = {} label_test_dict = {} ## Get all documents and get the train and test list id_list, filename_list, label_list, split_list = self.es.get_all_documents( ) label_list = [1 if x == "spam" else 0 for x in label_list] for i in range(len(split_list)): if (split_list[i] == 'train'): label_train_dict[id_list[i]] = label_list[i] else: label_test_dict[id_list[i]] = label_list[i] print("Generating feature matrix for train documents") feature_sparse_train_matrix = self.__generate_feature_matrix( label_train_dict) print("Generating feature matrix for test documents") feature_sparse_test_matrix = self.__generate_feature_matrix( label_test_dict) train_csr = feature_sparse_train_matrix.tocsr() train_labels = list(label_train_dict.values()) test_csr = feature_sparse_test_matrix.tocsr() test_labels = list(label_test_dict.values()) stop_calculation = timeit.default_timer() print("Time taken to generate feature matrix: " + str(stop_calculation - start_calculation)) return (train_csr, train_labels, test_csr, test_labels) def __classification(self, train_df, train_labels, test_df, test_labels): logistic_regression_accuracy = ml_models.logistic_regression( train_df, train_labels, test_df, test_labels) print("Logistic Regression Accuracy {}".format( logistic_regression_accuracy)) decision_tree_accuracy = ml_models.decision_tree( train_df, train_labels, test_df, test_labels) print("Decision Tree Accuracy {}".format(decision_tree_accuracy)) accuracy = max(logistic_regression_accuracy, decision_tree_accuracy) return (accuracy) def run(self, create_unigrams_dict=False): start_calculation = timeit.default_timer() ## Load unigram self.__load_unigram(create_unigrams_dict) ## Generate feature matrix train_csr, train_labels, test_csr, test_labels = self.__get_feature_matrix( ) ## Run the classification algorithm accuracy = self.__classification(train_csr, train_labels, test_csr, test_labels) print("Accuracy is {}".format(accuracy)) stop_calculation = timeit.default_timer() print("Time taken to run spam classifier userwords: " + str(stop_calculation - start_calculation))
def __init__(self): print(" In the constructor") self.es = Es(settings.hosts, settings.index_name, settings.index_type)