def run(self): step = 1 util = JobConfig(0, 0, None, None) best_util = JobConfig(0, 0, None, None) state = self.init_state() if self.check_constraint: while self.check_constraint(state): state = self.init_state() while step < self.max_step and util.get_util() < self.max_util: temperature = self.find_temperature(step) new_state = self.generate_neighbor(state) if self.check_constraint: while self.check_constraint(new_state): new_state = self.generate_neighbor(new_state) new_util = self.compute_util(new_state) # Utilization = 0 -> very undesirable -> reset if new_util.get_util() == 0: state = self.init_state() elif self.transition(util, new_util, temperature) >= random(): state = new_state util = new_util if new_util.compare(best_util): best_util = new_util step += 1 return best_util
def routing_compute_util(self, state): self.add_previous_jobs() cloned_links = self.graph.copy_links() valid = True paths_used = [] util = 0 job_config = JobConfig() for p in state: bw, links = p[0], p[1] for l in range(len(links) - 1): link_id = Link.get_id(links[l], links[l + 1]) link = cloned_links[link_id] link_bandwidth = link.get_bandwidth() if link_bandwidth < bw: valid = False break link.set_bandwidth(link_bandwidth - bw) if not valid: # logging.debug(str(state) + " cannot be built") break else: paths_used.append(p) if valid: self.graph.set_links(cloned_links) all_paths_used = deepcopy(paths_used) for job in self.jobs_config.values(): all_paths_used.extend(job.get_used_paths()) self.graph.set_flow(all_paths_used) util = 0 total_util = 0 for p in paths_used: flow = self.graph.get_flow(Flow.get_id(p[1][0], p[1][-1])) util += (flow.get_requested_bandwidth() + flow.get_effective_bandwidth()) for p in all_paths_used: flow = self.graph.get_flow(Flow.get_id(p[1][0], p[1][-1])) total_util += (flow.get_requested_bandwidth() + flow.get_effective_bandwidth()) job_config = JobConfig(util, total_util, copy_links(cloned_links), paths_used) self.reset() return job_config
def test_case_save_resolved_ipdata(self): """Test the process of ip resolution and file placement in S3.""" config = JobConfig().getconfig() utility.check_s3path(config) ipResolver = IPResolver() panoplyreader = PanoplyImport() datawriter = S3Writer() ipdata_geoservice = IPDataService() query_panoply = config["panoplydatabase"]["readQuery"] for dataframe_ip_address in panoplyreader.getbatch_pandas_dataframe( query_panoply): dataframes = utility.split_dataframe(dataframe_ip_address) processNo = 0 processList = [] for frame in enumerate(dataframes): processNo = processNo + 1 process_ipresolve = processes.Process( target=ipResolver.resolve_ipaddress, args=(frame[1], ipdata_geoservice, datawriter, processNo)) processList.append(process_ipresolve) process_ipresolve.start() logger.info('processNo-' + str(process_ipresolve.pid)) for p in processList: p.join() for i in range(1, config['processConfig']['noOfParallelProcess']): s3file_url = datawriter.getfileurl(i) self.assertTrue(datawriter.fileWriter.exists(s3file_url)) print('Ran test_case_save_resolved_ipdata test case')
def __init__(self): """Initialize panoply connection.""" self.config = JobConfig().getconfig() self.logger = utility.getlogger('ip_resolution', 'ip_resolution') self.filePath = self.config['storageDetails']['filePath'] self.fileName = self.config['storageDetails']['fileName'] self.fileExtension = self.config['storageDetails']['fileExtension'] self.directoryName = date.today().strftime("%m/%d/%y").replace("/", "_") self.awsKey = self.config['storageDetails']['awsKey'] self.secretKey = self.config['storageDetails']['secretKey'] self.s3file_url = 's3://' + str(self.filePath) + '/' + str(self.directoryName) + '/' + str(self.fileName) self.tableName = self.config['storageDetails']['tableName'] self.region = self.config['storageDetails']['region'] username = self.config['panoplydatabase']['user'] password = self.config['panoplydatabase']['password'] db = self.config['panoplydatabase']['database'] host = self.config['panoplydatabase']['host'] port = self.config['panoplydatabase']['port'] self.connection = psycopg2.connect(user=username, password=password, host=host, port=port, database=db) self.write_command = """BEGIN; truncate """ + self.tableName + """ ; copy """ + self.tableName + """ from '""" + self.s3file_url + """' access_key_id '""" + self.awsKey + """' secret_access_key '""" + self.secretKey + """' region '""" + self.region + """' ignoreheader 1 null as 'NA' removequotes delimiter ','; COMMIT;""" self.append_command = """BEGIN; copy """ + self.tableName + """ from '""" + self.s3file_url + """' access_key_id '""" + self.awsKey + """' secret_access_key '""" + self.secretKey + """' region '""" + self.region + """'
def __init__(self, processNo=0): """Initialize IP Data API object connection.""" self.config = JobConfig().getconfig() self.processNo = processNo self.logger = utility.getlogger('ip_resolution', 'ip_resolution') url = self.config['geoservice']['url'] apikey = self.config['geoservice']['apikey'] self.connection_url = url.replace("userkey", apikey)
def compute_route(self): util = JobConfig() chosen_paths = [] if self.build_paths(): for path in self.valid_paths.values(): chosen_paths.append(choice(path)) util = self.routing_compute_util(chosen_paths) return util
def test_case_panoplyImport_dataframes(self): """Test the import of dataframe from Panoply.""" connector = PanoplyImport() config = JobConfig().getconfig() query_panoply = 'select distinct ipaddress from activity_us (nolock) where ipaddress is not null limit 40' for panda_df in connector.getbatch_pandas_dataframe(query_panoply): self.assertTrue( panda_df.shape[0] <= config['panoplydatabase']['chunksize'] and (panda_df['ipaddress'].iloc[0] is not None)) print('Ran test_case_panoplyImport_dataframes test case')
def split_dataframe(dataframe): """Split a Pandas dataframe in sets.""" size = dataframe.shape[0] config = JobConfig().getconfig() max_rows = size // config['processConfig']['noOfParallelProcess'] list_df = [ dataframe[i:i + max_rows] for i in range(0, dataframe.shape[0], max_rows) ] return list_df
def __init__(self): """Initialize connection to S3.""" self.config = JobConfig().getconfig() self.logger = utility.getlogger('ip_resolution', 'ip_resolution') self.filePath = self.config['storageDetails']['filePath'] self.fileName = self.config['storageDetails']['fileName'] self.fileExtension = self.config['storageDetails']['fileExtension'] self.directoryName = date.today().strftime("%m/%d/%y").replace("/", "_") self.awsKey = self.config['storageDetails']['awsKey'] self.secretKey = self.config['storageDetails']['secretKey'] self.s3file_url = 's3://' + str(self.filePath) + '/' + str(self.directoryName) \ + '/' + str(self.fileName) + str(self.fileExtension) self.fileWriter = s3fs.S3FileSystem(self.awsKey, self.secretKey)
def splitdataframe1(dataframe): """Alternative approach, but not correct.""" size = dataframe.shape[0] config = JobConfig().getconfig() max_rows = size // config['processConfig']['noOfParallelProcess'] print(max_rows) dataframes = [] while size > max_rows: top = dataframe.loc[0:max_rows - 1] dataframes.append(top) dataframe = dataframe.loc[max_rows:size - 1] size = dataframe.shape[0] else: dataframes.append(dataframe) return dataframes
def compute_route(self): util = JobConfig() max_step = self.max_step max_util = self.num_mappers * self.num_reducers * self.bandwidth if self.build_paths(): # Executing simulated annealing for map-reduce routing simulated_annealing = SimulatedAnnealing(max_util, \ max_step, \ self.routing_init_state, \ self.routing_generate_neighbor, \ self.routing_compute_util) util = simulated_annealing.run() # print "util: ", util.get_util() return util
def _execute_job(self): available_hosts = [h for h in self.graph.get_hosts() if h.is_free()] hosts = [] util = JobConfig() # There are enough nodes to run the job if len(available_hosts) > (self.num_mappers + self.num_reducers): for i in range(self.num_mappers + self.num_reducers): host_to_add = choice(available_hosts) while host_to_add in hosts: host_to_add = choice(available_hosts) hosts.append(host_to_add) util = self.placement_compute_util(hosts) return util
def compute_route(self): util = JobConfig() if self.build_paths(): max_step = self.max_step max_util = self.cur_demand.get_net() # Executing simulated annealing for map-reduce routing simulated_annealing = SimulatedAnnealing(max_util, \ max_step, \ self.routing_init_state, \ self.routing_generate_neighbor, \ self.routing_compute_util, \ self.check_constraint) util = simulated_annealing.run() # print "util: ", util.get_util() return util
def test_case_ipresolution_pipeline(self): """Test the complete flow of ip resolution and result load in panoply.""" config = JobConfig().getconfig() utility.check_s3path(config) ip_resolver = IPResolver() panoplyreader = PanoplyImport() datawriter = S3Writer() ipdata_geoservice = IPDataService() if ResolveIp.process_chunks(config, ip_resolver, ipdata_geoservice, panoplyreader, datawriter): panoplywriter = PanoplyWriter() panoplywriter.save_data() query_panoply = 'select max(createdAt) as created from test_sp_ipaddress_parsed_us (nolock) limit 1' df = panoplyreader.get_pandas_dataframe(query_panoply) now = datetime.now() dt_string = now.strftime("%d/%m/%Y") df_string = df['created'].iloc[0] self.assertTrue(df_string[0:10] == dt_string) print('Ran test_case_ipresolution_pipeline test case')
def __init__(self): """Initialize Panoply connection.""" self.config = JobConfig().getconfig() self.logger = utility.getlogger('ip_resolution', 'ip_resolution') username = self.config['panoplydatabase']['user'] password = self.config['panoplydatabase']['password'] db = self.config['panoplydatabase']['database'] host = self.config['panoplydatabase']['host'] port = self.config['panoplydatabase']['port'] self.connection_url = 'postgresql://' + str(username) + ':' + str( password) + '@' + str(host) + ':' + str(port) + '/' + str(db) self.readQuery = self.config['panoplydatabase']['readQuery'] self.chunksize = self.config['panoplydatabase']['chunksize'] try: self.connection_panoply = create_engine(self.connection_url, echo=False) self.logger.info('Initialized Panoply connection') except Exception as ex: self.logger.info('Issue with panoply connection:' + str(ex)) self.logger.error(utility.print_exception())
def test_case_s3_writer(self): """Test file write to S3 bucket.""" config = JobConfig().getconfig() utility.check_s3path(config) panoplyreader = PanoplyImport() datawriter = S3Writer() # print(dataframe_results) query_panoply = 'select distinct ipaddress from activity_us (nolock) where ipaddress is not null limit 10' dataframe_results = panoplyreader.get_pandas_dataframe(query_panoply) datawriter.append_data(dataframe_results, 0) datawriter = S3Writer() s3file_url = datawriter.getfileurl(0) with datawriter.fileWriter.open(s3file_url, mode='rb') as pointer: file_bytes = pointer.read() data_bytes = dataframe_results[['ipaddress' ]].to_csv(None, header=False, index=False).encode() self.assertEqual(file_bytes[11:], data_bytes) print('Ran test_case_s3_writer test case')
def execute_job(self, job): available_hosts = [h for h in self.graph.get_hosts() if h.is_free()] util = JobConfig() # There are enough nodes to run the job if len(available_hosts) > (self.num_mappers + self.num_reducers): max_util = self.num_mappers * self.num_reducers * self.bandwidth max_step = self.max_step # Executing simulated annealing for map-reduce placement simulated_annealing = SimulatedAnnealing(max_util, \ max_step, \ self.placement_init_state, \ self.placement_generate_neighbor, \ self.placement_compute_util) util = simulated_annealing.run() return util
def __init__(self): """Initialize instance of IP Resolver.""" self.config = JobConfig().getconfig() self.logger = utility.getlogger('ip_resolution', 'ip_resolution')
target=ipResolver.resolve_ipaddress, args=(frame[1], geoservice, datawriter, processNo)) processList.append(process_ipresolve) process_ipresolve.start() logger.info('processNo-' + str(process_ipresolve.pid)) for p in processList: p.join() # print(str(p.exitcode)) except Exception as ex: logger.info('Issue in fetching data from Panoply:' + str(ex)) logger.error(utility.print_exception()) return False logger.info("Finished the batch job in %s seconds" % str( (time.time() - seconds) // 1)) return True if __name__ == "__main__": logger = utility.getlogger('ip_resolution', 'ip_resolution') logger.info('Starting ip resolution job') config = JobConfig().getconfig() utility.check_s3path(config) ip_resolver = IPResolver() panoplyreader = PanoplyImport() datawriter = S3Writer() ipdata_geoservice = IPDataService() if process_chunks(config, ip_resolver, ipdata_geoservice, panoplyreader, datawriter): panoplywriter = PanoplyWriter() panoplywriter.save_data()