def process_chunks(config, ipResolver, geoservice: GeoService, datareader: DataImport, datawriter: DataExport) -> bool: """Triggers parallel processes. Resolves IP in pandas dataframes chunks and stores ip resolved data in S3. """ # Initialize logging logger = utility.getlogger('ip_resolution', 'ip_resolution') seconds = time.time() try: query_panoply = config["panoplydatabase"]["readQuery"] for dataframe_ip_address in datareader.getbatch_pandas_dataframe( query_panoply): dataframes = utility.split_dataframe(dataframe_ip_address) processNo = 0 processList = [] for frame in enumerate(dataframes): processNo = processNo + 1 process_ipresolve = processes.Process( target=ipResolver.resolve_ipaddress, args=(frame[1], geoservice, datawriter, processNo)) processList.append(process_ipresolve) process_ipresolve.start() logger.info('processNo-' + str(process_ipresolve.pid)) for p in processList: p.join() # print(str(p.exitcode)) except Exception as ex: logger.info('Issue in fetching data from Panoply:' + str(ex)) logger.error(utility.print_exception()) return False logger.info("Finished the batch job in %s seconds" % str( (time.time() - seconds) // 1)) return True
def __init__(self): """Initialize panoply connection.""" self.config = JobConfig().getconfig() self.logger = utility.getlogger('ip_resolution', 'ip_resolution') self.filePath = self.config['storageDetails']['filePath'] self.fileName = self.config['storageDetails']['fileName'] self.fileExtension = self.config['storageDetails']['fileExtension'] self.directoryName = date.today().strftime("%m/%d/%y").replace("/", "_") self.awsKey = self.config['storageDetails']['awsKey'] self.secretKey = self.config['storageDetails']['secretKey'] self.s3file_url = 's3://' + str(self.filePath) + '/' + str(self.directoryName) + '/' + str(self.fileName) self.tableName = self.config['storageDetails']['tableName'] self.region = self.config['storageDetails']['region'] username = self.config['panoplydatabase']['user'] password = self.config['panoplydatabase']['password'] db = self.config['panoplydatabase']['database'] host = self.config['panoplydatabase']['host'] port = self.config['panoplydatabase']['port'] self.connection = psycopg2.connect(user=username, password=password, host=host, port=port, database=db) self.write_command = """BEGIN; truncate """ + self.tableName + """ ; copy """ + self.tableName + """ from '""" + self.s3file_url + """' access_key_id '""" + self.awsKey + """' secret_access_key '""" + self.secretKey + """' region '""" + self.region + """' ignoreheader 1 null as 'NA' removequotes delimiter ','; COMMIT;""" self.append_command = """BEGIN; copy """ + self.tableName + """ from '""" + self.s3file_url + """' access_key_id '""" + self.awsKey + """' secret_access_key '""" + self.secretKey + """' region '""" + self.region + """'
def __init__(self, processNo=0): """Initialize IP Data API object connection.""" self.config = JobConfig().getconfig() self.processNo = processNo self.logger = utility.getlogger('ip_resolution', 'ip_resolution') url = self.config['geoservice']['url'] apikey = self.config['geoservice']['apikey'] self.connection_url = url.replace("userkey", apikey)
def __init__(self): """Initialize connection to S3.""" self.config = JobConfig().getconfig() self.logger = utility.getlogger('ip_resolution', 'ip_resolution') self.filePath = self.config['storageDetails']['filePath'] self.fileName = self.config['storageDetails']['fileName'] self.fileExtension = self.config['storageDetails']['fileExtension'] self.directoryName = date.today().strftime("%m/%d/%y").replace("/", "_") self.awsKey = self.config['storageDetails']['awsKey'] self.secretKey = self.config['storageDetails']['secretKey'] self.s3file_url = 's3://' + str(self.filePath) + '/' + str(self.directoryName) \ + '/' + str(self.fileName) + str(self.fileExtension) self.fileWriter = s3fs.S3FileSystem(self.awsKey, self.secretKey)
def __init__(self): """Initialize Panoply connection.""" self.config = JobConfig().getconfig() self.logger = utility.getlogger('ip_resolution', 'ip_resolution') username = self.config['panoplydatabase']['user'] password = self.config['panoplydatabase']['password'] db = self.config['panoplydatabase']['database'] host = self.config['panoplydatabase']['host'] port = self.config['panoplydatabase']['port'] self.connection_url = 'postgresql://' + str(username) + ':' + str( password) + '@' + str(host) + ':' + str(port) + '/' + str(db) self.readQuery = self.config['panoplydatabase']['readQuery'] self.chunksize = self.config['panoplydatabase']['chunksize'] try: self.connection_panoply = create_engine(self.connection_url, echo=False) self.logger.info('Initialized Panoply connection') except Exception as ex: self.logger.info('Issue with panoply connection:' + str(ex)) self.logger.error(utility.print_exception())
def setlogger(self, processNo): """Get Logger for use in a python child process.""" self.logger = utility.getlogger('s3writer', 's3writer', processNo)
def setlogger(self, processNo): """Get Logger for use in a python child process.""" self.logger = utility.getlogger('ipdataservice', 'ipdataservice', processNo)
target=ipResolver.resolve_ipaddress, args=(frame[1], geoservice, datawriter, processNo)) processList.append(process_ipresolve) process_ipresolve.start() logger.info('processNo-' + str(process_ipresolve.pid)) for p in processList: p.join() # print(str(p.exitcode)) except Exception as ex: logger.info('Issue in fetching data from Panoply:' + str(ex)) logger.error(utility.print_exception()) return False logger.info("Finished the batch job in %s seconds" % str( (time.time() - seconds) // 1)) return True if __name__ == "__main__": logger = utility.getlogger('ip_resolution', 'ip_resolution') logger.info('Starting ip resolution job') config = JobConfig().getconfig() utility.check_s3path(config) ip_resolver = IPResolver() panoplyreader = PanoplyImport() datawriter = S3Writer() ipdata_geoservice = IPDataService() if process_chunks(config, ip_resolver, ipdata_geoservice, panoplyreader, datawriter): panoplywriter = PanoplyWriter() panoplywriter.save_data()
def setlogger(self, processNo): """Get Logger for use in a python child process.""" self.logger = utility.getlogger('panoply_connector', 'panoplyconnector', processNo)
def setlogger(self, processNo): """Get Logger for use in a python child process.""" self.logger = utility.getlogger('ip_resolution', 'ipresolver', processNo)
def __init__(self): """Initialize instance of IP Resolver.""" self.config = JobConfig().getconfig() self.logger = utility.getlogger('ip_resolution', 'ip_resolution')