def append_data(self, dataframe_pandas, processNo): """Save pandas dataframe to S3 in append mode.""" try: seconds = time.time() self.logger.info("Started the file append operation %s at %s " % (str(processNo), time.time())) s3file_url = self.getfileurl(processNo) fileExists = self.fileWriter.exists(s3file_url) self.logger.info('s3 files url:' + s3file_url) if fileExists: data_bytes = dataframe_pandas.to_csv(None, header=False, index=False).encode() else: data_bytes = dataframe_pandas.to_csv(None, header=True, index=False).encode() with self.fileWriter.open(s3file_url, mode='ab', block_size=None, acl='public-read') as pointer: pointer.write(data_bytes) pointer.close() self.logger.info("Ended file append operation %s in %s " % (str(processNo), time.time() - seconds)) except IOError as e: self.logger.info("I/O error:" + str(e)) self.logger.error(utility.print_exception()) sys.exit() except Exception as ex: self.logger.info('Issue with saving to S3:' + str(ex)) self.logger.error(utility.print_exception()) sys.exit() finally: self.cleanup()
def get_pandas_dataframe(self, query): """Get complete data as pandas dataframe.""" try: dataframe_ip_address = pd.read_sql_query(query, self.getconnection()) return dataframe_ip_address except SQLAlchemyError as e: error = str(e.__dict__['orig']) self.logger.info('SQLAlchemyError:' + error) self.logger.error(utility.print_exception()) sys.exit() except Exception as ex: self.logger.info('Issue in fetching data from Panoply:' + str(ex)) self.logger.error(utility.print_exception()) sys.exit()
def process_chunks(config, ipResolver, geoservice: GeoService, datareader: DataImport, datawriter: DataExport) -> bool: """Triggers parallel processes. Resolves IP in pandas dataframes chunks and stores ip resolved data in S3. """ # Initialize logging logger = utility.getlogger('ip_resolution', 'ip_resolution') seconds = time.time() try: query_panoply = config["panoplydatabase"]["readQuery"] for dataframe_ip_address in datareader.getbatch_pandas_dataframe( query_panoply): dataframes = utility.split_dataframe(dataframe_ip_address) processNo = 0 processList = [] for frame in enumerate(dataframes): processNo = processNo + 1 process_ipresolve = processes.Process( target=ipResolver.resolve_ipaddress, args=(frame[1], geoservice, datawriter, processNo)) processList.append(process_ipresolve) process_ipresolve.start() logger.info('processNo-' + str(process_ipresolve.pid)) for p in processList: p.join() # print(str(p.exitcode)) except Exception as ex: logger.info('Issue in fetching data from Panoply:' + str(ex)) logger.error(utility.print_exception()) return False logger.info("Finished the batch job in %s seconds" % str( (time.time() - seconds) // 1)) return True
def resolve_ipaddress(self, dataframe_ip_address, geocoding_service, datawriter, processNo): """Run the ip resolution methods in a child process.""" try: seconds = time.time() self.setlogger(processNo) geocoding_service.setlogger(processNo) self.logger.info("Started the process %s at %s " % (str(processNo), time.time())) dataframe_ip_preprocessed = self.preprocess(dataframe_ip_address) dataframe_ip_processed = self.get_address( dataframe_ip_preprocessed, geocoding_service) dataframe_results = self.postprocess(dataframe_ip_processed) # print(dataframe_results) datawriter.setlogger(processNo) datawriter.append_data(dataframe_results, processNo) process = psutil.Process(os.getpid()) # print('processNo-' + str(os.getpid())) # print(process.memory_info().rss) self.logger.info("Ended the process %s in %s " % (str(processNo), time.time() - seconds)) process.terminate() except Exception as ex: self.logger.info('Issue with resolveIp logic in IpResolver:' + ex) self.logger.error(utility.print_exception())
def save_data(self, dataframe_pandas): """Save pandas dataframe to S3.""" try: data_bytes = dataframe_pandas[['ipaddress', 'ipaddress_stripped', 'country', 'city', 'region', 'createdAt']] \ .to_csv(None, index=False).encode() with self.fileWriter.open(self.s3file_url, mode='wb', block_size=None, acl='public-read') as pointer: pointer.write(data_bytes) pointer.close() self.logger.info("Finished writing in S3") except IOError as e: self.logger.info("I/O error:" + str(e)) self.logger.error(utility.print_exception()) sys.exit() except Exception as ex: self.logger.info('Issue with saving to S3:' + str(ex)) self.logger.error(utility.print_exception()) sys.exit() finally: self.cleanup()
def getbatch_pandas_dataframe(self, readQuery): """Get data in chunks from pandas dataframe.""" try: dataframe_iterator = pd.read_sql_query(readQuery, self.getconnection(), chunksize=self.chunksize) self.logger.info('Fetched new chunk from Panoply table') for dataframe_ip_batch in dataframe_iterator: # print(dataframe_ip_batch) yield dataframe_ip_batch except SQLAlchemyError as e: error = str(e.__dict__['orig']) self.logger.info('SQLAlchemyError:' + error) self.logger.error(utility.print_exception()) sys.exit() except Exception as ex: self.logger.info('Issue in fetching data from Panoply:' + str(ex)) self.logger.error(utility.print_exception()) sys.exit()
def preprocess(self, dataframe_ip_address): """Do any preprocessing on data.""" try: df = dataframe_ip_address df['ipaddress_stripped'] = df['ipaddress'].apply( lambda x: x.strip('::ffff:')) df.assign(city='', country='', region='', createdAt='') return df except Exception as ex: self.logger.info('Issue in preprocessing logic:' + ex) self.logger.error(utility.print_exception())
def save_data(self, mode='w'): """Save data from S3 to Panoply table using copy command.""" self.logger.info('Panoply file urls:' + self.s3file_url) if mode == 'a': command = self.append_command else: command = self.write_command try: cursor = self.connection.cursor() cursor.execute(command) self.connection.commit() self.logger.info('finished copying to redshift') except psycopg2.DatabaseError as e: self.logger.info("Database error:" + str(e)) self.logger.error(utility.print_exception()) sys.exit() except Exception as ex: self.logger.info('Issue with copying from S3 to Panoply redshift:' + str(ex)) self.logger.error(utility.print_exception()) sys.exit() finally: cursor.close() self.cleanup()
def get_address(self, dataframe_ip_address, geocoding_service): """Call Geocoding service API.""" try: country, region, city = '', '', '' df = dataframe_ip_address for ind, row_series in df.iterrows(): try: ipaddress = df['ipaddress_stripped'][ind] addressObject = geocoding_service.fetch_geolocation( ipaddress) country, region, city = self.parse_json_address( addressObject) df.at[ind, 'country'] = country df.at[ind, 'region'] = region df.at[ind, 'city'] = city # print(addressObject) except Exception as ex: self.logger.info('Issue with fetching or parsing logic:' + ex) self.logger.error(utility.print_exception()) return df except Exception as ex: self.logger.info('Issue with getIpInformation logic:' + ex) self.logger.error(utility.print_exception())
def postprocess(self, dataframe_ip_address): """Do any postprocessing on data after ip resolution.""" try: df = dataframe_ip_address df['country'] = df['country'].apply( lambda x: str(x).replace("'", "") if x else x) df['city'] = df['city'].apply(lambda x: str(x).replace("'", "") if x else x) df['region'] = df['region'].apply( lambda x: str(x).replace("'", "") if x else x) now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") df['createdAt'] = dt_string return df except Exception as ex: self.logger.info('Issue in postprocessing logic:' + ex) self.logger.error(utility.print_exception())
def fetch_geolocation(self, ipaddress) -> {}: """Fetch Location as json object.""" try: url = self.connection_url.replace("ipaddress", ipaddress) req = requests.get(url) except requests.exceptions.HTTPError as ex: self.logger.error("Http Error:" + str(ex)) except requests.exceptions.ConnectionError as ex: self.logger.error("Error Connecting:" + str(ex)) except requests.exceptions.Timeout as ex: self.logger.error("Timeout Error:" + str(ex)) except requests.exceptions.RequestException as ex: self.logger.error('Issue with requesting location:' + str(ex)) except Exception as ex: self.logger.info( 'Issue with requesting the Geoservice for location:' + str(ex)) self.logger.error(utility.print_exception()) return req.json()
def __init__(self): """Initialize Panoply connection.""" self.config = JobConfig().getconfig() self.logger = utility.getlogger('ip_resolution', 'ip_resolution') username = self.config['panoplydatabase']['user'] password = self.config['panoplydatabase']['password'] db = self.config['panoplydatabase']['database'] host = self.config['panoplydatabase']['host'] port = self.config['panoplydatabase']['port'] self.connection_url = 'postgresql://' + str(username) + ':' + str( password) + '@' + str(host) + ':' + str(port) + '/' + str(db) self.readQuery = self.config['panoplydatabase']['readQuery'] self.chunksize = self.config['panoplydatabase']['chunksize'] try: self.connection_panoply = create_engine(self.connection_url, echo=False) self.logger.info('Initialized Panoply connection') except Exception as ex: self.logger.info('Issue with panoply connection:' + str(ex)) self.logger.error(utility.print_exception())