Esempio n. 1
0
 def append_data(self, dataframe_pandas, processNo):
     """Save pandas dataframe to S3 in append mode."""
     try:
         seconds = time.time()
         self.logger.info("Started the file append operation %s at %s " % (str(processNo), time.time()))
         s3file_url = self.getfileurl(processNo)
         fileExists = self.fileWriter.exists(s3file_url)
         self.logger.info('s3 files url:' + s3file_url)
         if fileExists:
             data_bytes = dataframe_pandas.to_csv(None, header=False, index=False).encode()
         else:
             data_bytes = dataframe_pandas.to_csv(None, header=True, index=False).encode()
         with self.fileWriter.open(s3file_url, mode='ab', block_size=None, acl='public-read') as pointer:
             pointer.write(data_bytes)
             pointer.close()
         self.logger.info("Ended file append operation %s in %s " % (str(processNo), time.time() - seconds))
     except IOError as e:
         self.logger.info("I/O error:" + str(e))
         self.logger.error(utility.print_exception())
         sys.exit()
     except Exception as ex:
         self.logger.info('Issue with saving to S3:' + str(ex))
         self.logger.error(utility.print_exception())
         sys.exit()
     finally:
         self.cleanup()
Esempio n. 2
0
 def get_pandas_dataframe(self, query):
     """Get complete data as pandas dataframe."""
     try:
         dataframe_ip_address = pd.read_sql_query(query,
                                                  self.getconnection())
         return dataframe_ip_address
     except SQLAlchemyError as e:
         error = str(e.__dict__['orig'])
         self.logger.info('SQLAlchemyError:' + error)
         self.logger.error(utility.print_exception())
         sys.exit()
     except Exception as ex:
         self.logger.info('Issue in fetching data from Panoply:' + str(ex))
         self.logger.error(utility.print_exception())
         sys.exit()
Esempio n. 3
0
def process_chunks(config, ipResolver, geoservice: GeoService,
                   datareader: DataImport, datawriter: DataExport) -> bool:
    """Triggers parallel processes.

    Resolves IP in pandas dataframes chunks and stores ip resolved data in S3.
    """
    # Initialize logging
    logger = utility.getlogger('ip_resolution', 'ip_resolution')
    seconds = time.time()
    try:
        query_panoply = config["panoplydatabase"]["readQuery"]
        for dataframe_ip_address in datareader.getbatch_pandas_dataframe(
                query_panoply):
            dataframes = utility.split_dataframe(dataframe_ip_address)
            processNo = 0
            processList = []
            for frame in enumerate(dataframes):
                processNo = processNo + 1
                process_ipresolve = processes.Process(
                    target=ipResolver.resolve_ipaddress,
                    args=(frame[1], geoservice, datawriter, processNo))
                processList.append(process_ipresolve)
                process_ipresolve.start()
                logger.info('processNo-' + str(process_ipresolve.pid))
            for p in processList:
                p.join()
                # print(str(p.exitcode))
    except Exception as ex:
        logger.info('Issue in fetching data from Panoply:' + str(ex))
        logger.error(utility.print_exception())
        return False
    logger.info("Finished the batch job in %s seconds" % str(
        (time.time() - seconds) // 1))
    return True
Esempio n. 4
0
    def resolve_ipaddress(self, dataframe_ip_address, geocoding_service,
                          datawriter, processNo):
        """Run the ip resolution methods in a child process."""
        try:

            seconds = time.time()
            self.setlogger(processNo)
            geocoding_service.setlogger(processNo)
            self.logger.info("Started the process %s at %s " %
                             (str(processNo), time.time()))
            dataframe_ip_preprocessed = self.preprocess(dataframe_ip_address)
            dataframe_ip_processed = self.get_address(
                dataframe_ip_preprocessed, geocoding_service)
            dataframe_results = self.postprocess(dataframe_ip_processed)
            # print(dataframe_results)
            datawriter.setlogger(processNo)
            datawriter.append_data(dataframe_results, processNo)
            process = psutil.Process(os.getpid())
            # print('processNo-' + str(os.getpid()))
            # print(process.memory_info().rss)
            self.logger.info("Ended the process %s in %s " %
                             (str(processNo), time.time() - seconds))
            process.terminate()
        except Exception as ex:
            self.logger.info('Issue with resolveIp logic in IpResolver:' + ex)
            self.logger.error(utility.print_exception())
Esempio n. 5
0
 def save_data(self, dataframe_pandas):
     """Save pandas dataframe to S3."""
     try:
         data_bytes = dataframe_pandas[['ipaddress', 'ipaddress_stripped', 'country', 'city', 'region', 'createdAt']] \
             .to_csv(None, index=False).encode()
         with self.fileWriter.open(self.s3file_url, mode='wb', block_size=None, acl='public-read') as pointer:
             pointer.write(data_bytes)
             pointer.close()
         self.logger.info("Finished writing in S3")
     except IOError as e:
         self.logger.info("I/O error:" + str(e))
         self.logger.error(utility.print_exception())
         sys.exit()
     except Exception as ex:
         self.logger.info('Issue with saving to S3:' + str(ex))
         self.logger.error(utility.print_exception())
         sys.exit()
     finally:
         self.cleanup()
Esempio n. 6
0
 def getbatch_pandas_dataframe(self, readQuery):
     """Get data in chunks from pandas dataframe."""
     try:
         dataframe_iterator = pd.read_sql_query(readQuery,
                                                self.getconnection(),
                                                chunksize=self.chunksize)
         self.logger.info('Fetched new chunk from Panoply table')
         for dataframe_ip_batch in dataframe_iterator:
             # print(dataframe_ip_batch)
             yield dataframe_ip_batch
     except SQLAlchemyError as e:
         error = str(e.__dict__['orig'])
         self.logger.info('SQLAlchemyError:' + error)
         self.logger.error(utility.print_exception())
         sys.exit()
     except Exception as ex:
         self.logger.info('Issue in fetching data from Panoply:' + str(ex))
         self.logger.error(utility.print_exception())
         sys.exit()
Esempio n. 7
0
 def preprocess(self, dataframe_ip_address):
     """Do any preprocessing on data."""
     try:
         df = dataframe_ip_address
         df['ipaddress_stripped'] = df['ipaddress'].apply(
             lambda x: x.strip('::ffff:'))
         df.assign(city='', country='', region='', createdAt='')
         return df
     except Exception as ex:
         self.logger.info('Issue in preprocessing logic:' + ex)
         self.logger.error(utility.print_exception())
Esempio n. 8
0
 def save_data(self, mode='w'):
     """Save data from S3 to Panoply table using copy command."""
     self.logger.info('Panoply file urls:' + self.s3file_url)
     if mode == 'a':
         command = self.append_command
     else:
         command = self.write_command
     try:
         cursor = self.connection.cursor()
         cursor.execute(command)
         self.connection.commit()
         self.logger.info('finished copying to redshift')
     except psycopg2.DatabaseError as e:
         self.logger.info("Database error:" + str(e))
         self.logger.error(utility.print_exception())
         sys.exit()
     except Exception as ex:
         self.logger.info('Issue with copying from S3 to Panoply redshift:' + str(ex))
         self.logger.error(utility.print_exception())
         sys.exit()
     finally:
         cursor.close()
         self.cleanup()
Esempio n. 9
0
 def get_address(self, dataframe_ip_address, geocoding_service):
     """Call Geocoding service API."""
     try:
         country, region, city = '', '', ''
         df = dataframe_ip_address
         for ind, row_series in df.iterrows():
             try:
                 ipaddress = df['ipaddress_stripped'][ind]
                 addressObject = geocoding_service.fetch_geolocation(
                     ipaddress)
                 country, region, city = self.parse_json_address(
                     addressObject)
                 df.at[ind, 'country'] = country
                 df.at[ind, 'region'] = region
                 df.at[ind, 'city'] = city
                 # print(addressObject)
             except Exception as ex:
                 self.logger.info('Issue with fetching or parsing logic:' +
                                  ex)
                 self.logger.error(utility.print_exception())
         return df
     except Exception as ex:
         self.logger.info('Issue with getIpInformation logic:' + ex)
         self.logger.error(utility.print_exception())
Esempio n. 10
0
 def postprocess(self, dataframe_ip_address):
     """Do any postprocessing on data after ip resolution."""
     try:
         df = dataframe_ip_address
         df['country'] = df['country'].apply(
             lambda x: str(x).replace("'", "") if x else x)
         df['city'] = df['city'].apply(lambda x: str(x).replace("'", "")
                                       if x else x)
         df['region'] = df['region'].apply(
             lambda x: str(x).replace("'", "") if x else x)
         now = datetime.now()
         dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
         df['createdAt'] = dt_string
         return df
     except Exception as ex:
         self.logger.info('Issue in postprocessing logic:' + ex)
         self.logger.error(utility.print_exception())
Esempio n. 11
0
 def fetch_geolocation(self, ipaddress) -> {}:
     """Fetch Location as json object."""
     try:
         url = self.connection_url.replace("ipaddress", ipaddress)
         req = requests.get(url)
     except requests.exceptions.HTTPError as ex:
         self.logger.error("Http Error:" + str(ex))
     except requests.exceptions.ConnectionError as ex:
         self.logger.error("Error Connecting:" + str(ex))
     except requests.exceptions.Timeout as ex:
         self.logger.error("Timeout Error:" + str(ex))
     except requests.exceptions.RequestException as ex:
         self.logger.error('Issue with requesting location:' + str(ex))
     except Exception as ex:
         self.logger.info(
             'Issue with requesting the Geoservice for location:' + str(ex))
         self.logger.error(utility.print_exception())
     return req.json()
Esempio n. 12
0
 def __init__(self):
     """Initialize Panoply connection."""
     self.config = JobConfig().getconfig()
     self.logger = utility.getlogger('ip_resolution', 'ip_resolution')
     username = self.config['panoplydatabase']['user']
     password = self.config['panoplydatabase']['password']
     db = self.config['panoplydatabase']['database']
     host = self.config['panoplydatabase']['host']
     port = self.config['panoplydatabase']['port']
     self.connection_url = 'postgresql://' + str(username) + ':' + str(
         password) + '@' + str(host) + ':' + str(port) + '/' + str(db)
     self.readQuery = self.config['panoplydatabase']['readQuery']
     self.chunksize = self.config['panoplydatabase']['chunksize']
     try:
         self.connection_panoply = create_engine(self.connection_url,
                                                 echo=False)
         self.logger.info('Initialized Panoply connection')
     except Exception as ex:
         self.logger.info('Issue with panoply connection:' + str(ex))
         self.logger.error(utility.print_exception())