def mkrollups(self): staging_table = 'clientrollups_staging' drop_table_if_exists(staging_table, self.pconn, self.pcur) megaquery = """ CREATE TABLE {} AS SELECT client_id, {} from events t inner join visits using (visit_id) inner join visitors v using (visitor_id) inner join campaigns using (campaign_id) inner join clients cl using (client_id) WHERE visits.ip not in ({}) AND campaigns.delete_dt is null GROUP BY client_id """.format(staging_table, self.metric_expressions(), OUR_IP_STRING) debug('Calculating client rollups') with self.pconn: self.pcur.execute(megaquery) debug('beginning deploy table on clientrollups') deploy_table('clientrollups', 'clientrollups_staging', 'clientrollups_old', self.pcur, self.pconn) self.updated = strftime('%x %X') debug('Done.')
def main(table): start = time.time() # get the schema of the table columns = None csvtime = None runcsv = None try: with mysql.connect(**rds) as dbconn: dbconn.execute("describe %s" % table) description = dbconn.fetchall() columns = create_query(description) write2csv(table, dbconn) csvtime = time.time() runcsv = csvtime - start except StandardError as e: logger.warning("error: {0}".format(e)) redconn = psycopg2.connect(**redshift) redcursor = redconn.cursor() up2s3(table) ups3time = time.time() runs3 = ups3time - csvtime # create the table with the columns query now generated staging_table = "{0}_staging".format(table) old_table = "{0}_old".format(table) drop_table_if_exists(staging_table, redconn, redcursor) logging.info("Creating table {}".format(staging_table)) with redconn: redcursor.execute("CREATE TABLE {0} ({1})".format(staging_table, columns)) # copy the file that we just uploaded to s3 to redshift access_key = aws["aws_access_key_id"] secret_key = aws["aws_secret_access_key"] try: copy_from_s3(redconn, redcursor, table, staging_table, access_key, secret_key) except psycopg2.DatabaseError: # step through the csv we are about to copy over and change the encodings to work properly with redshift logging.info("Error copying, assuming encoding errors and rewriting CSV...") with open("%s.csv" % table, "r") as csvfile: reader = csv.reader(csvfile, delimiter="|") with open("%s2.csv" % table, "wb") as csvfile2: writer = csv.writer(csvfile2, delimiter="|") keep_going = True while keep_going: try: this = reader.next() new = [i.decode("latin-1").encode("utf-8") for i in this] writer.writerow(new) except StopIteration: keep_going = False logging.info("Rewrite complete") os.remove("%s.csv" % table) os.system("mv {0}2.csv {0}.csv".format(table)) up2s3(table) # atomicity insurance time.sleep(10) copy_from_s3(redconn, redcursor, table, staging_table, access_key, secret_key) copytime = time.time() runcopy = copytime - ups3time deploy_table(table, staging_table, old_table, redcursor, redconn) endtime = time.time() runswap = endtime - copytime runtotal = endtime - start logging.info("Successfully copied %s from RDS to S3 to Redshift" % table) events = [ ("write csv", runcsv), ("write to s3", runs3), ("copy from s3 to redshift", runcopy), ("swap redshift tables", runswap), ("complete entire process", runtotal), ] logging.info("|".join("{0:.2f} seconds to {1}".format(duration, event) for event, duration in events))