def connect(): """ Connect to the PostgreSQL database server """ try: # connect to the PostgreSQL server print('Connecting to the PostgreSQL database...') # use a postgres connection string below db = Postgres( "host=localhost dbname=michaelmostachetti user=michaelmostachetti password=password" ) # This is where you will run your four queries # You can use either .one(), or .run() depending if you care # about the return values of the queries # Query 1 result1 = db.one("SELECT version();") print(result1) # Query 2 db.run("SELECT version();") # Query 3 db.run("SELECT version();") # Query 4 db.run("SELECT version();") except: print("An error occurred")
def get_last_event_id(db: postgres.Postgres, room_id: str, before: datetime) -> Optional[str]: return db.one( "SELECT event_id FROM events WHERE received_ts <= %(before)s AND room_id=%(room_id)s ORDER BY received_ts DESC LIMIT 1", before=int(before.timestamp() * 1000), room_id=room_id, )
def sql_listing(param_dict): ''' Run a SQL listing. The sql file is loaded, and then any non-reserved keyword in the parameters is treated as a string to be replaced in the sql string. The SQL is then printed out, before newlines are removed, and then run in one of the allowed modes. The allowed modes are: run : The SQL returns no result one : The SQL should return one row result to be printed top : The SQL returns many results, print the top N (given by global print_num_rows) :param param_dict: dictionary produced by load_and_check_listing_params :return: ''' with open('../../listings/chap%d/%s.sql' % (param_dict['chapter'], param_dict['name']), 'r') as myfile: db = Postgres("postgres://%s:%s@localhost/%s" % (os.environ['CHURN_DB_USER'],os.environ['CHURN_DB_PASS'],os.environ['CHURN_DB'])) # prefix the search path onto the listing, which does not specify the schema sql = "set search_path = '%s'; " % param_dict['schema']; # load the sql file sql = sql + myfile.read() # bind any parameters that match strings in the sql param_keys = [p for p in param_dict.keys() if p not in reserved_param_keywords] for p in param_keys: sql = sql.replace(p, str(param_dict[p])) # Print the sql (then remove the newlines) print('SQL:\n----------\n'+sql+'\n----------\nRESULT:') sql = sql.replace('\n', ' ') # Run in the manner indicated by the mode if param_dict['mode'] == 'run': db.run(sql) elif param_dict['mode'] == 'one': res = db.one(sql) print(res) elif param_dict['mode'] == 'top' or param_dict['mode'] == 'save': res = db.all(sql) df = pd.DataFrame(res) if param_dict['mode'] == 'save': save_path = '../../../fight-churn-output/' + param_dict['schema'] + '/' os.makedirs(save_path,exist_ok=True) csv_path=save_path + param_dict['schema'] + '_' + param_dict['name'].replace(param_dict['prefix'],'') + '.csv' print('Saving: %s' % csv_path) df.to_csv(csv_path, index=False) else: print(df.head(print_num_rows)) else: print('Unknown run mode for SQL example') exit(-4)
def sql_listing(chapter, listing, name, schema, mode, param_dict, insert=False, save_ext=None): """ Run a SQL listing. The sql file is loaded, and then any non-reserved keyword in the parameters is treated as a string to be replaced in the sql string. The SQL is then printed out. :param chapter: :param listing: :param name: :param schema: :param mode:The allowed modes are: run : The SQL returns no result one : The SQL should return one row result to be printed top : The SQL returns many results, print the top N (given by global print_num_rows) :param param_dict: the bind variables as a dictionary :param insert: flag to use the insert form of a query; see chapter 7 :param save_ext: :return: """ with open( '../../listings/chap%d/%s.sql' % (chapter, _full_listing_name(chapter, listing, name, insert)), 'r') as myfile: db = Postgres("postgres://%s:%s@localhost/%s" % (os.environ['CHURN_DB_USER'], os.environ['CHURN_DB_PASS'], os.environ['CHURN_DB'])) # prefix the search path onto the listing, which does not specify the schema sql = "set search_path = '%s'; " % schema # load the sql file sql = sql + myfile.read() # bind any parameters that match strings in the sql param_keys = [ p for p in param_dict.keys() if p not in reserved_param_keywords ] for p in param_keys: sql = sql.replace(p, str(param_dict[p])) # Print the sql (then remove the newlines) print('SQL:\n----------\n' + sql + '\n----------\nRESULT:') sql = sql.replace('\n', ' ') # Run in the manner indicated by the mode if mode == 'run': db.run(sql) elif mode == 'one': res = db.one(sql) print(res) elif mode == 'top' or mode == 'save': res = db.all(sql) df = pd.DataFrame(res) if mode == 'save': save_path = '../../../fight-churn-output/' + schema + '/' os.makedirs(save_path, exist_ok=True) csv_path = save_path + schema + '_' + name.replace( 'listing_{}_{}_'.format(chapter, listing), '') if save_ext: csv_path = csv_path + '_' + save_ext csv_path = csv_path + '.csv' print('Saving: %s' % csv_path) df.to_csv(csv_path, index=False) else: print(df.head(print_num_rows)) else: print('Unknown run mode for SQL example') exit(-4)
class ChurnSimulation: def __init__(self, model, start, end, init_customers, seed): ''' Creates the behavior/utility model objects, sets internal variables to prepare for simulation, and creates the database connection :param model: name of the behavior/utility model parameters :param start: start date for simulation :param end: end date for simulation :param init_customers: how many customers to create at start date ''' self.model_name = model self.start_date = start self.end_date = end self.init_customers = init_customers self.monthly_growth_rate = 0.1 self.util_mod = UtilityModel(self.model_name) behavior_versions = glob.glob('../conf/' + self.model_name + '_*.csv') self.behavior_models = {} self.model_list = [] for b in behavior_versions: version = b[(b.find(self.model_name) + len(self.model_name) + 1):-4] if version in ('utility', 'population', 'country'): continue behave_mod = FatTailledBehaviorModel(self.model_name, seed, version) self.behavior_models[behave_mod.version] = behave_mod self.model_list.append(behave_mod) if len(self.behavior_models) > 1: self.population_percents = pd.read_csv( '../conf/' + self.model_name + '_population.csv', index_col=0) self.util_mod.setChurnScale(self.behavior_models, self.population_percents) self.population_picker = np.cumsum(self.population_percents) self.country_lookup = pd.read_csv('../conf/' + self.model_name + '_country.csv') self.subscription_count = 0 self.tmp_sub_file_name = os.path.join( tempfile.gettempdir(), '{}_tmp_sub.csv'.format(self.model_name)) self.tmp_event_file_name = os.path.join( tempfile.gettempdir(), '{}_tmp_event.csv'.format(self.model_name)) self.db = Postgres( "postgres://%s:%s@localhost/%s" % (os.environ['CHURN_DB_USER'], os.environ['CHURN_DB_PASS'], os.environ['CHURN_DB'])) self.con = post.connect(database=os.environ['CHURN_DB'], user=os.environ['CHURN_DB_USER'], password=os.environ['CHURN_DB_PASS']) def remove_tmp_files(self): ''' Remove temp files when the simulation is over :return: ''' os.remove(self.tmp_event_file_name) os.remove(self.tmp_sub_file_name) def pick_customer_model(self): choice = random.uniform(0, 1) for m in range(0, self.population_picker.shape[0]): if choice <= self.population_picker['percent'][m]: version_name = self.population_picker.index.values[m] return self.behavior_models[version_name] def simulate_customer(self, start_of_month): ''' Simulate one customer collecting its events and subscriptions. This function has the core interaction between the simulation objects. Customer is created from the behavior model, and picking a random start date within the month. Then the customer objects simulates the events for the month, and the utility model determines if there is a churn based on the simulated event counts. :param start_of_month: :return: the new customer object it contains the events and subscriptions ''' # customer_model = self.pick_customer_model() customer_model = np.random.choice(self.model_list, p=self.population_percents['pcnt']) new_customer = customer_model.generate_customer(start_of_month) customer_country = np.random.choice(self.country_lookup['country'], p=self.country_lookup['pcnt']) new_customer.country = customer_country # Pick a random start date for the subscription within the month end_range = start_of_month + relativedelta(months=+1) this_month = start_of_month + timedelta( days=random.randrange((end_range - start_of_month).days)) churned = False while not churned: next_month = this_month + relativedelta(months=1) new_customer.subscriptions.append((this_month, next_month)) month_count = new_customer.generate_events(this_month, next_month) churned = self.util_mod.simulate_churn( month_count, new_customer) or next_month > self.end_date if not churned: this_month = next_month return new_customer def create_customers_for_month(self, month_date, n_to_create): ''' Creates all the customers for one month, by calling simulate_customer and copy_customer_to_database in a loop. :param month_date: the month start date :param n_to_create: number of customers to create within that month :return: ''' total_subscriptions = 0 total_events = 0 for i in range(n_to_create): customer = self.simulate_customer(month_date) self.copy_customer_to_database(customer) total_subscriptions += len(customer.subscriptions) total_events += len(customer.events) if i % 100 == 0: print( 'Simulated customer {}/{}: {:,} subscriptions & {:,} events' .format(i, n_to_create, total_subscriptions, total_events)) def copy_customer_to_database(self, customer): ''' Copy one customers data to the database, by first writing it to temp files and then using the sql COPY command :param customer: a Customer object that has already had its simulation run :return: ''' with open(self.tmp_sub_file_name, 'w') as tmp_file: for s in customer.subscriptions: tmp_file.write("%d,%d,'%s','%s','%s',%f,\\null,\\null,1\n" % \ (self.subscription_count, customer.id, self.model_name, s[0], s[1], 9.99)) # mrr is 9.99 self.subscription_count += 1 with open(self.tmp_event_file_name, 'w') as tmp_file: for e in customer.events: tmp_file.write("%d,'%s',%d\n" % (customer.id, e[0], e[1])) sql = "INSERT INTO {}.account VALUES({},'{}','{}',{})".format( self.model_name, customer.id, customer.channel, customer.date_of_birth.isoformat(), 'NULL' if customer.country == 'None' else "'{}'".format(customer.country)) self.db.run(sql) cur = self.con.cursor() sql = "COPY %s.subscription FROM STDIN USING DELIMITERS ',' WITH NULL AS '\\null'" % ( self.model_name) with open(self.tmp_sub_file_name, 'r') as f: cur.copy_expert(sql, f) self.con.commit() sql = "COPY %s.event FROM STDIN USING DELIMITERS ',' WITH NULL AS '\\null'" % ( self.model_name) with open(self.tmp_event_file_name, 'r') as f: cur.copy_expert(sql, f) self.con.commit() def truncate_old_sim(self): ''' Removes an old simulation from the database, if it already exists for this model :return: True if is safe to proceed (no data or data removed); False means old data not removed ''' oldEvent = self.db.one('select count(*) from %s.event' % self.model_name) oldSubs = self.db.one('select count(*) from %s.subscription' % self.model_name) oldAccount = self.db.one('select count(*) from %s.account' % self.model_name) if oldEvent > 0 or oldSubs > 0 or oldAccount > 0: print( 'TRUNCATING *Events/Metrics & Subscriptions/Observations* in schema -> %s <- ...' % self.model_name) if input("are you sure? (enter %s to proceed) " % self.model_name) == self.model_name: if oldEvent > 0: self.db.run('truncate table %s.event' % self.model_name) self.db.run('truncate table %s.metric' % self.model_name) if oldAccount > 0: self.db.run('truncate table %s.account' % self.model_name) if oldSubs > 0: self.db.run('truncate table %s.subscription' % self.model_name) self.db.run('truncate table %s.active_period' % self.model_name) self.db.run('truncate table %s.observation' % self.model_name) return True else: return False else: return True def run_simulation(self): ''' Simulation main function. First it prepares the database by truncating any old events and subscriptions, and inserting the event types into the database. Next it creeates the initial customers by calling create_customers_for_month, and then it advances month by month adding new customers (also using create_customers_for_month.) The number of new customers for each month is determined from the growth rate. Note that churn is not handled at this level, but is modeled at the customer level. :return: ''' # database setup if not self.truncate_old_sim(): return # Any model can insert the event types self.behavior_models[next(iter( self.behavior_models))].insert_event_types(self.model_name, self.db) # Initial customer count print('\nCreating %d initial customers for month of %s' % (self.init_customers, self.start_date)) self.create_customers_for_month(self.start_date, self.init_customers) print( 'Created %d initial customers with %d subscriptions for start date %s' % (self.init_customers, self.subscription_count, str(self.start_date))) # Advance to additional months next_month = self.start_date + relativedelta(months=+1) n_to_add = int(ceil(self.init_customers * self.monthly_growth_rate) ) # number of new customers in first month while next_month < self.end_date: print('\nCreating %d new customers for month of %s:' % (n_to_add, next_month)) self.create_customers_for_month(next_month, n_to_add) print( 'Created %d new customers for month %s, now %d subscriptions\n' % (n_to_add, str(next_month), self.subscription_count)) next_month = next_month + relativedelta(months=+1) n_to_add = int(ceil(n_to_add * (1.0 + self.monthly_growth_rate)) ) # increase the new customers by growth self.remove_tmp_files()
class MetricCalculator: def __init__(self, schema): ''' Initialize metric calculator from schema name. Loads parameter json from the adjacent conf directory. Loads date range from the configuration. Makes postgres connection with environment variables. :param schema: ''' with open('../conf/%s_metrics.json' % schema, 'r') as myfile: self.metric_dict = json.loads(myfile.read()) self.schema = schema self.from_date = self.metric_dict['date_range']['from_date'] self.to_date = self.metric_dict['date_range']['to_date'] self.non_metrics = ('date_range', 'event_properties') self.URI = "postgres://%s:%s@localhost/%s" % ( os.environ['CHURN_DB_USER'], os.environ['CHURN_DB_PASS'], os.environ['CHURN_DB']) self.db = Postgres(self.URI) with open('../sql/qa_metric.sql', 'r') as myfile: self.qa_sql = myfile.read().replace('\n', ' ') def remove_old_metrics_from_db(self, run_mets=None, no_warn=False): ''' Delete values of existing metrics. If no metrics are specified, it truncates the metric table. Otherwise just delete the specified metrics. :param run_mets: list of strings, metric names; or else None meaning truncate all metrics :return: ''' if run_mets is None: print('TRUNCATING *Metrics* in schema -> %s <- ...' % schema) if not no_warn and input( "are you sure? (enter %s to proceed) " % schema) == schema: exit(0) self.db.run('truncate table %s.metric' % schema) self.db.run('truncate table %s.metric_name' % schema) else: if isinstance(run_mets, str): run_mets = [run_mets] if len(run_mets) > 1 and not no_warn: print('DELETING * %d * Metrics in schema -> %s <- ...' % (len(run_mets), schema)) if input("are you sure? (enter %s to proceed) " % schema) != schema: exit(0) for m in run_mets: id = self.get_metric_id(m) if id is not None: deletSql="delete from %s.metric where metric_name_id=%d and metric_time between '%s'::timestamp and '%s'::timestamp" \ % (schema,id,self.from_date,self.to_date) print('Clearing old values: ' + deletSql) self.db.run(deletSql) def get_metric_id(self, metric_name): ''' Get the id of one metric from the database by name :param metric_name: string name of the metric :return: id number of the metric, assuming one was found; or else SQL returns NULL as None in Python ''' sql = "select metric_name_id from %s.metric_name where metric_name='%s'" % ( self.schema, metric_name) return self.db.one(sql) def add_metric_id(self, metric): ''' Add an id for a metric if it doesn't already exist :param metric: string name of the metric :return: ''' id = self.get_metric_id(metric) if id is None: id = self.db.one( 'select max(metric_name_id)+1 from %s.metric_name' % schema) if id is None: id = 0 insertNameSql = "insert into %s.metric_name (metric_name_id,metric_name) values (%d,'%s')" % ( schema, id, metric) self.db.run(insertNameSql) print('Inserted metric %s.%s as id %d' % (schema, metric, id)) return id def metric_qa_plot(self, metric, args): save_path = '../../../fight-churn-output/' + self.schema + '/' os.makedirs(save_path, exist_ok=True) print('Checking metric %s.%s' % (self.schema, metric)) id = self.get_metric_id(metric) if id is None: "No ID found for metric %s" % metric return aSql = self.qa_sql.replace('%metric_name_id', str(id)) aSql = aSql.replace('%schema', self.schema) aSql = aSql.replace('%from_date', self.from_date) aSql = aSql.replace('%to_date', self.to_date) print(aSql) res = pandas.read_sql_query(aSql, self.URI) if res.shape[0] == 0 or res['avg_val'].isnull().values.all(): print('\t*** No result for %s' % metric) return cleanedName = ''.join(e for e in metric if e.isalnum()) # res.to_csv(save_path+cleanedName+'_metric_qa.csv',index=False) # uncomment to save details plt.figure(figsize=(8, 10)) plt.subplot(4, 1, 1) plt.plot('calc_date', 'max_val', data=res, marker='', color='red', linewidth=2, label="max") if args.hideax: plt.gca().get_xaxis().set_visible( False) # Hiding y axis labels on the count plt.ylim(0, ceil(1.1 * res['max_val'].dropna().max())) plt.legend() plt.title(metric) plt.subplot(4, 1, 2) plt.plot('calc_date', 'avg_val', data=res, marker='', color='green', linewidth=2, label='avg') if args.hideax: plt.gca().get_xaxis().set_visible( False) # Hiding y axis labels on the count plt.ylim(0, ceil(1.1 * res['avg_val'].dropna().max())) plt.legend() plt.subplot(4, 1, 3) plt.plot('calc_date', 'min_val', data=res, marker='', color='blue', linewidth=2, label='min') if args.hideax: plt.gca().get_xaxis().set_visible( False) # Hiding y axis labels on the count # plt.ylim(0, ceil(2*res['min_val'].dropna().max())) plt.legend() plt.subplot(4, 1, 4) plt.plot('calc_date', 'n_calc', data=res, marker='', color='black', linewidth=2, label="n_calc") plt.ylim(0, ceil(1.1 * res['n_calc'].dropna().max())) plt.legend() plt.gca().figure.autofmt_xdate() if args.hideax: plt.gca().get_yaxis().set_visible( False) # Hiding y axis labels on the count monthFormat = mdates.DateFormatter('%b') plt.gca().get_xaxis().set_major_formatter(monthFormat) else: plt.gcf().autofmt_xdate() plt.savefig(save_path + 'metric_valqa_' + cleanedName + '.' + args.format) plt.close() def qa_metrics(self, args): ''' Loops over the configured metrics and makes the QA plot of each. If a list was provided, it only runs the ones in the list. :param run_mets: list of strings, metric names; or else None meaning calculate all configured metrics :param args: from argparse :return: ''' if args.metrics is None: for metric in self.metric_dict.keys(): if metric in self.non_metrics: continue self.metric_qa_plot(metric, args) else: for metric in args.metrics: self.metric_qa_plot(metric, args) def run_one_metric_calculation(self, metric): ''' Calculate one metric, by name. First adds the id, then loads the raw sql from the file. To set the bind variables it starts out with the second level dictionary for this metric from the main metric dictionary. Then it adds all of the metric parameters that are common to all metric calcultions, such as from and to dates, the metric name id, a schema and the value name. These are put into the SQL template with a simple replace (did not use the Postgres bind system because it was not flexible enough.) Finally, it runs the SQL. :param metric: string name of the metric :return: ''' assert metric in self.metric_dict, "No metric %s in metric dictionary!" % metric id = self.add_metric_id(metric) with open('../sql/%s.sql' % self.metric_dict[metric]['sql'], 'r') as myfile: sql = myfile.read().replace('\n', ' ') params = self.metric_dict[metric] params['metric_name_val'] = metric params['schema'] = schema params['from_date'] = self.from_date params['to_date'] = self.to_date params['metric_name_id'] = id bind_char = '%' for p in params.keys(): sql = sql.replace(bind_char + p, str(params[p])) print(sql) self.db.run(sql) def calculate_metrics(self, run_mets=None): ''' Loops over the configured metrics and runs them. If a list was provided, it only runs the ones in the list. :param run_mets: list of strings, metric names; or else None meaning calculate all configured metrics :return: ''' for metric in self.metric_dict.keys(): if (run_mets is not None and metric not in run_mets) or metric in self.non_metrics: continue self.run_one_metric_calculation(metric)
return parser.markdown if __name__ == "__main__": from sys import argv, stdin, stdout if len(argv) == 1: stdout.write(bbcode_to_markdown(stdin.read().decode('utf-8', 'replace')).encode('utf-8')) else: from postgres import Postgres _, db_uri, db_table, id_column, db_column = argv info = {"table": db_table, "column": db_column, "id": id_column} db = Postgres(db_uri) count = db.one('SELECT COUNT(*) FROM "{table}"'.format(**info)) query = 'UPDATE "{table}" SET "{column}" = %(markdown)s WHERE "{id}" = %(id)s'.format(**info) failures = [] for i, row in enumerate(db.all('SELECT "{id}", "{column}" FROM "{table}"'.format(**info))): print("\x1b[0G{done}/{total}".format(done=i, total=count), end="") stdout.flush() try: db.run(query, {"id": getattr(row, id_column), "markdown": bbcode_to_markdown(getattr(row, db_column))}) except Exception as e: print() print(e) failures.append(getattr(row, id_column)) print()
class ChurnSimulation: def __init__(self, model, start, end, init_customers, growth, churn, mrr,seed): ''' Creates the behavior/utility model objects, sets internal variables to prepare for simulation, and creates the database connection :param model: name of the behavior/utility model parameters :param start: start date for simulation :param end: end date for simulation :param init_customers: how many customers to create at start date :param growth: monthly customer growth rate :param churn: monthly customer churn rate :param mrr: customer MRR ''' self.model_name=model self.start_date = start self.end_date = end self.init_customers=init_customers self.monthly_growth_rate = growth self.monthly_churn_rate = churn self.mrr=mrr self.behave_mod=GaussianBehaviorModel(self.model_name,seed) self.util_mod=UtilityModel(self.model_name,self.monthly_churn_rate,self.behave_mod) self.subscription_count = 0 self.tmp_sub_file_name = os.path.join(tempfile.gettempdir(),'{}_tmp_sub.csv'.format(self.model_name)) self.tmp_event_file_name=os.path.join(tempfile.gettempdir(),'{}_tmp_event.csv'.format(self.model_name)) self.db = Postgres("postgres://%s:%s@localhost/%s" % ( os.environ['CHURN_DB_USER'], os.environ['CHURN_DB_PASS'], os.environ['CHURN_DB'])) self.con = post.connect( database= os.environ['CHURN_DB'], user= os.environ['CHURN_DB_USER'], password=os.environ['CHURN_DB_PASS']) def remove_tmp_files(self): ''' Remove temp files when the simulation is over :return: ''' os.remove(self.tmp_event_file_name) os.remove(self.tmp_sub_file_name) def simulate_customer(self, start_of_month): ''' Simulate one customer collecting its events and subscriptions. This function has the core interaction between the simulation objects. Customer is created from the behavior model, and picking a random start date within the month. Then the customer objects simulates the events for the month, and the utility model determines if there is a churn based on the simulated event counts. :param start_of_month: :return: the new customer object it contains the events and subscriptions ''' new_customer=self.behave_mod.generate_customer() # Pick a random start date for the subscription within the month end_range = start_of_month + relativedelta(months=+1) this_month=start_of_month + timedelta(days=random.randrange((end_range-start_of_month).days)) churned = False while not churned: next_month=this_month+relativedelta(months=1) new_customer.subscriptions.append( (this_month,next_month) ) month_count = new_customer.generate_events(this_month,next_month) churned=self.util_mod.simulate_churn(month_count) or next_month > self.end_date if not churned: this_month = next_month return new_customer def create_customers_for_month(self,month_date,n_to_create): ''' Creates all the customers for one month, by calling simulate_customer and copy_customer_to_database in a loop. :param month_date: the month start date :param n_to_create: number of customers to create within that month :return: ''' for i in range(n_to_create): customer = self.simulate_customer(month_date) self.copy_customer_to_database(customer) print('Simulated customer %d: %d subscription, %d events' % (i, len(customer.subscriptions), len(customer.events))) def copy_customer_to_database(self,customer): ''' Copy one customers data to the database, by first writing it to temp files and then using the sql COPY command :param customer: a Customer object that has already had its simulation run :return: ''' with open(self.tmp_sub_file_name, 'w') as tmp_file: for s in customer.subscriptions: tmp_file.write("%d,%d,'%s','%s','%s',%f,\\null,\\null,1\n" % \ (self.subscription_count, customer.id, self.model_name, s[0], s[1], self.mrr)) self.subscription_count += 1 with open(self.tmp_event_file_name, 'w') as tmp_file: for e in customer.events: tmp_file.write("%d,'%s',%d\n" % (customer.id, e[0], e[1])) sql = "COPY %s.subscription FROM STDIN USING DELIMITERS ',' WITH NULL AS '\\null'" % (self.model_name) cur = self.con.cursor() with open(self.tmp_sub_file_name, 'r') as f: cur.copy_expert(sql, f) self.con.commit() sql = "COPY %s.event FROM STDIN USING DELIMITERS ',' WITH NULL AS '\\null'" % (self.model_name) with open(self.tmp_event_file_name, 'r') as f: cur.copy_expert(sql, f) self.con.commit() def truncate_old_sim(self): ''' Removes an old simulation from the database, if it already exists for this model :return: True if is safe to proceed (no data or data removed); False means old data not removed ''' oldEvent= self.db.one('select count(*) from %s.event' % self.model_name) oldSubs= self.db.one('select count(*) from %s.subscription' % self.model_name) if oldEvent > 0 or oldSubs>0: print('TRUNCATING *Events & Subscriptions* in schema -> %s <- ...' % self.model_name) if input("are you sure? (enter %s to proceed) " % self.model_name) == self.model_name: if oldEvent > 0: self.db.run('truncate table %s.event' % self.model_name) if oldSubs > 0: self.db.run('truncate table %s.subscription' % self.model_name) return True else: return False else: return True def run_simulation(self): ''' Simulation main function. First it prepares the database by truncating any old events and subscriptions, and inserting the event types into the database. Next it creeates the initial customers by calling create_customers_for_month, and then it advances month by month adding new customers (also using create_customers_for_month.) The number of new customers for each month is determined from the growth rate. Note that churn is not handled at this level, but is modeled at the customer level. :return: ''' # database setup if not self.truncate_old_sim(): return self.behave_mod.insert_event_types(self.model_name,self.db) # Initial customer count print('\nCreating %d initial customers for %s start date' % (self.init_customers,self.start_date)) self.create_customers_for_month(self.start_date,self.init_customers) print('Created %d initial customers with %d subscriptions for start date %s' % (self.init_customers,self.subscription_count,str(self.start_date))) # Advance to additional months next_month=self.start_date+relativedelta(months=+1) n_to_add = int(ceil( self.init_customers* self.monthly_growth_rate)) # number of new customers in first month while next_month < self.end_date: print('\nCreating %d new customers for month %s:' % (n_to_add,next_month)) self.create_customers_for_month(next_month,n_to_add) print('Created %d new customers for month %s, now %d subscriptions\n' % (n_to_add,str(next_month),self.subscription_count)) next_month=next_month+relativedelta(months=+1) n_to_add = int(ceil( n_to_add * (1.0+self.monthly_growth_rate))) # increase the new customers by growth self.remove_tmp_files()
class PostgreSQLDB(object): FIELD_SONG_ID = 'song_id' FIELD_SONGNAME = 'song_name' FIELD_OFFSET = 'time' FIELD_HASH = 'hash' SONG_ID = 'song_id' SONG_NAME = 'song_name' CONFIDENCE = 'confidence' MATCH_TIME = 'match_time' OFFSET = 'time' OFFSET_SECS = 'offset_seconds' db = None # tables FINGERPRINTS_TABLENAME = 'fingerprints' SONGS_TABLENAME = 'songs' # creates CREATE_FINGERPRINTS_TABLE = """ CREATE TABLE IF NOT EXISTS "%s"( "%s" INT PRIMARY KEY NOT NULL, "%s" INT NOT NULL, "%s" INT NOT NULL);""" % (FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET) CREATE_SONGS_TABLE = \ """CREATE TABLE IF NOT EXISTS "%s"( "%s" SERIAL PRIMARY KEY , "%s" varchar(250) NOT NULL);""" % \ (SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME) SELECT_SONG = """SELECT %s FROM %s WHERE %s = %%s;""" \ % (FIELD_SONGNAME, SONGS_TABLENAME, FIELD_SONG_ID) # inserts fingerprint. Update if existing INSERT_FINGERPRINT = \ """INSERT INTO %s VALUES (%%s, %%s, %%s) ON CONFLICT (%s) DO UPDATE SET %s = EXCLUDED.%s, %s = EXCLUDED.%s;""" \ % (FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_SONG_ID, FIELD_OFFSET, FIELD_OFFSET) INSERT_SONG = "INSERT INTO %s (%s) VALUES (%%s);" % (SONGS_TABLENAME, FIELD_SONGNAME) SELECT_MULTIPLE = """SELECT %s, %s, %s FROM %s WHERE %s IN (%%s);""" \ % (FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET, FINGERPRINTS_TABLENAME, FIELD_HASH) def __init__(self, drop_tables=False): super(PostgreSQLDB, self).__init__() if os.environ.get('DOCKERCLOUD_SERVICE_HOSTNAME', None) is not None: self.db = Postgres( u"postgres://*****:*****@postgres/hashes") else: # self.db = Postgres(u"postgres://*****:*****@localhost/postgres") self.db = Postgres( u"postgres://*****:*****@pervasivesounds.com/hashes" ) if drop_tables: self.db.run("DROP TABLE IF EXISTS %s CASCADE" % self.SONGS_TABLENAME) self.db.run("DROP TABLE IF EXISTS %s CASCADE" % self.FINGERPRINTS_TABLENAME) self.db.run(self.CREATE_SONGS_TABLE) self.db.run(self.CREATE_FINGERPRINTS_TABLE) def store(self, name, hashes): sid = self.insert_song(name) self.insert_hashes(sid, hashes) def insert_hash(self, song_id, hash, offset): self.db.run(self.INSERT_FINGERPRINT, (hash, song_id, offset)) def insert_hashes(self, sid, hashes): values = [] for time_, hash_ in hashes: values.append((int(hash_), sid, time_)) with self.db.get_cursor() as cur: for split_values in batch(values, 1000): cur.executemany(self.INSERT_FINGERPRINT, split_values) def insert_song(self, songname): """ Inserts song in the database and returns the ID of the inserted record. """ self.db.run(self.INSERT_SONG, (songname, )) return self.db.one( "SELECT %s FROM %s ORDER BY %s DESC LIMIT 1" % (self.FIELD_SONG_ID, self.SONGS_TABLENAME, self.FIELD_SONG_ID)) def get_song_by_id(self, sid): """ Returns song by its ID. """ return self.db.one(self.SELECT_SONG, (int(sid), )) def return_matches(self, hashes): mapper = {} for offset, hash in hashes: mapper[int(hash)] = offset # Get an iteratable of all the hashes we need values = list(mapper.keys()) res = [] if hashes is not None: for split_values in batch(values, 100): query = self.SELECT_MULTIPLE query %= ', '.join(["%s"] * len(split_values)) [ res.append(r) for r in self.db.all(query, split_values, back_as=tuple) ] return np.asarray([(sid, offset - mapper[hash]) for (hash, sid, offset) in res]) def get_best_sids(self, matches): unique, counts = np.unique(matches[:, 0], return_counts=True) return unique[np.argsort(counts)[::-1][:np.minimum(len(counts), 20)]] def align_matches(self, matches, sids): """ Finds hash matches that align in time with other matches and finds consensus about which hashes are "true" signal from the audio. Returns a dictionary with match information. """ # align by diffs diff_counter = {} largest = 0 largest_count = 0 song_id = -1 for sid in sids: for sid, diff in matches[matches[:, 0] == sid]: if sid not in diff_counter: diff_counter[sid] = {} if diff not in diff_counter[sid]: diff_counter[sid][diff] = 0 diff_counter[sid][diff] += 1 if diff_counter[sid][diff] > largest_count: largest = diff largest_count = diff_counter[sid][diff] song_id = sid # total_count = {} # for sid in diff_counter.keys(): # total_count[sid] = np.sum(diff_counter[sid].values) songs = [] for sid in diff_counter.keys(): song_name = self.get_song_by_id(sid) for diff in diff_counter[sid].keys(): confidence = diff_counter[sid][diff] if confidence > 4: songs.append({ 'song_id': song_id, 'song_name': song_name, 'confidence': confidence, 'offset': diff }) return songs
class TestCache(TestCase): def setUp(self): self.db = Postgres(cache=Cache(max_size=1), cursor_factory=SimpleTupleCursor) self.db.run("DROP SCHEMA IF EXISTS public CASCADE") self.db.run("CREATE SCHEMA public") self.db.run("CREATE TABLE foo (key text, value int)") self.db.run("INSERT INTO foo VALUES ('a', 1)") self.db.run("INSERT INTO foo VALUES ('b', 2)") def test_one_returns_cached_row(self): query = "SELECT * FROM foo WHERE key = 'a'" r1 = self.db.one(query, max_age=10) r2 = self.db.one(query, max_age=10) assert r2 is r1 def test_all_returns_cached_rows(self): query = "SELECT * FROM foo ORDER BY key" r1 = self.db.all(query, max_age=10) r2 = self.db.all(query, max_age=10) assert r2 == r1 assert r2 is not r1 assert r2[0] is r1[0] def test_back_as_is_compatible_with_caching(self): query = "SELECT * FROM foo WHERE key = 'a'" r1 = self.db.one(query, back_as=dict, max_age=10) r2 = self.db.one(query, back_as=namedtuple, max_age=10) assert r1 == r2._asdict() rows = self.db.all(query, back_as='Row', max_age=10) assert rows == [r1] def test_all_returns_row_cached_by_one(self): query = "SELECT * FROM foo WHERE key = 'a'" row = self.db.one(query, max_age=10) rows = self.db.all(query, max_age=10) assert rows == [row] assert rows[0] is row def test_one_raises_TooMany_when_the_cache_contains_multiple_rows(self): query = "SELECT * FROM foo" rows = self.db.all(query, max_age=10) assert len(rows) == 2 with self.assertRaises(TooMany): self.db.one(query, max_age=10) def test_cache_max_size(self): query1 = b"SELECT * FROM foo WHERE key = 'a'" query2 = b"SELECT * FROM foo WHERE key = 'b'" self.db.all(query1, max_age=10) assert set(self.db.cache.entries.keys()) == {query1} self.db.all(query2, max_age=10) assert set(self.db.cache.entries.keys()) == {query2} def test_cache_max_age(self): query = b"SELECT * FROM foo WHERE key = 'a'" r1 = self.db.one(query, max_age=0) r2 = self.db.one(query, max_age=10) assert r2 is not r1 def test_cache_prune(self): self.db.cache.max_size = 2 query1 = b"SELECT * FROM foo WHERE key = 'a'" query2 = b"SELECT * FROM foo WHERE key = 'b'" self.db.one(query1, max_age=-1) self.db.one(query2, max_age=10) assert set(self.db.cache.entries.keys()) == {query1, query2} self.db.cache.prune() assert set(self.db.cache.entries.keys()) == {query2} def test_cache_prevents_concurrent_queries(self): with self.db.get_cursor() as cursor: cursor.run("LOCK TABLE foo IN EXCLUSIVE MODE") def insert(): self.db.one("INSERT INTO foo VALUES ('c', 3) RETURNING *", max_age=1) t1 = Thread(target=insert) t2 = Thread(target=insert) t1.start() t2.start() cursor.run("COMMIT") # this releases the table lock t1.join() t2.join() n = self.db.one("SELECT count(*) FROM foo WHERE key = 'c'") assert n == 1
FIELD_SONGNAME = 'song_name' FIELD_OFFSET = 'time' FIELD_HASH = 'hash' SONG_ID = 'song_id' SONG_NAME = 'song_name' CONFIDENCE = 'confidence' MATCH_TIME = 'match_time' OFFSET = 'time' OFFSET_SECS = 'offset_seconds' FINGERPRINTS_TABLENAME = 'fingerprints' SONGS_TABLENAME = 'songs' COUNT_ROWS = "SELECT reltuples AS approximate_row_count FROM pg_class WHERE relname = '%s';" postgres_url = '52.49.153.98:5432' db = Postgres(u"postgres://*****:*****@%s/hashes" % postgres_url) print("Number of hashes: %d" % db.one(COUNT_ROWS % FINGERPRINTS_TABLENAME)) # print("Number of songs: %d" % db.one(COUNT_ROWS % SONGS_TABLENAME)) print("\nSong table first and last element:") print(db.one("SELECT %s, %s FROM %s ORDER BY %s ASC LIMIT 1" % (FIELD_SONG_ID, FIELD_SONGNAME, SONGS_TABLENAME, FIELD_SONG_ID))) print(db.one("SELECT %s, %s FROM %s ORDER BY %s DESC LIMIT 1" % (FIELD_SONG_ID, FIELD_SONGNAME, SONGS_TABLENAME, FIELD_SONG_ID))) print("\nFingerprint table first and last element:") first_hash = db.one("SELECT %s, %s FROM %s ORDER BY %s ASC LIMIT 1" % (FIELD_HASH, FIELD_SONG_ID, FINGERPRINTS_TABLENAME, FIELD_SONG_ID)) print(db.one("SELECT * FROM %s WHERE %s=%d;" % (SONGS_TABLENAME, FIELD_SONG_ID, first_hash.song_id))) last_hash = db.one("SELECT %s, %s FROM %s ORDER BY %s DESC LIMIT 1" % (FIELD_HASH, FIELD_SONG_ID, FINGERPRINTS_TABLENAME, FIELD_SONG_ID)) print(db.one("SELECT * FROM %s WHERE %s=%d;" % (SONGS_TABLENAME, FIELD_SONG_ID, last_hash.song_id))) print("\nSize of database: %s" % db.one("SELECT pg_size_pretty(pg_database_size('hashes'));")) print("Size of %s table: %s" % (FINGERPRINTS_TABLENAME, db.one("SELECT pg_size_pretty(pg_table_size('%s'));" % FINGERPRINTS_TABLENAME))) print("Size of %s table: %s" % (SONGS_TABLENAME, db.one("SELECT pg_size_pretty(pg_table_size('%s'));" % SONGS_TABLENAME)))