Example #1
0
def connect():
    """ Connect to the PostgreSQL database server """
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        # use a postgres connection string below
        db = Postgres(
            "host=localhost dbname=michaelmostachetti user=michaelmostachetti password=password"
        )

        # This is where you will run your four queries
        # You can use either .one(), or .run() depending if you care
        # about the return values of the queries

        # Query 1
        result1 = db.one("SELECT version();")
        print(result1)
        # Query 2
        db.run("SELECT version();")
        # Query 3
        db.run("SELECT version();")
        # Query 4
        db.run("SELECT version();")
    except:
        print("An error occurred")
Example #2
0
def get_last_event_id(db: postgres.Postgres, room_id: str,
                      before: datetime) -> Optional[str]:
    return db.one(
        "SELECT event_id FROM events WHERE received_ts <= %(before)s AND room_id=%(room_id)s ORDER BY received_ts DESC LIMIT 1",
        before=int(before.timestamp() * 1000),
        room_id=room_id,
    )
Example #3
0
def sql_listing(param_dict):
    '''
    Run a SQL listing.  The sql file is loaded, and then any non-reserved keyword in the parameters is treated as a
    string to be replaced in the sql string. The SQL is then printed out, before newlines are removed, and then run
    in one of the allowed modes.  The allowed modes are:
        run : The SQL returns no result
        one : The SQL should return one row result to be printed
        top : The SQL returns many results, print the top N (given by global print_num_rows)
    :param param_dict: dictionary produced by load_and_check_listing_params
    :return:
    '''


    with open('../../listings/chap%d/%s.sql' % (param_dict['chapter'], param_dict['name']), 'r') as myfile:
        db = Postgres("postgres://%s:%s@localhost/%s" % (os.environ['CHURN_DB_USER'],os.environ['CHURN_DB_PASS'],os.environ['CHURN_DB']))

        # prefix the search path onto the listing, which does not specify the schema
        sql = "set search_path = '%s'; " % param_dict['schema'];

        # load the sql file
        sql = sql + myfile.read()

        # bind any parameters that match strings in the sql
        param_keys = [p for p in param_dict.keys() if p not in reserved_param_keywords]
        for p in param_keys:
            sql = sql.replace(p, str(param_dict[p]))

        # Print the sql (then remove the newlines)
        print('SQL:\n----------\n'+sql+'\n----------\nRESULT:')
        sql = sql.replace('\n', ' ')

        # Run in the manner indicated by the mode
        if  param_dict['mode']  == 'run':
            db.run(sql)
        elif  param_dict['mode']  == 'one':
            res = db.one(sql)
            print(res)
        elif  param_dict['mode']  == 'top' or param_dict['mode'] == 'save':
            res = db.all(sql)
            df = pd.DataFrame(res)
            if  param_dict['mode']  == 'save':
                save_path = '../../../fight-churn-output/' + param_dict['schema'] + '/'
                os.makedirs(save_path,exist_ok=True)
                csv_path=save_path + param_dict['schema'] + '_' +  param_dict['name'].replace(param_dict['prefix'],'')  + '.csv'
                print('Saving: %s' % csv_path)
                df.to_csv(csv_path, index=False)
            else:
                print(df.head(print_num_rows))
        else:
            print('Unknown run mode for SQL example')
            exit(-4)
Example #4
0
def sql_listing(chapter,
                listing,
                name,
                schema,
                mode,
                param_dict,
                insert=False,
                save_ext=None):
    """

    Run a SQL listing.  The sql file is loaded, and then any non-reserved keyword in the parameters is treated as a
    string to be replaced in the sql string. The SQL is then printed out.

    :param chapter:
    :param listing:
    :param name:
    :param schema:
    :param mode:The allowed modes are:
        run : The SQL returns no result
        one : The SQL should return one row result to be printed
        top : The SQL returns many results, print the top N (given by global print_num_rows)
    :param param_dict: the bind variables as a dictionary
    :param insert: flag to use the insert form of a query; see chapter 7
    :param save_ext:
    :return:
    """

    with open(
            '../../listings/chap%d/%s.sql' %
        (chapter, _full_listing_name(chapter, listing, name, insert)),
            'r') as myfile:

        db = Postgres("postgres://%s:%s@localhost/%s" %
                      (os.environ['CHURN_DB_USER'],
                       os.environ['CHURN_DB_PASS'], os.environ['CHURN_DB']))

        # prefix the search path onto the listing, which does not specify the schema
        sql = "set search_path = '%s'; " % schema

        # load the sql file
        sql = sql + myfile.read()

        # bind any parameters that match strings in the sql
        param_keys = [
            p for p in param_dict.keys() if p not in reserved_param_keywords
        ]
        for p in param_keys:
            sql = sql.replace(p, str(param_dict[p]))

        # Print the sql (then remove the newlines)
        print('SQL:\n----------\n' + sql + '\n----------\nRESULT:')
        sql = sql.replace('\n', ' ')

        # Run in the manner indicated by the mode
        if mode == 'run':
            db.run(sql)
        elif mode == 'one':
            res = db.one(sql)
            print(res)
        elif mode == 'top' or mode == 'save':
            res = db.all(sql)
            df = pd.DataFrame(res)
            if mode == 'save':
                save_path = '../../../fight-churn-output/' + schema + '/'
                os.makedirs(save_path, exist_ok=True)
                csv_path = save_path + schema + '_' + name.replace(
                    'listing_{}_{}_'.format(chapter, listing), '')
                if save_ext:
                    csv_path = csv_path + '_' + save_ext
                csv_path = csv_path + '.csv'
                print('Saving: %s' % csv_path)
                df.to_csv(csv_path, index=False)
            else:
                print(df.head(print_num_rows))
        else:
            print('Unknown run mode for SQL example')
            exit(-4)
Example #5
0
class ChurnSimulation:
    def __init__(self, model, start, end, init_customers, seed):
        '''
        Creates the behavior/utility model objects, sets internal variables to prepare for simulation, and creates
        the database connection

        :param model: name of the behavior/utility model parameters
        :param start: start date for simulation
        :param end: end date for simulation
        :param init_customers: how many customers to create at start date
        '''

        self.model_name = model
        self.start_date = start
        self.end_date = end
        self.init_customers = init_customers
        self.monthly_growth_rate = 0.1

        self.util_mod = UtilityModel(self.model_name)
        behavior_versions = glob.glob('../conf/' + self.model_name + '_*.csv')
        self.behavior_models = {}
        self.model_list = []
        for b in behavior_versions:
            version = b[(b.find(self.model_name) + len(self.model_name) +
                         1):-4]
            if version in ('utility', 'population', 'country'):
                continue
            behave_mod = FatTailledBehaviorModel(self.model_name, seed,
                                                 version)
            self.behavior_models[behave_mod.version] = behave_mod
            self.model_list.append(behave_mod)

        if len(self.behavior_models) > 1:
            self.population_percents = pd.read_csv(
                '../conf/' + self.model_name + '_population.csv', index_col=0)
        self.util_mod.setChurnScale(self.behavior_models,
                                    self.population_percents)
        self.population_picker = np.cumsum(self.population_percents)

        self.country_lookup = pd.read_csv('../conf/' + self.model_name +
                                          '_country.csv')

        self.subscription_count = 0
        self.tmp_sub_file_name = os.path.join(
            tempfile.gettempdir(), '{}_tmp_sub.csv'.format(self.model_name))
        self.tmp_event_file_name = os.path.join(
            tempfile.gettempdir(), '{}_tmp_event.csv'.format(self.model_name))

        self.db = Postgres(
            "postgres://%s:%s@localhost/%s" %
            (os.environ['CHURN_DB_USER'], os.environ['CHURN_DB_PASS'],
             os.environ['CHURN_DB']))

        self.con = post.connect(database=os.environ['CHURN_DB'],
                                user=os.environ['CHURN_DB_USER'],
                                password=os.environ['CHURN_DB_PASS'])

    def remove_tmp_files(self):
        '''
        Remove temp files when the simulation is over
        :return:
        '''
        os.remove(self.tmp_event_file_name)
        os.remove(self.tmp_sub_file_name)

    def pick_customer_model(self):
        choice = random.uniform(0, 1)
        for m in range(0, self.population_picker.shape[0]):
            if choice <= self.population_picker['percent'][m]:
                version_name = self.population_picker.index.values[m]
                return self.behavior_models[version_name]

    def simulate_customer(self, start_of_month):
        '''
        Simulate one customer collecting its events and subscriptions.

        This function has the core interaction between the simulation objects.  Customer is created from the behavior
        model, and picking a random start date within the month.  Then the customer objects simulates the events for
        the month, and the utility model determines if there is a churn based on the simulated event counts.

        :param start_of_month:
        :return: the new customer object it contains the events and subscriptions
        '''
        # customer_model = self.pick_customer_model()
        customer_model = np.random.choice(self.model_list,
                                          p=self.population_percents['pcnt'])
        new_customer = customer_model.generate_customer(start_of_month)

        customer_country = np.random.choice(self.country_lookup['country'],
                                            p=self.country_lookup['pcnt'])
        new_customer.country = customer_country

        # Pick a random start date for the subscription within the month
        end_range = start_of_month + relativedelta(months=+1)
        this_month = start_of_month + timedelta(
            days=random.randrange((end_range - start_of_month).days))

        churned = False
        while not churned:
            next_month = this_month + relativedelta(months=1)
            new_customer.subscriptions.append((this_month, next_month))
            month_count = new_customer.generate_events(this_month, next_month)
            churned = self.util_mod.simulate_churn(
                month_count, new_customer) or next_month > self.end_date
            if not churned:
                this_month = next_month
        return new_customer

    def create_customers_for_month(self, month_date, n_to_create):
        '''
        Creates all the customers for one month, by calling simulate_customer and copy_customer_to_database in a loop.
        :param month_date: the month start date
        :param n_to_create: number of customers to create within that month
        :return:
        '''

        total_subscriptions = 0
        total_events = 0
        for i in range(n_to_create):
            customer = self.simulate_customer(month_date)
            self.copy_customer_to_database(customer)
            total_subscriptions += len(customer.subscriptions)
            total_events += len(customer.events)
            if i % 100 == 0:
                print(
                    'Simulated customer {}/{}: {:,} subscriptions & {:,} events'
                    .format(i, n_to_create, total_subscriptions, total_events))

    def copy_customer_to_database(self, customer):
        '''
        Copy one customers data to the database, by first writing it to temp files and then using the sql COPY command
        :param customer: a Customer object that has already had its simulation run
        :return:
        '''
        with open(self.tmp_sub_file_name, 'w') as tmp_file:
            for s in customer.subscriptions:
                tmp_file.write("%d,%d,'%s','%s','%s',%f,\\null,\\null,1\n" % \
                               (self.subscription_count, customer.id, self.model_name, s[0], s[1], 9.99)) # mrr is 9.99
                self.subscription_count += 1
        with open(self.tmp_event_file_name, 'w') as tmp_file:
            for e in customer.events:
                tmp_file.write("%d,'%s',%d\n" % (customer.id, e[0], e[1]))

        sql = "INSERT INTO {}.account VALUES({},'{}','{}',{})".format(
            self.model_name, customer.id, customer.channel,
            customer.date_of_birth.isoformat(), 'NULL'
            if customer.country == 'None' else "'{}'".format(customer.country))
        self.db.run(sql)

        cur = self.con.cursor()

        sql = "COPY %s.subscription FROM STDIN USING DELIMITERS ',' WITH NULL AS '\\null'" % (
            self.model_name)
        with open(self.tmp_sub_file_name, 'r') as f:
            cur.copy_expert(sql, f)
        self.con.commit()

        sql = "COPY %s.event FROM STDIN USING DELIMITERS ',' WITH NULL AS '\\null'" % (
            self.model_name)
        with open(self.tmp_event_file_name, 'r') as f:
            cur.copy_expert(sql, f)
        self.con.commit()

    def truncate_old_sim(self):
        '''
        Removes an old simulation from the database, if it already exists for this model
        :return: True if is safe to proceed (no data or data removed); False means old data not removed
        '''
        oldEvent = self.db.one('select count(*) from %s.event' %
                               self.model_name)
        oldSubs = self.db.one('select count(*) from %s.subscription' %
                              self.model_name)
        oldAccount = self.db.one('select count(*) from %s.account' %
                                 self.model_name)
        if oldEvent > 0 or oldSubs > 0 or oldAccount > 0:
            print(
                'TRUNCATING *Events/Metrics & Subscriptions/Observations* in schema -> %s <-  ...'
                % self.model_name)
            if input("are you sure? (enter %s to proceed) " %
                     self.model_name) == self.model_name:
                if oldEvent > 0:
                    self.db.run('truncate table %s.event' % self.model_name)
                    self.db.run('truncate table %s.metric' % self.model_name)
                if oldAccount > 0:
                    self.db.run('truncate table %s.account' % self.model_name)
                if oldSubs > 0:
                    self.db.run('truncate table %s.subscription' %
                                self.model_name)
                    self.db.run('truncate table %s.active_period' %
                                self.model_name)
                    self.db.run('truncate table %s.observation' %
                                self.model_name)
                return True
            else:
                return False
        else:
            return True

    def run_simulation(self):
        '''
        Simulation main function. First it prepares the database by truncating any old events and subscriptions, and
        inserting the event types into the database.  Next it creeates the initial customers by calling
        create_customers_for_month, and then it advances month by month adding new customers (also using
        create_customers_for_month.)  The number of new customers for each month is determined from the growth rate.
        Note that churn is not handled at this level, but is modeled at the customer level.
        :return:
        '''

        # database setup
        if not self.truncate_old_sim():
            return
        # Any model can insert the event types
        self.behavior_models[next(iter(
            self.behavior_models))].insert_event_types(self.model_name,
                                                       self.db)

        # Initial customer count
        print('\nCreating %d initial customers for month of %s' %
              (self.init_customers, self.start_date))
        self.create_customers_for_month(self.start_date, self.init_customers)
        print(
            'Created %d initial customers with %d subscriptions for start date %s'
            % (self.init_customers, self.subscription_count,
               str(self.start_date)))

        # Advance to additional months
        next_month = self.start_date + relativedelta(months=+1)
        n_to_add = int(ceil(self.init_customers * self.monthly_growth_rate)
                       )  # number of new customers in first month
        while next_month < self.end_date:
            print('\nCreating %d new customers for month of %s:' %
                  (n_to_add, next_month))
            self.create_customers_for_month(next_month, n_to_add)
            print(
                'Created %d new customers for month %s, now %d subscriptions\n'
                % (n_to_add, str(next_month), self.subscription_count))
            next_month = next_month + relativedelta(months=+1)
            n_to_add = int(ceil(n_to_add * (1.0 + self.monthly_growth_rate))
                           )  # increase the new customers by growth

        self.remove_tmp_files()
Example #6
0
class MetricCalculator:
    def __init__(self, schema):
        '''
		Initialize metric calculator from schema name.  Loads parameter json from the adjacent conf directory.
		Loads date range from the configuration. Makes postgres connection with environment variables.
		:param schema:
		'''

        with open('../conf/%s_metrics.json' % schema, 'r') as myfile:
            self.metric_dict = json.loads(myfile.read())

        self.schema = schema
        self.from_date = self.metric_dict['date_range']['from_date']
        self.to_date = self.metric_dict['date_range']['to_date']

        self.non_metrics = ('date_range', 'event_properties')

        self.URI = "postgres://%s:%s@localhost/%s" % (
            os.environ['CHURN_DB_USER'], os.environ['CHURN_DB_PASS'],
            os.environ['CHURN_DB'])
        self.db = Postgres(self.URI)

        with open('../sql/qa_metric.sql', 'r') as myfile:
            self.qa_sql = myfile.read().replace('\n', ' ')

    def remove_old_metrics_from_db(self, run_mets=None, no_warn=False):
        '''
		Delete values of existing metrics. If no metrics are specified, it truncates the metric table. Otherwise
		just delete the specified metrics.
		:param run_mets: list of strings, metric names; or else None meaning truncate all metrics
		:return:
		'''
        if run_mets is None:
            print('TRUNCATING *Metrics* in schema -> %s <-  ...' % schema)
            if not no_warn and input(
                    "are you sure? (enter %s to proceed) " % schema) == schema:
                exit(0)
            self.db.run('truncate table %s.metric' % schema)
            self.db.run('truncate table %s.metric_name' % schema)
        else:
            if isinstance(run_mets, str): run_mets = [run_mets]
            if len(run_mets) > 1 and not no_warn:
                print('DELETING * %d * Metrics in schema -> %s <-  ...' %
                      (len(run_mets), schema))
                if input("are you sure? (enter %s to proceed) " %
                         schema) != schema:
                    exit(0)
            for m in run_mets:
                id = self.get_metric_id(m)
                if id is not None:
                    deletSql="delete from %s.metric where metric_name_id=%d and metric_time between '%s'::timestamp and '%s'::timestamp"  \
                        % (schema,id,self.from_date,self.to_date)
                    print('Clearing old values: ' + deletSql)
                    self.db.run(deletSql)

    def get_metric_id(self, metric_name):
        '''
		Get the id of one metric from the database by name
		:param metric_name: string name of the metric
		:return: id number of the metric, assuming one was found; or else SQL returns NULL as None in Python
		'''
        sql = "select metric_name_id from %s.metric_name where metric_name='%s'" % (
            self.schema, metric_name)
        return self.db.one(sql)

    def add_metric_id(self, metric):
        '''
		Add an id for a metric if it doesn't already exist
		:param metric: string name of the metric
		:return:
		'''
        id = self.get_metric_id(metric)
        if id is None:
            id = self.db.one(
                'select max(metric_name_id)+1 from %s.metric_name' % schema)
            if id is None: id = 0
            insertNameSql = "insert into %s.metric_name (metric_name_id,metric_name) values (%d,'%s')" % (
                schema, id, metric)
            self.db.run(insertNameSql)
            print('Inserted metric %s.%s as id %d' % (schema, metric, id))

        return id

    def metric_qa_plot(self, metric, args):

        save_path = '../../../fight-churn-output/' + self.schema + '/'
        os.makedirs(save_path, exist_ok=True)

        print('Checking metric %s.%s' % (self.schema, metric))
        id = self.get_metric_id(metric)
        if id is None:
            "No ID found for metric %s" % metric
            return
        aSql = self.qa_sql.replace('%metric_name_id', str(id))
        aSql = aSql.replace('%schema', self.schema)
        aSql = aSql.replace('%from_date', self.from_date)
        aSql = aSql.replace('%to_date', self.to_date)

        print(aSql)
        res = pandas.read_sql_query(aSql, self.URI)
        if res.shape[0] == 0 or res['avg_val'].isnull().values.all():
            print('\t*** No result for %s' % metric)
            return

        cleanedName = ''.join(e for e in metric if e.isalnum())
        # res.to_csv(save_path+cleanedName+'_metric_qa.csv',index=False) # uncomment to save details

        plt.figure(figsize=(8, 10))
        plt.subplot(4, 1, 1)
        plt.plot('calc_date',
                 'max_val',
                 data=res,
                 marker='',
                 color='red',
                 linewidth=2,
                 label="max")
        if args.hideax:
            plt.gca().get_xaxis().set_visible(
                False)  # Hiding y axis labels on the count
        plt.ylim(0, ceil(1.1 * res['max_val'].dropna().max()))
        plt.legend()
        plt.title(metric)
        plt.subplot(4, 1, 2)
        plt.plot('calc_date',
                 'avg_val',
                 data=res,
                 marker='',
                 color='green',
                 linewidth=2,
                 label='avg')
        if args.hideax:
            plt.gca().get_xaxis().set_visible(
                False)  # Hiding y axis labels on the count
        plt.ylim(0, ceil(1.1 * res['avg_val'].dropna().max()))
        plt.legend()
        plt.subplot(4, 1, 3)
        plt.plot('calc_date',
                 'min_val',
                 data=res,
                 marker='',
                 color='blue',
                 linewidth=2,
                 label='min')
        if args.hideax:
            plt.gca().get_xaxis().set_visible(
                False)  # Hiding y axis labels on the count
        # plt.ylim(0, ceil(2*res['min_val'].dropna().max()))
        plt.legend()
        plt.subplot(4, 1, 4)
        plt.plot('calc_date',
                 'n_calc',
                 data=res,
                 marker='',
                 color='black',
                 linewidth=2,
                 label="n_calc")
        plt.ylim(0, ceil(1.1 * res['n_calc'].dropna().max()))
        plt.legend()
        plt.gca().figure.autofmt_xdate()
        if args.hideax:
            plt.gca().get_yaxis().set_visible(
                False)  # Hiding y axis labels on the count
            monthFormat = mdates.DateFormatter('%b')
            plt.gca().get_xaxis().set_major_formatter(monthFormat)
        else:
            plt.gcf().autofmt_xdate()

        plt.savefig(save_path + 'metric_valqa_' + cleanedName + '.' +
                    args.format)
        plt.close()

    def qa_metrics(self, args):
        '''
		Loops over the configured metrics and makes the QA plot of each.  If a list was provided, it only runs the ones
		in the list.
		:param run_mets: list of strings, metric names; or else None meaning calculate all configured metrics
		:param args: from argparse
		:return:
		'''

        if args.metrics is None:
            for metric in self.metric_dict.keys():
                if metric in self.non_metrics: continue
                self.metric_qa_plot(metric, args)
        else:
            for metric in args.metrics:
                self.metric_qa_plot(metric, args)

    def run_one_metric_calculation(self, metric):
        '''
		Calculate one metric, by name.  First adds the id, then loads the raw sql from the file.  To set the bind
		variables it starts out with the second level dictionary for this metric from the main metric dictionary.
		Then it adds all of the metric parameters that are common to all metric calcultions, such as from and to
		 dates, the metric name id, a schema and the value name.  These are put into the SQL template with a simple
		 replace (did not use the Postgres bind system because it was not flexible enough.) Finally, it runs the SQL.
		:param metric: string name of the metric
		:return:
		'''

        assert metric in self.metric_dict, "No metric %s in metric dictionary!" % metric

        id = self.add_metric_id(metric)

        with open('../sql/%s.sql' % self.metric_dict[metric]['sql'],
                  'r') as myfile:
            sql = myfile.read().replace('\n', ' ')

        params = self.metric_dict[metric]
        params['metric_name_val'] = metric
        params['schema'] = schema
        params['from_date'] = self.from_date
        params['to_date'] = self.to_date
        params['metric_name_id'] = id
        bind_char = '%'
        for p in params.keys():
            sql = sql.replace(bind_char + p, str(params[p]))
        print(sql)

        self.db.run(sql)

    def calculate_metrics(self, run_mets=None):
        '''
		Loops over the configured metrics and runs them.  If a list was provided, it only runs the ones in the list.
		:param run_mets: list of strings, metric names; or else None meaning calculate all configured metrics
		:return:
		'''

        for metric in self.metric_dict.keys():
            if (run_mets is not None
                    and metric not in run_mets) or metric in self.non_metrics:
                continue

            self.run_one_metric_calculation(metric)
Example #7
0
    return parser.markdown


if __name__ == "__main__":
    from sys import argv, stdin, stdout

    if len(argv) == 1:
        stdout.write(bbcode_to_markdown(stdin.read().decode('utf-8', 'replace')).encode('utf-8'))
    else:
        from postgres import Postgres

        _, db_uri, db_table, id_column, db_column = argv
        info = {"table": db_table, "column": db_column, "id": id_column}
        db = Postgres(db_uri)

        count = db.one('SELECT COUNT(*) FROM "{table}"'.format(**info))
        query = 'UPDATE "{table}" SET "{column}" = %(markdown)s WHERE "{id}" = %(id)s'.format(**info)
        failures = []

        for i, row in enumerate(db.all('SELECT "{id}", "{column}" FROM "{table}"'.format(**info))):
            print("\x1b[0G{done}/{total}".format(done=i, total=count), end="")
            stdout.flush()

            try:
                db.run(query, {"id": getattr(row, id_column), "markdown": bbcode_to_markdown(getattr(row, db_column))})
            except Exception as e:
                print()
                print(e)
                failures.append(getattr(row, id_column))

        print()
Example #8
0
class ChurnSimulation:

    def __init__(self, model, start, end, init_customers, growth, churn, mrr,seed):
        '''
        Creates the behavior/utility model objects, sets internal variables to prepare for simulation, and creates
        the database connection

        :param model: name of the behavior/utility model parameters
        :param start: start date for simulation
        :param end: end date for simulation
        :param init_customers: how many customers to create at start date
        :param growth: monthly customer growth rate
        :param churn: monthly customer churn rate
        :param mrr: customer MRR
        '''

        self.model_name=model
        self.start_date = start
        self.end_date = end
        self.init_customers=init_customers
        self.monthly_growth_rate = growth
        self.monthly_churn_rate = churn
        self.mrr=mrr

        self.behave_mod=GaussianBehaviorModel(self.model_name,seed)
        self.util_mod=UtilityModel(self.model_name,self.monthly_churn_rate,self.behave_mod)

        self.subscription_count = 0
        self.tmp_sub_file_name = os.path.join(tempfile.gettempdir(),'{}_tmp_sub.csv'.format(self.model_name))
        self.tmp_event_file_name=os.path.join(tempfile.gettempdir(),'{}_tmp_event.csv'.format(self.model_name))

        self.db = Postgres("postgres://%s:%s@localhost/%s" % (
        os.environ['CHURN_DB_USER'], os.environ['CHURN_DB_PASS'], os.environ['CHURN_DB']))

        self.con = post.connect( database= os.environ['CHURN_DB'],
                                 user= os.environ['CHURN_DB_USER'],
                                 password=os.environ['CHURN_DB_PASS'])

    def remove_tmp_files(self):
        '''
        Remove temp files when the simulation is over
        :return:
        '''
        os.remove(self.tmp_event_file_name)
        os.remove(self.tmp_sub_file_name)

    def simulate_customer(self, start_of_month):
        '''
        Simulate one customer collecting its events and subscriptions.

        This function has the core interaction between the simulation objects.  Customer is created from the behavior
        model, and picking a random start date within the month.  Then the customer objects simulates the events for
        the month, and the utility model determines if there is a churn based on the simulated event counts.

        :param start_of_month:
        :return: the new customer object it contains the events and subscriptions
        '''
        new_customer=self.behave_mod.generate_customer()

        # Pick a random start date for the subscription within the month
        end_range = start_of_month + relativedelta(months=+1)
        this_month=start_of_month + timedelta(days=random.randrange((end_range-start_of_month).days))

        churned = False
        while not churned:
            next_month=this_month+relativedelta(months=1)
            new_customer.subscriptions.append( (this_month,next_month) )
            month_count = new_customer.generate_events(this_month,next_month)
            churned=self.util_mod.simulate_churn(month_count) or next_month > self.end_date
            if not churned:
                this_month = next_month
        return new_customer


    def create_customers_for_month(self,month_date,n_to_create):
        '''
        Creates all the customers for one month, by calling simulate_customer and copy_customer_to_database in a loop.
        :param month_date: the month start date
        :param n_to_create: number of customers to create within that month
        :return:
        '''

        for i in range(n_to_create):
            customer = self.simulate_customer(month_date)
            self.copy_customer_to_database(customer)
            print('Simulated customer %d: %d subscription, %d events' %
                  (i, len(customer.subscriptions), len(customer.events)))


    def copy_customer_to_database(self,customer):
        '''
        Copy one customers data to the database, by first writing it to temp files and then using the sql COPY command
        :param customer: a Customer object that has already had its simulation run
        :return:
        '''
        with open(self.tmp_sub_file_name, 'w') as tmp_file:
            for s in customer.subscriptions:
                tmp_file.write("%d,%d,'%s','%s','%s',%f,\\null,\\null,1\n" % \
                               (self.subscription_count, customer.id, self.model_name, s[0], s[1], self.mrr))
                self.subscription_count += 1
        with open(self.tmp_event_file_name, 'w') as tmp_file:
            for e in customer.events:
                tmp_file.write("%d,'%s',%d\n" % (customer.id, e[0], e[1]))

        sql = "COPY %s.subscription FROM STDIN USING DELIMITERS ',' WITH NULL AS '\\null'" % (self.model_name)
        cur = self.con.cursor()
        with open(self.tmp_sub_file_name, 'r') as f:
            cur.copy_expert(sql, f)
        self.con.commit()

        sql = "COPY %s.event FROM STDIN USING DELIMITERS ',' WITH NULL AS '\\null'" % (self.model_name)
        with open(self.tmp_event_file_name, 'r') as f:
            cur.copy_expert(sql, f)
        self.con.commit()


    def truncate_old_sim(self):
        '''
        Removes an old simulation from the database, if it already exists for this model
        :return: True if is safe to proceed (no data or data removed); False means old data not removed
        '''
        oldEvent= self.db.one('select count(*) from %s.event' % self.model_name)
        oldSubs= self.db.one('select count(*) from %s.subscription' % self.model_name)
        if oldEvent > 0 or oldSubs>0:
            print('TRUNCATING *Events & Subscriptions* in schema -> %s <-  ...' % self.model_name)
            if input("are you sure? (enter %s to proceed) " % self.model_name) == self.model_name:
                if oldEvent > 0:
                    self.db.run('truncate table %s.event' % self.model_name)
                if oldSubs > 0:
                    self.db.run('truncate table %s.subscription' % self.model_name)
                return True
            else:
                return False
        else:
            return True

    def run_simulation(self):
        '''
        Simulation main function. First it prepares the database by truncating any old events and subscriptions, and
        inserting the event types into the database.  Next it creeates the initial customers by calling
        create_customers_for_month, and then it advances month by month adding new customers (also using
        create_customers_for_month.)  The number of new customers for each month is determined from the growth rate.
        Note that churn is not handled at this level, but is modeled at the customer level.
        :return:
        '''

        # database setup
        if not self.truncate_old_sim():
            return
        self.behave_mod.insert_event_types(self.model_name,self.db)

        # Initial customer count
        print('\nCreating %d initial customers for %s start date' % (self.init_customers,self.start_date))
        self.create_customers_for_month(self.start_date,self.init_customers)
        print('Created %d initial customers with %d subscriptions for start date %s' % (self.init_customers,self.subscription_count,str(self.start_date)))

        # Advance to additional months
        next_month=self.start_date+relativedelta(months=+1)
        n_to_add = int(ceil( self.init_customers* self.monthly_growth_rate))  # number of new customers in first month
        while next_month < self.end_date:
            print('\nCreating %d new customers for month %s:' % (n_to_add,next_month))
            self.create_customers_for_month(next_month,n_to_add)
            print('Created %d new customers for month %s, now %d subscriptions\n' % (n_to_add,str(next_month),self.subscription_count))
            next_month=next_month+relativedelta(months=+1)
            n_to_add = int(ceil( n_to_add * (1.0+self.monthly_growth_rate))) # increase the new customers by growth

        self.remove_tmp_files()
Example #9
0
class PostgreSQLDB(object):
    FIELD_SONG_ID = 'song_id'
    FIELD_SONGNAME = 'song_name'
    FIELD_OFFSET = 'time'
    FIELD_HASH = 'hash'
    SONG_ID = 'song_id'
    SONG_NAME = 'song_name'
    CONFIDENCE = 'confidence'
    MATCH_TIME = 'match_time'
    OFFSET = 'time'
    OFFSET_SECS = 'offset_seconds'
    db = None

    # tables
    FINGERPRINTS_TABLENAME = 'fingerprints'
    SONGS_TABLENAME = 'songs'

    # creates
    CREATE_FINGERPRINTS_TABLE = """
        CREATE TABLE IF NOT EXISTS "%s"(
             "%s" INT PRIMARY KEY NOT NULL,
             "%s" INT NOT NULL,
             "%s" INT NOT NULL);""" % (FINGERPRINTS_TABLENAME, FIELD_HASH,
                                       FIELD_SONG_ID, FIELD_OFFSET)

    CREATE_SONGS_TABLE = \
        """CREATE TABLE IF NOT EXISTS "%s"(
                "%s" SERIAL PRIMARY KEY ,
                "%s" varchar(250) NOT NULL);""" % \
        (SONGS_TABLENAME, FIELD_SONG_ID, FIELD_SONGNAME)

    SELECT_SONG = """SELECT %s FROM %s WHERE %s = %%s;""" \
                  % (FIELD_SONGNAME, SONGS_TABLENAME, FIELD_SONG_ID)

    # inserts fingerprint. Update if existing
    INSERT_FINGERPRINT = \
        """INSERT INTO %s VALUES (%%s, %%s, %%s)
           ON
             CONFLICT (%s)
           DO UPDATE SET
             %s = EXCLUDED.%s,
             %s = EXCLUDED.%s;""" \
        % (FINGERPRINTS_TABLENAME, FIELD_HASH, FIELD_SONG_ID, FIELD_SONG_ID, FIELD_OFFSET, FIELD_OFFSET)

    INSERT_SONG = "INSERT INTO %s (%s) VALUES (%%s);" % (SONGS_TABLENAME,
                                                         FIELD_SONGNAME)

    SELECT_MULTIPLE = """SELECT %s, %s, %s FROM %s WHERE %s IN (%%s);""" \
                      % (FIELD_HASH, FIELD_SONG_ID, FIELD_OFFSET,
                         FINGERPRINTS_TABLENAME, FIELD_HASH)

    def __init__(self, drop_tables=False):
        super(PostgreSQLDB, self).__init__()
        if os.environ.get('DOCKERCLOUD_SERVICE_HOSTNAME', None) is not None:
            self.db = Postgres(
                u"postgres://*****:*****@postgres/hashes")
        else:
            # self.db = Postgres(u"postgres://*****:*****@localhost/postgres")
            self.db = Postgres(
                u"postgres://*****:*****@pervasivesounds.com/hashes"
            )

        if drop_tables:
            self.db.run("DROP TABLE IF EXISTS %s CASCADE" %
                        self.SONGS_TABLENAME)
            self.db.run("DROP TABLE IF EXISTS %s CASCADE" %
                        self.FINGERPRINTS_TABLENAME)

        self.db.run(self.CREATE_SONGS_TABLE)
        self.db.run(self.CREATE_FINGERPRINTS_TABLE)

    def store(self, name, hashes):
        sid = self.insert_song(name)
        self.insert_hashes(sid, hashes)

    def insert_hash(self, song_id, hash, offset):
        self.db.run(self.INSERT_FINGERPRINT, (hash, song_id, offset))

    def insert_hashes(self, sid, hashes):
        values = []
        for time_, hash_ in hashes:
            values.append((int(hash_), sid, time_))

        with self.db.get_cursor() as cur:
            for split_values in batch(values, 1000):
                cur.executemany(self.INSERT_FINGERPRINT, split_values)

    def insert_song(self, songname):
        """
        Inserts song in the database and returns the ID of the inserted record.
        """
        self.db.run(self.INSERT_SONG, (songname, ))
        return self.db.one(
            "SELECT %s FROM %s ORDER BY %s DESC LIMIT 1" %
            (self.FIELD_SONG_ID, self.SONGS_TABLENAME, self.FIELD_SONG_ID))

    def get_song_by_id(self, sid):
        """
        Returns song by its ID.
        """
        return self.db.one(self.SELECT_SONG, (int(sid), ))

    def return_matches(self, hashes):
        mapper = {}
        for offset, hash in hashes:
            mapper[int(hash)] = offset

        # Get an iteratable of all the hashes we need
        values = list(mapper.keys())

        res = []
        if hashes is not None:
            for split_values in batch(values, 100):
                query = self.SELECT_MULTIPLE
                query %= ', '.join(["%s"] * len(split_values))

                [
                    res.append(r)
                    for r in self.db.all(query, split_values, back_as=tuple)
                ]
            return np.asarray([(sid, offset - mapper[hash])
                               for (hash, sid, offset) in res])

    def get_best_sids(self, matches):
        unique, counts = np.unique(matches[:, 0], return_counts=True)
        return unique[np.argsort(counts)[::-1][:np.minimum(len(counts), 20)]]

    def align_matches(self, matches, sids):
        """
            Finds hash matches that align in time with other matches and finds
            consensus about which hashes are "true" signal from the audio.

            Returns a dictionary with match information.
        """
        # align by diffs
        diff_counter = {}
        largest = 0
        largest_count = 0
        song_id = -1
        for sid in sids:
            for sid, diff in matches[matches[:, 0] == sid]:
                if sid not in diff_counter:
                    diff_counter[sid] = {}
                if diff not in diff_counter[sid]:
                    diff_counter[sid][diff] = 0
                diff_counter[sid][diff] += 1

                if diff_counter[sid][diff] > largest_count:
                    largest = diff
                    largest_count = diff_counter[sid][diff]
                    song_id = sid

        # total_count = {}
        # for sid in diff_counter.keys():
        #     total_count[sid] = np.sum(diff_counter[sid].values)

        songs = []
        for sid in diff_counter.keys():
            song_name = self.get_song_by_id(sid)
            for diff in diff_counter[sid].keys():
                confidence = diff_counter[sid][diff]
                if confidence > 4:
                    songs.append({
                        'song_id': song_id,
                        'song_name': song_name,
                        'confidence': confidence,
                        'offset': diff
                    })
        return songs
Example #10
0
class TestCache(TestCase):
    def setUp(self):
        self.db = Postgres(cache=Cache(max_size=1),
                           cursor_factory=SimpleTupleCursor)
        self.db.run("DROP SCHEMA IF EXISTS public CASCADE")
        self.db.run("CREATE SCHEMA public")
        self.db.run("CREATE TABLE foo (key text, value int)")
        self.db.run("INSERT INTO foo VALUES ('a', 1)")
        self.db.run("INSERT INTO foo VALUES ('b', 2)")

    def test_one_returns_cached_row(self):
        query = "SELECT * FROM foo WHERE key = 'a'"
        r1 = self.db.one(query, max_age=10)
        r2 = self.db.one(query, max_age=10)
        assert r2 is r1

    def test_all_returns_cached_rows(self):
        query = "SELECT * FROM foo ORDER BY key"
        r1 = self.db.all(query, max_age=10)
        r2 = self.db.all(query, max_age=10)
        assert r2 == r1
        assert r2 is not r1
        assert r2[0] is r1[0]

    def test_back_as_is_compatible_with_caching(self):
        query = "SELECT * FROM foo WHERE key = 'a'"
        r1 = self.db.one(query, back_as=dict, max_age=10)
        r2 = self.db.one(query, back_as=namedtuple, max_age=10)
        assert r1 == r2._asdict()
        rows = self.db.all(query, back_as='Row', max_age=10)
        assert rows == [r1]

    def test_all_returns_row_cached_by_one(self):
        query = "SELECT * FROM foo WHERE key = 'a'"
        row = self.db.one(query, max_age=10)
        rows = self.db.all(query, max_age=10)
        assert rows == [row]
        assert rows[0] is row

    def test_one_raises_TooMany_when_the_cache_contains_multiple_rows(self):
        query = "SELECT * FROM foo"
        rows = self.db.all(query, max_age=10)
        assert len(rows) == 2
        with self.assertRaises(TooMany):
            self.db.one(query, max_age=10)

    def test_cache_max_size(self):
        query1 = b"SELECT * FROM foo WHERE key = 'a'"
        query2 = b"SELECT * FROM foo WHERE key = 'b'"
        self.db.all(query1, max_age=10)
        assert set(self.db.cache.entries.keys()) == {query1}
        self.db.all(query2, max_age=10)
        assert set(self.db.cache.entries.keys()) == {query2}

    def test_cache_max_age(self):
        query = b"SELECT * FROM foo WHERE key = 'a'"
        r1 = self.db.one(query, max_age=0)
        r2 = self.db.one(query, max_age=10)
        assert r2 is not r1

    def test_cache_prune(self):
        self.db.cache.max_size = 2
        query1 = b"SELECT * FROM foo WHERE key = 'a'"
        query2 = b"SELECT * FROM foo WHERE key = 'b'"
        self.db.one(query1, max_age=-1)
        self.db.one(query2, max_age=10)
        assert set(self.db.cache.entries.keys()) == {query1, query2}
        self.db.cache.prune()
        assert set(self.db.cache.entries.keys()) == {query2}

    def test_cache_prevents_concurrent_queries(self):
        with self.db.get_cursor() as cursor:
            cursor.run("LOCK TABLE foo IN EXCLUSIVE MODE")

            def insert():
                self.db.one("INSERT INTO foo VALUES ('c', 3) RETURNING *",
                            max_age=1)

            t1 = Thread(target=insert)
            t2 = Thread(target=insert)
            t1.start()
            t2.start()
            cursor.run("COMMIT")  # this releases the table lock
        t1.join()
        t2.join()
        n = self.db.one("SELECT count(*) FROM foo WHERE key = 'c'")
        assert n == 1
Example #11
0
FIELD_SONGNAME = 'song_name'
FIELD_OFFSET = 'time'
FIELD_HASH = 'hash'
SONG_ID = 'song_id'
SONG_NAME = 'song_name'
CONFIDENCE = 'confidence'
MATCH_TIME = 'match_time'
OFFSET = 'time'
OFFSET_SECS = 'offset_seconds'
FINGERPRINTS_TABLENAME = 'fingerprints'
SONGS_TABLENAME = 'songs'
COUNT_ROWS = "SELECT reltuples AS approximate_row_count FROM pg_class WHERE relname = '%s';"
postgres_url = '52.49.153.98:5432'

db = Postgres(u"postgres://*****:*****@%s/hashes" % postgres_url)
print("Number of hashes: %d" % db.one(COUNT_ROWS % FINGERPRINTS_TABLENAME))
# print("Number of songs: %d" % db.one(COUNT_ROWS % SONGS_TABLENAME))

print("\nSong table first and last element:")
print(db.one("SELECT %s, %s FROM %s ORDER BY %s ASC LIMIT 1" % (FIELD_SONG_ID, FIELD_SONGNAME, SONGS_TABLENAME, FIELD_SONG_ID)))
print(db.one("SELECT %s, %s FROM %s ORDER BY %s DESC LIMIT 1" % (FIELD_SONG_ID, FIELD_SONGNAME, SONGS_TABLENAME, FIELD_SONG_ID)))

print("\nFingerprint table first and last element:")
first_hash = db.one("SELECT %s, %s FROM %s ORDER BY %s ASC LIMIT 1" % (FIELD_HASH, FIELD_SONG_ID, FINGERPRINTS_TABLENAME, FIELD_SONG_ID))
print(db.one("SELECT * FROM %s WHERE %s=%d;" % (SONGS_TABLENAME, FIELD_SONG_ID, first_hash.song_id)))
last_hash = db.one("SELECT %s, %s FROM %s ORDER BY %s DESC LIMIT 1" % (FIELD_HASH, FIELD_SONG_ID, FINGERPRINTS_TABLENAME, FIELD_SONG_ID))
print(db.one("SELECT * FROM %s WHERE %s=%d;" % (SONGS_TABLENAME, FIELD_SONG_ID, last_hash.song_id)))

print("\nSize of database: %s" % db.one("SELECT pg_size_pretty(pg_database_size('hashes'));"))
print("Size of %s table: %s" % (FINGERPRINTS_TABLENAME, db.one("SELECT pg_size_pretty(pg_table_size('%s'));" % FINGERPRINTS_TABLENAME)))
print("Size of %s table: %s" % (SONGS_TABLENAME, db.one("SELECT pg_size_pretty(pg_table_size('%s'));" % SONGS_TABLENAME)))