def test_save_to_database(): start_url = 'https://example.com' urls = [ 'https://example.com/example1', 'https://example.com/example2', 'https://example2.com/example/' ] save_to_database(start_url, urls) conn = connect_to_database() cursor = conn.cursor() for url in urls: cursor.execute( """ MATCH (l1: Link {url: $start_url}) MATCH (l2: Link {url: $url}) MATCH (l1)-[e:LINKS_TO]->(l2) RETURN l1, e, l2; """, { 'start_url': start_url, 'url': url }) row = cursor.fetchone() assert row is not None assert any(el.properties.get('url', None) == url for el in row)
def save_ftrl_data(data_type, fnames, ftablename, test_folds, train_folds, ftrl_type, optional_date_ftrl3, optional_condition_ftrl4): conn = utils.connect_to_database() cur = conn.cursor() path = '../data/output-py/ftrl/' if (data_type == 'val') or (data_type == 'train'): path_part = 'train' else: path_part = 'test' temp_path = '../data/output-py/ftrl/temp/' file_name = data_type + '_ftrl_folds.csv' sql_query = open('genentech-sql/pattern_ftrl_' + path_part + ftrl_type + '.sql').read() if data_type == 'train': sql_query = sql_query.replace( 'OPTIONAL_CV_EXPRESSION', 'WHERE ' + ' OR '.join(['cv_index=' + str(x) for x in train_folds])) if data_type == 'val': sql_query = sql_query.replace( 'OPTIONAL_CV_EXPRESSION', 'WHERE ' + ' OR '.join(['cv_index=' + str(x) for x in test_folds])) sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename) sql_query = sql_query.replace('FEATURES_LIST_COMMA_SEPARATED', ','.join(fnames)) sql_query = sql_query.replace('T1_FEATURES_COMMA_SEPARATED', ','.join(['t1.' + x for x in fnames])) sql_query = sql_query.replace('OPTIONAL_DATE_FTRL3', optional_date_ftrl3) sql_query = sql_query.replace('OPTIONAL_CONDITION_FTRL4', optional_condition_ftrl4) copy_string = "unload ('" + sql_query + "') to 's3://genentech-2016/ftrl/" + file_name + "' " +\ "credentials " + utils.S3_CONNECTION_STRING +\ "delimiter ',' gzip allowoverwrite;" cur.execute(copy_string) conn.commit() cur.close() conn.close() os.system('aws s3 cp s3://genentech-2016/ftrl/ ' + temp_path + ' --recursive') os.system('aws s3 rm s3://genentech-2016/ftrl/ --recursive') os.system('find ' + temp_path + ' -name \*.gz -exec gunzip {} \;') data_parts = ' '.join(sorted(glob.glob(temp_path + '*'))) if data_type == 'test': header = 'patient_id,' + ','.join(fnames) + '\n' else: header = 'patient_id,' + ','.join(fnames) + ',is_screener\n' with open(temp_path + "header.csv", "w") as text_file: text_file.write("%s" % header) os.system('cat ' + temp_path + 'header.csv ' + data_parts + ' > ' + path + file_name) os.system('rm -R ' + temp_path + '/*') return path + file_name
def merge_likelihood_tables(fnames_list, ftablename, train_folds): folds = [x for x in range(1, nfold+1)] sql_query = open('genentech-sql/pattern_merge_likeli.sql').read() sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename) sql_query = sql_query.replace('FEATURE_NAMES_COMMA_SEPARATED', ','.join([','.join(x) for x in fnames_list])) likeli_tables_for_join = '' count = 2 for fnames in fnames_list: likeli_tables_for_join = likeli_tables_for_join + ' INNER JOIN ' + '_'.join(fnames) + '_likeli_table t' +\ str(count) + ' ON ' + ' AND '.join(['t1.' + x + '=t'+str(count)+'.'+x for x in fnames]) count = count + 1 sql_query = sql_query.replace('LIKELI_TABLES_FOR_JOIN', likeli_tables_for_join) sql_query = sql_query.replace('GENERIC_FEATURE_NAME', '_'.join(['_'.join(x) for x in fnames_list])) likeli_function = 'MAX(' + '+'.join(['t'+str(x)+'.feature_avg' for x in range(2, len(fnames_list)+2)]) + ')' #likeli_function = 'MAX(1.0-' + '*'.join(['(1.0-t'+str(x)+'.feature_avg)' for x in range(2, len(fnames_list)+2)]) + ')' sql_query = sql_query.replace('LIKELI_FUNCTION', likeli_function) if len(train_folds) == len(folds): choosing_patients_expression = 'patients_test2' else: choosing_patients_expression = 'train_cv_indices ' + 'WHERE ' + ' OR '.join(['cv_index='+str(x) for x in folds if not x in list(train_folds)]) sql_query = sql_query.replace('CHOOSING_PATIENTS_EXPRESSION', choosing_patients_expression) conn = utils.connect_to_database() cur = conn.cursor() cur.execute(sql_query) likeli = pd.DataFrame(cur.fetchall()) likeli.columns = [x.name for x in cur.description] cur.close() conn.close() return likeli
def find_shortest_path(start_url, end_url): """Find the shortest path between two nodes representing URLs in the database.""" conn = connect_to_database() cursor = conn.cursor() cursor.execute( """ MATCH (l1: Link {url: $start_url})-[edge_list: LINKS_TO * bfs..10]-(l2: Link {url: $end_url}) RETURN edge_list LIMIT 1; """, { 'start_url': start_url, 'end_url': end_url }) row = cursor.fetchone() if row is None: cursor.execute("MATCH (l1: Link {url: $start_url}) RETURN l1;", {'start_url': start_url}) row = cursor.fetchone() if row is None: raise WebSiteNotFoundError(start_url) cursor.execute("MATCH (l1: Link {url: $end_url}) RETURN l1;", {'end_url': end_url}) row = cursor.fetchone() if row is None: raise WebSiteNotFoundError(end_url) raise ShortestPathNotFoundError( f'No path between {start_url} and {end_url}.') return row[0], cursor
def drop_likelihood_table(likeli_table_name): conn = utils.connect_to_database() cur = conn.cursor() cur.execute('DROP TABLE ' + likeli_table_name + ';') conn.commit() cur.close() conn.close() return None
def main(): conn = connect_to_database() cur = conn.cursor() load_staging_tables(cur, conn) insert_tables(cur, conn) conn.close()
def scrap_and_populate_db(url): scrap(url) conn = connect_to_database() cursor = conn.cursor() cursor.execute( """ MATCH (l: Link {url: $url}) RETURN l; """, {'url': url}) return cursor
def main(): conn = connect_to_database() cur = conn.cursor() print('dropping all tables ...') drop_tables(cur, conn) print('tables dropped successfully \n creating tables ... ') create_tables(cur, conn) print('table created successfully') conn.close() print('connection closed')
def generate_likelihood_table(likeli_table_name, fnames, ftablename, train_folds): sql_query = open('genentech-sql/pattern_likeli_table.sql').read() sql_query = sql_query.replace('LIKELI_TABLE_NAME', likeli_table_name) sql_query = sql_query.replace('T1_COMMA_SEPARATED', ','.join(['t1.'+x for x in fnames])) sql_query = sql_query.replace('FEATURE_NAMES_COMMA_SEPARATED', ','.join(fnames)) sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename) sql_query = sql_query.replace('OPTIONAL_CV_EXPRESSION', 'WHERE ' + ' OR '.join(['cv_index='+str(x) for x in list(train_folds)])) conn = utils.connect_to_database() cur = conn.cursor() cur.execute(sql_query) conn.commit() cur.close() conn.close() return None
def save_ftrl_data(data_type, fnames, ftablename, test_folds, train_folds, ftrl_type, optional_date_ftrl3, optional_condition_ftrl4): conn = utils.connect_to_database() cur = conn.cursor() path = '../data/output-py/ftrl/' if (data_type=='val') or (data_type=='train'): path_part = 'train' else: path_part = 'test' temp_path = '../data/output-py/ftrl/temp/' file_name = data_type + '_ftrl_folds.csv' sql_query = open('genentech-sql/pattern_ftrl_' + path_part + ftrl_type + '.sql').read() if data_type == 'train': sql_query = sql_query.replace('OPTIONAL_CV_EXPRESSION', 'WHERE ' + ' OR '.join(['cv_index='+str(x) for x in train_folds])) if data_type == 'val': sql_query = sql_query.replace('OPTIONAL_CV_EXPRESSION', 'WHERE ' + ' OR '.join(['cv_index='+str(x) for x in test_folds])) sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename) sql_query = sql_query.replace('FEATURES_LIST_COMMA_SEPARATED', ','.join(fnames)) sql_query = sql_query.replace('T1_FEATURES_COMMA_SEPARATED', ','.join(['t1.'+x for x in fnames])) sql_query = sql_query.replace('OPTIONAL_DATE_FTRL3', optional_date_ftrl3) sql_query = sql_query.replace('OPTIONAL_CONDITION_FTRL4', optional_condition_ftrl4) copy_string = "unload ('" + sql_query + "') to 's3://genentech-2016/ftrl/" + file_name + "' " +\ "credentials " + utils.S3_CONNECTION_STRING +\ "delimiter ',' gzip allowoverwrite;" cur.execute(copy_string) conn.commit() cur.close() conn.close() os.system('aws s3 cp s3://genentech-2016/ftrl/ ' + temp_path + ' --recursive') os.system('aws s3 rm s3://genentech-2016/ftrl/ --recursive') os.system('find ' + temp_path + ' -name \*.gz -exec gunzip {} \;') data_parts = ' '.join(sorted(glob.glob(temp_path + '*'))) if data_type == 'test': header = 'patient_id,' + ','.join(fnames) + '\n' else: header = 'patient_id,' + ','.join(fnames) + ',is_screener\n' with open(temp_path + "header.csv", "w") as text_file: text_file.write("%s" % header) os.system('cat ' + temp_path + 'header.csv ' + data_parts + ' > ' + path + file_name) os.system('rm -R ' + temp_path + '/*') return path + file_name
def merge_likelihood_tables(fnames_list, ftablename, train_folds): folds = [x for x in range(1, nfold + 1)] sql_query = open('genentech-sql/pattern_merge_likeli.sql').read() sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename) sql_query = sql_query.replace('FEATURE_NAMES_COMMA_SEPARATED', ','.join([','.join(x) for x in fnames_list])) likeli_tables_for_join = '' count = 2 for fnames in fnames_list: likeli_tables_for_join = likeli_tables_for_join + ' INNER JOIN ' + '_'.join(fnames) + '_likeli_table t' +\ str(count) + ' ON ' + ' AND '.join(['t1.' + x + '=t'+str(count)+'.'+x for x in fnames]) count = count + 1 sql_query = sql_query.replace('LIKELI_TABLES_FOR_JOIN', likeli_tables_for_join) sql_query = sql_query.replace('GENERIC_FEATURE_NAME', '_'.join(['_'.join(x) for x in fnames_list])) likeli_function = 'MAX(' + '+'.join([ 't' + str(x) + '.feature_avg' for x in range(2, len(fnames_list) + 2) ]) + ')' #likeli_function = 'MAX(1.0-' + '*'.join(['(1.0-t'+str(x)+'.feature_avg)' for x in range(2, len(fnames_list)+2)]) + ')' sql_query = sql_query.replace('LIKELI_FUNCTION', likeli_function) if len(train_folds) == len(folds): choosing_patients_expression = 'patients_test2' else: choosing_patients_expression = 'train_cv_indices ' + 'WHERE ' + ' OR '.join( [ 'cv_index=' + str(x) for x in folds if not x in list(train_folds) ]) sql_query = sql_query.replace('CHOOSING_PATIENTS_EXPRESSION', choosing_patients_expression) conn = utils.connect_to_database() cur = conn.cursor() cur.execute(sql_query) likeli = pd.DataFrame(cur.fetchall()) likeli.columns = [x.name for x in cur.description] cur.close() conn.close() return likeli
def build_train_test(): conn = utils.connect_to_database() target = ['is_screener'] flist_basic = [ 'patient_age_group', 'patient_state', 'ethinicity', 'household_income', 'education_level' ] flist_pah = [ 'activity_type_r_count_all', 'activity_type_a_count_all', 'activity_type_count_all' ] sql_query = "SELECT t1.patient_id,t1.is_screener," + ",".join(['t1.'+x for x in flist_basic]) +\ "," + ",".join(['t2.'+x for x in flist_pah]) +\ " FROM patients_train t1\ LEFT JOIN patient_activity_feats t2\ ON t1.patient_id=t2.patient_id;" train = pd.read_sql_query(sql_query, conn) train.reset_index(drop=True, inplace=True) sql_query = "SELECT t1.patient_id," + ",".join(['t1.'+x for x in flist_basic]) +\ "," + ",".join(['t2.'+x for x in flist_pah]) +\ " FROM patients_test2 t1\ LEFT JOIN patient_activity_feats t2\ ON t1.patient_id=t2.patient_id;" test = pd.read_sql_query(sql_query, conn) test.reset_index(drop=True, inplace=True) cv_indices = pd.read_sql_query( 'SELECT patient_id, cv_index FROM train_cv_indices;', conn) train = pd.merge(train, cv_indices, on='patient_id', how='left') train = calculate_basic(train) test = calculate_basic(test) train, test = encode_onehot(train, test, ['patient_state', 'ethinicity']) print "Writing to HDF5 store..." store = pd.HDFStore('../data/output-py/train_test.h5') store.append('train', train) store.append('test', test) store.close() conn.close() return train, test
def generate_likelihood_table(likeli_table_name, fnames, ftablename, train_folds): sql_query = open('genentech-sql/pattern_likeli_table.sql').read() sql_query = sql_query.replace('LIKELI_TABLE_NAME', likeli_table_name) sql_query = sql_query.replace('T1_COMMA_SEPARATED', ','.join(['t1.' + x for x in fnames])) sql_query = sql_query.replace('FEATURE_NAMES_COMMA_SEPARATED', ','.join(fnames)) sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename) sql_query = sql_query.replace( 'OPTIONAL_CV_EXPRESSION', 'WHERE ' + ' OR '.join(['cv_index=' + str(x) for x in list(train_folds)])) conn = utils.connect_to_database() cur = conn.cursor() cur.execute(sql_query) conn.commit() cur.close() conn.close() return None
def save_to_database(start_url, urls): """Save nodes representing URLs and edges representing links to the database.""" conn = connect_to_database() cursor = conn.cursor() cursor.execute("MERGE (l:Link {url: $start_url});", {'start_url': start_url}) for url in urls: cursor.execute( """ MERGE (l1: Link {url: $start_url}) MERGE (l2: Link {url: $url}) MERGE (l1)-[e:LINKS_TO]->(l2) RETURN l1, e, l2; """, { 'start_url': start_url, 'url': url }) conn.commit()
def __init__(self): """ initialises current_team Connects to the database Creates a temp table for each position """ self.current_team = {} self.connection, self.cursor = utils.connect_to_database() self.clone_table_column_creators = "id integer primary key, name varchar(100), player_url varchar(100), position varchar(100), age integer, current_team varchar(20), team_one_year_ago varchar(20), team_two_year_ago varchar(20), team_three_year_ago varchar(20), points_one_year_ago float, points_two_year_ago float, points_three_year_ago float" self.clone_table_columns = "id, name, player_url, position, age, current_team, team_one_year_ago, team_two_year_ago, team_three_year_ago, points_one_year_ago, points_two_year_ago, points_three_year_ago" self.temp_table_column_creators = "id integer primary key, name varchar(100), position varchar(100), current_team varchar(20), weighted_score float" self.temp_table_columns = "id, name, position, current_team, weighted_score" utils.create_temp_clone_table("players", "cloneplayers", self.clone_table_column_creators, self.clone_table_columns, self.connection, self.cursor) utils.add_column_to_table("cloneplayers", "weighted_score", "float", self.connection, self.cursor) utils.update_weighted_scores(self.connection, self.cursor) for position in utils.get_positions_list(self.cursor): filter_position = position.rstrip('0123456789 ').upper() + '%' utils.create_temporary_table(position, self.temp_table_column_creators, self.connection, self.cursor) utils.populate_temp_table_from_other_table(position, self.temp_table_columns, "cloneplayers", "position", filter_position, self.connection, self.cursor)
def build_train_test(): conn = utils.connect_to_database() target = ['is_screener'] flist_basic = ['patient_age_group', 'patient_state', 'ethinicity', 'household_income', 'education_level'] flist_pah = ['activity_type_r_count_all', 'activity_type_a_count_all', 'activity_type_count_all'] sql_query = "SELECT t1.patient_id,t1.is_screener," + ",".join(['t1.'+x for x in flist_basic]) +\ "," + ",".join(['t2.'+x for x in flist_pah]) +\ " FROM patients_train t1\ LEFT JOIN patient_activity_feats t2\ ON t1.patient_id=t2.patient_id;" train = pd.read_sql_query(sql_query, conn) train.reset_index(drop=True, inplace=True) sql_query = "SELECT t1.patient_id," + ",".join(['t1.'+x for x in flist_basic]) +\ "," + ",".join(['t2.'+x for x in flist_pah]) +\ " FROM patients_test2 t1\ LEFT JOIN patient_activity_feats t2\ ON t1.patient_id=t2.patient_id;" test = pd.read_sql_query(sql_query, conn) test.reset_index(drop=True, inplace=True) cv_indices = pd.read_sql_query('SELECT patient_id, cv_index FROM train_cv_indices;', conn) train = pd.merge(train, cv_indices, on='patient_id', how='left') train = calculate_basic(train) test = calculate_basic(test) train, test = encode_onehot(train, test, ['patient_state', 'ethinicity']) print "Writing to HDF5 store..." store = pd.HDFStore('../data/output-py/train_test.h5') store.append('train', train) store.append('test', test) store.close() conn.close() return train, test
def main(): ''' Script entry point: - Establishes connection with the sparkifydb database and gets cursor to it - Loads staging tables - Inserts data into regular tables from staging tables - Finally, closes the connection Parameters: None Returns: None ''' cur, conn = connect_to_database() print('Loading staging tables') load_staging_tables(cur, conn) print('Inserting data into tables from staging tables') insert_tables(cur, conn) print('Done') conn.close()
def main(): ''' Script entry point: - Establishes connection with the sparkifydb database and gets cursor to it - Drops all the tables - Creates all the tables - Finally, closes the connection Parameters: None Returns: None ''' cur, conn = connect_to_database() print('Dropping tables') drop_tables(cur, conn) print('Creating tables') create_tables(cur, conn) print('Done') conn.close()
import psycopg2 import pandas as pd from scipy.stats.stats import pearsonr import utils conn = utils.connect_to_database() cur = conn.cursor() cur.execute(open('genentech-sql/patient_activity_feats.sql').read()) conn.commit() cur.execute(open('genentech-sql/prescription_feats.sql').read()) conn.commit() cur.execute(open('genentech-sql/diagnosis_feats.sql').read()) conn.commit() cur.execute(open('genentech-sql/diagnosis_procedure_link.sql').read()) conn.commit() cur.execute(open('genentech-sql/diagnosis_pairs.sql').read()) conn.commit() cur.execute(open('genentech-sql/diagnosis_feats2.sql').read()) conn.commit() cur.execute(open('genentech-sql/procedure_head2.sql').read()) conn.commit() cur.execute(open('genentech-sql/diagnosis_procedure_link2.sql').read())
def test_rsm(training_set_size = 500, validation_set_size = 200, n_hidden = 200): from batch_data import BatchData as Batch import utils training_set_name = "training_set_%d" % training_set_size validation_set_name = "validation_set_%d" % validation_set_size # We make sure Mongo is running somewhere : utils.connect_to_database(database_name = 'yelp') def load_dataset(size = 500, lexicon = None, name="training_set"): # if training_set.npy doesn't exist: rc = utils.ResourceConverter(lexicon = lexicon) batch = Batch( data=utils.mongo_database_global['restaurants'].find({}, {'signature':1}), # from Mongo's cursor enumerator batch_size = size, # mini-batch shuffle = True, # stochastic conversion = rc.process # convert to matrices using lexicon) ) dataset = batch.next() # and save it for later. numpy.save(name, dataset) return dataset if file_exists("lexicon.gzp"): lexicon = utils.Lexicon.load("lexicon.gzp") else: # if lexicon.gzp doesnt exist: # 'restaurants' is the name of the collection, we stem the words in the triggers, # and we lowercase them to minimize the visible dimensions (bag of words dimensions) lexicon = utils.gather_lexicon('restaurants', stem= True, lowercase = True, show_progress= True) lexicon.save("lexicon.gzp") if file_exists("%s.npy" % training_set_name): train_set_x_mem = numpy.load("%s.npy" % training_set_name) else: train_set_x_mem = load_dataset(size = training_set_size, lexicon = lexicon, name=training_set_name) if file_exists("%s.npy" % validation_set_name): validation_set_x_mem = numpy.load("%s.npy" % validation_set_name) else: validation_set_x_mem = load_dataset(size = validation_set_size, lexicon = lexicon, name=validation_set_name) train_set_x = theano.shared(train_set_x_mem, borrow = True) validation_set_x = theano.shared(validation_set_x_mem, borrow = True) # construct the RSM class mini_batch_size = 100 # allocate symbolic variables for the data n_train_batches = floor(train_set_x.get_value(borrow=True).shape[0] / mini_batch_size) rng = numpy.random.RandomState(123) theano_rng = T.shared_randomstreams.RandomStreams(rng.randint(2 ** 30)) rsm = RSM(n_visible=lexicon.max_index, n_hidden=n_hidden, numpy_rng=rng, theano_rng=theano_rng) def save_rsm(name=''): # save computation results: numpy.save(("%s_W_trained" % name), rsm.W.get_value(borrow=True)) numpy.save(("%s_hbias_trained" % name), rsm.hbias.get_value(borrow=True)) numpy.save(("%s_vbias_trained" % name), rsm.vbias.get_value(borrow=True)) # get training function learning_rate = theano.shared(0.01) cost, updates = rsm.get_cost_updates(lr=learning_rate, k=2) index = T.lscalar() # index to a [mini]batch train_rsm = theano.function( [index], cost, updates=updates, givens={ rsm.input: train_set_x[index * mini_batch_size:(index + 1) * mini_batch_size], rsm.scaling: train_set_x[index * mini_batch_size:(index + 1) * mini_batch_size].sum(axis=1).astype(theano.config.floatX), }, name='train_rbm') [pre_sigmoid_h1, h1_mean, h1_sample, pre_sigmoid_v1, v1_mean, v1_sample] = rsm.gibbs_vhv(rsm.input) validate_rsm = theano.function( [], rsm.reconstruction_cost(rsm.input, v1_sample), givens = { rsm.input: validation_set_x, rsm.scaling: validation_set_x.sum(axis=1) } ) training_epochs = 300 # will stop early batch_indices = [i for i in range(n_train_batches)] start_time = time.time() min_val = None try: for epoch in range(training_epochs): # go through the training set mean_cost = [] # more stochasticity: random.shuffle(batch_indices) for batch_index in batch_indices: mean_cost.append(train_rsm(batch_index) / mini_batch_size) validation_cost = validate_rsm() / validation_set_size print('Training epoch %d, cost is %.4f, validation cost is %.4f' % (epoch+1, numpy.mean(mean_cost), validation_cost)) if min_val != None and validation_cost < min_val: save_rsm("min_validation") min_val = min(min_val, validation_cost) if min_val != None else validation_cost print('Training took %.05fmn' % ((time.time() - start_time)/60.0)) except (KeyboardInterrupt, SystemExit): print("Saving final rsm...") save_rsm("final") exit() except: raise print("Saving final rsm...") save_rsm("final")
def __init__(self): if config['installed']: self.db = utils.connect_to_database()
def test_rsm(training_set_size=500, validation_set_size=200, n_hidden=200): from batch_data import BatchData as Batch import utils training_set_name = "training_set_%d" % training_set_size validation_set_name = "validation_set_%d" % validation_set_size # We make sure Mongo is running somewhere : utils.connect_to_database(database_name='yelp') def load_dataset(size=500, lexicon=None, name="training_set"): # if training_set.npy doesn't exist: rc = utils.ResourceConverter(lexicon=lexicon) batch = Batch( data=utils.mongo_database_global['restaurants'].find( {}, {'signature': 1}), # from Mongo's cursor enumerator batch_size=size, # mini-batch shuffle=True, # stochastic conversion=rc.process # convert to matrices using lexicon) ) dataset = batch.next() # and save it for later. numpy.save(name, dataset) return dataset if file_exists("lexicon.gzp"): lexicon = utils.Lexicon.load("lexicon.gzp") else: # if lexicon.gzp doesnt exist: # 'restaurants' is the name of the collection, we stem the words in the triggers, # and we lowercase them to minimize the visible dimensions (bag of words dimensions) lexicon = utils.gather_lexicon('restaurants', stem=True, lowercase=True, show_progress=True) lexicon.save("lexicon.gzp") if file_exists("%s.npy" % training_set_name): train_set_x_mem = numpy.load("%s.npy" % training_set_name) else: train_set_x_mem = load_dataset(size=training_set_size, lexicon=lexicon, name=training_set_name) if file_exists("%s.npy" % validation_set_name): validation_set_x_mem = numpy.load("%s.npy" % validation_set_name) else: validation_set_x_mem = load_dataset(size=validation_set_size, lexicon=lexicon, name=validation_set_name) train_set_x = theano.shared(train_set_x_mem, borrow=True) validation_set_x = theano.shared(validation_set_x_mem, borrow=True) # construct the RSM class mini_batch_size = 100 # allocate symbolic variables for the data n_train_batches = floor( train_set_x.get_value(borrow=True).shape[0] / mini_batch_size) rng = numpy.random.RandomState(123) theano_rng = T.shared_randomstreams.RandomStreams(rng.randint(2**30)) rsm = RSM(n_visible=lexicon.max_index, n_hidden=n_hidden, numpy_rng=rng, theano_rng=theano_rng) def save_rsm(name=''): # save computation results: numpy.save(("%s_W_trained" % name), rsm.W.get_value(borrow=True)) numpy.save(("%s_hbias_trained" % name), rsm.hbias.get_value(borrow=True)) numpy.save(("%s_vbias_trained" % name), rsm.vbias.get_value(borrow=True)) # get training function learning_rate = theano.shared(0.01) cost, updates = rsm.get_cost_updates(lr=learning_rate, k=2) index = T.lscalar() # index to a [mini]batch train_rsm = theano.function( [index], cost, updates=updates, givens={ rsm.input: train_set_x[index * mini_batch_size:(index + 1) * mini_batch_size], rsm.scaling: train_set_x[index * mini_batch_size:(index + 1) * mini_batch_size].sum(axis=1).astype( theano.config.floatX), }, name='train_rbm') [pre_sigmoid_h1, h1_mean, h1_sample, pre_sigmoid_v1, v1_mean, v1_sample] = rsm.gibbs_vhv(rsm.input) validate_rsm = theano.function([], rsm.reconstruction_cost( rsm.input, v1_sample), givens={ rsm.input: validation_set_x, rsm.scaling: validation_set_x.sum(axis=1) }) training_epochs = 300 # will stop early batch_indices = [i for i in range(n_train_batches)] start_time = time.time() min_val = None try: for epoch in range(training_epochs): # go through the training set mean_cost = [] # more stochasticity: random.shuffle(batch_indices) for batch_index in batch_indices: mean_cost.append(train_rsm(batch_index) / mini_batch_size) validation_cost = validate_rsm() / validation_set_size print('Training epoch %d, cost is %.4f, validation cost is %.4f' % (epoch + 1, numpy.mean(mean_cost), validation_cost)) if min_val != None and validation_cost < min_val: save_rsm("min_validation") min_val = min( min_val, validation_cost) if min_val != None else validation_cost print('Training took %.05fmn' % ((time.time() - start_time) / 60.0)) except (KeyboardInterrupt, SystemExit): print("Saving final rsm...") save_rsm("final") exit() except: raise print("Saving final rsm...") save_rsm("final")
import random from utils import connect_to_database from pymongo.errors import ConnectionFailure from warnings import warn try: DB = connect_to_database(database_name="yelp") except ConnectionFailure as e: warn("Could not connect to MongoDB database `yelp`") DB = None class Personality: def __init__(self, good_requirement, collection_name="restaurants"): self.collection_name = collection_name self.good_requirement = good_requirement self.good_examples = [] def _random_sample(self): database = DB[self.collection_name] all_elements = database.count() random_el = random.randint(0, all_elements) els = list( database.find({"review_count": { "$gt": 4 }}, { "url": 1, "rating": 1, "price": 1, "categories": 1
def test_connect_to_database(self): self.assertIs(type(df.connect_to_database()), pyodbc.Connection)
$ curl -X POST -H "Content-type: application/json" http://127.0.0.1:5000/predict_map -d '{"Time": 57007.0, "V1": -1.2712441917143702, "V2": 2.46267526851135, "V3": -2.85139500331783, "V4": 2.3244800653477995, "V5": -1.37224488981369, "V6": -0.948195686538643, "V7": -3.06523436172054, "V8": 1.1669269478721105, "V9": -2.2687705884481297, "V10": -4.88114292689057, "V11": 2.2551474887046297, "V12": -4.68638689759229, "V13": 0.652374668512965, "V14": -6.17428834800643, "V15": 0.594379608016446, "V16": -4.8496923870965185, "V17": -6.53652073527011, "V18": -3.11909388163881, "V19": 1.71549441975915, "V20": 0.560478075726644, "V21": 0.652941051330455, "V22": 0.0819309763507574, "V23": -0.22134783119833895, "V24": -0.5235821592333061, "V25": 0.224228161862968, "V26": 0.756334522703558, "V27": 0.632800477330469, "V28": 0.25018709275719697, "Amount": 0.01}' Returns: resp (json): predicted fraud probability """ X = _manage_query(request) y_pred = model.predict(X)[0] print("Value predicted: {}".format(y_pred)) if y_pred >= FRAUD_THRESHOLD: row = select_random_row(conn, TABLE_LOCATIONS) location = {"title": row[0], "latitude": row[1], "longitude": row[2]} print("New location: {}".format(location)) socketio.emit('map_update', location, broadcast=True, namespace='/fraud') return make_response(jsonify({'fraud': y_pred}), STATUS_OK) # Load the model as a global variable model = lgb.Booster(model_file=BASELINE_MODEL) # Connect to database conn = connect_to_database(DATABASE_FILE) if __name__ == "__main__": try: print("Server started") socketio.run(app, debug=True) except: raise finally: print("Stop procedure") conn.close()
def calculate_likelihoods(train, test, fnames, ftablename, function_type='max', query_type='', optional_filter_feature_likeli6='', optional_filter_value_likeli6=''): global_mean = np.mean(train.is_screener) folds = [x for x in range(1, nfold + 1)] likeli_all = pd.DataFrame() for L in range(1, len(folds) + 1): for train_folds in itertools.combinations(folds, L): print train_folds sql_query = open('genentech-sql/pattern_likeli_multiple' + query_type + '.sql').read() sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename) sql_query = sql_query.replace('GENERIC_FEATURE_NAME', '_'.join(fnames)) sql_query = sql_query.replace('FEATURE_NAMES_COMMA_SEPARATED', ','.join(fnames)) sql_query = sql_query.replace( 'T1_COMMA_SEPARATED', ','.join(['t1.' + x for x in fnames])) sql_query = sql_query.replace( 'T3_T4_CONDITION', ' AND '.join(['t3.' + x + '=t4.' + x for x in fnames])) sql_query = sql_query.replace( 'OPTIONAL_CV_EXPRESSION', 'WHERE ' + ' OR '.join(['cv_index=' + str(x) for x in list(train_folds)])) sql_query = sql_query.replace('GROUP_FUNCTION', function_type) sql_query = sql_query.replace( 'OPTIONAL_CONDITION_LIKELI6', 'WHERE ' + optional_filter_feature_likeli6 + "='" + optional_filter_value_likeli6 + "'") #sql_query = sql_query.replace('OPTIONAL_CONDITION_LIKELI6', 'WHERE ' + optional_filter_feature_likeli6 + ">=" + optional_filter_value_likeli6) if len(list(train_folds)) == len(folds): choosing_patients_expression = 'patients_test2' else: choosing_patients_expression = 'train_cv_indices ' + 'WHERE ' + ' OR '.join( [ 'cv_index=' + str(x) for x in folds if not x in list(train_folds) ]) sql_query = sql_query.replace('CHOOSING_PATIENTS_EXPRESSION', choosing_patients_expression) conn = utils.connect_to_database() cur = conn.cursor() cur.execute(sql_query) if (query_type == '3') or (query_type == '4') or (query_type == '5'): conn.commit() sql_query = open('genentech-sql/pattern_likeli_multiple' + query_type + '_2.sql').read() sql_query = sql_query.replace('GENERIC_FEATURE_NAME', '_'.join(fnames)) sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename) cur.execute(sql_query) likeli = pd.DataFrame(cur.fetchall()) likeli.columns = [x.name for x in cur.description] cur.execute('DROP TABLE patient_likeli_table;') conn.commit() else: likeli = pd.DataFrame(cur.fetchall()) likeli.columns = [x.name for x in cur.description] for x in folds: if x in list(train_folds): likeli['fold' + str(x)] = 1 else: likeli['fold' + str(x)] = 0 cur.close() conn.close() likeli_all = likeli_all.append(likeli, ignore_index=True) col = likeli.columns[1] likeli = pd.merge(likeli, train[['patient_id', 'is_screener']], on='patient_id', how='inner') if len(likeli) > 0: print "Pearson correlation: " + str( pearsonr(likeli.is_screener, likeli[col])) print "AUC: " + str(auc(likeli.is_screener, likeli[col])) del likeli feats_all = train[['patient_id']].append(test[['patient_id']], ignore_index=True) for test_fold in ([0] + folds): train_folds = [x for x in folds if (x != test_fold) and (x != 0)] if len(train_folds) == len(folds): pd_query = ' and '.join( ['fold' + str(x) + '==1' for x in train_folds]) else: pd_query = ' and '.join( ['fold' + str(x) + '==1' for x in train_folds]) + ' and ' + ' and '.join([ 'fold' + str(x) + '==0' for x in folds if not x in train_folds ]) print pd_query likeli = likeli_all.query(pd_query).copy().reset_index(drop=True) for x in folds: likeli.drop('fold' + str(x), axis=1, inplace=True) if test_fold == 0: feats_fold = test[['patient_id']].copy() else: feats_fold = train.query('cv_index==@test_fold')[['patient_id' ]].copy() feats_fold = pd.merge(feats_fold, likeli, on='patient_id', how='left') del likeli for val_fold in [x for x in folds if (x != test_fold) and (x != 0)]: train_folds = [ x for x in folds if (x != test_fold) and (x != val_fold) and (x != 0) ] pd_query = ' and '.join( ['fold' + str(x) + '==1' for x in train_folds]) + ' and ' + ' and '.join([ 'fold' + str(x) + '==0' for x in folds if not x in train_folds ]) likeli = likeli_all.query(pd_query).copy().reset_index(drop=True) for x in folds: likeli.drop('fold' + str(x), axis=1, inplace=True) feats_val_fold = train.query('cv_index==@val_fold')[['patient_id' ]].copy() feats_val_fold = pd.merge(feats_val_fold, likeli, on='patient_id', how='left') del likeli feats_fold = feats_fold.append(feats_val_fold, ignore_index=True) col = feats_fold.columns[1] feats_fold = feats_fold.reset_index(drop=True) feats_fold[col].fillna(global_mean, inplace=True) #feats_fold[fname_w_likeli].fillna(global_mean, inplace=True) feats_fold = feats_fold.rename( columns={col: col + '_fold_' + str(test_fold)}) #feats_fold = feats_fold.rename(columns={fname_w_likeli : fname_w_likeli+'_fold_'+str(test_fold)}) feats_all = pd.merge(feats_all, feats_fold, on='patient_id', how='left') print "Writing to HDF5 store..." store = pd.HDFStore('../data/output-py/' + col + '.h5') store.append('feats_all', feats_all) store.close() conn.close() print "Feature " + col + " is saved in file." return col
import random from utils import connect_to_database from pymongo.errors import ConnectionFailure from warnings import warn try: DB = connect_to_database(database_name = "yelp") except ConnectionFailure as e: warn("Could not connect to MongoDB database `yelp`") DB = None class Personality: def __init__(self, good_requirement, collection_name = "restaurants"): self.collection_name = collection_name self.good_requirement = good_requirement self.good_examples = [] def _random_sample(self): database = DB[self.collection_name] all_elements = database.count() random_el = random.randint(0, all_elements) els = list(database.find({"review_count": {"$gt": 4}}, {"url": 1,"rating":1, "price": 1, "categories":1}, limit=1, skip=random_el)) if len(els) > 0: return els[0] else: return self._random_sample() def generate_bad_sample(self): sample = self._random_sample() if self.good_requirement(sample): return self.generate_bad_sample()
right_on='Departamento') df = df[['Departamento', 'Descripción de Distrito', 'IDH (2018)']] df = df.rename( columns={ 'Departamento': 'departament', 'Descripción de Distrito': 'municipality', 'IDH (2018)': 'idh' }) df.head() # ### Insert to DB df.head() con = utils.connect_to_database() # + #df.to_sql('demographics',con,'semantic') # + # After this point sql/semantic/create_tenders_demographics.sql should be run to match municipality in tenders # - # #### Get Data of Gender and age by municipality # We didnt use this part in the bias analysis, leave it here just in case we need to pick it up later. def demographics(file):
def calculate_likelihoods(train, test, fnames, ftablename, function_type='max', query_type='', optional_filter_feature_likeli6='', optional_filter_value_likeli6=''): global_mean = np.mean(train.is_screener) folds = [x for x in range(1, nfold+1)] likeli_all = pd.DataFrame() for L in range(1, len(folds)+1): for train_folds in itertools.combinations(folds, L): print train_folds sql_query = open('genentech-sql/pattern_likeli_multiple' + query_type + '.sql').read() sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename) sql_query = sql_query.replace('GENERIC_FEATURE_NAME', '_'.join(fnames)) sql_query = sql_query.replace('FEATURE_NAMES_COMMA_SEPARATED', ','.join(fnames)) sql_query = sql_query.replace('T1_COMMA_SEPARATED', ','.join(['t1.'+x for x in fnames])) sql_query = sql_query.replace('T3_T4_CONDITION', ' AND '.join(['t3.'+x+'=t4.'+x for x in fnames])) sql_query = sql_query.replace('OPTIONAL_CV_EXPRESSION', 'WHERE ' + ' OR '.join(['cv_index='+str(x) for x in list(train_folds)])) sql_query = sql_query.replace('GROUP_FUNCTION', function_type) sql_query = sql_query.replace('OPTIONAL_CONDITION_LIKELI6', 'WHERE ' + optional_filter_feature_likeli6 + "='" + optional_filter_value_likeli6 + "'") #sql_query = sql_query.replace('OPTIONAL_CONDITION_LIKELI6', 'WHERE ' + optional_filter_feature_likeli6 + ">=" + optional_filter_value_likeli6) if len(list(train_folds)) == len(folds): choosing_patients_expression = 'patients_test2' else: choosing_patients_expression = 'train_cv_indices ' + 'WHERE ' + ' OR '.join(['cv_index='+str(x) for x in folds if not x in list(train_folds)]) sql_query = sql_query.replace('CHOOSING_PATIENTS_EXPRESSION', choosing_patients_expression) conn = utils.connect_to_database() cur = conn.cursor() cur.execute(sql_query) if (query_type == '3') or (query_type == '4') or (query_type == '5'): conn.commit() sql_query = open('genentech-sql/pattern_likeli_multiple' + query_type + '_2.sql').read() sql_query = sql_query.replace('GENERIC_FEATURE_NAME', '_'.join(fnames)) sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename) cur.execute(sql_query) likeli = pd.DataFrame(cur.fetchall()) likeli.columns = [x.name for x in cur.description] cur.execute('DROP TABLE patient_likeli_table;') conn.commit() else: likeli = pd.DataFrame(cur.fetchall()) likeli.columns = [x.name for x in cur.description] for x in folds: if x in list(train_folds): likeli['fold'+str(x)] = 1 else: likeli['fold'+str(x)] = 0 cur.close() conn.close() likeli_all = likeli_all.append(likeli, ignore_index=True) col = likeli.columns[1] likeli = pd.merge(likeli, train[['patient_id', 'is_screener']], on='patient_id', how='inner') if len(likeli)>0: print "Pearson correlation: " + str(pearsonr(likeli.is_screener, likeli[col])) print "AUC: " + str(auc(likeli.is_screener, likeli[col])) del likeli feats_all = train[['patient_id']].append(test[['patient_id']], ignore_index=True) for test_fold in ([0] + folds): train_folds = [x for x in folds if (x != test_fold) and (x != 0)] if len(train_folds) == len(folds): pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) else: pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) + ' and ' + ' and '.join(['fold'+str(x)+'==0' for x in folds if not x in train_folds]) print pd_query likeli = likeli_all.query(pd_query).copy().reset_index(drop=True) for x in folds: likeli.drop('fold'+str(x), axis=1, inplace=True) if test_fold == 0: feats_fold = test[['patient_id']].copy() else: feats_fold = train.query('cv_index==@test_fold')[['patient_id']].copy() feats_fold = pd.merge(feats_fold, likeli, on='patient_id', how='left') del likeli for val_fold in [x for x in folds if (x != test_fold) and (x != 0)]: train_folds = [x for x in folds if (x != test_fold) and (x != val_fold) and (x != 0)] pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) + ' and ' + ' and '.join(['fold'+str(x)+'==0' for x in folds if not x in train_folds]) likeli = likeli_all.query(pd_query).copy().reset_index(drop=True) for x in folds: likeli.drop('fold'+str(x), axis=1, inplace=True) feats_val_fold = train.query('cv_index==@val_fold')[['patient_id']].copy() feats_val_fold = pd.merge(feats_val_fold, likeli, on='patient_id', how='left') del likeli feats_fold = feats_fold.append(feats_val_fold, ignore_index=True) col = feats_fold.columns[1] feats_fold = feats_fold.reset_index(drop=True) feats_fold[col].fillna(global_mean, inplace=True) #feats_fold[fname_w_likeli].fillna(global_mean, inplace=True) feats_fold = feats_fold.rename(columns={col : col+'_fold_'+str(test_fold)}) #feats_fold = feats_fold.rename(columns={fname_w_likeli : fname_w_likeli+'_fold_'+str(test_fold)}) feats_all = pd.merge(feats_all, feats_fold, on='patient_id', how='left') print "Writing to HDF5 store..." store = pd.HDFStore('../data/output-py/' + col + '.h5') store.append('feats_all', feats_all) store.close() conn.close() print "Feature " + col + " is saved in file." return col