Example #1
0
def test_save_to_database():
    start_url = 'https://example.com'
    urls = [
        'https://example.com/example1', 'https://example.com/example2',
        'https://example2.com/example/'
    ]
    save_to_database(start_url, urls)

    conn = connect_to_database()
    cursor = conn.cursor()

    for url in urls:
        cursor.execute(
            """
                           MATCH (l1: Link {url: $start_url})
                           MATCH (l2: Link {url: $url}) 
                           MATCH (l1)-[e:LINKS_TO]->(l2)
                           RETURN l1, e, l2;
                           """, {
                'start_url': start_url,
                'url': url
            })
        row = cursor.fetchone()
        assert row is not None
        assert any(el.properties.get('url', None) == url for el in row)
def save_ftrl_data(data_type, fnames, ftablename, test_folds, train_folds,
                   ftrl_type, optional_date_ftrl3, optional_condition_ftrl4):
    conn = utils.connect_to_database()
    cur = conn.cursor()

    path = '../data/output-py/ftrl/'
    if (data_type == 'val') or (data_type == 'train'):
        path_part = 'train'
    else:
        path_part = 'test'

    temp_path = '../data/output-py/ftrl/temp/'
    file_name = data_type + '_ftrl_folds.csv'

    sql_query = open('genentech-sql/pattern_ftrl_' + path_part + ftrl_type +
                     '.sql').read()
    if data_type == 'train':
        sql_query = sql_query.replace(
            'OPTIONAL_CV_EXPRESSION',
            'WHERE ' + ' OR '.join(['cv_index=' + str(x)
                                    for x in train_folds]))
    if data_type == 'val':
        sql_query = sql_query.replace(
            'OPTIONAL_CV_EXPRESSION',
            'WHERE ' + ' OR '.join(['cv_index=' + str(x) for x in test_folds]))
    sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename)
    sql_query = sql_query.replace('FEATURES_LIST_COMMA_SEPARATED',
                                  ','.join(fnames))
    sql_query = sql_query.replace('T1_FEATURES_COMMA_SEPARATED',
                                  ','.join(['t1.' + x for x in fnames]))
    sql_query = sql_query.replace('OPTIONAL_DATE_FTRL3', optional_date_ftrl3)
    sql_query = sql_query.replace('OPTIONAL_CONDITION_FTRL4',
                                  optional_condition_ftrl4)

    copy_string = "unload ('" + sql_query + "') to 's3://genentech-2016/ftrl/" + file_name + "' " +\
                  "credentials " + utils.S3_CONNECTION_STRING +\
                  "delimiter ',' gzip allowoverwrite;"

    cur.execute(copy_string)
    conn.commit()

    cur.close()
    conn.close()

    os.system('aws s3 cp s3://genentech-2016/ftrl/ ' + temp_path +
              ' --recursive')
    os.system('aws s3 rm s3://genentech-2016/ftrl/ --recursive')
    os.system('find ' + temp_path + ' -name \*.gz -exec gunzip {} \;')
    data_parts = ' '.join(sorted(glob.glob(temp_path + '*')))
    if data_type == 'test':
        header = 'patient_id,' + ','.join(fnames) + '\n'
    else:
        header = 'patient_id,' + ','.join(fnames) + ',is_screener\n'
    with open(temp_path + "header.csv", "w") as text_file:
        text_file.write("%s" % header)
    os.system('cat ' + temp_path + 'header.csv ' + data_parts + ' > ' + path +
              file_name)
    os.system('rm -R ' + temp_path + '/*')

    return path + file_name
def merge_likelihood_tables(fnames_list, ftablename, train_folds):
    folds = [x for x in range(1, nfold+1)]

    sql_query = open('genentech-sql/pattern_merge_likeli.sql').read()
    sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename)
    sql_query = sql_query.replace('FEATURE_NAMES_COMMA_SEPARATED', ','.join([','.join(x) for x in fnames_list]))

    likeli_tables_for_join = ''
    
    count = 2
    for fnames in fnames_list:
         likeli_tables_for_join = likeli_tables_for_join + ' INNER JOIN ' + '_'.join(fnames) + '_likeli_table t' +\
                                  str(count) + ' ON ' + ' AND '.join(['t1.' + x + '=t'+str(count)+'.'+x for x in fnames])
         count = count + 1
    sql_query = sql_query.replace('LIKELI_TABLES_FOR_JOIN', likeli_tables_for_join)
    sql_query = sql_query.replace('GENERIC_FEATURE_NAME', '_'.join(['_'.join(x) for x in fnames_list]))
    likeli_function = 'MAX(' + '+'.join(['t'+str(x)+'.feature_avg' for x in range(2, len(fnames_list)+2)]) + ')'
    #likeli_function = 'MAX(1.0-' + '*'.join(['(1.0-t'+str(x)+'.feature_avg)' for x in range(2, len(fnames_list)+2)]) + ')'
    sql_query = sql_query.replace('LIKELI_FUNCTION', likeli_function)

    if len(train_folds) == len(folds):
        choosing_patients_expression = 'patients_test2'
    else:
        choosing_patients_expression = 'train_cv_indices ' + 'WHERE ' + ' OR '.join(['cv_index='+str(x) for x in folds if not x in list(train_folds)])

    sql_query = sql_query.replace('CHOOSING_PATIENTS_EXPRESSION', choosing_patients_expression)

    conn = utils.connect_to_database()
    cur = conn.cursor()
    cur.execute(sql_query)
    likeli = pd.DataFrame(cur.fetchall())
    likeli.columns = [x.name for x in cur.description]
    cur.close()
    conn.close()
    return likeli   
Example #4
0
def find_shortest_path(start_url, end_url):
    """Find the shortest path between two nodes representing URLs in the database."""
    conn = connect_to_database()
    cursor = conn.cursor()

    cursor.execute(
        """
        MATCH (l1: Link {url: $start_url})-[edge_list: LINKS_TO * bfs..10]-(l2: Link {url: $end_url})
        RETURN edge_list LIMIT 1;
        """, {
            'start_url': start_url,
            'end_url': end_url
        })

    row = cursor.fetchone()

    if row is None:
        cursor.execute("MATCH (l1: Link {url: $start_url}) RETURN l1;",
                       {'start_url': start_url})
        row = cursor.fetchone()
        if row is None:
            raise WebSiteNotFoundError(start_url)
        cursor.execute("MATCH (l1: Link {url: $end_url}) RETURN l1;",
                       {'end_url': end_url})
        row = cursor.fetchone()
        if row is None:
            raise WebSiteNotFoundError(end_url)
        raise ShortestPathNotFoundError(
            f'No path between {start_url} and {end_url}.')

    return row[0], cursor
def drop_likelihood_table(likeli_table_name):
    conn = utils.connect_to_database()
    cur = conn.cursor()
    cur.execute('DROP TABLE ' + likeli_table_name + ';')
    conn.commit()
    cur.close()
    conn.close()
    return None
def main():
    conn = connect_to_database()
    cur = conn.cursor()
    
    load_staging_tables(cur, conn)
    insert_tables(cur, conn)

    conn.close()
def drop_likelihood_table(likeli_table_name):
    conn = utils.connect_to_database()
    cur = conn.cursor()
    cur.execute('DROP TABLE ' + likeli_table_name + ';')
    conn.commit()
    cur.close()
    conn.close()
    return None
Example #8
0
def scrap_and_populate_db(url):
    scrap(url)
    conn = connect_to_database()
    cursor = conn.cursor()

    cursor.execute(
        """
        MATCH (l: Link {url: $url})
        RETURN l;
        """, {'url': url})

    return cursor
Example #9
0
def main():

    conn = connect_to_database()
    cur = conn.cursor()

    print('dropping all tables ...')
    drop_tables(cur, conn)
    print('tables dropped successfully \n creating tables ... ')
    create_tables(cur, conn)
    print('table created successfully')
    conn.close()
    print('connection closed')
def generate_likelihood_table(likeli_table_name, fnames, ftablename, train_folds):
    sql_query = open('genentech-sql/pattern_likeli_table.sql').read()
    sql_query = sql_query.replace('LIKELI_TABLE_NAME', likeli_table_name)
    sql_query = sql_query.replace('T1_COMMA_SEPARATED', ','.join(['t1.'+x for x in fnames]))
    sql_query = sql_query.replace('FEATURE_NAMES_COMMA_SEPARATED', ','.join(fnames))
    sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename)
    sql_query = sql_query.replace('OPTIONAL_CV_EXPRESSION', 'WHERE ' + ' OR '.join(['cv_index='+str(x) for x in list(train_folds)]))
    conn = utils.connect_to_database()
    cur = conn.cursor()
    cur.execute(sql_query)
    conn.commit()
    cur.close()
    conn.close()
    return None
def save_ftrl_data(data_type, fnames, ftablename, test_folds, train_folds, ftrl_type, optional_date_ftrl3, optional_condition_ftrl4):
    conn = utils.connect_to_database()
    cur = conn.cursor()
    
    path = '../data/output-py/ftrl/'
    if (data_type=='val') or (data_type=='train'):
        path_part = 'train'
    else:
        path_part = 'test'

    temp_path = '../data/output-py/ftrl/temp/'
    file_name = data_type + '_ftrl_folds.csv'

    sql_query = open('genentech-sql/pattern_ftrl_' + path_part + ftrl_type + '.sql').read()
    if data_type == 'train':
        sql_query = sql_query.replace('OPTIONAL_CV_EXPRESSION', 'WHERE ' + ' OR '.join(['cv_index='+str(x) for x in train_folds]))
    if data_type == 'val':
        sql_query = sql_query.replace('OPTIONAL_CV_EXPRESSION', 'WHERE ' + ' OR '.join(['cv_index='+str(x) for x in test_folds]))
    sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename)
    sql_query = sql_query.replace('FEATURES_LIST_COMMA_SEPARATED', ','.join(fnames))
    sql_query = sql_query.replace('T1_FEATURES_COMMA_SEPARATED', ','.join(['t1.'+x for x in fnames]))
    sql_query = sql_query.replace('OPTIONAL_DATE_FTRL3', optional_date_ftrl3)
    sql_query = sql_query.replace('OPTIONAL_CONDITION_FTRL4', optional_condition_ftrl4)

    copy_string = "unload ('" + sql_query + "') to 's3://genentech-2016/ftrl/" + file_name + "' " +\
                  "credentials " + utils.S3_CONNECTION_STRING +\
                  "delimiter ',' gzip allowoverwrite;"

    cur.execute(copy_string)
    conn.commit()

    cur.close()
    conn.close()

    os.system('aws s3 cp s3://genentech-2016/ftrl/ ' + temp_path + ' --recursive')
    os.system('aws s3 rm s3://genentech-2016/ftrl/ --recursive')
    os.system('find ' + temp_path + ' -name \*.gz -exec gunzip {} \;')
    data_parts = ' '.join(sorted(glob.glob(temp_path + '*')))
    if data_type == 'test':
        header = 'patient_id,' + ','.join(fnames) + '\n'
    else:
        header = 'patient_id,' + ','.join(fnames) + ',is_screener\n'
    with open(temp_path + "header.csv", "w") as text_file:
        text_file.write("%s" % header)
    os.system('cat ' + temp_path + 'header.csv ' + data_parts + ' > ' + path + file_name)
    os.system('rm -R ' + temp_path + '/*')

    return path + file_name
Example #12
0
def merge_likelihood_tables(fnames_list, ftablename, train_folds):
    folds = [x for x in range(1, nfold + 1)]

    sql_query = open('genentech-sql/pattern_merge_likeli.sql').read()
    sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename)
    sql_query = sql_query.replace('FEATURE_NAMES_COMMA_SEPARATED',
                                  ','.join([','.join(x) for x in fnames_list]))

    likeli_tables_for_join = ''

    count = 2
    for fnames in fnames_list:
        likeli_tables_for_join = likeli_tables_for_join + ' INNER JOIN ' + '_'.join(fnames) + '_likeli_table t' +\
                                 str(count) + ' ON ' + ' AND '.join(['t1.' + x + '=t'+str(count)+'.'+x for x in fnames])
        count = count + 1
    sql_query = sql_query.replace('LIKELI_TABLES_FOR_JOIN',
                                  likeli_tables_for_join)
    sql_query = sql_query.replace('GENERIC_FEATURE_NAME',
                                  '_'.join(['_'.join(x) for x in fnames_list]))
    likeli_function = 'MAX(' + '+'.join([
        't' + str(x) + '.feature_avg' for x in range(2,
                                                     len(fnames_list) + 2)
    ]) + ')'
    #likeli_function = 'MAX(1.0-' + '*'.join(['(1.0-t'+str(x)+'.feature_avg)' for x in range(2, len(fnames_list)+2)]) + ')'
    sql_query = sql_query.replace('LIKELI_FUNCTION', likeli_function)

    if len(train_folds) == len(folds):
        choosing_patients_expression = 'patients_test2'
    else:
        choosing_patients_expression = 'train_cv_indices ' + 'WHERE ' + ' OR '.join(
            [
                'cv_index=' + str(x)
                for x in folds if not x in list(train_folds)
            ])

    sql_query = sql_query.replace('CHOOSING_PATIENTS_EXPRESSION',
                                  choosing_patients_expression)

    conn = utils.connect_to_database()
    cur = conn.cursor()
    cur.execute(sql_query)
    likeli = pd.DataFrame(cur.fetchall())
    likeli.columns = [x.name for x in cur.description]
    cur.close()
    conn.close()
    return likeli
Example #13
0
def build_train_test():
    conn = utils.connect_to_database()
    target = ['is_screener']
    flist_basic = [
        'patient_age_group', 'patient_state', 'ethinicity', 'household_income',
        'education_level'
    ]
    flist_pah = [
        'activity_type_r_count_all', 'activity_type_a_count_all',
        'activity_type_count_all'
    ]

    sql_query = "SELECT t1.patient_id,t1.is_screener," + ",".join(['t1.'+x for x in flist_basic]) +\
                "," + ",".join(['t2.'+x for x in flist_pah]) +\
                " FROM patients_train t1\
                 LEFT JOIN patient_activity_feats t2\
                 ON t1.patient_id=t2.patient_id;"

    train = pd.read_sql_query(sql_query, conn)
    train.reset_index(drop=True, inplace=True)

    sql_query = "SELECT t1.patient_id," + ",".join(['t1.'+x for x in flist_basic]) +\
                "," + ",".join(['t2.'+x for x in flist_pah]) +\
                " FROM patients_test2 t1\
                 LEFT JOIN patient_activity_feats t2\
                 ON t1.patient_id=t2.patient_id;"

    test = pd.read_sql_query(sql_query, conn)
    test.reset_index(drop=True, inplace=True)

    cv_indices = pd.read_sql_query(
        'SELECT patient_id, cv_index FROM train_cv_indices;', conn)
    train = pd.merge(train, cv_indices, on='patient_id', how='left')

    train = calculate_basic(train)
    test = calculate_basic(test)
    train, test = encode_onehot(train, test, ['patient_state', 'ethinicity'])

    print "Writing to HDF5 store..."
    store = pd.HDFStore('../data/output-py/train_test.h5')
    store.append('train', train)
    store.append('test', test)
    store.close()
    conn.close()
    return train, test
Example #14
0
def generate_likelihood_table(likeli_table_name, fnames, ftablename,
                              train_folds):
    sql_query = open('genentech-sql/pattern_likeli_table.sql').read()
    sql_query = sql_query.replace('LIKELI_TABLE_NAME', likeli_table_name)
    sql_query = sql_query.replace('T1_COMMA_SEPARATED',
                                  ','.join(['t1.' + x for x in fnames]))
    sql_query = sql_query.replace('FEATURE_NAMES_COMMA_SEPARATED',
                                  ','.join(fnames))
    sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename)
    sql_query = sql_query.replace(
        'OPTIONAL_CV_EXPRESSION', 'WHERE ' +
        ' OR '.join(['cv_index=' + str(x) for x in list(train_folds)]))
    conn = utils.connect_to_database()
    cur = conn.cursor()
    cur.execute(sql_query)
    conn.commit()
    cur.close()
    conn.close()
    return None
Example #15
0
def save_to_database(start_url, urls):
    """Save nodes representing URLs and edges representing links to the database."""
    conn = connect_to_database()
    cursor = conn.cursor()

    cursor.execute("MERGE (l:Link {url: $start_url});",
                   {'start_url': start_url})

    for url in urls:
        cursor.execute(
            """
                        MERGE (l1: Link {url: $start_url})
                        MERGE (l2: Link {url: $url}) 
                        MERGE (l1)-[e:LINKS_TO]->(l2)
                        RETURN l1, e, l2;
                        """, {
                'start_url': start_url,
                'url': url
            })
    conn.commit()
 def __init__(self):
     """
     initialises current_team
     Connects to the database
     Creates a temp table for each position
     """
     
     self.current_team = {}
     self.connection, self.cursor = utils.connect_to_database()
     
     self.clone_table_column_creators = "id integer primary key, name varchar(100), player_url varchar(100), position varchar(100), age integer, current_team varchar(20), team_one_year_ago varchar(20), team_two_year_ago varchar(20), team_three_year_ago varchar(20), points_one_year_ago float, points_two_year_ago float, points_three_year_ago float"
     self.clone_table_columns = "id, name, player_url, position, age, current_team, team_one_year_ago, team_two_year_ago, team_three_year_ago, points_one_year_ago, points_two_year_ago, points_three_year_ago"
     self.temp_table_column_creators = "id integer primary key, name varchar(100), position varchar(100), current_team varchar(20), weighted_score float"
     self.temp_table_columns = "id, name, position, current_team, weighted_score"
     
     utils.create_temp_clone_table("players", "cloneplayers", self.clone_table_column_creators,
                                   self.clone_table_columns, self.connection, self.cursor)
     utils.add_column_to_table("cloneplayers", "weighted_score", "float", self.connection, self.cursor)
     utils.update_weighted_scores(self.connection, self.cursor)
     for position in utils.get_positions_list(self.cursor):
         filter_position = position.rstrip('0123456789 ').upper() + '%'
         utils.create_temporary_table(position, self.temp_table_column_creators, self.connection, self.cursor)
         utils.populate_temp_table_from_other_table(position, self.temp_table_columns, "cloneplayers", "position", filter_position, self.connection, self.cursor)
def build_train_test():
    conn = utils.connect_to_database()
    target = ['is_screener']
    flist_basic = ['patient_age_group', 'patient_state', 'ethinicity',
                   'household_income', 'education_level']
    flist_pah = ['activity_type_r_count_all', 'activity_type_a_count_all', 'activity_type_count_all']

    sql_query = "SELECT t1.patient_id,t1.is_screener," + ",".join(['t1.'+x for x in flist_basic]) +\
                "," + ",".join(['t2.'+x for x in flist_pah]) +\
                " FROM patients_train t1\
                 LEFT JOIN patient_activity_feats t2\
                 ON t1.patient_id=t2.patient_id;"
    train = pd.read_sql_query(sql_query, conn)
    train.reset_index(drop=True, inplace=True)

    sql_query = "SELECT t1.patient_id," + ",".join(['t1.'+x for x in flist_basic]) +\
                "," + ",".join(['t2.'+x for x in flist_pah]) +\
                " FROM patients_test2 t1\
                 LEFT JOIN patient_activity_feats t2\
                 ON t1.patient_id=t2.patient_id;"
    test = pd.read_sql_query(sql_query, conn)
    test.reset_index(drop=True, inplace=True)

    cv_indices = pd.read_sql_query('SELECT patient_id, cv_index FROM train_cv_indices;', conn)
    train = pd.merge(train, cv_indices, on='patient_id', how='left')

    train = calculate_basic(train)
    test = calculate_basic(test)
    train, test = encode_onehot(train, test, ['patient_state', 'ethinicity'])

    print "Writing to HDF5 store..."
    store = pd.HDFStore('../data/output-py/train_test.h5')
    store.append('train', train)
    store.append('test', test)
    store.close()
    conn.close()
    return train, test
Example #18
0
def main():
    '''
    Script entry point:
    - Establishes connection with the sparkifydb database and gets cursor to it
    - Loads staging tables
    - Inserts data into regular tables from staging tables
    - Finally, closes the connection

    Parameters:
        None

    Returns:
        None
    '''
    cur, conn = connect_to_database()

    print('Loading staging tables')
    load_staging_tables(cur, conn)

    print('Inserting data into tables from staging tables')
    insert_tables(cur, conn)

    print('Done')
    conn.close()
def main():
    '''
    Script entry point:
    - Establishes connection with the sparkifydb database and gets cursor to it
    - Drops all the tables
    - Creates all the tables
    - Finally, closes the connection

    Parameters:
        None

    Returns:
        None
    '''
    cur, conn = connect_to_database()

    print('Dropping tables')
    drop_tables(cur, conn)

    print('Creating tables')
    create_tables(cur, conn)

    print('Done')
    conn.close()
import psycopg2
import pandas as pd
from scipy.stats.stats import pearsonr

import utils

conn = utils.connect_to_database()
cur = conn.cursor()

cur.execute(open('genentech-sql/patient_activity_feats.sql').read())
conn.commit()

cur.execute(open('genentech-sql/prescription_feats.sql').read())
conn.commit()

cur.execute(open('genentech-sql/diagnosis_feats.sql').read())
conn.commit()

cur.execute(open('genentech-sql/diagnosis_procedure_link.sql').read())
conn.commit()

cur.execute(open('genentech-sql/diagnosis_pairs.sql').read())
conn.commit()

cur.execute(open('genentech-sql/diagnosis_feats2.sql').read())
conn.commit()

cur.execute(open('genentech-sql/procedure_head2.sql').read())
conn.commit()

cur.execute(open('genentech-sql/diagnosis_procedure_link2.sql').read())
Example #21
0
def test_rsm(training_set_size = 500, validation_set_size = 200, n_hidden = 200):
	from batch_data import BatchData as Batch
	import utils
	
	training_set_name   = "training_set_%d" % training_set_size
	validation_set_name = "validation_set_%d" % validation_set_size

	# We make sure Mongo is running somewhere :
	utils.connect_to_database(database_name = 'yelp')

	def load_dataset(size = 500, lexicon = None, name="training_set"):
		# if training_set.npy doesn't exist:
		rc = utils.ResourceConverter(lexicon = lexicon)
		batch = Batch(
		    data=utils.mongo_database_global['restaurants'].find({}, {'signature':1}), # from Mongo's cursor enumerator
		    batch_size = size,  # mini-batch
		    shuffle = True, # stochastic
		    conversion = rc.process # convert to matrices using lexicon)
		)
		dataset = batch.next()
		# and save it for later.
		numpy.save(name, dataset)
		return dataset

	if file_exists("lexicon.gzp"):
		lexicon = utils.Lexicon.load("lexicon.gzp")
	else:
		# if lexicon.gzp doesnt exist:
		# 'restaurants' is the name of the collection, we stem the words in the triggers,
		# and we lowercase them to minimize the visible dimensions (bag of words dimensions)
		lexicon = utils.gather_lexicon('restaurants',
		                               stem= True, 
		                               lowercase = True,
		                               show_progress= True)
		lexicon.save("lexicon.gzp")

	if file_exists("%s.npy" % training_set_name):
		train_set_x_mem      = numpy.load("%s.npy" % training_set_name)
	else:
		train_set_x_mem      = load_dataset(size = training_set_size, lexicon = lexicon, name=training_set_name)

	if file_exists("%s.npy" % validation_set_name):
		validation_set_x_mem = numpy.load("%s.npy" % validation_set_name)
	else:
		validation_set_x_mem = load_dataset(size = validation_set_size, lexicon = lexicon, name=validation_set_name)

	train_set_x      = theano.shared(train_set_x_mem,      borrow = True)
	validation_set_x = theano.shared(validation_set_x_mem, borrow = True)

	# construct the RSM class
	mini_batch_size = 100
	# allocate symbolic variables for the data
	n_train_batches = floor(train_set_x.get_value(borrow=True).shape[0] / mini_batch_size)
	rng             = numpy.random.RandomState(123)
	theano_rng      = T.shared_randomstreams.RandomStreams(rng.randint(2 ** 30))
	rsm             = RSM(n_visible=lexicon.max_index, n_hidden=n_hidden, numpy_rng=rng, theano_rng=theano_rng)

	def save_rsm(name=''):
		# save computation results:
		numpy.save(("%s_W_trained"     % name),     rsm.W.get_value(borrow=True))
		numpy.save(("%s_hbias_trained" % name), rsm.hbias.get_value(borrow=True))
		numpy.save(("%s_vbias_trained" % name), rsm.vbias.get_value(borrow=True))

	# get training function
	learning_rate  = theano.shared(0.01)
	cost, updates = rsm.get_cost_updates(lr=learning_rate, k=2)

	index = T.lscalar()    # index to a [mini]batch
	train_rsm = theano.function(
	    [index],
	    cost,
	    updates=updates,
	    givens={
	        rsm.input:   train_set_x[index * mini_batch_size:(index + 1) * mini_batch_size],
	        rsm.scaling: train_set_x[index * mini_batch_size:(index + 1) * mini_batch_size].sum(axis=1).astype(theano.config.floatX),
	    },
	    name='train_rbm')


	[pre_sigmoid_h1, h1_mean, h1_sample,
				pre_sigmoid_v1, v1_mean, v1_sample] = rsm.gibbs_vhv(rsm.input)

	validate_rsm = theano.function(
		[],
		rsm.reconstruction_cost(rsm.input, v1_sample),
		givens = {
			rsm.input: validation_set_x,
			rsm.scaling: validation_set_x.sum(axis=1)
		}
	)


	training_epochs     = 300 # will stop early
	batch_indices = [i for i in range(n_train_batches)]
	start_time = time.time()
	min_val = None
	try:
		for epoch in range(training_epochs):

			# go through the training set
			mean_cost = []

			# more stochasticity:
			random.shuffle(batch_indices)

			for batch_index in batch_indices:
			    mean_cost.append(train_rsm(batch_index) / mini_batch_size)

			validation_cost = validate_rsm()           / validation_set_size

			print('Training epoch %d, cost is %.4f, validation cost is %.4f' % (epoch+1, numpy.mean(mean_cost), validation_cost))
		
			if min_val != None and validation_cost < min_val:
				save_rsm("min_validation")
			min_val = min(min_val, validation_cost) if min_val != None else validation_cost

		print('Training took %.05fmn' % ((time.time() - start_time)/60.0))
	except (KeyboardInterrupt, SystemExit):
		print("Saving final rsm...")
		save_rsm("final")
		exit()
	except:
		raise
	print("Saving final rsm...")
	save_rsm("final")
Example #22
0
 def __init__(self):
     if config['installed']:
         self.db = utils.connect_to_database()
Example #23
0
def test_rsm(training_set_size=500, validation_set_size=200, n_hidden=200):
    from batch_data import BatchData as Batch
    import utils

    training_set_name = "training_set_%d" % training_set_size
    validation_set_name = "validation_set_%d" % validation_set_size

    # We make sure Mongo is running somewhere :
    utils.connect_to_database(database_name='yelp')

    def load_dataset(size=500, lexicon=None, name="training_set"):
        # if training_set.npy doesn't exist:
        rc = utils.ResourceConverter(lexicon=lexicon)
        batch = Batch(
            data=utils.mongo_database_global['restaurants'].find(
                {}, {'signature': 1}),  # from Mongo's cursor enumerator
            batch_size=size,  # mini-batch
            shuffle=True,  # stochastic
            conversion=rc.process  # convert to matrices using lexicon)
        )
        dataset = batch.next()
        # and save it for later.
        numpy.save(name, dataset)
        return dataset

    if file_exists("lexicon.gzp"):
        lexicon = utils.Lexicon.load("lexicon.gzp")
    else:
        # if lexicon.gzp doesnt exist:
        # 'restaurants' is the name of the collection, we stem the words in the triggers,
        # and we lowercase them to minimize the visible dimensions (bag of words dimensions)
        lexicon = utils.gather_lexicon('restaurants',
                                       stem=True,
                                       lowercase=True,
                                       show_progress=True)
        lexicon.save("lexicon.gzp")

    if file_exists("%s.npy" % training_set_name):
        train_set_x_mem = numpy.load("%s.npy" % training_set_name)
    else:
        train_set_x_mem = load_dataset(size=training_set_size,
                                       lexicon=lexicon,
                                       name=training_set_name)

    if file_exists("%s.npy" % validation_set_name):
        validation_set_x_mem = numpy.load("%s.npy" % validation_set_name)
    else:
        validation_set_x_mem = load_dataset(size=validation_set_size,
                                            lexicon=lexicon,
                                            name=validation_set_name)

    train_set_x = theano.shared(train_set_x_mem, borrow=True)
    validation_set_x = theano.shared(validation_set_x_mem, borrow=True)

    # construct the RSM class
    mini_batch_size = 100
    # allocate symbolic variables for the data
    n_train_batches = floor(
        train_set_x.get_value(borrow=True).shape[0] / mini_batch_size)
    rng = numpy.random.RandomState(123)
    theano_rng = T.shared_randomstreams.RandomStreams(rng.randint(2**30))
    rsm = RSM(n_visible=lexicon.max_index,
              n_hidden=n_hidden,
              numpy_rng=rng,
              theano_rng=theano_rng)

    def save_rsm(name=''):
        # save computation results:
        numpy.save(("%s_W_trained" % name), rsm.W.get_value(borrow=True))
        numpy.save(("%s_hbias_trained" % name),
                   rsm.hbias.get_value(borrow=True))
        numpy.save(("%s_vbias_trained" % name),
                   rsm.vbias.get_value(borrow=True))

    # get training function
    learning_rate = theano.shared(0.01)
    cost, updates = rsm.get_cost_updates(lr=learning_rate, k=2)

    index = T.lscalar()  # index to a [mini]batch
    train_rsm = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            rsm.input:
            train_set_x[index * mini_batch_size:(index + 1) * mini_batch_size],
            rsm.scaling:
            train_set_x[index * mini_batch_size:(index + 1) *
                        mini_batch_size].sum(axis=1).astype(
                            theano.config.floatX),
        },
        name='train_rbm')

    [pre_sigmoid_h1, h1_mean, h1_sample, pre_sigmoid_v1, v1_mean,
     v1_sample] = rsm.gibbs_vhv(rsm.input)

    validate_rsm = theano.function([],
                                   rsm.reconstruction_cost(
                                       rsm.input, v1_sample),
                                   givens={
                                       rsm.input: validation_set_x,
                                       rsm.scaling:
                                       validation_set_x.sum(axis=1)
                                   })

    training_epochs = 300  # will stop early
    batch_indices = [i for i in range(n_train_batches)]
    start_time = time.time()
    min_val = None
    try:
        for epoch in range(training_epochs):

            # go through the training set
            mean_cost = []

            # more stochasticity:
            random.shuffle(batch_indices)

            for batch_index in batch_indices:
                mean_cost.append(train_rsm(batch_index) / mini_batch_size)

            validation_cost = validate_rsm() / validation_set_size

            print('Training epoch %d, cost is %.4f, validation cost is %.4f' %
                  (epoch + 1, numpy.mean(mean_cost), validation_cost))

            if min_val != None and validation_cost < min_val:
                save_rsm("min_validation")
            min_val = min(
                min_val,
                validation_cost) if min_val != None else validation_cost

        print('Training took %.05fmn' % ((time.time() - start_time) / 60.0))
    except (KeyboardInterrupt, SystemExit):
        print("Saving final rsm...")
        save_rsm("final")
        exit()
    except:
        raise
    print("Saving final rsm...")
    save_rsm("final")
Example #24
0
import random
from utils import connect_to_database
from pymongo.errors import ConnectionFailure
from warnings import warn

try:
    DB = connect_to_database(database_name="yelp")
except ConnectionFailure as e:
    warn("Could not connect to MongoDB database `yelp`")
    DB = None


class Personality:
    def __init__(self, good_requirement, collection_name="restaurants"):
        self.collection_name = collection_name
        self.good_requirement = good_requirement
        self.good_examples = []

    def _random_sample(self):
        database = DB[self.collection_name]

        all_elements = database.count()
        random_el = random.randint(0, all_elements)
        els = list(
            database.find({"review_count": {
                "$gt": 4
            }}, {
                "url": 1,
                "rating": 1,
                "price": 1,
                "categories": 1
Example #25
0
 def test_connect_to_database(self):
     self.assertIs(type(df.connect_to_database()), pyodbc.Connection)
Example #26
0
    $ curl -X POST -H "Content-type: application/json" http://127.0.0.1:5000/predict_map -d '{"Time": 57007.0, "V1": -1.2712441917143702, "V2": 2.46267526851135, "V3": -2.85139500331783, "V4": 2.3244800653477995, "V5": -1.37224488981369, "V6": -0.948195686538643, "V7": -3.06523436172054, "V8": 1.1669269478721105, "V9": -2.2687705884481297, "V10": -4.88114292689057, "V11": 2.2551474887046297, "V12": -4.68638689759229, "V13": 0.652374668512965, "V14": -6.17428834800643, "V15": 0.594379608016446, "V16": -4.8496923870965185, "V17": -6.53652073527011, "V18": -3.11909388163881, "V19": 1.71549441975915, "V20": 0.560478075726644, "V21": 0.652941051330455, "V22": 0.0819309763507574, "V23": -0.22134783119833895, "V24": -0.5235821592333061, "V25": 0.224228161862968, "V26": 0.756334522703558, "V27": 0.632800477330469, "V28": 0.25018709275719697, "Amount": 0.01}'
    Returns:
        resp (json): predicted fraud probability
    """
    X = _manage_query(request)
    y_pred = model.predict(X)[0]
    print("Value predicted: {}".format(y_pred))
    if y_pred >= FRAUD_THRESHOLD:
        row = select_random_row(conn, TABLE_LOCATIONS)
        location = {"title": row[0], "latitude": row[1], "longitude": row[2]}
        print("New location: {}".format(location))
        socketio.emit('map_update', location, broadcast=True, namespace='/fraud')
    return make_response(jsonify({'fraud': y_pred}), STATUS_OK)


# Load the model as a global variable
model = lgb.Booster(model_file=BASELINE_MODEL)

# Connect to database
conn = connect_to_database(DATABASE_FILE)

if __name__ == "__main__":
    try:
        print("Server started")
        socketio.run(app, debug=True)
    except:
        raise
    finally:
        print("Stop procedure")
        conn.close()
Example #27
0
def calculate_likelihoods(train,
                          test,
                          fnames,
                          ftablename,
                          function_type='max',
                          query_type='',
                          optional_filter_feature_likeli6='',
                          optional_filter_value_likeli6=''):
    global_mean = np.mean(train.is_screener)
    folds = [x for x in range(1, nfold + 1)]

    likeli_all = pd.DataFrame()
    for L in range(1, len(folds) + 1):
        for train_folds in itertools.combinations(folds, L):
            print train_folds
            sql_query = open('genentech-sql/pattern_likeli_multiple' +
                             query_type + '.sql').read()
            sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename)
            sql_query = sql_query.replace('GENERIC_FEATURE_NAME',
                                          '_'.join(fnames))
            sql_query = sql_query.replace('FEATURE_NAMES_COMMA_SEPARATED',
                                          ','.join(fnames))
            sql_query = sql_query.replace(
                'T1_COMMA_SEPARATED', ','.join(['t1.' + x for x in fnames]))
            sql_query = sql_query.replace(
                'T3_T4_CONDITION',
                ' AND '.join(['t3.' + x + '=t4.' + x for x in fnames]))
            sql_query = sql_query.replace(
                'OPTIONAL_CV_EXPRESSION', 'WHERE ' +
                ' OR '.join(['cv_index=' + str(x) for x in list(train_folds)]))
            sql_query = sql_query.replace('GROUP_FUNCTION', function_type)
            sql_query = sql_query.replace(
                'OPTIONAL_CONDITION_LIKELI6',
                'WHERE ' + optional_filter_feature_likeli6 + "='" +
                optional_filter_value_likeli6 + "'")
            #sql_query = sql_query.replace('OPTIONAL_CONDITION_LIKELI6', 'WHERE ' + optional_filter_feature_likeli6 + ">=" + optional_filter_value_likeli6)
            if len(list(train_folds)) == len(folds):
                choosing_patients_expression = 'patients_test2'
            else:
                choosing_patients_expression = 'train_cv_indices ' + 'WHERE ' + ' OR '.join(
                    [
                        'cv_index=' + str(x)
                        for x in folds if not x in list(train_folds)
                    ])
            sql_query = sql_query.replace('CHOOSING_PATIENTS_EXPRESSION',
                                          choosing_patients_expression)

            conn = utils.connect_to_database()
            cur = conn.cursor()
            cur.execute(sql_query)
            if (query_type == '3') or (query_type == '4') or (query_type
                                                              == '5'):
                conn.commit()
                sql_query = open('genentech-sql/pattern_likeli_multiple' +
                                 query_type + '_2.sql').read()
                sql_query = sql_query.replace('GENERIC_FEATURE_NAME',
                                              '_'.join(fnames))
                sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename)
                cur.execute(sql_query)
                likeli = pd.DataFrame(cur.fetchall())
                likeli.columns = [x.name for x in cur.description]
                cur.execute('DROP TABLE patient_likeli_table;')
                conn.commit()
            else:
                likeli = pd.DataFrame(cur.fetchall())
                likeli.columns = [x.name for x in cur.description]

            for x in folds:
                if x in list(train_folds):
                    likeli['fold' + str(x)] = 1
                else:
                    likeli['fold' + str(x)] = 0
            cur.close()
            conn.close()

            likeli_all = likeli_all.append(likeli, ignore_index=True)
            col = likeli.columns[1]
            likeli = pd.merge(likeli,
                              train[['patient_id', 'is_screener']],
                              on='patient_id',
                              how='inner')
            if len(likeli) > 0:
                print "Pearson correlation: " + str(
                    pearsonr(likeli.is_screener, likeli[col]))
                print "AUC: " + str(auc(likeli.is_screener, likeli[col]))
            del likeli

    feats_all = train[['patient_id']].append(test[['patient_id']],
                                             ignore_index=True)
    for test_fold in ([0] + folds):
        train_folds = [x for x in folds if (x != test_fold) and (x != 0)]

        if len(train_folds) == len(folds):
            pd_query = ' and '.join(
                ['fold' + str(x) + '==1' for x in train_folds])
        else:
            pd_query = ' and '.join(
                ['fold' + str(x) + '==1'
                 for x in train_folds]) + ' and ' + ' and '.join([
                     'fold' + str(x) + '==0'
                     for x in folds if not x in train_folds
                 ])
        print pd_query

        likeli = likeli_all.query(pd_query).copy().reset_index(drop=True)
        for x in folds:
            likeli.drop('fold' + str(x), axis=1, inplace=True)

        if test_fold == 0:
            feats_fold = test[['patient_id']].copy()
        else:
            feats_fold = train.query('cv_index==@test_fold')[['patient_id'
                                                              ]].copy()
        feats_fold = pd.merge(feats_fold, likeli, on='patient_id', how='left')
        del likeli

        for val_fold in [x for x in folds if (x != test_fold) and (x != 0)]:
            train_folds = [
                x for x in folds
                if (x != test_fold) and (x != val_fold) and (x != 0)
            ]
            pd_query = ' and '.join(
                ['fold' + str(x) + '==1'
                 for x in train_folds]) + ' and ' + ' and '.join([
                     'fold' + str(x) + '==0'
                     for x in folds if not x in train_folds
                 ])

            likeli = likeli_all.query(pd_query).copy().reset_index(drop=True)
            for x in folds:
                likeli.drop('fold' + str(x), axis=1, inplace=True)

            feats_val_fold = train.query('cv_index==@val_fold')[['patient_id'
                                                                 ]].copy()
            feats_val_fold = pd.merge(feats_val_fold,
                                      likeli,
                                      on='patient_id',
                                      how='left')
            del likeli
            feats_fold = feats_fold.append(feats_val_fold, ignore_index=True)

        col = feats_fold.columns[1]
        feats_fold = feats_fold.reset_index(drop=True)
        feats_fold[col].fillna(global_mean, inplace=True)
        #feats_fold[fname_w_likeli].fillna(global_mean, inplace=True)
        feats_fold = feats_fold.rename(
            columns={col: col + '_fold_' + str(test_fold)})
        #feats_fold = feats_fold.rename(columns={fname_w_likeli : fname_w_likeli+'_fold_'+str(test_fold)})
        feats_all = pd.merge(feats_all,
                             feats_fold,
                             on='patient_id',
                             how='left')

    print "Writing to HDF5 store..."
    store = pd.HDFStore('../data/output-py/' + col + '.h5')
    store.append('feats_all', feats_all)
    store.close()
    conn.close()
    print "Feature " + col + " is saved in file."
    return col
import random 
from utils import connect_to_database
from pymongo.errors import ConnectionFailure
from warnings import warn

try:
    DB = connect_to_database(database_name = "yelp")
except ConnectionFailure as e:
    warn("Could not connect to MongoDB database `yelp`")
    DB = None

class Personality:
    def __init__(self, good_requirement, collection_name = "restaurants"):
        self.collection_name = collection_name
        self.good_requirement = good_requirement
        self.good_examples = []
        
    def _random_sample(self):
        database = DB[self.collection_name]
        all_elements = database.count()
        random_el = random.randint(0, all_elements)
        els = list(database.find({"review_count": {"$gt": 4}}, {"url": 1,"rating":1, "price": 1, "categories":1}, limit=1, skip=random_el))
        if len(els) > 0:
            return els[0]
        else:
            return self._random_sample()
        
    def generate_bad_sample(self):
        sample = self._random_sample() 
        if self.good_requirement(sample):
            return self.generate_bad_sample()
Example #29
0
              right_on='Departamento')

df = df[['Departamento', 'DescripciĆ³n de Distrito', 'IDH (2018)']]
df = df.rename(
    columns={
        'Departamento': 'departament',
        'DescripciĆ³n de Distrito': 'municipality',
        'IDH (2018)': 'idh'
    })
df.head()

# ### Insert to DB

df.head()

con = utils.connect_to_database()

# +
#df.to_sql('demographics',con,'semantic')

# +
# After this point sql/semantic/create_tenders_demographics.sql  should be run to match municipality in tenders
# -

# #### Get Data of Gender and age by municipality

# We didnt use this part in the bias analysis, leave it here just in case we need to pick it up later.


def demographics(file):
def calculate_likelihoods(train, test, fnames, ftablename, function_type='max', query_type='', optional_filter_feature_likeli6='', optional_filter_value_likeli6=''):
    global_mean = np.mean(train.is_screener)
    folds = [x for x in range(1, nfold+1)]

    likeli_all = pd.DataFrame()
    for L in range(1, len(folds)+1):
        for train_folds in itertools.combinations(folds, L):
            print train_folds
            sql_query = open('genentech-sql/pattern_likeli_multiple' + query_type + '.sql').read()
            sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename)
            sql_query = sql_query.replace('GENERIC_FEATURE_NAME', '_'.join(fnames))
            sql_query = sql_query.replace('FEATURE_NAMES_COMMA_SEPARATED', ','.join(fnames))
            sql_query = sql_query.replace('T1_COMMA_SEPARATED', ','.join(['t1.'+x for x in fnames]))
            sql_query = sql_query.replace('T3_T4_CONDITION', ' AND '.join(['t3.'+x+'=t4.'+x for x in fnames]))
            sql_query = sql_query.replace('OPTIONAL_CV_EXPRESSION', 'WHERE ' + ' OR '.join(['cv_index='+str(x) for x in list(train_folds)]))
            sql_query = sql_query.replace('GROUP_FUNCTION', function_type)
            sql_query = sql_query.replace('OPTIONAL_CONDITION_LIKELI6', 'WHERE ' + optional_filter_feature_likeli6 + "='" + optional_filter_value_likeli6 + "'")
            #sql_query = sql_query.replace('OPTIONAL_CONDITION_LIKELI6', 'WHERE ' + optional_filter_feature_likeli6 + ">=" + optional_filter_value_likeli6)
            if len(list(train_folds)) == len(folds):
                choosing_patients_expression = 'patients_test2'
            else:
                choosing_patients_expression = 'train_cv_indices ' + 'WHERE ' + ' OR '.join(['cv_index='+str(x) for x in folds if not x in list(train_folds)])
            sql_query = sql_query.replace('CHOOSING_PATIENTS_EXPRESSION', choosing_patients_expression)

            conn = utils.connect_to_database()
            cur = conn.cursor()
            cur.execute(sql_query)
            if (query_type == '3') or (query_type == '4') or (query_type == '5'):
                conn.commit()
                sql_query = open('genentech-sql/pattern_likeli_multiple' + query_type + '_2.sql').read()
                sql_query = sql_query.replace('GENERIC_FEATURE_NAME', '_'.join(fnames))
                sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename)
                cur.execute(sql_query)
                likeli = pd.DataFrame(cur.fetchall())
                likeli.columns = [x.name for x in cur.description]
                cur.execute('DROP TABLE patient_likeli_table;')
                conn.commit()
            else:
                likeli = pd.DataFrame(cur.fetchall())
                likeli.columns = [x.name for x in cur.description]

            for x in folds:
                if x in list(train_folds):
                    likeli['fold'+str(x)] = 1
                else:
                    likeli['fold'+str(x)] = 0
            cur.close()
            conn.close()
            
            likeli_all = likeli_all.append(likeli, ignore_index=True)
            col = likeli.columns[1]
            likeli = pd.merge(likeli, train[['patient_id', 'is_screener']], on='patient_id', how='inner')
            if len(likeli)>0:
                print "Pearson correlation: " + str(pearsonr(likeli.is_screener, likeli[col]))
                print "AUC: " + str(auc(likeli.is_screener, likeli[col]))
            del likeli

    feats_all = train[['patient_id']].append(test[['patient_id']], ignore_index=True)
    for test_fold in ([0] + folds):
        train_folds = [x for x in folds if (x != test_fold) and (x != 0)]

        if len(train_folds) == len(folds):
            pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds])
        else:
            pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) + ' and ' + ' and '.join(['fold'+str(x)+'==0' for x in folds if not x in train_folds])
        print pd_query

        likeli = likeli_all.query(pd_query).copy().reset_index(drop=True)
        for x in folds:
            likeli.drop('fold'+str(x), axis=1, inplace=True)

        if test_fold == 0:
            feats_fold = test[['patient_id']].copy()
        else:
            feats_fold = train.query('cv_index==@test_fold')[['patient_id']].copy()
        feats_fold = pd.merge(feats_fold, likeli, on='patient_id', how='left')
        del likeli

        for val_fold in [x for x in folds if (x != test_fold) and (x != 0)]:
            train_folds = [x for x in folds if (x != test_fold) and (x != val_fold) and (x != 0)]
            pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) + ' and ' + ' and '.join(['fold'+str(x)+'==0' for x in folds if not x in train_folds])
            
            likeli = likeli_all.query(pd_query).copy().reset_index(drop=True)
            for x in folds:
                likeli.drop('fold'+str(x), axis=1, inplace=True)

            feats_val_fold = train.query('cv_index==@val_fold')[['patient_id']].copy()
            feats_val_fold = pd.merge(feats_val_fold, likeli, on='patient_id', how='left')
            del likeli
            feats_fold = feats_fold.append(feats_val_fold, ignore_index=True)

        col = feats_fold.columns[1]
        feats_fold = feats_fold.reset_index(drop=True)
        feats_fold[col].fillna(global_mean, inplace=True)
        #feats_fold[fname_w_likeli].fillna(global_mean, inplace=True)
        feats_fold = feats_fold.rename(columns={col : col+'_fold_'+str(test_fold)})
        #feats_fold = feats_fold.rename(columns={fname_w_likeli : fname_w_likeli+'_fold_'+str(test_fold)})
        feats_all = pd.merge(feats_all, feats_fold, on='patient_id', how='left')

    print "Writing to HDF5 store..."
    store = pd.HDFStore('../data/output-py/' + col + '.h5')
    store.append('feats_all', feats_all)
    store.close()
    conn.close()
    print "Feature " + col + " is saved in file."
    return col