def run_component(): #increase limit to handle the increase in depth/width of dictionary objects sys.setrecursionlimit(50000) #recall we divided up the data into multiple pickle files file_list = [ f for f in os.listdir('data/data_subsets') if not f.startswith('.') ] #set the number of processes num_proc = 5 #divide files into (roughly) even number of num_proc file_sets = h.chunkify(file_list, num_proc) proc_list = [] for file_set in file_sets: proc = mp.Process(target=run_process, args=(file_set, )) proc_list.append(proc) for proc in proc_list: proc.start() for proc in proc_list: proc.join()
def showBookshelf(): books = session.query(Book).all() book_chunks = chunkify(books, 4) genres = getGenreList(books) return render_template('bookshelf.html', book_chunks=book_chunks, genres=genres)
def showGenre(genre): genre_format = genre.replace('+', ' ').title() books_by_genre = session.query(Book).filter_by(category=genre_format).all() book_chunks = chunkify(books_by_genre, 4) return render_template('showGenre.html', genre=genre_format, book_chunks=book_chunks)
def showLandingPage(): if 'username' not in login_session: return render_template('landingpage.html') else: user_id = getUserID(login_session['email']) books = session.query(Book).filter_by(user_id=user_id).all() book_chunks = chunkify(books, 4) genres = getGenreList(books) return render_template('mainpage.html', book_chunks=book_chunks, genres=genres, login_session=login_session)
def showGenre(genre): if 'username' not in login_session: return redirect('/login') genre_frmt = genre.replace('+', ' ').title() books_by_genre = session.query(Book).filter_by(category=genre_frmt).all() book_chunks = chunkify(books_by_genre, 4) return render_template('showGenre.html', genre=genre_frmt, book_chunks=book_chunks, login_session=login_session)
def run_component(): pickle_file = 'data/data.pickle' meta_file = 'data/scraping_metadata.pickle' #load up the data (NOTE: kept the metadata on scrapetime on a separate cPickle file because that was a separate project) data = pickle.load(open(pickle_file, 'rb')) metadata = pickle.load(open(meta_file, 'rb')) #has been previously set up in the metadata file for num_threads in metadata['search_domains']: keys = metadata['search_domains'][num_threads] #divide up given set of keys into num_threads subsets keys_subsets = h.chunkify(keys, num_threads) #each new threadcount group is always a new session, and each time program starts it's also a new session session = datetime.datetime.now() metadata['data'].update({session: {}}) thread_list = [] threadID = 0 #print for update to keep track print( "=========== \n NEW MUTLTHREADING \nNEW NUMBER OF THREADS: %s \n===========" % (num_threads)) #run multiple threads for thread_keys in keys_subsets: threadID += 1 metadata['data'][session].update({threadID: []}) thread = ST.scrapeThread(threadID, num_threads, thread_keys, pickle_file, meta_file, data, session, metadata) thread_list.append(thread) for thread in thread_list: thread.start() for thread in thread_list: thread.join() #final update of data within session pickle.dump(data, open(pickle_file, 'wb')) pickle.dump(metadata, open(meta_file, 'wb')) #final update of data after entire program pickle.dump(data, open(pickle_file, 'wb')) pickle.dump(metadata, open(meta_file, 'wb'))
def multi(): lst = [i for i in range(300)] num_proc = 1 array_sets = h.chunkify(lst, num_proc) proc_list = [] for array_set in array_sets: proc = mp.Process(target=my_f, args=(array_set, )) proc_list.append(proc) for proc in proc_list: proc.start() for proc in proc_list: proc.join()
def mixed_effects_analysis(args, embed_matrix): # load common brain space subjects = [1, 2, 4, 5, 7, 8, 9, 10, 11] num_sentences = 240 common_space = helper.load_common_space(subjects, local=args.local) print("COMMON SPACE SHAPE: " + str(common_space.shape)) voxel_coordinates = np.transpose(np.nonzero(common_space)) num_voxels = len(voxel_coordinates) print("NUM VOXELS IN SHARED COMMON BRAIN SPACE: " + str(num_voxels)) # initialize variables all_activations = [] subj_number = [] voxel_index = [] # prepare model embeddings dim_labels = ['dim' + str(i) for i in range(embed_matrix.shape[1])] embed_matrix_pd = pd.DataFrame(embed_matrix, columns=dim_labels) print("EMBEDDINGS SHAPE: " + str(embed_matrix_pd.shape)) embed_matrix_pd_repeat = pd.concat([embed_matrix_pd] * len(subjects), ignore_index=True) embed_matrix_pd_repeat.insert(0, 'bias', 1) print("REPEAT EMBEDDINGS SHAPE: " + str(embed_matrix_pd_repeat.shape)) # get labels labels = "" conditional_labels = "" for i in range(embed_matrix.shape[1]): labels += 'dim' + str(i) + ' + ' conditional_labels += 'dim' + str(i) + ' | subject_number + ' # get data for subj in tqdm(subjects): if args.local: modified_activations = pickle.load( open(f"../examplesGLM/subj{subj}/modified_activations.p", "rb")) else: modified_activations = pickle.load( open( f"/n/shieber_lab/Lab/users/cjou/fmri/subj{subj}/modified_activations.p", "rb")) norm_modified_activations = helper.z_score( np.array(modified_activations)) activation_vals = np.array([ modified_elem[np.nonzero(common_space)] for modified_elem in norm_modified_activations ]) # print("ACTIVATIONS SHAPE: " + str(activation_vals.shape)) flatten_activations = get_activations(activation_vals) # print("FLATTEN ACTIVATIONS SHAPE: " + str(flatten_activations.shape)) all_activations.extend(flatten_activations) voxel_index.extend(list(range(num_voxels)) * num_sentences) subj_number.extend([subj] * num_voxels * num_sentences) del modified_activations del norm_modified_activations del activation_vals del flatten_activations print("ACTIVATIONS LENGTH: " + str(len(all_activations))) print("SUBJECT NUMBER LENGTH: " + str(len(subj_number))) print("VOXEL INDEX: " + str(len(voxel_index))) # create dataframe data = pd.DataFrame({ 'subject_number': subj_number, 'voxel_index': voxel_index, 'activations': all_activations }) data_slice = data.loc[data["voxel_index"] == 0] print("DATA SLICE SHAPE: " + str(data_slice.shape)) # per voxel rmses_per_voxel = [] CHUNK = helper.chunkify(list(range(num_voxels)), args.batch_num, args.total_batches) for v in tqdm(CHUNK): data_slice = data.loc[data["voxel_index"] == v].reset_index() # concat_pd = pd.concat([data_slice, embed_matrix_pd_repeat], axis=1) rmse = run_per_voxel(data_slice, embed_matrix_pd_repeat, labels) rmses_per_voxel.append(rmse) print(asdf) return rmses_per_voxel
def all_activations_for_all_sentences(modified_activations, volmask, embed_matrix, args, radius=5, kfold_split=5, alpha=1): global temp_file_name ACTIVATION_SHAPE = (240, 515) print("getting activations for all sentences...") res_per_spotlight = [] predictions = [] rankings = [] llhs = [] # pvalues = [] alphas = [] a,b,c = volmask.shape nonzero_pts = np.transpose(np.nonzero(volmask)) true_spotlights = [] CHUNK = helper.chunkify(nonzero_pts, args.batch_num, args.total_batches) CHUNK_SIZE = len(CHUNK) # iterate over spotlight print("for each spotlight...") index=0 nn_matrix = calculate_dist_matrix(embed_matrix) if args.rsa else None for pt in tqdm(CHUNK): # SPHERE MASK BELOW sphere_mask = np.zeros((a,b,c)) x1,y1,z1 = pt # points_glm.append(pt) for i in range(-radius, radius+1): for j in range(-radius, radius+1): for k in range(-radius, radius+1): xp = x1 + i yp = y1 + j zp = z1 + k pt2 = [xp,yp,zp] if 0 <= xp and 0 <= yp and 0 <= zp and xp < a and yp < b and zp < c: dist = math.sqrt(i ** 2 + j ** 2 + k ** 2) if pt2 in nonzero_pts and dist <= radius: sphere_mask[x1+i][y1+j][z1+k] = 1 # SPHERE MASK ABOVE spotlights = [] spotlight_mask = [] # iterate over each sentence for sentence_act in modified_activations: spot = sentence_act[sphere_mask.astype(bool)] remove_nan = np.nan_to_num(spot).astype(np.float32) spotlights.append(remove_nan) # spotlight_mask.append(sphere_mask.astype(bool)) print(np.array(spotlights).shape) true_spotlights.append(spotlights) # boolean_masks.append(spotlight_mask) ## DECODING BELOW if args.rsa: res = rsa(nn_matrix, np.array(spotlights)) else: res, pred, llh, rank, alpha = linear_model(embed_matrix, spotlights, args, kfold_split, alpha) predictions.append(pred) llhs.append(llh) rankings.append(rank) # pvalues.append(pval) alphas.append(alpha) # print("RES for SPOTLIGHT #", index, ": ", res) # print("RANK : " + str(rank)) res_per_spotlight.append(res) index+=1 ## DECODING ABOVE return res_per_spotlight, llhs, rankings, alphas
def run_component(): csv_directory = '/Users/kenta/Dropbox (Econ)/CanadianDoctors/British Columbia/BC Doctors/table/' pickle_file = 'data/data.pickle' meta_file = 'data/scraping_metadata.pickle' data = {} metadata = { 'search_domains':{}, 'data':{}} #CSV files between years 2000-2016 (inclusive) for x in range(1, 17): year = 2000 + x csvfile = csv_directory + "doc_" + str(year) + ".csv" with open(csvfile, 'r', encoding = 'utf-8') as fname: reader = csv.reader(fname) reader_list = list(reader) #some csv parsing for row in reader_list[1:]: fname = row[1] lname = row[0] salary = row[2] if (fname, lname) in data: data[(fname,lname)]['RECORDS'].update( { year:salary } ) else: data.update( { (fname,lname):{ 'fname':fname, 'lname':lname, 'RECORDS':{}, 'SCRAPING':{}, 'GOOGLE':{} } } ) data[(fname,lname)]['RECORDS'].update( { year:salary } ) data[(fname,lname)].update({ 'fname':fname, 'lname':lname }) #this will contain arrays of (fname,lname) keys keys_array = [] #fill array with all (fname,lname) keys for key in data: keys_array.append(key) #choose lower&upper bound on threadcount, as well as incrementing size low = 1 high = 17 step_size = 2 num_subs = (low+high)//step_size #randomization np.random.shuffle(keys_array) #chunks is an array of arrays containing (fname,lname) keys chunks = h.chunkify(keys_array,num_subs) #organize the metadata dictionary for x in range(0, num_subs): num_threads = 2*x + 1 search_subset = {num_threads:chunks[x]} metadata['search_domains'].update(search_subset) pickle.dump(data, open(pickle_file, 'wb')) pickle.dump(metadata, open(meta_file, 'wb'))