コード例 #1
0
def run_component():

    #increase limit to handle the increase in depth/width of dictionary objects
    sys.setrecursionlimit(50000)

    #recall we divided up the data into multiple pickle files
    file_list = [
        f for f in os.listdir('data/data_subsets') if not f.startswith('.')
    ]

    #set the number of processes
    num_proc = 5

    #divide files into (roughly) even number of num_proc
    file_sets = h.chunkify(file_list, num_proc)

    proc_list = []

    for file_set in file_sets:
        proc = mp.Process(target=run_process, args=(file_set, ))
        proc_list.append(proc)
    for proc in proc_list:
        proc.start()
    for proc in proc_list:
        proc.join()
コード例 #2
0
def showBookshelf():
    books = session.query(Book).all()
    book_chunks = chunkify(books, 4)
    genres = getGenreList(books)
    return render_template('bookshelf.html',
                           book_chunks=book_chunks,
                           genres=genres)
コード例 #3
0
def showGenre(genre):
    genre_format = genre.replace('+', ' ').title()
    books_by_genre = session.query(Book).filter_by(category=genre_format).all()
    book_chunks = chunkify(books_by_genre, 4)

    return render_template('showGenre.html',
                           genre=genre_format,
                           book_chunks=book_chunks)
コード例 #4
0
def showLandingPage():
    if 'username' not in login_session:
        return render_template('landingpage.html')
    else:
        user_id = getUserID(login_session['email'])
        books = session.query(Book).filter_by(user_id=user_id).all()
        book_chunks = chunkify(books, 4)
        genres = getGenreList(books)
        return render_template('mainpage.html',
                               book_chunks=book_chunks,
                               genres=genres,
                               login_session=login_session)
コード例 #5
0
def showGenre(genre):
    if 'username' not in login_session:
        return redirect('/login')

    genre_frmt = genre.replace('+', ' ').title()
    books_by_genre = session.query(Book).filter_by(category=genre_frmt).all()
    book_chunks = chunkify(books_by_genre, 4)

    return render_template('showGenre.html',
                           genre=genre_frmt,
                           book_chunks=book_chunks,
                           login_session=login_session)
コード例 #6
0
ファイル: scrape.py プロジェクト: kentabf/selenium-webscraper
def run_component():

    pickle_file = 'data/data.pickle'
    meta_file = 'data/scraping_metadata.pickle'

    #load up the data (NOTE: kept the metadata on scrapetime on a separate cPickle file because that was a separate project)
    data = pickle.load(open(pickle_file, 'rb'))
    metadata = pickle.load(open(meta_file, 'rb'))

    #has been previously set up in the metadata file
    for num_threads in metadata['search_domains']:

        keys = metadata['search_domains'][num_threads]

        #divide up given set of keys into num_threads subsets
        keys_subsets = h.chunkify(keys, num_threads)

        #each new threadcount group is always a new session, and each time program starts it's also a new session
        session = datetime.datetime.now()
        metadata['data'].update({session: {}})

        thread_list = []
        threadID = 0

        #print for update to keep track
        print(
            "=========== \n NEW MUTLTHREADING \nNEW NUMBER OF THREADS: %s \n==========="
            % (num_threads))

        #run multiple threads
        for thread_keys in keys_subsets:
            threadID += 1
            metadata['data'][session].update({threadID: []})
            thread = ST.scrapeThread(threadID, num_threads, thread_keys,
                                     pickle_file, meta_file, data, session,
                                     metadata)
            thread_list.append(thread)
        for thread in thread_list:
            thread.start()
        for thread in thread_list:
            thread.join()

        #final update of data within session
        pickle.dump(data, open(pickle_file, 'wb'))
        pickle.dump(metadata, open(meta_file, 'wb'))

    #final update of data after entire program
    pickle.dump(data, open(pickle_file, 'wb'))
    pickle.dump(metadata, open(meta_file, 'wb'))
コード例 #7
0
def multi():
    lst = [i for i in range(300)]

    num_proc = 1
    array_sets = h.chunkify(lst, num_proc)

    proc_list = []

    for array_set in array_sets:
        proc = mp.Process(target=my_f, args=(array_set, ))
        proc_list.append(proc)
    for proc in proc_list:
        proc.start()
    for proc in proc_list:
        proc.join()
コード例 #8
0
def mixed_effects_analysis(args, embed_matrix):
    # load common brain space
    subjects = [1, 2, 4, 5, 7, 8, 9, 10, 11]
    num_sentences = 240
    common_space = helper.load_common_space(subjects, local=args.local)
    print("COMMON SPACE SHAPE: " + str(common_space.shape))
    voxel_coordinates = np.transpose(np.nonzero(common_space))
    num_voxels = len(voxel_coordinates)
    print("NUM VOXELS IN SHARED COMMON BRAIN SPACE: " + str(num_voxels))

    # initialize variables
    all_activations = []
    subj_number = []
    voxel_index = []

    # prepare model embeddings
    dim_labels = ['dim' + str(i) for i in range(embed_matrix.shape[1])]
    embed_matrix_pd = pd.DataFrame(embed_matrix, columns=dim_labels)
    print("EMBEDDINGS SHAPE: " + str(embed_matrix_pd.shape))
    embed_matrix_pd_repeat = pd.concat([embed_matrix_pd] * len(subjects),
                                       ignore_index=True)
    embed_matrix_pd_repeat.insert(0, 'bias', 1)
    print("REPEAT EMBEDDINGS SHAPE: " + str(embed_matrix_pd_repeat.shape))

    # get labels
    labels = ""
    conditional_labels = ""
    for i in range(embed_matrix.shape[1]):
        labels += 'dim' + str(i) + ' + '
        conditional_labels += 'dim' + str(i) + ' | subject_number + '

    # get data
    for subj in tqdm(subjects):
        if args.local:
            modified_activations = pickle.load(
                open(f"../examplesGLM/subj{subj}/modified_activations.p",
                     "rb"))
        else:
            modified_activations = pickle.load(
                open(
                    f"/n/shieber_lab/Lab/users/cjou/fmri/subj{subj}/modified_activations.p",
                    "rb"))

        norm_modified_activations = helper.z_score(
            np.array(modified_activations))
        activation_vals = np.array([
            modified_elem[np.nonzero(common_space)]
            for modified_elem in norm_modified_activations
        ])
        # print("ACTIVATIONS SHAPE: " + str(activation_vals.shape))
        flatten_activations = get_activations(activation_vals)
        # print("FLATTEN ACTIVATIONS SHAPE: " + str(flatten_activations.shape))
        all_activations.extend(flatten_activations)
        voxel_index.extend(list(range(num_voxels)) * num_sentences)
        subj_number.extend([subj] * num_voxels * num_sentences)
        del modified_activations
        del norm_modified_activations
        del activation_vals
        del flatten_activations

    print("ACTIVATIONS LENGTH: " + str(len(all_activations)))
    print("SUBJECT NUMBER LENGTH: " + str(len(subj_number)))
    print("VOXEL INDEX: " + str(len(voxel_index)))

    # create dataframe
    data = pd.DataFrame({
        'subject_number': subj_number,
        'voxel_index': voxel_index,
        'activations': all_activations
    })

    data_slice = data.loc[data["voxel_index"] == 0]
    print("DATA SLICE SHAPE: " + str(data_slice.shape))

    # per voxel
    rmses_per_voxel = []
    CHUNK = helper.chunkify(list(range(num_voxels)), args.batch_num,
                            args.total_batches)
    for v in tqdm(CHUNK):
        data_slice = data.loc[data["voxel_index"] == v].reset_index()
        # concat_pd = pd.concat([data_slice, embed_matrix_pd_repeat], axis=1)
        rmse = run_per_voxel(data_slice, embed_matrix_pd_repeat, labels)
        rmses_per_voxel.append(rmse)
        print(asdf)

    return rmses_per_voxel
コード例 #9
0
def all_activations_for_all_sentences(modified_activations, volmask, embed_matrix, args, radius=5, kfold_split=5, alpha=1):
	global temp_file_name

	ACTIVATION_SHAPE = (240, 515)

	print("getting activations for all sentences...")
	res_per_spotlight = []
	predictions = []
	rankings = []
	llhs = []
	# pvalues = []
	alphas = []
	a,b,c = volmask.shape
	nonzero_pts = np.transpose(np.nonzero(volmask))
	true_spotlights = []
	CHUNK = helper.chunkify(nonzero_pts, args.batch_num, args.total_batches)
	CHUNK_SIZE = len(CHUNK)

	# iterate over spotlight
	print("for each spotlight...")

	index=0
	nn_matrix = calculate_dist_matrix(embed_matrix) if args.rsa else None 
	for pt in tqdm(CHUNK):

		# SPHERE MASK BELOW
		sphere_mask = np.zeros((a,b,c))
		x1,y1,z1 = pt
		# points_glm.append(pt)
		for i in range(-radius, radius+1):
			for j in range(-radius, radius+1):
				for k in range(-radius, radius+1):
					xp = x1 + i
					yp = y1 + j
					zp = z1 + k
					pt2 = [xp,yp,zp]
					if 0 <= xp and 0 <= yp and 0 <= zp and xp < a and yp < b and zp < c:
						dist = math.sqrt(i ** 2 + j ** 2 + k ** 2)
						if pt2 in nonzero_pts and dist <= radius:
							sphere_mask[x1+i][y1+j][z1+k] = 1
		# SPHERE MASK ABOVE

		spotlights = []
		spotlight_mask = []

		# iterate over each sentence
		for sentence_act in modified_activations:
			spot = sentence_act[sphere_mask.astype(bool)]
			remove_nan = np.nan_to_num(spot).astype(np.float32)
			spotlights.append(remove_nan)
			# spotlight_mask.append(sphere_mask.astype(bool))

		print(np.array(spotlights).shape)

		true_spotlights.append(spotlights)
		# boolean_masks.append(spotlight_mask)

		## DECODING BELOW
		if args.rsa: 
			res = rsa(nn_matrix, np.array(spotlights))
		else: 
			res, pred, llh, rank, alpha = linear_model(embed_matrix, spotlights, args, kfold_split, alpha)
			predictions.append(pred)
			llhs.append(llh)
			rankings.append(rank)
			# pvalues.append(pval)
			alphas.append(alpha)

		# print("RES for SPOTLIGHT #", index, ": ", res)
		# print("RANK : " + str(rank))
		res_per_spotlight.append(res)

		index+=1
		
		## DECODING ABOVE

	return res_per_spotlight, llhs, rankings, alphas
コード例 #10
0
ファイル: setup.py プロジェクト: kentabf/selenium-webscraper
def run_component():
	
	csv_directory = '/Users/kenta/Dropbox (Econ)/CanadianDoctors/British Columbia/BC Doctors/table/'

	pickle_file = 'data/data.pickle'
	meta_file = 'data/scraping_metadata.pickle'

	data = {}
	metadata = { 'search_domains':{}, 'data':{}}

	#CSV files between years 2000-2016 (inclusive)
	for x in range(1, 17):
		
		year = 2000 + x

		csvfile = csv_directory + "doc_" + str(year) + ".csv"

		with open(csvfile, 'r', encoding = 'utf-8') as fname:
			reader = csv.reader(fname)
			reader_list = list(reader)


			#some csv parsing
			for row in reader_list[1:]:
				fname = row[1]
				lname = row[0]
				salary = row[2]

				if (fname, lname) in data:
					data[(fname,lname)]['RECORDS'].update( { year:salary } )

				else:
					data.update( { (fname,lname):{ 'fname':fname, 'lname':lname, 'RECORDS':{}, 'SCRAPING':{}, 'GOOGLE':{} } } )
					data[(fname,lname)]['RECORDS'].update( { year:salary } )

				data[(fname,lname)].update({ 'fname':fname, 'lname':lname })

	#this will contain arrays of (fname,lname) keys
	keys_array = []

	#fill array with all (fname,lname) keys
	for key in data:
		keys_array.append(key)

	#choose lower&upper bound on threadcount, as well as incrementing size
	low = 1
	high = 17
	step_size = 2
	num_subs = (low+high)//step_size

	#randomization
	np.random.shuffle(keys_array)

	#chunks is an array of arrays containing (fname,lname) keys
	chunks = h.chunkify(keys_array,num_subs)

	#organize the metadata dictionary
	for x in range(0, num_subs):
		num_threads = 2*x + 1
		search_subset = {num_threads:chunks[x]}
		metadata['search_domains'].update(search_subset)


	pickle.dump(data, open(pickle_file, 'wb'))
	pickle.dump(metadata, open(meta_file, 'wb'))