def test_recfromcsv(self): # data = StringIO.StringIO('A,B\n0,1\n2,3') test = np.recfromcsv(data, missing='N/A', names=True, case_sensitive=True) control = np.array([(0, 1), (2, 3)], dtype=[('A', np.int), ('B', np.int)]) self.failUnless(isinstance(test, np.recarray)) assert_equal(test, control) # data = StringIO.StringIO('A,B\n0,1\n2,N/A') test = np.recfromcsv(data, dtype=None, missing='N/A', names=True, case_sensitive=True, usemask=True) control = ma.array([(0, 1), (2, -1)], mask=[(False, False), (False, True)], dtype=[('A', np.int), ('B', np.int)]) assert_equal(test, control) assert_equal(test.mask, control.mask) assert_equal(test.A, [0, 2]) # data = StringIO.StringIO('A,B\n0,1\n2,3') test = np.recfromcsv(data, missing='N/A',) control = np.array([(0, 1), (2, 3)], dtype=[('a', np.int), ('b', np.int)]) self.failUnless(isinstance(test, np.recarray)) assert_equal(test, control)
def from_paths(solver_names, task_paths, domain, suffix=".runs.csv"): """Collect run data from task paths.""" training = RunData(solver_names) for path in task_paths: # load run records run_data = numpy.recfromcsv(path + suffix, usemask=True) rows = run_data.tolist() if run_data.shape == (): rows = [rows] for (run_solver, run_budget, run_cost, run_succeeded, run_answer) in rows: record = RunRecord(run_solver, run_budget, run_cost, run_succeeded) training.add_run(path, record) # load feature data feature_records = numpy.recfromcsv("{0}.features.csv".format(path)) feature_dict = dict(zip(feature_records.dtype.names, feature_records.tolist())) training.add_feature_vector(path, feature_dict) return training
def otherfunc(roifiles, subjects): import numpy as np from matplotlib.mlab import rec2csv import os first = np.recfromcsv(roifiles[0]) numcons = len(first.dtype.names) - 1 roinames = ["subject_id"] + first["roi"].tolist() formats = ["a20"] + ["f4" for f in roinames[1:]] confiles = [] for con in range(0, numcons): recarray = np.zeros(len(roifiles), dtype={"names": roinames, "formats": formats}) for i, file in enumerate(roifiles): recfile = np.recfromcsv(file) recarray["subject_id"][i] = subjects[i] for roi in roinames[1:]: value = recfile["con%02d" % (con + 1)][recfile["roi"] == roi] if value: recarray[roi][i] = value else: recarray[roi][i] = 999 filename = os.path.abspath("grouped_con%02d.csv" % (con + 1)) rec2csv(recarray, filename) confiles.append(filename) return confiles
def get_regressors(csv,ids): import numpy as np if csv == '': return None reg = {} design = np.recfromcsv(csv) design_str = np.recfromcsv(csv,dtype=str) names = design_str.dtype.names csv_ids = [] for i in design_str["id"]: csv_ids.append(str(i)) csv_ids = np.asarray(csv_ids) for n in names: if not n=="id": reg[n] = [] for sub in ids: if sub in csv_ids: for key in reg.keys(): reg[key].append(design[key][csv_ids==sub][0]) else: raise Exception("%s is missing from the CSV file!"%sub) cov = [] for key,item in reg.iteritems(): cov.append({'name':key,'vector':item,'centering':0}) print cov return cov
def get_regressors(csv,ids): import numpy as np if csv == '': return None reg = {} design = np.recfromcsv(csv) design_str = np.recfromcsv(csv,dtype=str) names = design_str.dtype.names csv_ids = [] for i in design_str["id"]: csv_ids.append(str(i)) csv_ids = np.asarray(csv_ids) for n in names: if not n=="id": reg[n] = [] for sub in ids: if sub in csv_ids: for key in reg.keys(): reg[key].append(design[key][csv_ids==sub][0]) else: raise Exception("%s is missing from the CSV file!"%sub) if 'group' in names: data = np.asarray(reg['group']) vals = np.unique(data) for i, v in enumerate(vals): data[data==v] = i+1 group = data.astype(int).tolist() reg.pop('group') else: group = [1]*len(reg[names[-1]]) return reg, group
def __init__(self, tasks_roots, domain): """Initialize.""" # scan for CSV files train_paths = [] for tasks_root in tasks_roots: train_paths.extend(cargo.files_under(tasks_root, domain.extensions)) logger.info("using %i tasks for training", len(train_paths)) # fetch training data from each file self._run_lists = {} self._feature_vectors = {} for path in train_paths: # load run records run_data = numpy.recfromcsv("{0}.runs.csv".format(path), usemask = True) run_list = [] for (run_solver, run_budget, run_cost, run_succeeded, run_answer) in run_data.tolist(): record = RunRecord(run_solver, run_budget, run_cost, run_succeeded) run_list.append(record) self._run_lists[path] = run_list # load feature data feature_vector = numpy.recfromcsv("{0}.features.csv".format(path)).tolist() self._feature_vectors[path] = feature_vector
def sort_battles(self, results_filename='csv/mz_results_boulders.csv', images_filename='csv/mz_images_boulders.csv', out_filename='csv/mz_boulders_rank.csv'): p = np.recfromcsv(images_filename, names=True) objid = p.field('id') rank = np.zeros(objid.shape, np.int) - 1 fracrank = np.zeros(objid.shape) - 1 battles = np.recfromcsv(results_filename, names=True) # currently does not do anything with inconclusive battles battles = battles[battles.field('winner') > 0] first = battles['first_asset_id'] second = battles['second_asset_id'] winner = battles['winner'] w = np.where(winner == 1, first, second) l = np.where(winner == 1, second, first) competitors = np.unique(np.concatenate((w, l))) self.competitors = self._asarray(competitors) self.winners = self._asarray(w) self.losers = self._asarray(l) self._consistency_check() self._setup_internal_variables() print('ncomp = %i, nwars = %i'%(self.ncomp, self.nwars)) self.iterate() for r, id in enumerate(self.ranking): idx = (objid == id).nonzero()[0] if len(idx) < 1: print('Could not find objid match for id={}, rank={}'.format(id, r)) idx = idx[0] rank[idx] = r fracrank[idx] = float(r) / self.ncomp np.savetxt(out_filename, np.asarray((objid, rank, fracrank)).T, fmt='%d,%d,%.3f', header=("objid,rank,fracrank"))
def compare(fileA, fileB): mooseData = np.recfromcsv(fileA, delimiter=',') nrnData = np.recfromcsv(fileB, delimiter=',') mooseData = zip(*mooseData) nrnData = zip(*nrnData) print mooseData[0] pylab.plot([1e3*x for x in mooseData[0]], [ 1e3*x for x in mooseData[1]] , label = 'moose') pylab.plot(nrnData[0], nrnData[1], label = 'neuron') #pylab.plot(mooseData) #pylab.plot(nrnData) pylab.show()
def np_combine_csv_files(csvpaths, verbose=False): """Combine a collection of CSV files into a single numpy record array. Can take a while! CSV files with different fields (different headers, different number of fields) are merged together correctly, data type inferral and promotion takes a while. Treats the first line as a header, uses to name the fields. Giving it files without headers will cause weird things to happen. Arguments: csvpaths: List of text files to read into the array Returns: numpy.recarray """ big_csv = numpy.recfromcsv( open(csvpaths[0]), case_sensitive=True, deletechars='', replace_space=' ', autostrip=True ) if 'File ID' not in big_csv.dtype.names and big_csv['Input'].size > 1: big_csv = numpy.lib.recfunctions.append_fields( big_csv, 'File ID', [os.path.splitext(os.path.basename(x))[0] for x in big_csv['Input']], usemask=False, asrecarray=True ) for i, csvpath in enumerate(csvpaths[1:]): csv_arr = numpy.recfromcsv( open(csvpath), case_sensitive=True, deletechars='', replace_space=' ', autostrip=True ) if 'File ID' not in csv_arr.dtype.names and csv_arr['Input'].size > 1: csv_arr = numpy.lib.recfunctions.append_fields( csv_arr, 'File ID', [os.path.splitext(os.path.basename(x))[0] for x in csv_arr['Input']], usemask=False, asrecarray=True ) for field_name in csv_arr.dtype.names: if field_name not in big_csv.dtype.names: big_csv = numpy.lib.recfunctions.append_fields( big_csv, field_name, [], usemask=False, asrecarray=True ) big_csv = numpy.lib.recfunctions.stack_arrays( (big_csv, csv_arr), usemask=False, asrecarray=True, autoconvert=True ) if verbose: print('Loaded %d/%d files' % (i + 1, len(csvpaths)), end='\r') return big_csv
def new_tables(): sns.set_context("paper", font_scale=font_scale, rc={"lines.linewidth": 2.5}) fig, ax = plt.subplots(1) with open('../results/sdss/query_number_num_new_tables.csv') as f: data = np.recfromcsv(f) c = data['num_new_tables'].astype(float) c /= sum(c) q = data['query_number'].astype(float) q /= q[-1] ax.plot(q, np.cumsum(c), label="SDSS", color=colors['sdss'], linewidth=2, drawstyle='steps-post') # ax.scatter(q[0: -1], np.cumsum(c)[0: -1], color=colors['sdss'], marker="o", s=50, alpha=.7) with open('../results/tpch/query_number_num_new_tables.csv') as f: data = np.recfromcsv(f) c = data['num_new_tables'].astype(float) c /= sum(c) q = data['query_number'].astype(float) q /= q[-1] ax.plot(q, np.cumsum(c), label="TPC-H", color=colors['tpch'], linewidth=2, drawstyle='steps-post') # ax.scatter(q[0: -1], np.cumsum(c)[0: -1], color=colors['tpch'], marker="o", s=50, alpha=.7) # sns.rugplot([0.1, 0.2, 10, 100], ax=ax) with open('../results/sqlshare/table_coverage.csv') as f: data = np.recfromcsv(f) c = data['tables'].astype(float) c /= c[-1] q = data['query_id'].astype(float) q /= q[-1] ax.plot(q, c, label="SQLShare", color=colors['sqlshare'], linewidth=2, drawstyle='steps-post') # ax.scatter(q[0: -1], c[0: -1], color=colors['sqlshare'], marker="o", s=20, alpha=.01) ax.yaxis.set_major_formatter(formatter) ax.xaxis.set_major_formatter(formatter) plt.title("CDF of new tables") ax.set_xlabel('\% of queries') ax.set_ylabel('\% of newly used table') ax.set_ylim(0, 1.01) ax.set_xlim(-0.01, 1) ax.title.set_position((ax.title._x, 1.04)) plt.legend(loc=4) plt.tight_layout() plt.savefig(root_path + 'plot_table_coverage.eps', format='eps')
def get_subjects(): """Returns names of all subjects """ pdata = np.recfromcsv(('/mindhive/gablab/sad/PY_STUDY_DIR/Block/volsurf/' 'l2output/social/split_halves/regression/' 'lsasDELTA/6mm/allsubs.csv'), names=True) return pdata.subject.tolist()
def sort_results_csv(input_file='../../results/baseline_classifier_results.csv',output_file=''): """ Sorts the results csv file and writes to the same file. Sort on classifier name first (1th column), then on features (6th column) """ if output_file =='': output_file = input_file #import header first with open(input_file, 'r') as f: header = f.readline() #load csv into table (automatically with correct datatypes) table = np.recfromcsv(input_file,delimiter=',') #only sort if we have more then one element (to prevent bugs) if np.size(table) > 1: #sort on features table = sorted(table, key=lambda tup: tup[5]) #sort on classifier table = sorted(table, key=lambda tup: tup[0]) #store sorted file with open(output_file,'w') as fd: fd.write(header) [fd.write(settings_to_string(tup[0],tup[1],tup[2],tup[3],tup[4],tup[5],tup[6],tup[7]) + "\n") for tup in table]
def selectOnSharpeRatio(self, ls_symbols, top_n_equities=10): ''' Choose the best portfolio over the stock universe, according to their sharpe ratio''' #TODO: change this to a DataAccess utilitie -------------- symbols, files = getAllFromCSV() datalength = len(recfromcsv(files[0])['close']) print('Datalength: {}'.format(datalength)) #--------------------------------------------------------- #Initiaing data arrays closes = np.recarray((datalength,), dtype=[(symbol, 'float') for symbol in symbols]) daily_ret = np.recarray((datalength - 1,), dtype=[(symbol, 'float') for symbol in symbols]) average_returns = np.zeros(len(files)) return_stdev = np.zeros(len(files)) sharpe_ratios = np.zeros(len(files)) cumulative_returns = np.recarray((datalength-1,), dtype=[(symbol, 'float') for symbol in symbols]) # Here is the meat #TODO: data = dataobj.getData(ls_symbols) for i, symbol in enumerate(ls_symbols): if len(data) != datalength: continue print('Processing {} file'.format(file)) closes[symbols[i]] = data['close'][::-1] daily_ret[symbols[i]] = dailyReturns() # We now can compute: average_returns[i] = daily_ret[symbols[i]].mean() return_stdev[i] = daily_ret[symbols[i]].stdev() sharpe_ratios[i] = (average_returns[i] / return_stdev[i]) * np.sqrt(datalength) # compare to course print('\tavg: {}, stdev: {}, sharpe ratio: {}'.format(average_returns[i], return_stdev[i], sharpe_ratios[i])) sorted_sharpe_indices = np.argsort(sharpe_ratios)[::-1][0:top_n_equities] #TODO: return a disct as {symbol: sharpe_ratio}, or a df with all 3 components return sorted_sharpe_indices
def import_data(self): """Imports data from csv file as a numpy record array (aka structured array)""" self.data = np.recfromcsv(self.folder + self.file + '.csv') self.exh.T_in = self.data['hx_exh_in_t'] + 273.15 # K self.exh.T_out = self.data['hx_exh_out_t'] + 273.15 # K self.exh.mdot = self.data['exh_mdot_kgmin'] / 60. # kg/s self.exh.deltaP = ( self.data['hx_exh_delta_p_2_in_wc'] * 0.249 * 2. # kPa ) self.cool.T_in = ( 0.5 * (self.data['hx_cool_1_in_t'] + self.data['hx_cool_2_in_t']) + 273.15 # K ) self.cool.T_out = ( 0.5 * (self.data['hx_cool_1_out_t'] + self.data['hx_cool_2_out_t']) + 273.15 # K ) self.cool.Vdot = self.data['cool_vdot_gpm'] self.exh.T_mean = 0.5 * (self.exh.T_in + self.exh.T_out) self.exh.deltaT = self.exh.T_in - self.exh.T_out self.exh.eta = self.exh.deltaT / (self.exh.T_in - self.cool.T_in) self.exh.c_p = np.zeros(self.exh.T_in.size) for i in range(self.exh.T_in.size): self.exh.T = self.exh.T_mean[i] self.exh.set_TempPres_dependents() self.exh.c_p[i] = self.exh.c_p_air self.exh.Qdot = self.exh.mdot * self.exh.c_p * self.exh.deltaT
def fetch_abide(data_dir=None, verbose=0, **kwargs): """ """ exclude_ids = ['UM_1_0050289', 'Yale_0050571', 'KKI_0050822', 'SDSU_0050204', 'CMU_a_0050664'] strategy = 'nofilt_noglobal' pipeline = 'cpac' dataset_name = 'ABIDE_pcp' csv = 'Phenotypic_V1_0b_preprocessed1.csv' kwargs['qc_rater_1'] = b'OK' kwargs['qc_anat_rater_2'] = [b'OK', b'maybe'] kwargs['qc_func_rater_2'] = [b'OK', b'maybe'] kwargs['qc_anat_rater_3'] = b'OK' kwargs['qc_func_rater_3'] = b'OK' path_csv = os.path.join(data_dir, dataset_name, csv) with open(path_csv, 'r') as pheno_f: pheno = ['i' + pheno_f.readline()] for line in pheno_f: pheno.append(re.sub(r',(?=[^"]*"(?:[^"]*"[^"]*")*[^"]*$)', ";", line)) # bytes (encode()) needed for python 2/3 compat with numpy pheno = '\n'.join(pheno).encode() pheno = BytesIO(pheno) pheno = np.recfromcsv(pheno, comments='$', case_sensitive=True) # First, filter subjects with no filename pheno = pheno[pheno['FILE_ID'] != b'no_filename'] # Apply user defined filters user_filter = datasets.utils._filter_columns(pheno, kwargs) pheno = pheno[user_filter] for id_ in exclude_ids: pheno = pheno[pheno['FILE_ID'] != id_] data_dir = os.path.join(data_dir, dataset_name, pipeline, strategy) results = {} file_ids = [file_id.decode() for file_id in pheno['FILE_ID']] ext = '.nii.gz' derivative = 'func_preproc' files = [] for file_id in file_ids: file_ = (file_id + '_' + derivative + ext) check_file = os.path.join(data_dir, file_) if os.path.isfile(check_file): files.append(check_file) else: print("File is missing %s" % file_) results['phenotypic'] = pheno results[derivative] = files return Bunch(**results)
def read_census_matrix(state): censusFileLocation = M_PATH + '\\DemographicQueries\\' fname = censusFileLocation + state + 'Query.txt' tra = np.loadtxt(fname, delimiter=",", skiprows = 1, dtype = str, usecols=[3], converters = {3:remove_b}) uni = np.loadtxt(fname, delimiter=",", skiprows = 1, dtype = str, usecols=[2], converters = {2:remove_b}) mydata = np.recfromcsv(fname, delimiter=',', usecols = demo_ranges(), filling_values=np.nan, case_sensitive=True, deletechars='', replace_space=' ') return mydata, tra, uni
def q3a_pm(base_path, csv_fn): filename = "".join([base_path, csv_fn]) names = ['genre', 'gender', 'movies', 'type', 'age'] my_data = np.recfromcsv(filename, names = ['genre', 'gender', 'movies', 'type', 'age']) vhs = get_arr_for_col_del(my_data, 'type', names, "VHS") dvd = get_arr_for_col_del(my_data, 'type', names, "DVD") bluray = get_arr_for_col_del(my_data, 'type', names, "BLURAY") names.pop(3) vhs_f = get_arr_for_col_del(vhs, 'gender', names, "F") vhs_m = get_arr_for_col_del(vhs, 'gender', names, "M") dvd_f = get_arr_for_col_del(dvd, 'gender', names, "F") dvd_m = get_arr_for_col_del(dvd, 'gender', names, "M") bluray_f = get_arr_for_col_del(bluray, 'gender', names, "F") bluray_m = get_arr_for_col_del(bluray, 'gender', names, "M") plot_q3a(vhs_f, names, 'VHS copies', 'Movie distr. F') plot_q3a(vhs_m, names, 'VHS copies', 'Movie distr. M') plot_q3a(dvd_f, names, 'DVD copies', 'Movie distr. F') plot_q3a(dvd_m, names, 'DVD copies', 'Movie distr. M') plot_q3a(bluray_f, names, 'Bluray copies', 'Movie distr. F') plot_q3a(bluray_m, names, 'Bluray copies', 'Movie distr. M') return
def get_subject_data(subjects): """Returns contrast_files and behavioral data for a given list of subjects Parameters ---------- subjects : list of strings (names of subjects) Returns ------- confiles : list of strings (names of contrast files) pdata : recarray containing behavioral information for given subject order """ # original input file with test scores etc for every subject pdata = np.recfromcsv(('/mindhive/gablab/sad/PY_STUDY_DIR/Block/volsurf/' 'l2output/social/split_halves/regression/' 'lsasDELTA/6mm/allsubs.csv'), names=True) sidx = [] confiles = [] for s in subjects: try: idx = np.nonzero(pdata.subject==s)[0][0] except IndexError, e: raise IndexError('subject %s not found' % s) sidx.append(idx) confile = glob(con_template % s) if not confile: raise ValueError('no confile found for subject %s' % s) confiles.extend(confile)
def test_loopBlocks(self): """An experiment file with made-up params and routines to see whether future versions of experiments will get loaded. """ #load the test experiment (with a stims loop, trials loop and blocks loop) expfile = path.join(self.exp.prefsPaths['tests'], 'data', 'testLoopsBlocks.psyexp') self.exp.loadFromXML(expfile) # reload the edited file #alter the settings so the data goes to our tmp dir datafileBase = os.path.join(self.tmp_dir, 'testLoopsBlocks') datafileBaseRel = os.path.relpath(datafileBase,expfile) self.exp.settings.params['Data filename'].val = repr(datafileBaseRel) #write the script from the experiment script = self.exp.writeScript(expPath=expfile) py_file = os.path.join(self.tmp_dir, 'testLoopBlocks.py') # save it with codecs.open(py_file, 'w', 'utf-8-sig') as f: f.write(script.replace("core.quit()", "pass")) f.write("del thisExp\n") #garbage collect the experiment so files are auto-saved #run the file (and make sure we return to this location afterwards) wd = os.getcwd() execfile(py_file) os.chdir(wd) #load the data print("searching..." +datafileBase) print(glob.glob(datafileBase+'*')) f = open(datafileBase+".csv", 'rb') dat = numpy.recfromcsv(f, case_sensitive=True) f.close() assert len(dat)==8 # because 4 'blocks' with 2 trials each (3 stims per trial)
def read_data(self, fname, infile_type="aci", delimiter=","): """ Reads in the A-Ci data if infile_type="aci" is true, otherwise this reads in the fitted results... For A-Ci data, code expects a format of: -> Curve, Tleaf, Ci, Photo, Species, Season, Leaf Parameters ---------- fname : string input file name, expecting csv file. Returns: -------- data : array numpy array containing the data """ data = np.recfromcsv(fname, delimiter=delimiter, names=True, case_sensitive=True) if infile_type == "norm": # using normalised temp data data["Tav"] = data["Tav"] + self.deg2kelvin data["Jnorm"] = np.exp(data["Jnorm"]) data["Vnorm"] = np.exp(data["Vnorm"]) elif infile_type == "meas": # using measured temp data. data["Tav"] = data["Tav"] + self.deg2kelvin data["Jmax"] = data["Jmax"] data["Vcmax"] = data["Vcmax"] elif infile_type != "aci": raise IOError("Unknown file type in read??\n") return data
def read_datafile(datafile, header_skip, footer_skip, delim, string): path = os.path.join(APP_STATIC,datafile) data = np.recfromcsv(path, skip_header=header_skip, skip_footer=footer_skip, autostrip=True, delimiter=delim) data = np.array(map(list, data)) # convert the data into a normal numpy array extra_col = make_column(string, data.shape[0]) data = np.hstack((data, extra_col)) return data
def add_tan_pix_coordinates(in_file_name, out_file_name): """Compute x, y nominal coordinates from alt, az and add as columns to a CSV file""" logging.info('Reading file: {0}'.format(in_file_name)) data = np.recfromcsv(in_file_name) az = data['azevent'] alt = data['altevent'] alts = data['altsystem'] azs = data['azsystem'] infile = file(in_file_name) logging.info('Writing file: {0}'.format(out_file_name)) outfile = file(out_file_name, 'w') names = infile.readline().split() names.append(',Nomx,Nomy\n') line = ' '.join(names) line = line.replace(' ', '') outfile.write(line) for ii in np.arange(0, len(alts) - 1, 1): noms = tan_world_to_pix(az[ii], alt[ii], azs[ii], alts[ii]) values = infile.readline().split() values.append(',%s,%s\n' % (str(noms[0]), str(noms[1]))) line = ' '.join(values) line = line.replace(' ', '') outfile.write(line) infile.close() outfile.close()
def main(): filename = '../../dataset/sea_dataset/normalized_sea.csv' data = np.recfromcsv(filename) data_tuplelist = data.tolist() data_list = [list(i) for i in data_tuplelist] nop = 100 nod = shape(data_list)[1] print nod sigmai = [0.1] * nod chunk_size = 50 old_index = np.random.normal(loc=0, scale=math.pow(sigmai[1], 1), size=(nop, nod)) old_param = np.random.normal(loc=0, scale=sigmai[1], size=(1, nod)) # print old_param #print old_index chunk_accuracy_list = [] for i in range(0, 60000, chunk_size): print i chunk_data = data_list[i:i + chunk_size] chunk_data = [[1] + x for x in chunk_data] [chunk_params, current_parameters] = compute_chunk_n(chunk_data, nop, sigmai, old_param, old_index) #print chunk_params #print 'gg' #print current_parameters old_param = [chunk_params] old_index = current_parameters #print old_param #print current_parameters #print chunk_params #print chunk_params chunk_accuracy_list.append(compute_accuracy(chunk_data, chunk_params)) plot_accuracy(chunk_accuracy_list)
def compute_features(self, task, cpu_seconds = None): """Read or compute features of an instance.""" # grab precomputed feature data csv_path = task + ".features.csv" assert os.path.exists(csv_path) features_array = numpy.recfromcsv(csv_path) features = features_array.tolist() names = features_array.dtype.names # accumulate their cost assert names[0] == "cpu_cost" cpu_cost = features[0] borg.get_accountant().charge_cpu(cpu_cost) # handle timeout logic, and we're done if cpu_seconds is not None: if cpu_cost >= cpu_seconds: return (["cpu_cost"], [cpu_seconds]) else: assert len(names) > 1 return (names, features)
def yield_runs(): suite = borg.load_solvers(suite_path) logger.info("scanning paths under %s", tasks_root) paths = list(borg.util.files_under(tasks_root, suite.domain.extensions)) if not paths: raise ValueError("no paths found under specified root") if only_solver is None: solver_names = suite.solvers.keys() else: solver_names = [only_solver] for path in paths: run_data = None if only_missing and os.path.exists(path + suffix): run_data = numpy.recfromcsv(path + suffix, usemask=True) for solver_name in solver_names: if only_missing and run_data is not None: count = max(0, runs - numpy.sum(run_data.solver == solver_name)) else: count = runs logger.info("scheduling %i run(s) of %s on %s", count, solver_name, os.path.basename(path)) for _ in xrange(count): seed = numpy.random.randint(sys.maxint) yield (run_solver_on, [suite_path, solver_name, path, budget, store_answers, seed])
def fetch_coords_dosenbach_2010(): """Load the Dosenbach et al. 160 ROIs. These ROIs cover much of the cerebral cortex and cerebellum and are assigned to 6 networks. Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, contains: - "rois": coordinates of 160 ROIs in MNI space - "labels": ROIs labels - "networks": networks names References ---------- Dosenbach N.U., Nardos B., et al. "Prediction of individual brain maturity using fMRI.", 2010, Science 329, 1358-1361. """ dataset_name = 'dosenbach_2010' fdescr = _get_dataset_descr(dataset_name) package_directory = os.path.dirname(os.path.abspath(__file__)) csv = os.path.join(package_directory, "data", "dosenbach_2010.csv") out_csv = np.recfromcsv(csv) # We add the ROI number to its name, since names are not unique names = out_csv['name'] numbers = out_csv['number'] labels = np.array(['{0} {1}'.format(name, number) for (name, number) in zip(names, numbers)]) params = dict(rois=out_csv[['x', 'y', 'z']], labels=labels, networks=out_csv['network'], description=fdescr) return Bunch(**params)
def get_catalog(catalog_file): """ Load the catalog from the provided file :param catalog_file: a comma-separated catalog file :return: an instance of the Catalog class """ # Read them all as strings at first dtypes = map(lambda name:(name, 'S100'), columns_formats.keys()) data_ = numpy.recfromcsv(catalog_file, names=True, case_sensitive=True, dtype=dtypes) # Convert to the proper python format data_dict = collections.OrderedDict() for col in columns_formats: converter = columns_formats[col][0] null_value = columns_formats[col][1] this_data = data_[col] idx = (this_data == "NULL") this_data[idx] = null_value data_dict[col] = numpy.array(this_data, converter) # Convert to numpy.ndarray formats = tuple(map(lambda x:x.dtype.str, data_dict.values())) data = numpy.zeros(data_dict.values()[0].shape[0], dtype={'names': data_dict.keys(), 'formats': formats}) for col in data_dict: data[col][:] = data_dict[col] # data = numpy.recfromtxt(catalog_file, delimiter=',', names=True, # dtype=[('Trigger_name', 'S12'), # ('Trigger_date', 'S23'), # ('Trigger_time','<f8'), # ('GCN_type','S35'), # ('Time_scale', '<f8'), # ('Final_TS', '<f8'), # ('Output_RA', '<f8'), # ('Output_Dec', '<f8'), # ('Localization_error', '<f8'), # ('Closest_point_source', 'S43'), # ('Angular_distance', '<f8'), # ('Photon_index', '<f8'), # ('Photon_index_error', '<f8'), # ('GRB_events', '<i8')]) return Catalog(data)
def stations_json(): stations = np.recfromcsv('chi-stations.csv', delimiter=',') output = {'type': "FeatureCollection", 'features':[]} for s in stations: output['features'].append({ 'type': "Feature", "id": np.asscalar(s[0]), "geometry": { "type":"Point", "coordinates":[np.asscalar(s[2]),np.asscalar(s[1])] #long, lat }, "geometry_name": "origin_geom", "properties": { 'name': s[3] }}) f = io.open('chi-stations.json', 'w', encoding='utf-8') f.write(unicode(json.dumps(output, ensure_ascii=False))) f.close() json_output=open('chi-stations.json') output_data = json.load(json_output) pprint(output_data) json_output.close()
def graph(): # parse(MY_FILE, ",") data = np.recfromcsv('../data/crabs.csv') trans = [] itrans = [] x = [] i = 1 for row in data: trans.append(row['trans']) itrans.append(row['itrans']) x.append(i) i += 1 # create the figure fig = plt.figure(figsize=(7, 3)) # create a grid of 1 row and 1 column for the plot # gs = mpl.gridspec.GridSpec(1, 1) # put a plot in the first row, first column # ax = fig.add_subplots(gs[0]) plt.title('transVSitrans') plt.plot(x,trans,color='red') plt.plot(x, itrans, color='blue') fig.savefig('transVSitrans.png')
def show_predictions(alpha="alpha", symbol="GE", xtn=".PNG"): if type(alpha) == str: print ("Loading file named " + alpha + ".mat") a = mat.loadmat( alpha + ".mat", mat_dtype=False ) # load a matlab style set of matrices from the file named by the string alpha if a.has_key(alpha): alpha = a.get(alpha).reshape(-1) # get the variable with the name of the string in alpha else: alpha = a.get(a.keys()[2]).reshape(-1) # get the first non-hidden key and reshape into a 1-D array print ("Loading financial data for stock symbol", symbol) r = np.recfromcsv("/home/hobs/Desktop/References/quant/lyle/data/" + symbol + "_yahoo.csv", skiprows=1) r.sort() r.high = r.high * r.adj_close / r.close # adjust the high and low prices for stock splits r.low = r.low * r.adj_close / r.close # adjust the high and low prices for stock splits daily_returns = r.adj_close[1:] / r.adj_close[0:-1] - 1 predictions = lfilt(alpha, daily_returns) print ( "Plotting a scatter plot of", len(daily_returns), "returns vs", len(predictions), "predictions using a filter of length", len(alpha), ) (ax, fig) = plot(predictions, daily_returns[len(alpha) :], s="bo", xtn=".PNG") ax.set_xlabel("Predicted Returns") ax.set_ylabel("Actual Returns") big_mask = np.abs(predictions) > np.std(predictions) * 1.2 bigs = predictions[big_mask] true_bigs = daily_returns[big_mask] (ax, fig) = plot(bigs, true_bigs, s="r.", xtn=".PNG") fig.show() return (predictions, daily_returns, bigs, true_bigs, big_mask)
with the highest values. """ ############################################################################## # Retrieve the atlas and the data from nilearn import datasets atlas = datasets.fetch_atlas_msdl() atlas_filename = atlas['maps'] # Load the labels import numpy as np csv_filename = atlas['labels'] # The recfromcsv function can load a csv file labels = np.recfromcsv(csv_filename) names = labels['name'] data = datasets.fetch_adhd(n_subjects=1) # print basic information on the dataset print('First subject functional nifti images (4D) are at: %s' % data.func[0]) # 4D data ############################################################################## # Extract time series from nilearn.input_data import NiftiMapsMasker masker = NiftiMapsMasker(maps_img=atlas_filename, standardize=True, memory='nilearn_cache', verbose=5)
def fetch_lemur_mircen_2019_t2(subjects=[0], data_dir=None, url=None, resume=True, verbose=1): """Download and loads the mouse lemur template dataset. Parameters ---------- subjects : sequence of int or None, optional ids of subjects to load, default to loading one subject. data_dir : string, optional Path of the data directory. Used to force data storage in a specified location. Default: None resume : bool, optional (default True) If true, try resuming download if possible. verbose : int, optional (default 0) Defines the level of verbosity of the output. Returns ------- data : sklearn.datasets.base.Bunch Dictionary-like object, the interest attributes are : - 'anat': string list. Paths to T2-weighted images. - 'phenotypic': Participants genders, birth dates and MRI scan dates References ---------- :Download: https://openneuro.org/datasets/ds001945/versions/1.0.0/download :Reference: `A 3D population-based brain atlas of the mouse lemur primate with examples of applications in aging studies and comparative anatomy. <http://doi:10.1016/j.neuroimage.2018.10.010>`_ Neuroimage 185 (2019): 85-95. N. A. Nadkarni, S. Bougacha, C. Garin, M. Dhenain, and J. L. Picq. """ if url is None: url = 'https://openneuro.org/crn/datasets/ds001945/snapshots/1.0.0/files' dataset_name = 'mircen2019_t2' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) # Check arguments max_subjects = 34 if max(subjects) > max_subjects: warnings.warn( 'Warning: there are only {0} subjects'.format(max_subjects)) subjects = range(max_subjects) subject_ids = np.array(['sub-{0:02d}'.format(i) for i in range(1, 35)]) subject_ids = subject_ids[subjects] # Generate the list of urls json_urls = [ os.path.join(url, '{0}:anat:{0}_T2w.json'.format(subject_id)) for subject_id in subject_ids ] anat_urls = [ os.path.join(url, '{0}:anat:{0}_T2w.nii.gz'.format(subject_id)) for subject_id in subject_ids ] # Generate the list of target files anat_basenames = [ '{0}_anat_{0}_T2w.nii.gz'.format(subject_id) for subject_id in subject_ids ] anat_files = [ os.path.join(animal_dir, anat_basename) for (animal_dir, anat_basename) in zip(subject_ids, anat_basenames) ] json_basenames = [ '{0}_anat_{0}_T2w.json'.format(subject_id) for subject_id in subject_ids ] json_files = [ os.path.join(animal_dir, json_basename) for (animal_dir, json_basename) in zip(subject_ids, json_basenames) ] # Call fetch_files once per subject. anat = [] json = [] for anat_u, anat_f, json_u, json_f in zip(anat_urls, anat_files, json_urls, json_files): a, j = _fetch_files(data_dir, [(anat_f, anat_u, { 'move': anat_f }), (json_f, json_u, { 'move': json_f })], verbose=verbose) json.append(j) anat.append(a) pheno_url = os.path.join(url, 'lemur_atlas_list_t2_bids.csv') pheno_file = _fetch_file(pheno_url, data_dir, verbose=verbose) phenotypic = np.recfromcsv( pheno_file, delimiter='\t', skip_header=True, names=['animal_id', 'gender', 'birthdate', 'mri_date'], dtype=['U8', 'U3', 'datetime64[D]', 'datetime64[D]'], converters={ 2: _parse_date, 3: _parse_date }, encoding='U8') phenotypic = phenotypic[[ np.where(phenotypic['animal_id'] == '"' + i + '"')[0][0] for i in subject_ids ]] fdescr = _get_dataset_descr(dataset_name) return Bunch(anat=anat, pheno=phenotypic, description=fdescr)
Created on Sat Dec 14 17:23:25 2013 Author: Josef Perktold """ from __future__ import print_function import os import numpy as np from statsmodels.sandbox.nonparametric import kernels from numpy.testing import assert_allclose, assert_array_less DEBUG = 0 curdir = os.path.dirname(os.path.abspath(__file__)) fname = 'results/results_kernel_regression.csv' results = np.recfromcsv(os.path.join(curdir, fname)) y = results['accident'] x = results['service'] positive = x >= 0 x = np.log(x[positive]) y = y[positive] xg = np.linspace(x.min(), x.max(), 40) # grid points default in Stata #kern_name = 'gau' #kern = kernels.Gaussian() #kern_name = 'epan2' #kern = kernels.Epanechnikov() #kern_name = 'rec' #kern = kernels.Uniform() # ours looks awful #kern_name = 'tri'
def load_data_file(name, skip_header=None): fname = os.path.join(os.path.dirname(__file__), 'data', name) return np.recfromcsv(fname, skip_header=skip_header, case_sensitive=True).view(np.recarray)
from matplotlib.backends import backend_pdf from nilearn import plotting from nilearn import image from nilearn.masking import apply_mask from nilearn.image import math_img from soma import aims from copy import deepcopy import numpy as np import os import csv import glob ### initializes paths and set parameters atlas_template = '/neurospin/grip/protocols/MRI/Resting_state_Victor_2014/atlases/atlas_fonctionel_control_AVCnn/AVCnn.nii' #reference atlas on which all individual atlases will be based labels = np.recfromcsv('/neurospin/grip/protocols/MRI/Resting_state_Victor_2014/atlases/atlas_fonctionel_control_AVCnn/AVCnnlabels.csv')#reference roi labels on which all individual atlases will be based labels_names_ref =labels['name'].T root = '/media/vd239549/LaCie/victor/AVCnn/AVCnn_2016_DARTEL/patients' subjects = open('/neurospin/grip/protocols/MRI/Resting_state_Victor_2014/ressources_txt/AVCnn_patients.txt','r').read().split() #AVCnn_cont_all.txt #AVCnn_patients.txt basename = 'func_atlas' func_type= 'RS1' save_report = '/media/vd239549/LaCie/victor/AVCnn/AVCnn_2016_DARTEL/patients/docs/Atlas/atlas_func_report_patients.pdf' at_dir = '/neurospin/grip/protocols/MRI/Resting_state_Victor_2014/AVCnn/resultats/atlas_indiv_func' #dir where to save the atlases roi_thresh = 0.5 #Threshold of voxel ratio for roi to be considered #if number of voxel in individual brain mask and roi of reference atlas / number of voxel in roi of reference atlas < threshold then roi will be empty brain_mask_thresh = 0.1 #Threshold for mask binarization (times average intensity of mask : 0.1 means 10% of average mask intensity) #gathering data from reference atlas template_img = nb.load(atlas_template) #image oject
plt.scatter(data_float[:, 0], data_float[:, 1]) plt.xlabel('time (min.)') plt.ylabel('percentage of larvae') plt.show() ############################################ data = np.genfromtxt('titanic.csv', delimiter=',', names=True, dtype=None) np.shape(data) print(data["Survived"]) ################################################ # Assign the filename: file file = 'titanic.csv' # Import file using np.recfromcsv: d d = np.recfromcsv(file) # Print out first three entries of d print(d[:3]) ############################################### # Import pandas as pd import pandas as pd # Assign the filename: file file = 'titanic.csv' # Read the file into a DataFrame: df df = pd.read_csv(file) # View the head of the DataFrame
from nilearn.image import smooth_img from nilearn.mass_univariate import permuted_ols FWHM = 5 ### Gather data # images path_to_images = "/home/virgile/wip/retreat/pypreprocess_output" images = sorted( glob.glob(os.path.join(path_to_images, "OAS1_*_MR1/mwc2OAS1_*dimbet.nii"))) #images = images[:39] # disc1 only n_samples = len(images) # explanatory variates path_to_csv = "/home/virgile/wip/retreat/oasis/oasis_cross-sectional.csv" ext_vars = np.recfromcsv(path_to_csv)[:n_samples] age = ext_vars['age'].astype(float).reshape((-1, 1)) # filter elderly subjects (= subjects with available Alzheimer info) elderly_subjects_ids = np.where(~np.isnan(ext_vars['cdr']))[0] images = [x for (i, x) in enumerate(images) if i in elderly_subjects_ids] age = age[elderly_subjects_ids] cdr = LabelBinarizer().fit_transform(LabelEncoder().fit_transform( ext_vars['cdr'][elderly_subjects_ids])) cdr = cdr[:, -1].reshape((-1, 1)) # build impairment variate ### Mask data print "Resample images" nifti_masker = NiftiMasker(smoothing_fwhm=FWHM, memory='nilearn_cache', memory_level=1) # cache options
def main(): """Main function that is called when TPOT is run on the command line""" parser = argparse.ArgumentParser( description='A Python tool that ' 'automatically creates and optimizes machine learning pipelines using ' 'genetic programming.', add_help=False) parser.add_argument( 'INPUT_FILE', type=str, help='Data file to use in the TPOT ' 'optimization process. Ensure that the class label column is labeled as "class".' ) parser.add_argument('-h', '--help', action='help', help='Show this help message and exit.') parser.add_argument( '-is', action='store', dest='INPUT_SEPARATOR', default='\t', type=str, help='Character used to separate columns in the input file.') parser.add_argument('-target', action='store', dest='TARGET_NAME', default='class', type=str, help='Name of the target column in the input file.') parser.add_argument( '-mode', action='store', dest='TPOT_MODE', choices=['classification', 'regression'], default='classification', type=str, help= 'Whether TPOT is being used for a supervised classification or regression problem.' ) parser.add_argument( '-o', action='store', dest='OUTPUT_FILE', default='', type=str, help='File to export the code for the final optimized pipeline.') parser.add_argument( '-g', action='store', dest='GENERATIONS', default=100, type=positive_integer, help='Number of iterations to run the pipeline optimization process.\n' 'Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. ' 'TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.' ) parser.add_argument( '-p', action='store', dest='POPULATION_SIZE', default=100, type=positive_integer, help= 'Number of individuals to retain in the GP population every generation.\n' 'Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. ' 'TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.' ) parser.add_argument( '-os', action='store', dest='OFFSPRING_SIZE', default=None, type=positive_integer, help='Number of offspring to produce in each GP generation. ' 'By default, OFFSPRING_SIZE = POPULATION_SIZE.') parser.add_argument( '-mr', action='store', dest='MUTATION_RATE', default=0.9, type=float_range, help='GP mutation rate in the range [0.0, 1.0]. This tells the ' 'GP algorithm how many pipelines to apply random changes to every generation. ' 'We recommend using the default parameter unless you understand how the mutation ' 'rate affects GP algorithms.') parser.add_argument( '-xr', action='store', dest='CROSSOVER_RATE', default=0.1, type=float_range, help='GP crossover rate in the range [0.0, 1.0]. This tells the ' 'GP algorithm how many pipelines to "breed" every generation. ' 'We recommend using the default parameter unless you understand how the crossover ' 'rate affects GP algorithms.') parser.add_argument( '-scoring', action='store', dest='SCORING_FN', default=None, type=str, help='Function used to evaluate the quality of a given pipeline for ' 'the problem. By default, accuracy is used for classification problems and mean ' 'squared error (mse) is used for regression problems. ' 'TPOT assumes that any function with "error" or "loss" in the name is meant to ' 'be minimized, whereas any other functions will be maximized. ' 'Offers the same options as cross_val_score: ' '"accuracy", "adjusted_rand_score", "average_precision", "f1", "f1_macro", ' '"f1_micro", "f1_samples", "f1_weighted", "log_loss", "mean_absolute_error", ' '"mean_squared_error", "median_absolute_error", "precision", "precision_macro", ' '"precision_micro", "precision_samples", "precision_weighted", "r2", "recall", ' '"recall_macro", "recall_micro", "recall_samples", "recall_weighted", "roc_auc"' ) parser.add_argument( '-cv', action='store', dest='NUM_CV_FOLDS', default=5, type=int, help='Number of folds to evaluate each pipeline over in ' 'k-fold cross-validation during the TPOT optimization process.') parser.add_argument( '-njobs', action='store', dest='NUM_JOBS', default=1, type=int, help='Number of CPUs for evaluating pipelines in parallel ' ' during the TPOT optimization process. Assigning this to -1 will use as many ' 'cores as available on the computer.') parser.add_argument( '-maxtime', action='store', dest='MAX_TIME_MINS', default=None, type=int, help='How many minutes TPOT has to optimize the pipeline. This ' 'setting will override the GENERATIONS parameter ' 'and allow TPOT to run until it runs out of time.') parser.add_argument( '-maxeval', action='store', dest='MAX_EVAL_MINS', default=5, type=float, help='How many minutes TPOT has to evaluate a single pipeline. ' 'Setting this parameter to higher values will allow TPOT to explore more complex ' 'pipelines but will also allow TPOT to run longer.') parser.add_argument( '-s', action='store', dest='RANDOM_STATE', default=None, type=int, help='Random number generator seed for reproducibility. Set ' 'this seed if you want your TPOT run to be reproducible with the same ' 'seed and data set in the future.') parser.add_argument( '-config', action='store', dest='CONFIG_FILE', default='', type=str, help='Configuration file for customizing the operators and parameters ' 'that TPOT uses in the optimization process.') parser.add_argument( '-v', action='store', dest='VERBOSITY', default=1, choices=[0, 1, 2, 3], type=int, help='How much information TPOT communicates ' 'while it is running: 0 = none, 1 = minimal, 2 = high, 3 = all. ' 'A setting of 2 or higher will add a progress bar during the optimization procedure.' ) parser.add_argument( '--no-update-check', action='store_true', dest='DISABLE_UPDATE_CHECK', default=False, help= 'Flag indicating whether the TPOT version checker should be disabled.') parser.add_argument('--version', action='version', version='TPOT {version}'.format(version=__version__), help='Show the TPOT version number and exit.') args = parser.parse_args() if args.VERBOSITY >= 2: print('\nTPOT settings:') for arg in sorted(args.__dict__): arg_val = args.__dict__[arg] if arg == 'DISABLE_UPDATE_CHECK': continue elif arg == 'SCORING_FN' and arg_val is None: if args.TPOT_MODE == 'classification': arg_val = 'accuracy' else: arg_val = 'mean_squared_error' elif arg == 'OFFSPRING_SIZE' and arg_val is None: arg_val = args.__dict__['POPULATION_SIZE'] print('{}\t=\t{}'.format(arg, arg_val)) print('') input_data = np.recfromcsv(args.INPUT_FILE, delimiter=args.INPUT_SEPARATOR, dtype=np.float64, case_sensitive=True) if args.TARGET_NAME not in input_data.dtype.names: raise ValueError( 'The provided data file does not seem to have a target column. ' 'Please make sure to specify the target column using the -target parameter.' ) features = np.delete(input_data.view(np.float64).reshape( input_data.size, -1), input_data.dtype.names.index(args.TARGET_NAME), axis=1) training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, input_data[args.TARGET_NAME], random_state=args.RANDOM_STATE) if args.TPOT_MODE == 'classification': tpot_type = TPOTClassifier else: tpot_type = TPOTRegressor operator_dict = None if args.CONFIG_FILE: try: with open(args.CONFIG_FILE, 'r') as input_file: file_string = input_file.read() operator_dict = eval( file_string[file_string.find('{'):(file_string.rfind('}') + 1)]) except: raise TypeError( 'The operator configuration file is in a bad format or not available. ' 'Please check the configuration file before running TPOT.') tpot = tpot_type(generations=args.GENERATIONS, population_size=args.POPULATION_SIZE, offspring_size=args.OFFSPRING_SIZE, mutation_rate=args.MUTATION_RATE, crossover_rate=args.CROSSOVER_RATE, cv=args.NUM_CV_FOLDS, n_jobs=args.NUM_JOBS, scoring=args.SCORING_FN, max_time_mins=args.MAX_TIME_MINS, max_eval_time_mins=args.MAX_EVAL_MINS, random_state=args.RANDOM_STATE, config_dict=operator_dict, verbosity=args.VERBOSITY, disable_update_check=args.DISABLE_UPDATE_CHECK) print('') tpot.fit(training_features, training_classes) if args.VERBOSITY in [1, 2] and tpot._optimized_pipeline: training_score = max([ tpot._pareto_front.keys[x].wvalues[1] for x in range(len(tpot._pareto_front.keys)) ]) print('\nTraining score: {}'.format(abs(training_score))) print('Holdout score: {}'.format( tpot.score(testing_features, testing_classes))) elif args.VERBOSITY >= 3 and tpot._pareto_front: print('Final Pareto front testing scores:') for pipeline, pipeline_scores in zip(tpot._pareto_front.items, reversed( tpot._pareto_front.keys)): tpot._fitted_pipeline = tpot._pareto_front_fitted_pipelines[str( pipeline)] print('{}\t{}\t{}'.format( int(abs(pipeline_scores.wvalues[0])), tpot.score(testing_features, testing_classes), pipeline)) if args.OUTPUT_FILE != '': tpot.export(args.OUTPUT_FILE)
def fetch_atlas_msdl(data_dir=None, url=None, resume=True, verbose=1): """Download and load the MSDL brain atlas. Parameters ---------- data_dir: string, optional Path of the data directory. Used to force data storage in a specified location. Default: None url: string, optional Override download URL. Used for test only (or if you setup a mirror of the data). Returns ------- data: sklearn.datasets.base.Bunch Dictionary-like object, the interest attributes are : - 'maps': str, path to nifti file containing regions definition. - 'labels': string list containing the labels of the regions. - 'region_coords': tuple list (x, y, z) containing coordinates of each region in MNI space. - 'networks': string list containing names of the networks. - 'description': description about the atlas. References ---------- :Download: https://team.inria.fr/parietal/files/2015/01/MSDL_rois.zip :Paper to cite: `Multi-subject dictionary learning to segment an atlas of brain spontaneous activity <http://hal.inria.fr/inria-00588898/en>`_ Gael Varoquaux, Alexandre Gramfort, Fabian Pedregosa, Vincent Michel, Bertrand Thirion. Information Processing in Medical Imaging, 2011, pp. 562-573, Lecture Notes in Computer Science. :Other references: `Learning and comparing functional connectomes across subjects <http://hal.inria.fr/hal-00812911/en>`_. Gael Varoquaux, R.C. Craddock NeuroImage, 2013. """ url = 'https://team.inria.fr/parietal/files/2015/01/MSDL_rois.zip' opts = {'uncompress': True} dataset_name = "msdl_atlas" files = [(os.path.join('MSDL_rois', 'msdl_rois_labels.csv'), url, opts), (os.path.join('MSDL_rois', 'msdl_rois.nii'), url, opts)] data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) files = _fetch_files(data_dir, files, resume=resume, verbose=verbose) csv_data = np.recfromcsv(files[0]) labels = [name.strip() for name in csv_data['name'].tolist()] labels = [label.decode("utf-8") for label in labels] with warnings.catch_warnings(): warnings.filterwarnings('ignore', module='numpy', category=FutureWarning) region_coords = csv_data[['x', 'y', 'z']].tolist() net_names = [net_name.strip() for net_name in csv_data['net_name'].tolist()] fdescr = _get_dataset_descr(dataset_name) return Bunch(maps=files[1], labels=labels, region_coords=region_coords, networks=net_names, description=fdescr)
data # Example file = 'seaslug.txt' data = np.loadtxt(file, delimiter='\t', dtype=str) print(data[0]) data_float = np.loadtxt(file, delimiter='\t', dtype=float, skiprows=1) print(data_float[9]) plt.scatter(data_float[:, 0], data_float[:, 1]) plt.xlabel('time (min.)') plt.ylabel('percentage of larvae') plt.show() # use 'np.genfromtxt' to deal with complex data data = np.genfromtxt('titanic_sub.csv', delimiter=',', names=True, dtype=None) # np.recfromcsv() behaves similarly to np.genfromtxt(), except that its default dtype is None # it has the defaults delimiter=',' and names=True in addition to dtype=None d = np.recfromcsv('titanic_sub.csv') print(d[:3]) # Importing Flat Files using pandas # df.read_csv() / df.read_table() import pandas as pd filename = 'titanic_sub.csv' data = pd.read_csv(filename, nrows = 10, header = None) data.head() # check the first five rows data_array = data.values # convert into a numpy array # Example import matplotlib.pyplot as plt file = 'titanic_sub.csv' data = pd.read_csv(file, sep=',', comment='#', na_values='Nothing') print(data.head()) pd.DataFrame.hist(data[['Age']])
data_float = np.loadtxt(file, delimiter="\t", dtype=float, skiprows=1) # Print the 10th element of data_float print(data_float[9]) # Plot a scatterplot of the data plt.scatter(data_float[:, 0], data_float[:, 1]) plt.xlabel('time (min.)') plt.ylabel('percentage of larvae') plt.show() # Assign the filename: file file = 'titanic.csv' # Import file using np.recfromcsv: d d = np.recfromcsv(file,delimiter=",", names=True,dtype=None) # Print out first three entries of d print(d[:3]) # Import pandas as pd import pandas as pd # Assign the filename: file file = 'titanic.csv' # Read the file into a DataFrame: df df = pd.read_csv(file) # View the head of the DataFrame print(df.head())
Different classifiers in decoding the Haxby dataset ===================================================== Here we compare different classifiers on a visual object recognition decoding task. """ import time ### Fetch data using nilearn dataset fetcher ################################ from nilearn import datasets data_files = datasets.fetch_haxby(n_subjects=1) # load labels import numpy as np labels = np.recfromcsv(data_files.session_target[0], delimiter=" ") stimuli = labels['labels'] # identify resting state labels in order to be able to remove them resting_state = stimuli == "rest" # find names of remaining active labels categories = np.unique(stimuli[resting_state == False]) # extract tags indicating to which acquisition run a tag belongs session_labels = labels["chunks"][resting_state == False] # Load the fMRI data from nilearn.input_data import NiftiMasker # For decoding, standardizing is often very important masker = NiftiMasker(mask_img=data_files['mask_vt'][0], standardize=True)
from datetime import datetime import matplotlib.dates as mdates import matplotlib.pyplot as plt import numpy as np # Import Data data_path = "output/timeseries_export.csv" results = np.recfromcsv(data_path, encoding=None) # Get times as datetime objects times = list(map(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"), results["time"])) # Generate Plot fig, axarr = plt.subplots(2, sharex=True) axarr[0].set_title("Water Level and Discharge") # Upper subplot axarr[0].set_ylabel("Water Level [m]") axarr[0].plot(times, results["storage_level"], label="Storage", linewidth=2, color="b") axarr[0].plot(times, results["sea_level"], label="Sea", linewidth=2, color="m") axarr[0].plot( times, 0.5 * np.ones_like(times), label="Storage Max", linewidth=2, color="r", linestyle="--", )
# importing files using nympy # import data as arrays import numpy as np fn = "files\mnist_kaggle_some_rows.csv" data = np.loadtxt(fn, delimiter=',') print(data) print(type(data)) print(np.shape(data)) titanic = np.recfromcsv("files/titanic_sub.csv") np.shape(titanic) print(titanic[:4]) print(type(titanic[1]))
#import pyfits from matplotlib.backends.backend_pdf import PdfPages # from Leinert 1998 table 2 calfac = 21.7 * 0.44 * (7.40 / 21.7) fig = plt.figure(figsize=(6.5, 3.0)) f, (ax, ax1) = plt.subplots(2, sharex=True, gridspec_kw={'height_ratios': [3, 1]}) #ax = fig.add_subplot(1,1,1) ax.plot([0, 20], [0, 0], linestyle=':', color='black') ipd_density = np.recfromcsv('lookup/total_density_vs_r.txt', delimiter=' ') axa = ax.twinx() ax.set_zorder(axa.get_zorder() + 1) ax.patch.set_visible(False) p2, = axa.plot(ipd_density['au'], ipd_density['au']**(-2) * ipd_density['density'], linestyle='-', color='green') axa.fill_between(ipd_density['au'], 0.1 * ipd_density['au']**(-2) * ipd_density['density'], 10 * ipd_density['au']**(-2) * ipd_density['density'], color='green', alpha=0.2)
import numpy as np from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = RandomForestRegressor(max_features=0.25, min_samples_leaf=11, min_samples_split=9, n_estimators=100) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
## Import your algorithms here. from Tesla import Tesla from svr import SVR from ContextEngineBase import Complexity ## For different tests, these values will vary. inputFilePath = "SVRTestInput.csv" outputFilePath = "SVRTestOutput.csv" complexity = Complexity.secondOrder numTrainingSamples = 96 numExecuteSamples = 96 inputFile = open(inputFilePath) outputFile = open(outputFilePath) inputReader = csv.reader(inputFile) outputReader = csv.reader(outputFile) csv = recfromcsv(inputFilePath, delimiter=',') ## Change the name of the algorithm to test it out. algorithmTest = SVR(complexity, 1, 0, [0], {}) teslaTimestamps = {} svrTimestamps = {} #print(algorithmTest.complexity); #print(algorithmTest.functionOrder); totRow = 35040 numRow = 96 day_train_start = 0 day_train_end = 0 day_predict = 4 x_train = []
import numpy as np raw_data = np.recfromcsv("data/red.csv",delimiter=';') X = [] Y = [] for line in raw_data: row = [] for i in range(0,len(line)-1): row.append(line[i]) Y.append(line[len(line)-1]) X.append(row) X = np.mat(X) Y = np.transpose(np.mat(Y)) sample = X[:1500] test = X[1500:] sY = Y[:1500] tY = Y[1500:] p = [] for j in range(0,len(tY)-1): y = test[j] dist = [] for i in range(len(sample)): x = np.linalg.norm(sample[i]-y) dist.append([x,i])
# CSV file to read in csv_file = '/Users/sudregp/tmp/gf1p5t.csv' # Name of the column to be added to CSV var = 'matched' # define the two groups groups = ['NV', 'ADHD'] # for every individual in the smaller group, choose this many in the bigger group match_ratio = 2 # Some other variables to limit usable scans qc_column = 'raw_rating' use_twins = 1 # set to 0 if only using rows with column twin==0 gf = np.recfromcsv(csv_file) group_rows = [] group_subjects = [] for group in groups: group_rows.append([i for i in range(len(gf)) if gf[i]['dxgroup'] == group]) group_subjects.append(list(np.unique(gf[group_rows[-1]]['id']))) # First we clean up the QC column to make sure it only has numbers and NaNs for row in range(len(gf)): try: gf[row][qc_column] = float(gf[row][qc_column]) except ValueError: gf[row][qc_column] = np.nan # let's create a few dictionaries to make life easier later
import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'class' in the data file ###tpot_data = np.recfromcsv('./sources/cars.csv', delimiter=',', dtype=np.float64) tpot_data = np.recfromcsv('./sources/cars.csv') features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( StackingEstimator(estimator=LogisticRegression(C=1.0, dual=True)), RandomForestClassifier(max_features=0.6000000000000001, min_samples_leaf=20, min_samples_split=18)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) print(results)
def load_neuropixels_data(dir_name='neuropixels'): '''Downloads and returns data for the Neuropixels example. The dataset comes from `UCL's Cortex Lab <http://data.cortexlab.net/dualPhase3/data/>`_. Args: dir_name (str): Specifies the directory to which the data files should be downloaded. This is concatenated with the user-set data directory. Returns: dict: A dictionary where each key corresponds to a needed file. ''' dpath = os.path.join(config.get_data_directory(), dir_name) if not os.path.exists(dpath): os.makedirs(dpath) base_url = 'http://data.cortexlab.net/dualPhase3/data/' file_dict = dict() parent_fnames = [ 'experiment1stimInfo.mat', 'experiment2stimInfo.mat', 'experiment3stimInfo.mat', 'timeCorrection.mat', 'timeCorrection.npy', ] parent_dir = [ 'frontal/', 'posterior/', ] subdir_fnames = [ 'spike_clusters.npy', 'spike_templates.npy', 'spike_times.npy', 'templates.npy', 'whitening_mat_inv.npy', 'cluster_groups.csv', 'channel_positions.npy', ] for name in parent_fnames: fname = os.path.join(dpath, name) url = os.path.join(base_url, name) if not os.path.exists(fname): _urlretrieve(url, fname) file_dict[name] = _load_file(fname) for directory in parent_dir: if not os.path.exists(os.path.join(dpath, directory)): os.makedirs(os.path.join(dpath, directory)) for subdir in subdir_fnames: fname = os.path.join(dpath, directory, subdir) url = os.path.join(base_url, directory, subdir) if not os.path.exists(fname): _urlretrieve(url, fname) key = os.path.join(directory, subdir) if subdir == 'cluster_groups.csv': file_dict[key] = np.recfromcsv(fname, delimiter='\t') else: file_dict[key] = _load_file(fname) return file_dict
import pandas as pd import numpy as np # data = np.genfromtxt('/Users/apple/desktop/py4e/ImportData1/titanic_sub.csv', delimiter = ',', names=True,dtype=None) # print(data) # print(np.shape(data)) # print(data['Fare']) # print(data['Survived']) #recfromcsv is similar to genfromtxt but the arg dtupe=True is default d = np.recfromcsv('/Users/apple/desktop/py4e/ImportData1/titanic_sub.csv', delimiter=',', names=True)
import pylab as plt import numpy as np import random raw_data = np.recfromcsv("../data/data3.csv") m = {} # read dataset and put them into a dictionary for line in raw_data: line[2] = int(line[2][len(line[2]) - 1]) - 1 if not m.has_key(line[2]): m[line[2]] = [] m[line[2]].append([line[0], line[1], line[2]]) fig = plt.figure() ax = fig.add_subplot(111) ax.set_xlim(0, 1) ax.set_ylim(0, 1) colors = "brcmgykw" for key in m: for value in (m.get(key)): color = value[2] i = int(color[len(color) - 1]) ax.plot(value[0], value[1], "o", color=colors[i]) m[i] = m.pop(key) # setup GUI
import textwrap, sys, random, getTerms import numpy as np terms = getTerms.get() # get terms from getTerms.py punctuations = [' ', ', ', '. '] wordCt = [100, 300] words = np.recfromcsv( 'common_words_freq.csv') # read CSV of Eng. terms and freq p = np.true_divide(words['frequency'], np.sum(words['frequency'])) probs1 = np.array([0.8, 0.05, 0.15]) # texts, terms, specific terms probs2 = np.array([0.8, 0.1, 0.1]) # prob of space vs comma vs period def genDoc(spec): # spec argument is a list of specialized terms ct = np.random.randint(wordCt[0], wordCt[1]) # word count for doc strc = np.random.choice(range(3), ct, True, probs1) # 0=wrd, 1=trm, 2=spec txt = np.empty(ct, dtype=words.dtype[0]) txt[strc == 0] = np.random.choice(words['word'], np.sum(strc == 0), True, p) txt[strc == 1] = np.random.choice(np.hstack(terms), np.sum(strc == 1), True) txt[strc == 2] = np.random.choice(spec, np.sum(strc == 2), True) puncts = np.random.choice(punctuations, ct, True, probs2) out = np.empty(txt.size * 2, dtype=txt.dtype) out[0::2] = txt out[1::2] = puncts
def stats_test(_output_filename, input_filename): _, extension = os.path.splitext(input_filename) if extension == 'csv': data = recfromcsv(input_filename, delimiter=',') first_f = 1 else: data = np.load(input_filename) first_f = 0 dims = 15 mfss = np.zeros([len(data), dims]) for i in range(len(data)): print i if extension == 'csv': mfs = np.array(tuple(data[i])[first_f:]) #.astype(np.float32) else: mfs = data[i] if len(mfs) < 100: max_fa = np.max(mfs) min_fa = np.min(mfs) std_fa = np.std(mfs) mean_fa = np.mean(mfs) median_fa = np.median(mfs) sum_fa = np.sum(mfs) variation = scipy.stats.variation(mfs) var = scipy.stats.tvar(mfs) skew = scipy.stats.skew(mfs) kurtosis = scipy.stats.kurtosis(mfs) arg_max = np.argmax(mfs) arg_min = np.argmin(mfs) diff = np.max(mfs) - np.min(mfs) first_f = mfs[0] last_f = mfs[-1] mfss[i] = np.array([ max_fa, min_fa, mean_fa, std_fa, median_fa, sum_fa, skew, kurtosis, variation, var, arg_max, arg_min, diff, last_f, first_f ]) else: tmp = np.array([]) for l in range(5): mmfs = mfs[l * 20:(l + 1) * 20 - 1] max_fa = np.max(mmfs) min_fa = np.min(mmfs) std_fa = np.std(mmfs) mean_fa = np.mean(mmfs) median_fa = np.median(mmfs) sum_fa = np.sum(mmfs) variation = scipy.stats.variation(mmfs) var = scipy.stats.tvar(mmfs) skew = scipy.stats.skew(mmfs) kurtosis = scipy.stats.kurtosis(mmfs) arg_max = np.argmax(mmfs) arg_min = np.argmin(mmfs) tmp = np.hstack( (tmp, np.array([ max_fa, min_fa, mean_fa, std_fa, median_fa, sum_fa, skew, kurtosis, variation, var, arg_max, arg_min ]))) # for l in range(5): # skews[l] = scipy.stats.skew(mfs[l*20 : (l+1)*20 - 1]) # kurtosiss[l] = scipy.stats.kurtosis(mfs[l*20 : (l+1)*20 - 1]) #skew = np.mean(skews) #kurtosis = np.mean(kurtosiss) mfss[i] = tmp print mfss[i], i np.save(data_path + _output_filename, mfss) print "Saved ", data_path + _output_filename
def fetch(self, contrasts=None, n_subjects=None, get_tmaps=False, get_masks=False, get_anats=False, url=None, resume=True, force=False, verbose=1): if n_subjects is None: n_subjects = 94 # 94 subjects available if (n_subjects > 94) or (n_subjects < 1): warnings.warn("Wrong value for \'n_subjects\' (%d). The maximum " "value will be used instead (\'n_subjects=94\')") n_subjects = 94 # 94 subjects available if contrasts is None: contrasts = self.contrast_name_wrapper.values() elif isinstance(contrasts, _basestring): contrasts = [contrasts] allowed_contrasts = list(self.contrast_name_wrapper.values()) # convert contrast names contrasts_wrapped = [] # get a unique ID for each contrast. It is used to give a unique name to # each download file and avoid name collisions. contrasts_indices = [] for contrast in contrasts: if contrast in allowed_contrasts: contrasts_wrapped.append(contrast) contrasts_indices.append(allowed_contrasts.index(contrast)) elif contrast in self.contrast_name_wrapper: name = self.contrast_name_wrapper[contrast] contrasts_wrapped.append(name) contrasts_indices.append(allowed_contrasts.index(name)) else: raise ValueError("Contrast \'%s\' is not available" % contrast) # It is better to perform several small requests than a big one because: # - Brainomics server has no cache (can lead to timeout while the archive # is generated on the remote server) # - Local (cached) version of the files can be checked for each contrast opts = {'uncompress': True} subject_ids = ["S%02d" % s for s in range(1, n_subjects + 1)] subject_id_max = subject_ids[-1] data_types = ["c map"] if get_tmaps: data_types.append("t map") rql_types = str.join(", ", ["\"%s\"" % x for x in data_types]) root_url = "http://brainomics.cea.fr/localizer/" base_query = ("Any X,XT,XL,XI,XF,XD WHERE X is Scan, X type XT, " "X concerns S, " "X label XL, X identifier XI, " "X format XF, X description XD, " 'S identifier <= "%s", ' % (subject_id_max, ) + 'X type IN(%(types)s), X label "%(label)s"') urls = [ "%sbrainomics_data_%d.zip?rql=%s&vid=data-zip" % (root_url, i, _urllib.parse.quote(base_query % { "types": rql_types, "label": c }, safe=',()')) for c, i in zip(contrasts_wrapped, contrasts_indices) ] filenames = [] for subject_id in subject_ids: for data_type in data_types: for contrast_id, contrast in enumerate(contrasts_wrapped): name_aux = str.replace( str.join('_', [data_type, contrast]), ' ', '_') file_path = os.path.join("brainomics_data", subject_id, "%s.nii.gz" % name_aux) file_tarball_url = urls[contrast_id] filenames.append((file_path, file_tarball_url, opts)) # Fetch masks if asked by user if get_masks: urls.append("%sbrainomics_data_masks.zip?rql=%s&vid=data-zip" % (root_url, _urllib.parse.quote(base_query % { "types": '"boolean mask"', "label": "mask" }, safe=',()'))) for subject_id in subject_ids: file_path = os.path.join("brainomics_data", subject_id, "boolean_mask_mask.nii.gz") file_tarball_url = urls[-1] filenames.append((file_path, file_tarball_url, opts)) # Fetch anats if asked by user if get_anats: urls.append("%sbrainomics_data_anats.zip?rql=%s&vid=data-zip" % (root_url, _urllib.parse.quote(base_query % { "types": '"normalized T1"', "label": "anatomy" }, safe=',()'))) for subject_id in subject_ids: file_path = os.path.join("brainomics_data", subject_id, "normalized_T1_anat_defaced.nii.gz") file_tarball_url = urls[-1] filenames.append((file_path, file_tarball_url, opts)) # Fetch subject characteristics (separated in two files) if url is None: url_csv = ( "%sdataset/cubicwebexport.csv?rql=%s&vid=csvexport" % (root_url, _urllib.parse.quote("Any X WHERE X is Subject"))) url_csv2 = ("%sdataset/cubicwebexport2.csv?rql=%s&vid=csvexport" % (root_url, _urllib.parse.quote( "Any X,XI,XD WHERE X is QuestionnaireRun, " "X identifier XI, X datetime " "XD", safe=','))) else: url_csv = "%s/cubicwebexport.csv" % url url_csv2 = "%s/cubicwebexport2.csv" % url filenames += [("cubicwebexport.csv", url_csv, {}), ("cubicwebexport2.csv", url_csv2, {})] # Actual data fetching files = self.fetcher.fetch(filenames, resume=resume, force=force, verbose=verbose) anats = None masks = None tmaps = None # combine data from both covariates files into one single recarray from numpy.lib.recfunctions import join_by ext_vars_file2 = files[-1] csv_data2 = np.recfromcsv(ext_vars_file2, delimiter=';') files = files[:-1] ext_vars_file = files[-1] csv_data = np.recfromcsv(ext_vars_file, delimiter=';') files = files[:-1] # join_by sorts the output along the key csv_data = join_by('subject_id', csv_data, csv_data2, usemask=False, asrecarray=True)[:n_subjects] if get_anats: anats = files[-n_subjects:] files = files[:-n_subjects] if get_masks: masks = files[-n_subjects:] files = files[:-n_subjects] if get_tmaps: tmaps = files[1::2] files = files[::2] return Bunch(cmaps=files, tmaps=tmaps, masks=masks, anats=anats, ext_vars=csv_data)
import numpy as np from sklearn.decomposition import FastICA from sklearn.ensemble import RandomForestRegressor, VotingClassifier from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('raw_df.csv', delimiter=',', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( FastICA(tol=11.0), RandomForestRegressor(n_estimators=500) ) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features) print testing_features print results
'dmn_rACC_1to4_a0.05t0.99d5p5000.txt', 'dorsal_lFusiform_1to4_a0.05t0.99d5p5000.txt', 'dorsal_rFusiform_4to8_a0.05t0.99d5p5000.txt', 'dorsal_rFusiform_65to100_a0.05t0.99d5p5000.txt', 'ventral_lCingulate_4to8_a0.05t0.99d5p5000.txt', 'ventral_rVFC_30to55_a0.05t0.99d5p5000.txt', 'ventral_rVFC_4to8_a0.05t0.99d5p5000.txt' ]] results['fmri'] = [[ 'cognitive_lSupra_a0.05t0.99d5p5000.txt', 'cognitive_rSupra_a0.05t0.99d5p5000.txt', 'dmn_rACC_a0.05t0.99d5p5000.txt', 'dmn_rPrecuneus_a0.05t0.99d5p5000.txt', 'dorsal_lIPS_a0.05t0.99d5p5000.txt' ]] phen = np.recfromcsv(home + '/data/overlap_resting/subjs.csv') # load the fMRI result and the data. Transform the data, remove residuals, and concatenate to the others in the the same group data_dir = home + '/data/results/inatt_resting/' subjs_fname = home + '/data/fmri/joel_all.txt' gf_fname = home + '/data/fmri/gf.csv' fid = open(subjs_fname, 'r') subjs = [line.rstrip() for line in fid] fid.close() gf = pd.read_csv(gf_fname) # the order of subjects in data is the same as in subjs, because that's how data was created. let's find that order in the gf and resort it idx = [np.nonzero(gf.maskid == int(s))[0][0] for s in subjs] gf = gf.iloc[idx] # find out what are the indices of subjects in overlap group
def create_metadata(self, cond_type): """ The metadata file format is two columns [full scan index, value at index] """ # checks file exists file = self.file_check(self.meta_dir, cond_type) # separates file type and string name of file. fileparts = os.path.basename(file).split('.') if len(fileparts) != 2: logger.criticalcal( "Error processing file: %s, too many . in filebase" % file) # take file parts for processing. basefile, filetype = fileparts # IMPORTANT, transfering the meta file description logger.info("Matching '%s' to one of: %s meta types" % (basefile, self.meta_types)) pattern = [ t for t in self.meta_types if re.search(t, basefile) is not None ] logger.info("Found %s to bring into HDF5." % basefile) # catch any potential errors. if len(pattern) > 1: logger.critical( 'Multiple patterns matched for meta files.'\ 'Must change meta file names to match a type from list below.'\ '%s' %self.meta_types) else: logger.debug("Matched %s with %s" % (pattern[0], basefile)) attr_name = pattern[0] # now we can load in the file. if filetype == 'csv': info = np.recfromcsv(file, names=['index', attr_name], delimiter=',') elif filetype == 'txt': info = np.recfromcsv(file, names=['index', attr_name], delimiter='\t') else: logger.exception( "%s, not .txt or .csv, chill out, we're not there yet..." % filename) # check to make sure we don't have more than 2 columns in meta data file if len(info[0]) > 2: logger.exception( 'The metadata file format is [full scan index, value at index]') # checks that the length of the info file is the same length as the number of TRs. if len(info) != (self.n_TRs): logger.exception("Length of %s, does not match total # of TRs." % filename) # cycle through each run for i, r in self.hdf['func'].iteritems(): walker = [] # make sure run index matches run = self.hdf['func'][i].attrs['run'] # cycles through each TR for j in self.hdf['func'][i].attrs['f_ind']: # append to vector if run index matches global index if run == info[j][0]: walker.append(info[j][1]) else: logger.critical("Run index and meta file do not match! AHH") # write to hdf5 logger.info("No obvious errors processing %s file into hdf5" % filename) self.hdf['func'][i].attrs[attr_name] = np.array(walker)
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') import numpy as np from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import train_test_split # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('C:/Users/ecervera/.spyder-py3/tpot_boston_pipeline.py', delimiter=';', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['class'], random_state=None) exported_pipeline = GradientBoostingRegressor(alpha=0.85, learning_rate=0.1, loss="ls", max_features=0.9, min_samples_leaf=5, min_samples_split=6) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)