Ejemplo n.º 1
0
 def test_recfromcsv(self):
     #
     data = StringIO.StringIO('A,B\n0,1\n2,3')
     test = np.recfromcsv(data, missing='N/A',
                          names=True, case_sensitive=True)
     control = np.array([(0, 1), (2, 3)],
                        dtype=[('A', np.int), ('B', np.int)])
     self.failUnless(isinstance(test, np.recarray))
     assert_equal(test, control)
     #
     data = StringIO.StringIO('A,B\n0,1\n2,N/A')
     test = np.recfromcsv(data, dtype=None, missing='N/A',
                          names=True, case_sensitive=True, usemask=True)
     control = ma.array([(0, 1), (2, -1)],
                        mask=[(False, False), (False, True)],
                        dtype=[('A', np.int), ('B', np.int)])
     assert_equal(test, control)
     assert_equal(test.mask, control.mask)
     assert_equal(test.A, [0, 2])
     #
     data = StringIO.StringIO('A,B\n0,1\n2,3')
     test = np.recfromcsv(data, missing='N/A',)
     control = np.array([(0, 1), (2, 3)],
                        dtype=[('a', np.int), ('b', np.int)])
     self.failUnless(isinstance(test, np.recarray))
     assert_equal(test, control)
Ejemplo n.º 2
0
    def from_paths(solver_names, task_paths, domain, suffix=".runs.csv"):
        """Collect run data from task paths."""

        training = RunData(solver_names)

        for path in task_paths:
            # load run records
            run_data = numpy.recfromcsv(path + suffix, usemask=True)
            rows = run_data.tolist()

            if run_data.shape == ():
                rows = [rows]

            for (run_solver, run_budget, run_cost, run_succeeded, run_answer) in rows:
                record = RunRecord(run_solver, run_budget, run_cost, run_succeeded)

                training.add_run(path, record)

            # load feature data
            feature_records = numpy.recfromcsv("{0}.features.csv".format(path))
            feature_dict = dict(zip(feature_records.dtype.names, feature_records.tolist()))

            training.add_feature_vector(path, feature_dict)

        return training
Ejemplo n.º 3
0
def otherfunc(roifiles, subjects):
    import numpy as np
    from matplotlib.mlab import rec2csv
    import os

    first = np.recfromcsv(roifiles[0])
    numcons = len(first.dtype.names) - 1
    roinames = ["subject_id"] + first["roi"].tolist()
    formats = ["a20"] + ["f4" for f in roinames[1:]]
    confiles = []
    for con in range(0, numcons):
        recarray = np.zeros(len(roifiles), dtype={"names": roinames, "formats": formats})
        for i, file in enumerate(roifiles):
            recfile = np.recfromcsv(file)
            recarray["subject_id"][i] = subjects[i]
            for roi in roinames[1:]:
                value = recfile["con%02d" % (con + 1)][recfile["roi"] == roi]
                if value:
                    recarray[roi][i] = value
                else:
                    recarray[roi][i] = 999
        filename = os.path.abspath("grouped_con%02d.csv" % (con + 1))
        rec2csv(recarray, filename)
        confiles.append(filename)
    return confiles
def get_regressors(csv,ids):
    import numpy as np
    if csv == '':
        return None
    reg = {}
    design = np.recfromcsv(csv)
    design_str = np.recfromcsv(csv,dtype=str)
    names = design_str.dtype.names
    csv_ids = []
    for i in design_str["id"]:
        csv_ids.append(str(i))
    csv_ids = np.asarray(csv_ids)
    for n in names:
        if not n=="id":
            reg[n] = []
    for sub in ids:
        if sub in csv_ids:
            for key in reg.keys():
                reg[key].append(design[key][csv_ids==sub][0])
        else:
            raise Exception("%s is missing from the CSV file!"%sub)
    cov = []
    for key,item in reg.iteritems():
        cov.append({'name':key,'vector':item,'centering':0})
    print cov
    return cov
def get_regressors(csv,ids):
    import numpy as np
    if csv == '':
        return None
    reg = {}
    design = np.recfromcsv(csv)
    design_str = np.recfromcsv(csv,dtype=str)
    names = design_str.dtype.names
    csv_ids = []
    for i in design_str["id"]:
        csv_ids.append(str(i))
    csv_ids = np.asarray(csv_ids)
    for n in names:
        if not n=="id":
            reg[n] = []
    for sub in ids:
        if sub in csv_ids:
            for key in reg.keys():
                reg[key].append(design[key][csv_ids==sub][0])
        else:
            raise Exception("%s is missing from the CSV file!"%sub)
    if 'group' in names:
        data = np.asarray(reg['group'])
        vals = np.unique(data)
        for i, v in enumerate(vals):
            data[data==v] = i+1
        group = data.astype(int).tolist()
        reg.pop('group')
        
    else:
        group = [1]*len(reg[names[-1]])
    return reg, group
Ejemplo n.º 6
0
    def __init__(self, tasks_roots, domain):
        """Initialize."""

        # scan for CSV files
        train_paths = []

        for tasks_root in tasks_roots:
            train_paths.extend(cargo.files_under(tasks_root, domain.extensions))

        logger.info("using %i tasks for training", len(train_paths))

        # fetch training data from each file
        self._run_lists = {}
        self._feature_vectors = {}

        for path in train_paths:
            # load run records
            run_data = numpy.recfromcsv("{0}.runs.csv".format(path), usemask = True)
            run_list = []

            for (run_solver, run_budget, run_cost, run_succeeded, run_answer) in run_data.tolist():
                record = RunRecord(run_solver, run_budget, run_cost, run_succeeded)

                run_list.append(record)

            self._run_lists[path] = run_list

            # load feature data
            feature_vector = numpy.recfromcsv("{0}.features.csv".format(path)).tolist()

            self._feature_vectors[path] = feature_vector
 def sort_battles(self, results_filename='csv/mz_results_boulders.csv',
                  images_filename='csv/mz_images_boulders.csv',
                  out_filename='csv/mz_boulders_rank.csv'):
     p = np.recfromcsv(images_filename, names=True)
     objid = p.field('id')
     rank = np.zeros(objid.shape, np.int) - 1
     fracrank = np.zeros(objid.shape) - 1
     battles = np.recfromcsv(results_filename, names=True)
     # currently does not do anything with inconclusive battles
     battles = battles[battles.field('winner') > 0]
     first = battles['first_asset_id']
     second = battles['second_asset_id']
     winner = battles['winner']
     w = np.where(winner == 1, first, second)
     l = np.where(winner == 1, second, first)
     competitors = np.unique(np.concatenate((w, l)))
     self.competitors = self._asarray(competitors)
     self.winners = self._asarray(w)
     self.losers = self._asarray(l)
     self._consistency_check()
     self._setup_internal_variables()
     print('ncomp = %i, nwars = %i'%(self.ncomp, self.nwars))
     self.iterate()
     for r, id in enumerate(self.ranking):
         idx = (objid == id).nonzero()[0]
         if len(idx) < 1:
             print('Could not find objid match for id={}, rank={}'.format(id, r))
         idx = idx[0]
         rank[idx] = r
         fracrank[idx] = float(r) / self.ncomp
     np.savetxt(out_filename, np.asarray((objid, rank, fracrank)).T,
                fmt='%d,%d,%.3f',
                header=("objid,rank,fracrank"))
Ejemplo n.º 8
0
def compare(fileA, fileB):
    mooseData = np.recfromcsv(fileA, delimiter=',')
    nrnData = np.recfromcsv(fileB, delimiter=',')
    mooseData = zip(*mooseData)
    nrnData = zip(*nrnData)
    print mooseData[0]
    pylab.plot([1e3*x for x in mooseData[0]], [ 1e3*x for x in mooseData[1]]
            , label = 'moose')
    pylab.plot(nrnData[0], nrnData[1],
            label = 'neuron')
    #pylab.plot(mooseData)
    #pylab.plot(nrnData)
    pylab.show()
Ejemplo n.º 9
0
def np_combine_csv_files(csvpaths, verbose=False):
    """Combine a collection of CSV files into a single numpy record
       array. Can take a while! CSV files with different fields
       (different headers, different number of fields) are merged
       together correctly, data type inferral and promotion takes a
       while.

       Treats the first line as a header, uses to name the fields.
       Giving it files without headers will cause weird things to
       happen.

       Arguments:
       csvpaths:    List of text files to read into the array

       Returns: numpy.recarray
    """
    big_csv = numpy.recfromcsv(
        open(csvpaths[0]), case_sensitive=True, deletechars='',
        replace_space=' ', autostrip=True
    )
    if 'File ID' not in big_csv.dtype.names and big_csv['Input'].size > 1:
        big_csv = numpy.lib.recfunctions.append_fields(
            big_csv, 'File ID',
            [os.path.splitext(os.path.basename(x))[0]
             for x in big_csv['Input']],
            usemask=False, asrecarray=True
        )
    for i, csvpath in enumerate(csvpaths[1:]):
        csv_arr = numpy.recfromcsv(
            open(csvpath), case_sensitive=True, deletechars='',
            replace_space=' ', autostrip=True
        )
        if 'File ID' not in csv_arr.dtype.names and csv_arr['Input'].size > 1:
            csv_arr = numpy.lib.recfunctions.append_fields(
                csv_arr, 'File ID',
                [os.path.splitext(os.path.basename(x))[0]
                 for x in csv_arr['Input']],
                usemask=False, asrecarray=True
            )
        for field_name in csv_arr.dtype.names:
            if field_name not in big_csv.dtype.names:
                big_csv = numpy.lib.recfunctions.append_fields(
                    big_csv, field_name, [], usemask=False, asrecarray=True
                )
        big_csv = numpy.lib.recfunctions.stack_arrays(
            (big_csv, csv_arr), usemask=False, asrecarray=True,
            autoconvert=True
        )
        if verbose:
            print('Loaded %d/%d files' % (i + 1, len(csvpaths)), end='\r')
    return big_csv
Ejemplo n.º 10
0
def new_tables():
    sns.set_context("paper", font_scale=font_scale, rc={"lines.linewidth": 2.5})

    fig, ax = plt.subplots(1)

    with open('../results/sdss/query_number_num_new_tables.csv') as f:
        data = np.recfromcsv(f)
    c = data['num_new_tables'].astype(float)
    c /= sum(c)
    q = data['query_number'].astype(float)
    q /= q[-1]
    ax.plot(q, np.cumsum(c), label="SDSS", color=colors['sdss'], linewidth=2, drawstyle='steps-post')
    # ax.scatter(q[0: -1], np.cumsum(c)[0: -1], color=colors['sdss'], marker="o", s=50, alpha=.7)

    with open('../results/tpch/query_number_num_new_tables.csv') as f:
        data = np.recfromcsv(f)
    c = data['num_new_tables'].astype(float)
    c /= sum(c)
    q = data['query_number'].astype(float)
    q /= q[-1]
    ax.plot(q, np.cumsum(c), label="TPC-H", color=colors['tpch'], linewidth=2, drawstyle='steps-post')
    # ax.scatter(q[0: -1], np.cumsum(c)[0: -1], color=colors['tpch'], marker="o", s=50, alpha=.7)

    # sns.rugplot([0.1, 0.2, 10, 100], ax=ax)

    with open('../results/sqlshare/table_coverage.csv') as f:
        data = np.recfromcsv(f)
    c = data['tables'].astype(float)
    c /= c[-1]
    q = data['query_id'].astype(float)
    q /= q[-1]
    ax.plot(q, c, label="SQLShare", color=colors['sqlshare'], linewidth=2, drawstyle='steps-post')
    # ax.scatter(q[0: -1], c[0: -1], color=colors['sqlshare'], marker="o", s=20, alpha=.01)

    ax.yaxis.set_major_formatter(formatter)
    ax.xaxis.set_major_formatter(formatter)

    plt.title("CDF of new tables")
    ax.set_xlabel('\% of queries')
    ax.set_ylabel('\% of newly used table')

    ax.set_ylim(0, 1.01)
    ax.set_xlim(-0.01, 1)

    ax.title.set_position((ax.title._x, 1.04))

    plt.legend(loc=4)
    plt.tight_layout()

    plt.savefig(root_path + 'plot_table_coverage.eps', format='eps')
Ejemplo n.º 11
0
Archivo: cfutils.py Proyecto: satra/sad
def get_subjects():
    """Returns names of all subjects
    """
    pdata = np.recfromcsv(('/mindhive/gablab/sad/PY_STUDY_DIR/Block/volsurf/'
                           'l2output/social/split_halves/regression/'
                           'lsasDELTA/6mm/allsubs.csv'), names=True)
    return pdata.subject.tolist()
def sort_results_csv(input_file='../../results/baseline_classifier_results.csv',output_file=''):
	"""
	Sorts the results csv file and writes to the same file.
	Sort on classifier name first (1th column), then on features (6th column)
	"""

	if output_file =='': output_file = input_file

	#import header first
	with open(input_file, 'r') as f:
		header = f.readline()

	#load csv into table (automatically with correct datatypes)
	table = np.recfromcsv(input_file,delimiter=',')

	#only sort if we have more then one element (to prevent bugs)
	if np.size(table) > 1:

		#sort on features
		table = sorted(table, key=lambda tup: tup[5])
		#sort on classifier
		table = sorted(table, key=lambda tup: tup[0])

		#store sorted file
		with open(output_file,'w') as fd:
			fd.write(header)
			[fd.write(settings_to_string(tup[0],tup[1],tup[2],tup[3],tup[4],tup[5],tup[6],tup[7]) + "\n") for tup in table]
Ejemplo n.º 13
0
    def selectOnSharpeRatio(self, ls_symbols, top_n_equities=10):
        ''' Choose the best portfolio over the stock universe,
        according to their sharpe ratio'''
        #TODO: change this to a DataAccess utilitie --------------
        symbols, files = getAllFromCSV()
        datalength = len(recfromcsv(files[0])['close'])
        print('Datalength: {}'.format(datalength))
        #---------------------------------------------------------
        #Initiaing data arrays
        closes = np.recarray((datalength,), dtype=[(symbol, 'float') for symbol in symbols])
        daily_ret = np.recarray((datalength - 1,), dtype=[(symbol, 'float') for symbol in symbols])
        average_returns = np.zeros(len(files))
        return_stdev = np.zeros(len(files))
        sharpe_ratios = np.zeros(len(files))
        cumulative_returns = np.recarray((datalength-1,), dtype=[(symbol, 'float') for symbol in symbols])

        # Here is the meat
        #TODO: data = dataobj.getData(ls_symbols)
        for i, symbol in enumerate(ls_symbols):
            if len(data) != datalength:
                continue
            print('Processing {} file'.format(file))
            closes[symbols[i]] = data['close'][::-1]
            daily_ret[symbols[i]] = dailyReturns()
            # We now can compute:
            average_returns[i] = daily_ret[symbols[i]].mean()
            return_stdev[i] = daily_ret[symbols[i]].stdev()
            sharpe_ratios[i] = (average_returns[i] / return_stdev[i]) * np.sqrt(datalength)   # compare to course
            print('\tavg: {}, stdev: {}, sharpe ratio: {}'.format(average_returns[i], return_stdev[i], sharpe_ratios[i]))

        sorted_sharpe_indices = np.argsort(sharpe_ratios)[::-1][0:top_n_equities]
        #TODO: return a disct as {symbol: sharpe_ratio}, or a df with all 3 components
        return sorted_sharpe_indices
Ejemplo n.º 14
0
    def import_data(self):
        """Imports data from csv file as a numpy record array (aka
        structured array)"""

        self.data = np.recfromcsv(self.folder + self.file + '.csv')

        self.exh.T_in = self.data['hx_exh_in_t'] + 273.15  # K
        self.exh.T_out = self.data['hx_exh_out_t'] + 273.15  # K
        self.exh.mdot = self.data['exh_mdot_kgmin'] / 60.  # kg/s
        self.exh.deltaP = (
            self.data['hx_exh_delta_p_2_in_wc'] * 0.249 * 2.  # kPa
            )

        self.cool.T_in = (
            0.5 * (self.data['hx_cool_1_in_t'] +
            self.data['hx_cool_2_in_t']) + 273.15  # K
            )
        self.cool.T_out = (
            0.5 * (self.data['hx_cool_1_out_t'] +
            self.data['hx_cool_2_out_t']) + 273.15  # K
            )
        self.cool.Vdot = self.data['cool_vdot_gpm']

        self.exh.T_mean = 0.5 * (self.exh.T_in + self.exh.T_out)
        self.exh.deltaT = self.exh.T_in - self.exh.T_out
        self.exh.eta = self.exh.deltaT / (self.exh.T_in - self.cool.T_in)

        self.exh.c_p = np.zeros(self.exh.T_in.size)

        for i in range(self.exh.T_in.size):
            self.exh.T = self.exh.T_mean[i]
            self.exh.set_TempPres_dependents()
            self.exh.c_p[i] = self.exh.c_p_air

        self.exh.Qdot = self.exh.mdot * self.exh.c_p * self.exh.deltaT
Ejemplo n.º 15
0
def fetch_abide(data_dir=None, verbose=0, **kwargs):
    """
    """
    exclude_ids = ['UM_1_0050289', 'Yale_0050571', 'KKI_0050822',
                   'SDSU_0050204', 'CMU_a_0050664']
    strategy = 'nofilt_noglobal'
    pipeline = 'cpac'

    dataset_name = 'ABIDE_pcp'
    csv = 'Phenotypic_V1_0b_preprocessed1.csv'

    kwargs['qc_rater_1'] = b'OK'
    kwargs['qc_anat_rater_2'] = [b'OK', b'maybe']
    kwargs['qc_func_rater_2'] = [b'OK', b'maybe']
    kwargs['qc_anat_rater_3'] = b'OK'
    kwargs['qc_func_rater_3'] = b'OK'

    path_csv = os.path.join(data_dir, dataset_name, csv)

    with open(path_csv, 'r') as pheno_f:
        pheno = ['i' + pheno_f.readline()]

        for line in pheno_f:
            pheno.append(re.sub(r',(?=[^"]*"(?:[^"]*"[^"]*")*[^"]*$)', ";", line))

    # bytes (encode()) needed for python 2/3 compat with numpy
    pheno = '\n'.join(pheno).encode()
    pheno = BytesIO(pheno)
    pheno = np.recfromcsv(pheno, comments='$', case_sensitive=True)

    # First, filter subjects with no filename
    pheno = pheno[pheno['FILE_ID'] != b'no_filename']
    # Apply user defined filters
    user_filter = datasets.utils._filter_columns(pheno, kwargs)
    pheno = pheno[user_filter]

    for id_ in exclude_ids:
        pheno = pheno[pheno['FILE_ID'] != id_]

    data_dir = os.path.join(data_dir, dataset_name, pipeline, strategy)

    results = {}
    file_ids = [file_id.decode() for file_id in pheno['FILE_ID']]

    ext = '.nii.gz'
    derivative = 'func_preproc'
    files = []

    for file_id in file_ids:
        file_ = (file_id + '_' + derivative + ext)
        check_file = os.path.join(data_dir, file_)
        if os.path.isfile(check_file):
            files.append(check_file)
        else:
            print("File is missing %s" % file_)

    results['phenotypic'] = pheno
    results[derivative] = files

    return Bunch(**results)
Ejemplo n.º 16
0
def read_census_matrix(state):
    censusFileLocation = M_PATH + '\\DemographicQueries\\'
    fname = censusFileLocation + state + 'Query.txt'
    tra = np.loadtxt(fname, delimiter=",", skiprows = 1, dtype = str, usecols=[3], converters = {3:remove_b})
    uni = np.loadtxt(fname, delimiter=",", skiprows = 1, dtype = str, usecols=[2], converters = {2:remove_b})
    mydata = np.recfromcsv(fname, delimiter=',', usecols = demo_ranges(), filling_values=np.nan, case_sensitive=True, deletechars='', replace_space=' ')
    return mydata, tra, uni
Ejemplo n.º 17
0
def q3a_pm(base_path, csv_fn):
    filename = "".join([base_path, csv_fn])
    names = ['genre', 'gender', 'movies', 'type', 'age']
    my_data = np.recfromcsv(filename, names = ['genre', 'gender', 'movies', 'type', 'age'])
    
    vhs = get_arr_for_col_del(my_data, 'type', names, "VHS")
    dvd = get_arr_for_col_del(my_data, 'type', names, "DVD")
    bluray = get_arr_for_col_del(my_data, 'type', names, "BLURAY")

    names.pop(3)
    
    vhs_f = get_arr_for_col_del(vhs, 'gender', names, "F")
    vhs_m = get_arr_for_col_del(vhs, 'gender', names, "M")

    dvd_f = get_arr_for_col_del(dvd, 'gender', names, "F")
    dvd_m = get_arr_for_col_del(dvd, 'gender', names, "M")

    bluray_f = get_arr_for_col_del(bluray, 'gender', names, "F")
    bluray_m = get_arr_for_col_del(bluray, 'gender', names, "M")

    plot_q3a(vhs_f, names, 'VHS copies', 'Movie distr. F')
    plot_q3a(vhs_m, names, 'VHS copies', 'Movie distr. M')

    plot_q3a(dvd_f, names, 'DVD copies', 'Movie distr. F')
    plot_q3a(dvd_m, names, 'DVD copies', 'Movie distr. M')

    plot_q3a(bluray_f, names, 'Bluray copies', 'Movie distr. F')
    plot_q3a(bluray_m, names, 'Bluray copies', 'Movie distr. M')

    return
Ejemplo n.º 18
0
Archivo: cfutils.py Proyecto: satra/sad
def get_subject_data(subjects):
    """Returns contrast_files and behavioral data for a given list of subjects

    Parameters
    ----------

    subjects : list of strings (names of subjects)

    Returns
    -------

    confiles : list of strings (names of contrast files)
    pdata : recarray containing behavioral information for given subject order

    """
    # original input file with test scores etc for every subject
    pdata = np.recfromcsv(('/mindhive/gablab/sad/PY_STUDY_DIR/Block/volsurf/'
                           'l2output/social/split_halves/regression/'
                           'lsasDELTA/6mm/allsubs.csv'), names=True)

    sidx = []
    confiles = []
    for s in subjects:
        try:
            idx = np.nonzero(pdata.subject==s)[0][0]
        except IndexError, e:
            raise IndexError('subject %s not found' % s)
        sidx.append(idx)
        confile = glob(con_template % s)
        if not confile:
            raise ValueError('no confile found for subject %s' % s)
        confiles.extend(confile)
Ejemplo n.º 19
0
    def test_loopBlocks(self):
        """An experiment file with made-up params and routines to see whether
        future versions of experiments will get loaded.
        """
        #load the test experiment (with a stims loop, trials loop and blocks loop)
        expfile = path.join(self.exp.prefsPaths['tests'], 'data', 'testLoopsBlocks.psyexp')
        self.exp.loadFromXML(expfile) # reload the edited file
        #alter the settings so the data goes to our tmp dir
        datafileBase = os.path.join(self.tmp_dir, 'testLoopsBlocks')
        datafileBaseRel = os.path.relpath(datafileBase,expfile)
        self.exp.settings.params['Data filename'].val = repr(datafileBaseRel)
        #write the script from the experiment
        script = self.exp.writeScript(expPath=expfile)
        py_file = os.path.join(self.tmp_dir, 'testLoopBlocks.py')

        # save it
        with codecs.open(py_file, 'w', 'utf-8-sig') as f:
            f.write(script.replace("core.quit()", "pass"))
            f.write("del thisExp\n") #garbage collect the experiment so files are auto-saved

        #run the file (and make sure we return to this location afterwards)
        wd = os.getcwd()
        execfile(py_file)
        os.chdir(wd)
        #load the data
        print("searching..." +datafileBase)
        print(glob.glob(datafileBase+'*'))
        f = open(datafileBase+".csv", 'rb')
        dat = numpy.recfromcsv(f, case_sensitive=True)
        f.close()
        assert len(dat)==8 # because 4 'blocks' with 2 trials each (3 stims per trial)
Ejemplo n.º 20
0
    def read_data(self, fname, infile_type="aci", delimiter=","):
        """ Reads in the A-Ci data if infile_type="aci" is true, otherwise this
        reads in the fitted results...

        For A-Ci data, code expects a format of:
        -> Curve, Tleaf, Ci, Photo, Species, Season, Leaf

        Parameters
        ----------
        fname : string
            input file name, expecting csv file.

        Returns:
        --------
        data : array
            numpy array containing the data
        """
        data = np.recfromcsv(fname, delimiter=delimiter, names=True,
                             case_sensitive=True)
        if infile_type == "norm":
            # using normalised temp data
            data["Tav"] = data["Tav"] + self.deg2kelvin
            data["Jnorm"] = np.exp(data["Jnorm"])
            data["Vnorm"] = np.exp(data["Vnorm"])
        elif infile_type == "meas":
            # using measured temp data.
            data["Tav"] = data["Tav"] + self.deg2kelvin
            data["Jmax"] = data["Jmax"]
            data["Vcmax"] = data["Vcmax"]
        elif infile_type != "aci":
            raise IOError("Unknown file type in read??\n")

        return data
Ejemplo n.º 21
0
def read_datafile(datafile, header_skip, footer_skip, delim, string):
	path = os.path.join(APP_STATIC,datafile)
	data = np.recfromcsv(path, skip_header=header_skip, skip_footer=footer_skip, autostrip=True, delimiter=delim)
	data = np.array(map(list, data)) # convert the data into a normal numpy array
	extra_col = make_column(string, data.shape[0])
	data = np.hstack((data, extra_col))
	return data
Ejemplo n.º 22
0
def add_tan_pix_coordinates(in_file_name, out_file_name):
    """Compute x, y nominal coordinates from alt, az
    and add as columns to a CSV file"""
    logging.info('Reading file: {0}'.format(in_file_name))
    data = np.recfromcsv(in_file_name)
    az = data['azevent']
    alt = data['altevent']
    alts = data['altsystem']
    azs = data['azsystem']
    infile = file(in_file_name)

    logging.info('Writing file: {0}'.format(out_file_name))
    outfile = file(out_file_name, 'w')
    names = infile.readline().split()
    names.append(',Nomx,Nomy\n')
    line = ' '.join(names)
    line = line.replace(' ', '')
    outfile.write(line)

    for ii in np.arange(0, len(alts) - 1, 1):
        noms = tan_world_to_pix(az[ii], alt[ii], azs[ii], alts[ii])
        values = infile.readline().split()
        values.append(',%s,%s\n' % (str(noms[0]), str(noms[1])))
        line = ' '.join(values)
        line = line.replace(' ', '')

        outfile.write(line)

    infile.close()
    outfile.close()
Ejemplo n.º 23
0
def main():
    filename = '../../dataset/sea_dataset/normalized_sea.csv'
    data = np.recfromcsv(filename)
    data_tuplelist = data.tolist()
    data_list = [list(i) for i in data_tuplelist]

    nop = 100
    nod = shape(data_list)[1]
    print nod
    sigmai = [0.1] * nod
    chunk_size = 50

    old_index = np.random.normal(loc=0, scale=math.pow(sigmai[1], 1), size=(nop, nod))
    old_param = np.random.normal(loc=0, scale=sigmai[1], size=(1, nod))
    # print old_param
    #print old_index
    chunk_accuracy_list = []
    for i in range(0, 60000, chunk_size):
        print i
        chunk_data = data_list[i:i + chunk_size]
        chunk_data = [[1] + x for x in chunk_data]

        [chunk_params, current_parameters] = compute_chunk_n(chunk_data, nop, sigmai, old_param, old_index)
        #print chunk_params
        #print 'gg'
        #print current_parameters
        old_param = [chunk_params]
        old_index = current_parameters
        #print old_param
        #print current_parameters
        #print chunk_params
        #print chunk_params
        chunk_accuracy_list.append(compute_accuracy(chunk_data, chunk_params))
    plot_accuracy(chunk_accuracy_list)
Ejemplo n.º 24
0
    def compute_features(self, task, cpu_seconds = None):
        """Read or compute features of an instance."""

        # grab precomputed feature data
        csv_path = task + ".features.csv"

        assert os.path.exists(csv_path)

        features_array = numpy.recfromcsv(csv_path)
        features = features_array.tolist()
        names = features_array.dtype.names

        # accumulate their cost
        assert names[0] == "cpu_cost"

        cpu_cost = features[0]

        borg.get_accountant().charge_cpu(cpu_cost)

        # handle timeout logic, and we're done
        if cpu_seconds is not None:
            if cpu_cost >= cpu_seconds:
                return (["cpu_cost"], [cpu_seconds])
            else:
                assert len(names) > 1

        return (names, features)
Ejemplo n.º 25
0
    def yield_runs():
        suite = borg.load_solvers(suite_path)

        logger.info("scanning paths under %s", tasks_root)

        paths = list(borg.util.files_under(tasks_root, suite.domain.extensions))

        if not paths:
            raise ValueError("no paths found under specified root")

        if only_solver is None:
            solver_names = suite.solvers.keys()
        else:
            solver_names = [only_solver]

        for path in paths:
            run_data = None

            if only_missing and os.path.exists(path + suffix):
                run_data = numpy.recfromcsv(path + suffix, usemask=True)

            for solver_name in solver_names:
                if only_missing and run_data is not None:
                    count = max(0, runs - numpy.sum(run_data.solver == solver_name))
                else:
                    count = runs

                logger.info("scheduling %i run(s) of %s on %s", count, solver_name, os.path.basename(path))

                for _ in xrange(count):
                    seed = numpy.random.randint(sys.maxint)

                    yield (run_solver_on, [suite_path, solver_name, path, budget, store_answers, seed])
Ejemplo n.º 26
0
def fetch_coords_dosenbach_2010():
    """Load the Dosenbach et al. 160 ROIs. These ROIs cover
    much of the cerebral cortex and cerebellum and are assigned to 6
    networks.

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        dictionary-like object, contains:
        - "rois": coordinates of 160 ROIs in MNI space
        - "labels": ROIs labels
        - "networks": networks names

    References
    ----------
    Dosenbach N.U., Nardos B., et al. "Prediction of individual brain maturity
    using fMRI.", 2010, Science 329, 1358-1361.
    """
    dataset_name = 'dosenbach_2010'
    fdescr = _get_dataset_descr(dataset_name)
    package_directory = os.path.dirname(os.path.abspath(__file__))
    csv = os.path.join(package_directory, "data", "dosenbach_2010.csv")
    out_csv = np.recfromcsv(csv)

    # We add the ROI number to its name, since names are not unique
    names = out_csv['name']
    numbers = out_csv['number']
    labels = np.array(['{0} {1}'.format(name, number) for (name, number) in
                       zip(names, numbers)])
    params = dict(rois=out_csv[['x', 'y', 'z']],
                  labels=labels,
                  networks=out_csv['network'], description=fdescr)

    return Bunch(**params)
Ejemplo n.º 27
0
def get_catalog(catalog_file):
    """
    Load the catalog from the provided file
    
    :param catalog_file: a comma-separated catalog file
    :return: an instance of the Catalog class
    
    """

    # Read them all as strings at first

    dtypes = map(lambda name:(name, 'S100'), columns_formats.keys())

    data_ = numpy.recfromcsv(catalog_file, names=True, case_sensitive=True, dtype=dtypes)

    # Convert to the proper python format

    data_dict = collections.OrderedDict()

    for col in columns_formats:

        converter = columns_formats[col][0]
        null_value = columns_formats[col][1]

        this_data = data_[col]

        idx = (this_data == "NULL")

        this_data[idx] = null_value

        data_dict[col] = numpy.array(this_data, converter)

    # Convert to numpy.ndarray

    formats = tuple(map(lambda x:x.dtype.str, data_dict.values()))

    data = numpy.zeros(data_dict.values()[0].shape[0], dtype={'names': data_dict.keys(),
                                                                'formats': formats})

    for col in data_dict:

        data[col][:] = data_dict[col]

    # data = numpy.recfromtxt(catalog_file, delimiter=',', names=True,
    #                         dtype=[('Trigger_name', 'S12'),
    #                                ('Trigger_date', 'S23'),
    #                                ('Trigger_time','<f8'),
    #                                ('GCN_type','S35'),
    #                                ('Time_scale', '<f8'),
    #                                ('Final_TS', '<f8'),
    #                                ('Output_RA', '<f8'),
    #                                ('Output_Dec', '<f8'),
    #                                ('Localization_error', '<f8'),
    #                                ('Closest_point_source', 'S43'),
    #                                ('Angular_distance', '<f8'),
    #                                ('Photon_index', '<f8'),
    #                                ('Photon_index_error', '<f8'),
    #                                ('GRB_events', '<i8')])

    return Catalog(data)
Ejemplo n.º 28
0
def stations_json():

    stations = np.recfromcsv('chi-stations.csv', delimiter=',')

    output = {'type': "FeatureCollection", 'features':[]}

    for s in stations:

        output['features'].append({
            'type': "Feature",
            "id": np.asscalar(s[0]),
            "geometry": {
                "type":"Point",
                "coordinates":[np.asscalar(s[2]),np.asscalar(s[1])] #long, lat
            },
            "geometry_name": "origin_geom",
            "properties": {
                'name': s[3]
            }})

    f = io.open('chi-stations.json', 'w', encoding='utf-8') 
    f.write(unicode(json.dumps(output, ensure_ascii=False)))
    f.close()

    json_output=open('chi-stations.json')
    output_data = json.load(json_output)
    pprint(output_data)
    json_output.close()
def graph():
    # parse(MY_FILE, ",")
    data = np.recfromcsv('../data/crabs.csv')
    trans = []
    itrans = []
    x = []
    i = 1
    for row in data:
        trans.append(row['trans'])
        itrans.append(row['itrans'])
        x.append(i)
        i += 1
    # create the figure
    fig = plt.figure(figsize=(7, 3))
    # create a grid of 1 row and 1 column for the plot
    # gs = mpl.gridspec.GridSpec(1, 1)
    # put a plot in the first row, first column
    # ax = fig.add_subplots(gs[0])


    plt.title('transVSitrans')
    plt.plot(x,trans,color='red')
    plt.plot(x, itrans, color='blue')

    fig.savefig('transVSitrans.png')
Ejemplo n.º 30
0
def show_predictions(alpha="alpha", symbol="GE", xtn=".PNG"):
    if type(alpha) == str:
        print ("Loading file named " + alpha + ".mat")
        a = mat.loadmat(
            alpha + ".mat", mat_dtype=False
        )  # load a matlab style set of matrices from the file named by the string alpha
        if a.has_key(alpha):
            alpha = a.get(alpha).reshape(-1)  # get the variable with the name of the string in alpha
        else:
            alpha = a.get(a.keys()[2]).reshape(-1)  # get the first non-hidden key and reshape into a 1-D array
    print ("Loading financial data for stock symbol", symbol)
    r = np.recfromcsv("/home/hobs/Desktop/References/quant/lyle/data/" + symbol + "_yahoo.csv", skiprows=1)
    r.sort()
    r.high = r.high * r.adj_close / r.close  # adjust the high and low prices for stock splits
    r.low = r.low * r.adj_close / r.close  # adjust the high and low prices for stock splits
    daily_returns = r.adj_close[1:] / r.adj_close[0:-1] - 1
    predictions = lfilt(alpha, daily_returns)
    print (
        "Plotting a scatter plot of",
        len(daily_returns),
        "returns vs",
        len(predictions),
        "predictions using a filter of length",
        len(alpha),
    )
    (ax, fig) = plot(predictions, daily_returns[len(alpha) :], s="bo", xtn=".PNG")
    ax.set_xlabel("Predicted Returns")
    ax.set_ylabel("Actual Returns")
    big_mask = np.abs(predictions) > np.std(predictions) * 1.2
    bigs = predictions[big_mask]
    true_bigs = daily_returns[big_mask]
    (ax, fig) = plot(bigs, true_bigs, s="r.", xtn=".PNG")
    fig.show()
    return (predictions, daily_returns, bigs, true_bigs, big_mask)
with the highest values.

"""

##############################################################################
# Retrieve the atlas and the data
from nilearn import datasets
atlas = datasets.fetch_atlas_msdl()
atlas_filename = atlas['maps']

# Load the labels
import numpy as np
csv_filename = atlas['labels']

# The recfromcsv function can load a csv file
labels = np.recfromcsv(csv_filename)
names = labels['name']

data = datasets.fetch_adhd(n_subjects=1)

# print basic information on the dataset
print('First subject functional nifti images (4D) are at: %s' %
      data.func[0])  # 4D data

##############################################################################
# Extract time series
from nilearn.input_data import NiftiMapsMasker
masker = NiftiMapsMasker(maps_img=atlas_filename,
                         standardize=True,
                         memory='nilearn_cache',
                         verbose=5)
Ejemplo n.º 32
0
def fetch_lemur_mircen_2019_t2(subjects=[0],
                               data_dir=None,
                               url=None,
                               resume=True,
                               verbose=1):
    """Download and loads the mouse lemur template dataset.

    Parameters
    ----------
    subjects : sequence of int or None, optional
        ids of subjects to load, default to loading one subject.

    data_dir : string, optional
        Path of the data directory. Used to force data storage in a specified
        location. Default: None

    resume : bool, optional (default True)
        If true, try resuming download if possible.

    verbose : int, optional (default 0)
        Defines the level of verbosity of the output.

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        Dictionary-like object, the interest attributes are :

        - 'anat': string list. Paths to T2-weighted images.
        - 'phenotypic': Participants genders, birth dates and MRI scan dates

    References
    ----------
    :Download:
        https://openneuro.org/datasets/ds001945/versions/1.0.0/download

    :Reference:
        `A 3D population-based brain atlas of the mouse lemur primate with
        examples of applications in aging studies and comparative anatomy.
        <http://doi:10.1016/j.neuroimage.2018.10.010>`_
        Neuroimage 185 (2019): 85-95. 
        N. A. Nadkarni, S. Bougacha, C. Garin, M. Dhenain, and J. L. Picq. 

    """
    if url is None:
        url = 'https://openneuro.org/crn/datasets/ds001945/snapshots/1.0.0/files'

    dataset_name = 'mircen2019_t2'
    data_dir = _get_dataset_dir(dataset_name,
                                data_dir=data_dir,
                                verbose=verbose)

    # Check arguments
    max_subjects = 34
    if max(subjects) > max_subjects:
        warnings.warn(
            'Warning: there are only {0} subjects'.format(max_subjects))
        subjects = range(max_subjects)

    subject_ids = np.array(['sub-{0:02d}'.format(i) for i in range(1, 35)])
    subject_ids = subject_ids[subjects]

    # Generate the list of urls
    json_urls = [
        os.path.join(url, '{0}:anat:{0}_T2w.json'.format(subject_id))
        for subject_id in subject_ids
    ]
    anat_urls = [
        os.path.join(url, '{0}:anat:{0}_T2w.nii.gz'.format(subject_id))
        for subject_id in subject_ids
    ]

    # Generate the list of target files
    anat_basenames = [
        '{0}_anat_{0}_T2w.nii.gz'.format(subject_id)
        for subject_id in subject_ids
    ]
    anat_files = [
        os.path.join(animal_dir, anat_basename)
        for (animal_dir, anat_basename) in zip(subject_ids, anat_basenames)
    ]

    json_basenames = [
        '{0}_anat_{0}_T2w.json'.format(subject_id)
        for subject_id in subject_ids
    ]
    json_files = [
        os.path.join(animal_dir, json_basename)
        for (animal_dir, json_basename) in zip(subject_ids, json_basenames)
    ]

    # Call fetch_files once per subject.
    anat = []
    json = []
    for anat_u, anat_f, json_u, json_f in zip(anat_urls, anat_files, json_urls,
                                              json_files):
        a, j = _fetch_files(data_dir, [(anat_f, anat_u, {
            'move': anat_f
        }), (json_f, json_u, {
            'move': json_f
        })],
                            verbose=verbose)
        json.append(j)
        anat.append(a)

    pheno_url = os.path.join(url, 'lemur_atlas_list_t2_bids.csv')
    pheno_file = _fetch_file(pheno_url, data_dir, verbose=verbose)
    phenotypic = np.recfromcsv(
        pheno_file,
        delimiter='\t',
        skip_header=True,
        names=['animal_id', 'gender', 'birthdate', 'mri_date'],
        dtype=['U8', 'U3', 'datetime64[D]', 'datetime64[D]'],
        converters={
            2: _parse_date,
            3: _parse_date
        },
        encoding='U8')
    phenotypic = phenotypic[[
        np.where(phenotypic['animal_id'] == '"' + i + '"')[0][0]
        for i in subject_ids
    ]]
    fdescr = _get_dataset_descr(dataset_name)

    return Bunch(anat=anat, pheno=phenotypic, description=fdescr)
Ejemplo n.º 33
0
Created on Sat Dec 14 17:23:25 2013

Author: Josef Perktold
"""
from __future__ import print_function
import os
import numpy as np
from statsmodels.sandbox.nonparametric import kernels

from numpy.testing import assert_allclose, assert_array_less

DEBUG = 0

curdir = os.path.dirname(os.path.abspath(__file__))
fname = 'results/results_kernel_regression.csv'
results = np.recfromcsv(os.path.join(curdir, fname))

y = results['accident']
x = results['service']
positive = x >= 0
x = np.log(x[positive])
y = y[positive]
xg = np.linspace(x.min(), x.max(), 40) # grid points default in Stata

#kern_name = 'gau'
#kern = kernels.Gaussian()
#kern_name = 'epan2'
#kern = kernels.Epanechnikov()
#kern_name = 'rec'
#kern = kernels.Uniform()  # ours looks awful
#kern_name = 'tri'
Ejemplo n.º 34
0
def load_data_file(name, skip_header=None):
    fname = os.path.join(os.path.dirname(__file__), 'data', name)
    return np.recfromcsv(fname, skip_header=skip_header,
                         case_sensitive=True).view(np.recarray)
Ejemplo n.º 35
0
from matplotlib.backends import backend_pdf
from nilearn import plotting
from nilearn import image
from nilearn.masking import apply_mask
from nilearn.image import math_img
from soma import aims
from copy import deepcopy
import numpy as np
import os
import csv
import glob


### initializes paths and set parameters
atlas_template = '/neurospin/grip/protocols/MRI/Resting_state_Victor_2014/atlases/atlas_fonctionel_control_AVCnn/AVCnn.nii'  #reference atlas on which all individual atlases will be based
labels = np.recfromcsv('/neurospin/grip/protocols/MRI/Resting_state_Victor_2014/atlases/atlas_fonctionel_control_AVCnn/AVCnnlabels.csv')#reference roi labels on which all individual atlases will be based
labels_names_ref =labels['name'].T
root = '/media/vd239549/LaCie/victor/AVCnn/AVCnn_2016_DARTEL/patients'
subjects = open('/neurospin/grip/protocols/MRI/Resting_state_Victor_2014/ressources_txt/AVCnn_patients.txt','r').read().split()
#AVCnn_cont_all.txt
#AVCnn_patients.txt
basename = 'func_atlas'
func_type= 'RS1'
save_report = '/media/vd239549/LaCie/victor/AVCnn/AVCnn_2016_DARTEL/patients/docs/Atlas/atlas_func_report_patients.pdf'
at_dir = '/neurospin/grip/protocols/MRI/Resting_state_Victor_2014/AVCnn/resultats/atlas_indiv_func' #dir where to save the atlases 
roi_thresh = 0.5 #Threshold of voxel ratio for roi to be considered 
#if number of voxel in individual brain mask and roi of reference atlas / number of voxel in roi of reference atlas < threshold then roi will be empty
brain_mask_thresh = 0.1 #Threshold for mask binarization (times average intensity of mask : 0.1 means 10% of average mask intensity)

#gathering data from reference atlas
template_img = nb.load(atlas_template) #image oject
plt.scatter(data_float[:, 0], data_float[:, 1])
plt.xlabel('time (min.)')
plt.ylabel('percentage of larvae')
plt.show()

############################################
data = np.genfromtxt('titanic.csv', delimiter=',', names=True, dtype=None)
np.shape(data)
print(data["Survived"])

################################################
# Assign the filename: file
file = 'titanic.csv'

# Import file using np.recfromcsv: d
d = np.recfromcsv(file)

# Print out first three entries of d
print(d[:3])

###############################################
# Import pandas as pd
import pandas as pd

# Assign the filename: file
file = 'titanic.csv'

# Read the file into a DataFrame: df
df = pd.read_csv(file)

# View the head of the DataFrame
Ejemplo n.º 37
0
from nilearn.image import smooth_img
from nilearn.mass_univariate import permuted_ols

FWHM = 5

### Gather data
# images
path_to_images = "/home/virgile/wip/retreat/pypreprocess_output"
images = sorted(
    glob.glob(os.path.join(path_to_images, "OAS1_*_MR1/mwc2OAS1_*dimbet.nii")))
#images = images[:39]  # disc1 only

n_samples = len(images)
# explanatory variates
path_to_csv = "/home/virgile/wip/retreat/oasis/oasis_cross-sectional.csv"
ext_vars = np.recfromcsv(path_to_csv)[:n_samples]
age = ext_vars['age'].astype(float).reshape((-1, 1))

# filter elderly subjects (= subjects with available Alzheimer info)
elderly_subjects_ids = np.where(~np.isnan(ext_vars['cdr']))[0]
images = [x for (i, x) in enumerate(images) if i in elderly_subjects_ids]
age = age[elderly_subjects_ids]
cdr = LabelBinarizer().fit_transform(LabelEncoder().fit_transform(
    ext_vars['cdr'][elderly_subjects_ids]))
cdr = cdr[:, -1].reshape((-1, 1))  # build impairment variate

### Mask data
print "Resample images"
nifti_masker = NiftiMasker(smoothing_fwhm=FWHM,
                           memory='nilearn_cache',
                           memory_level=1)  # cache options
Ejemplo n.º 38
0
def main():
    """Main function that is called when TPOT is run on the command line"""
    parser = argparse.ArgumentParser(
        description='A Python tool that '
        'automatically creates and optimizes machine learning pipelines using '
        'genetic programming.',
        add_help=False)

    parser.add_argument(
        'INPUT_FILE',
        type=str,
        help='Data file to use in the TPOT '
        'optimization process. Ensure that the class label column is labeled as "class".'
    )

    parser.add_argument('-h',
                        '--help',
                        action='help',
                        help='Show this help message and exit.')

    parser.add_argument(
        '-is',
        action='store',
        dest='INPUT_SEPARATOR',
        default='\t',
        type=str,
        help='Character used to separate columns in the input file.')

    parser.add_argument('-target',
                        action='store',
                        dest='TARGET_NAME',
                        default='class',
                        type=str,
                        help='Name of the target column in the input file.')

    parser.add_argument(
        '-mode',
        action='store',
        dest='TPOT_MODE',
        choices=['classification', 'regression'],
        default='classification',
        type=str,
        help=
        'Whether TPOT is being used for a supervised classification or regression problem.'
    )

    parser.add_argument(
        '-o',
        action='store',
        dest='OUTPUT_FILE',
        default='',
        type=str,
        help='File to export the code for the final optimized pipeline.')

    parser.add_argument(
        '-g',
        action='store',
        dest='GENERATIONS',
        default=100,
        type=positive_integer,
        help='Number of iterations to run the pipeline optimization process.\n'
        'Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. '
        'TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.'
    )

    parser.add_argument(
        '-p',
        action='store',
        dest='POPULATION_SIZE',
        default=100,
        type=positive_integer,
        help=
        'Number of individuals to retain in the GP population every generation.\n'
        'Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. '
        'TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.'
    )

    parser.add_argument(
        '-os',
        action='store',
        dest='OFFSPRING_SIZE',
        default=None,
        type=positive_integer,
        help='Number of offspring to produce in each GP generation. '
        'By default, OFFSPRING_SIZE = POPULATION_SIZE.')

    parser.add_argument(
        '-mr',
        action='store',
        dest='MUTATION_RATE',
        default=0.9,
        type=float_range,
        help='GP mutation rate in the range [0.0, 1.0]. This tells the '
        'GP algorithm how many pipelines to apply random changes to every generation. '
        'We recommend using the default parameter unless you understand how the mutation '
        'rate affects GP algorithms.')

    parser.add_argument(
        '-xr',
        action='store',
        dest='CROSSOVER_RATE',
        default=0.1,
        type=float_range,
        help='GP crossover rate in the range [0.0, 1.0]. This tells the '
        'GP algorithm how many pipelines to "breed" every generation. '
        'We recommend using the default parameter unless you understand how the crossover '
        'rate affects GP algorithms.')

    parser.add_argument(
        '-scoring',
        action='store',
        dest='SCORING_FN',
        default=None,
        type=str,
        help='Function used to evaluate the quality of a given pipeline for '
        'the problem. By default, accuracy is used for classification problems and mean '
        'squared error (mse) is used for regression problems. '
        'TPOT assumes that any function with "error" or "loss" in the name is meant to '
        'be minimized, whereas any other functions will be maximized. '
        'Offers the same options as cross_val_score: '
        '"accuracy", "adjusted_rand_score", "average_precision", "f1", "f1_macro", '
        '"f1_micro", "f1_samples", "f1_weighted", "log_loss", "mean_absolute_error", '
        '"mean_squared_error", "median_absolute_error", "precision", "precision_macro", '
        '"precision_micro", "precision_samples", "precision_weighted", "r2", "recall", '
        '"recall_macro", "recall_micro", "recall_samples", "recall_weighted", "roc_auc"'
    )

    parser.add_argument(
        '-cv',
        action='store',
        dest='NUM_CV_FOLDS',
        default=5,
        type=int,
        help='Number of folds to evaluate each pipeline over in '
        'k-fold cross-validation during the TPOT optimization process.')

    parser.add_argument(
        '-njobs',
        action='store',
        dest='NUM_JOBS',
        default=1,
        type=int,
        help='Number of CPUs for evaluating pipelines in parallel '
        ' during the TPOT optimization process. Assigning this to -1 will use as many '
        'cores as available on the computer.')

    parser.add_argument(
        '-maxtime',
        action='store',
        dest='MAX_TIME_MINS',
        default=None,
        type=int,
        help='How many minutes TPOT has to optimize the pipeline. This '
        'setting will override the GENERATIONS parameter '
        'and allow TPOT to run until it runs out of time.')

    parser.add_argument(
        '-maxeval',
        action='store',
        dest='MAX_EVAL_MINS',
        default=5,
        type=float,
        help='How many minutes TPOT has to evaluate a single pipeline. '
        'Setting this parameter to higher values will allow TPOT to explore more complex '
        'pipelines but will also allow TPOT to run longer.')

    parser.add_argument(
        '-s',
        action='store',
        dest='RANDOM_STATE',
        default=None,
        type=int,
        help='Random number generator seed for reproducibility. Set '
        'this seed if you want your TPOT run to be reproducible with the same '
        'seed and data set in the future.')

    parser.add_argument(
        '-config',
        action='store',
        dest='CONFIG_FILE',
        default='',
        type=str,
        help='Configuration file for customizing the operators and parameters '
        'that TPOT uses in the optimization process.')

    parser.add_argument(
        '-v',
        action='store',
        dest='VERBOSITY',
        default=1,
        choices=[0, 1, 2, 3],
        type=int,
        help='How much information TPOT communicates '
        'while it is running: 0 = none, 1 = minimal, 2 = high, 3 = all. '
        'A setting of 2 or higher will add a progress bar during the optimization procedure.'
    )

    parser.add_argument(
        '--no-update-check',
        action='store_true',
        dest='DISABLE_UPDATE_CHECK',
        default=False,
        help=
        'Flag indicating whether the TPOT version checker should be disabled.')

    parser.add_argument('--version',
                        action='version',
                        version='TPOT {version}'.format(version=__version__),
                        help='Show the TPOT version number and exit.')

    args = parser.parse_args()

    if args.VERBOSITY >= 2:
        print('\nTPOT settings:')
        for arg in sorted(args.__dict__):
            arg_val = args.__dict__[arg]
            if arg == 'DISABLE_UPDATE_CHECK':
                continue
            elif arg == 'SCORING_FN' and arg_val is None:
                if args.TPOT_MODE == 'classification':
                    arg_val = 'accuracy'
                else:
                    arg_val = 'mean_squared_error'
            elif arg == 'OFFSPRING_SIZE' and arg_val is None:
                arg_val = args.__dict__['POPULATION_SIZE']
            print('{}\t=\t{}'.format(arg, arg_val))
        print('')

    input_data = np.recfromcsv(args.INPUT_FILE,
                               delimiter=args.INPUT_SEPARATOR,
                               dtype=np.float64,
                               case_sensitive=True)
    if args.TARGET_NAME not in input_data.dtype.names:
        raise ValueError(
            'The provided data file does not seem to have a target column. '
            'Please make sure to specify the target column using the -target parameter.'
        )

    features = np.delete(input_data.view(np.float64).reshape(
        input_data.size, -1),
                         input_data.dtype.names.index(args.TARGET_NAME),
                         axis=1)

    training_features, testing_features, training_classes, testing_classes = \
        train_test_split(features, input_data[args.TARGET_NAME], random_state=args.RANDOM_STATE)

    if args.TPOT_MODE == 'classification':
        tpot_type = TPOTClassifier
    else:
        tpot_type = TPOTRegressor

    operator_dict = None
    if args.CONFIG_FILE:
        try:
            with open(args.CONFIG_FILE, 'r') as input_file:
                file_string = input_file.read()
            operator_dict = eval(
                file_string[file_string.find('{'):(file_string.rfind('}') +
                                                   1)])
        except:
            raise TypeError(
                'The operator configuration file is in a bad format or not available. '
                'Please check the configuration file before running TPOT.')

    tpot = tpot_type(generations=args.GENERATIONS,
                     population_size=args.POPULATION_SIZE,
                     offspring_size=args.OFFSPRING_SIZE,
                     mutation_rate=args.MUTATION_RATE,
                     crossover_rate=args.CROSSOVER_RATE,
                     cv=args.NUM_CV_FOLDS,
                     n_jobs=args.NUM_JOBS,
                     scoring=args.SCORING_FN,
                     max_time_mins=args.MAX_TIME_MINS,
                     max_eval_time_mins=args.MAX_EVAL_MINS,
                     random_state=args.RANDOM_STATE,
                     config_dict=operator_dict,
                     verbosity=args.VERBOSITY,
                     disable_update_check=args.DISABLE_UPDATE_CHECK)

    print('')

    tpot.fit(training_features, training_classes)

    if args.VERBOSITY in [1, 2] and tpot._optimized_pipeline:
        training_score = max([
            tpot._pareto_front.keys[x].wvalues[1]
            for x in range(len(tpot._pareto_front.keys))
        ])
        print('\nTraining score: {}'.format(abs(training_score)))
        print('Holdout score: {}'.format(
            tpot.score(testing_features, testing_classes)))

    elif args.VERBOSITY >= 3 and tpot._pareto_front:
        print('Final Pareto front testing scores:')

        for pipeline, pipeline_scores in zip(tpot._pareto_front.items,
                                             reversed(
                                                 tpot._pareto_front.keys)):
            tpot._fitted_pipeline = tpot._pareto_front_fitted_pipelines[str(
                pipeline)]
            print('{}\t{}\t{}'.format(
                int(abs(pipeline_scores.wvalues[0])),
                tpot.score(testing_features, testing_classes), pipeline))

    if args.OUTPUT_FILE != '':
        tpot.export(args.OUTPUT_FILE)
Ejemplo n.º 39
0
def fetch_atlas_msdl(data_dir=None, url=None, resume=True, verbose=1):
    """Download and load the MSDL brain atlas.

    Parameters
    ----------
    data_dir: string, optional
        Path of the data directory. Used to force data storage in a specified
        location. Default: None

    url: string, optional
        Override download URL. Used for test only (or if you setup a mirror of
        the data).

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        Dictionary-like object, the interest attributes are :

        - 'maps': str, path to nifti file containing regions definition.
        - 'labels': string list containing the labels of the regions.
        - 'region_coords': tuple list (x, y, z) containing coordinates
          of each region in MNI space.
        - 'networks': string list containing names of the networks.
        - 'description': description about the atlas.


    References
    ----------
    :Download:
        https://team.inria.fr/parietal/files/2015/01/MSDL_rois.zip

    :Paper to cite:
        `Multi-subject dictionary learning to segment an atlas of brain
        spontaneous activity <http://hal.inria.fr/inria-00588898/en>`_
        Gael Varoquaux, Alexandre Gramfort, Fabian Pedregosa, Vincent Michel,
        Bertrand Thirion. Information Processing in Medical Imaging, 2011,
        pp. 562-573, Lecture Notes in Computer Science.

    :Other references:
        `Learning and comparing functional connectomes across subjects
        <http://hal.inria.fr/hal-00812911/en>`_.
        Gael Varoquaux, R.C. Craddock NeuroImage, 2013.

    """
    url = 'https://team.inria.fr/parietal/files/2015/01/MSDL_rois.zip'
    opts = {'uncompress': True}

    dataset_name = "msdl_atlas"
    files = [(os.path.join('MSDL_rois', 'msdl_rois_labels.csv'), url, opts),
             (os.path.join('MSDL_rois', 'msdl_rois.nii'), url, opts)]

    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
                                verbose=verbose)
    files = _fetch_files(data_dir, files, resume=resume, verbose=verbose)
    csv_data = np.recfromcsv(files[0])
    labels = [name.strip() for name in csv_data['name'].tolist()]
    labels = [label.decode("utf-8") for label in labels]
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', module='numpy',
                                category=FutureWarning)
        region_coords = csv_data[['x', 'y', 'z']].tolist()
    net_names = [net_name.strip() for net_name in csv_data['net_name'].tolist()]
    fdescr = _get_dataset_descr(dataset_name)

    return Bunch(maps=files[1], labels=labels, region_coords=region_coords,
                 networks=net_names, description=fdescr)
data
# Example
file = 'seaslug.txt'
data = np.loadtxt(file, delimiter='\t', dtype=str)
print(data[0])
data_float = np.loadtxt(file, delimiter='\t', dtype=float, skiprows=1)
print(data_float[9])
plt.scatter(data_float[:, 0], data_float[:, 1])
plt.xlabel('time (min.)')
plt.ylabel('percentage of larvae')
plt.show()
# use 'np.genfromtxt' to deal with complex data
data = np.genfromtxt('titanic_sub.csv', delimiter=',', names=True, dtype=None)
# np.recfromcsv() behaves similarly to np.genfromtxt(), except that its default dtype is None
# it has the defaults delimiter=',' and names=True in addition to dtype=None
d = np.recfromcsv('titanic_sub.csv')
print(d[:3])

# Importing Flat Files using pandas
# df.read_csv() / df.read_table()
import pandas as pd
filename = 'titanic_sub.csv'
data = pd.read_csv(filename, nrows = 10, header = None)
data.head()     # check the first five rows
data_array = data.values    # convert into a numpy array
# Example
import matplotlib.pyplot as plt
file = 'titanic_sub.csv'
data = pd.read_csv(file, sep=',', comment='#', na_values='Nothing')
print(data.head())
pd.DataFrame.hist(data[['Age']])
data_float = np.loadtxt(file, delimiter="\t", dtype=float, skiprows=1)

# Print the 10th element of data_float
print(data_float[9])

# Plot a scatterplot of the data
plt.scatter(data_float[:, 0], data_float[:, 1])
plt.xlabel('time (min.)')
plt.ylabel('percentage of larvae')
plt.show()

# Assign the filename: file
file = 'titanic.csv'

# Import file using np.recfromcsv: d
d = np.recfromcsv(file,delimiter=",", names=True,dtype=None)

# Print out first three entries of d
print(d[:3])

# Import pandas as pd
import pandas as pd

# Assign the filename: file
file = 'titanic.csv'

# Read the file into a DataFrame: df
df = pd.read_csv(file)

# View the head of the DataFrame
print(df.head())
Ejemplo n.º 42
0
Different classifiers in decoding the Haxby dataset
=====================================================

Here we compare different classifiers on a visual object recognition
decoding task.
"""

import time

### Fetch data using nilearn dataset fetcher ################################
from nilearn import datasets
data_files = datasets.fetch_haxby(n_subjects=1)

# load labels
import numpy as np
labels = np.recfromcsv(data_files.session_target[0], delimiter=" ")
stimuli = labels['labels']
# identify resting state labels in order to be able to remove them
resting_state = stimuli == "rest"

# find names of remaining active labels
categories = np.unique(stimuli[resting_state == False])

# extract tags indicating to which acquisition run a tag belongs
session_labels = labels["chunks"][resting_state == False]

# Load the fMRI data
from nilearn.input_data import NiftiMasker

# For decoding, standardizing is often very important
masker = NiftiMasker(mask_img=data_files['mask_vt'][0], standardize=True)
Ejemplo n.º 43
0
from datetime import datetime

import matplotlib.dates as mdates
import matplotlib.pyplot as plt

import numpy as np

# Import Data
data_path = "output/timeseries_export.csv"
results = np.recfromcsv(data_path, encoding=None)

# Get times as datetime objects
times = list(map(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"), results["time"]))

# Generate Plot
fig, axarr = plt.subplots(2, sharex=True)
axarr[0].set_title("Water Level and Discharge")

# Upper subplot
axarr[0].set_ylabel("Water Level [m]")
axarr[0].plot(times, results["storage_level"], label="Storage", linewidth=2, color="b")
axarr[0].plot(times, results["sea_level"], label="Sea", linewidth=2, color="m")
axarr[0].plot(
    times,
    0.5 * np.ones_like(times),
    label="Storage Max",
    linewidth=2,
    color="r",
    linestyle="--",
)
Ejemplo n.º 44
0
# importing files using nympy
# import data as arrays

import numpy as np

fn = "files\mnist_kaggle_some_rows.csv"
data = np.loadtxt(fn, delimiter=',')
print(data)
print(type(data))
print(np.shape(data))

titanic = np.recfromcsv("files/titanic_sub.csv")
np.shape(titanic)
print(titanic[:4])
print(type(titanic[1]))
Ejemplo n.º 45
0
#import pyfits
from matplotlib.backends.backend_pdf import PdfPages

# from Leinert 1998 table 2
calfac = 21.7 * 0.44 * (7.40 / 21.7)

fig = plt.figure(figsize=(6.5, 3.0))

f, (ax, ax1) = plt.subplots(2,
                            sharex=True,
                            gridspec_kw={'height_ratios': [3, 1]})
#ax = fig.add_subplot(1,1,1)

ax.plot([0, 20], [0, 0], linestyle=':', color='black')

ipd_density = np.recfromcsv('lookup/total_density_vs_r.txt', delimiter='   ')

axa = ax.twinx()
ax.set_zorder(axa.get_zorder() + 1)
ax.patch.set_visible(False)

p2, = axa.plot(ipd_density['au'],
               ipd_density['au']**(-2) * ipd_density['density'],
               linestyle='-',
               color='green')
axa.fill_between(ipd_density['au'],
                 0.1 * ipd_density['au']**(-2) * ipd_density['density'],
                 10 * ipd_density['au']**(-2) * ipd_density['density'],
                 color='green',
                 alpha=0.2)
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE',
                          delimiter='COLUMN_SEPARATOR',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_target, testing_target = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = RandomForestRegressor(max_features=0.25,
                                          min_samples_leaf=11,
                                          min_samples_split=9,
                                          n_estimators=100)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Ejemplo n.º 47
0
## Import your algorithms here.
from Tesla import Tesla
from svr import SVR
from ContextEngineBase import Complexity

## For different tests, these values will vary.
inputFilePath = "SVRTestInput.csv"
outputFilePath = "SVRTestOutput.csv"
complexity = Complexity.secondOrder
numTrainingSamples = 96
numExecuteSamples = 96
inputFile = open(inputFilePath)
outputFile = open(outputFilePath)
inputReader = csv.reader(inputFile)
outputReader = csv.reader(outputFile)
csv = recfromcsv(inputFilePath, delimiter=',')
## Change the name of the algorithm to test it out.
algorithmTest = SVR(complexity, 1, 0, [0], {})
teslaTimestamps = {}
svrTimestamps = {}

#print(algorithmTest.complexity);
#print(algorithmTest.functionOrder);

totRow = 35040
numRow = 96
day_train_start = 0
day_train_end = 0
day_predict = 4

x_train = []
Ejemplo n.º 48
0
import numpy as np


raw_data = np.recfromcsv("data/red.csv",delimiter=';')


X = []
Y = []
for line in raw_data:
    row = []
    for i in range(0,len(line)-1):
        row.append(line[i])
    Y.append(line[len(line)-1])
    X.append(row)
X = np.mat(X)

Y = np.transpose(np.mat(Y))

sample = X[:1500]
test = X[1500:]

sY = Y[:1500]
tY = Y[1500:]

p = []
for j in range(0,len(tY)-1):
    y = test[j]
    dist = []
    for i in range(len(sample)):
        x = np.linalg.norm(sample[i]-y)
        dist.append([x,i])
Ejemplo n.º 49
0
# CSV file to read in
csv_file = '/Users/sudregp/tmp/gf1p5t.csv'
# Name of the column to be added to CSV
var = 'matched'

# define the two groups
groups = ['NV', 'ADHD']
# for every individual in the smaller group, choose this many in the bigger group
match_ratio = 2

# Some other variables to limit usable scans
qc_column = 'raw_rating'
use_twins = 1  # set to 0 if only using rows with column twin==0

gf = np.recfromcsv(csv_file)

group_rows = []
group_subjects = []
for group in groups:
    group_rows.append([i for i in range(len(gf)) if gf[i]['dxgroup'] == group])
    group_subjects.append(list(np.unique(gf[group_rows[-1]]['id'])))

# First we clean up the QC column to make sure it only has numbers and NaNs
for row in range(len(gf)):
    try:
        gf[row][qc_column] = float(gf[row][qc_column])
    except ValueError:
        gf[row][qc_column] = np.nan

# let's create a few dictionaries to make life easier later
Ejemplo n.º 50
0
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'class' in the data file
###tpot_data = np.recfromcsv('./sources/cars.csv', delimiter=',', dtype=np.float64)
tpot_data = np.recfromcsv('./sources/cars.csv')
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_target, testing_target = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    StackingEstimator(estimator=LogisticRegression(C=1.0, dual=True)),
    RandomForestClassifier(max_features=0.6000000000000001,
                           min_samples_leaf=20,
                           min_samples_split=18))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

print(results)
Ejemplo n.º 51
0
def load_neuropixels_data(dir_name='neuropixels'):
    '''Downloads and returns data for the Neuropixels example.

    The dataset comes from `UCL's Cortex Lab
    <http://data.cortexlab.net/dualPhase3/data/>`_.

    Args:
        dir_name (str): Specifies the directory to which the data files
            should be downloaded. This is concatenated with the user-set
            data directory.

    Returns:
        dict: A dictionary where each key corresponds to a needed file.
    '''
    dpath = os.path.join(config.get_data_directory(), dir_name)
    if not os.path.exists(dpath):
        os.makedirs(dpath)

    base_url = 'http://data.cortexlab.net/dualPhase3/data/'
    file_dict = dict()

    parent_fnames = [
        'experiment1stimInfo.mat',
        'experiment2stimInfo.mat',
        'experiment3stimInfo.mat',
        'timeCorrection.mat',
        'timeCorrection.npy',
    ]
    parent_dir = [
        'frontal/',
        'posterior/',
    ]
    subdir_fnames = [
        'spike_clusters.npy',
        'spike_templates.npy',
        'spike_times.npy',
        'templates.npy',
        'whitening_mat_inv.npy',
        'cluster_groups.csv',
        'channel_positions.npy',
    ]

    for name in parent_fnames:
        fname = os.path.join(dpath, name)
        url = os.path.join(base_url, name)
        if not os.path.exists(fname):
            _urlretrieve(url, fname)
        file_dict[name] = _load_file(fname)

    for directory in parent_dir:
        if not os.path.exists(os.path.join(dpath, directory)):
            os.makedirs(os.path.join(dpath, directory))
        for subdir in subdir_fnames:
            fname = os.path.join(dpath, directory, subdir)
            url = os.path.join(base_url, directory, subdir)
            if not os.path.exists(fname):
                _urlretrieve(url, fname)
            key = os.path.join(directory, subdir)
            if subdir == 'cluster_groups.csv':
                file_dict[key] = np.recfromcsv(fname, delimiter='\t')
            else:
                file_dict[key] = _load_file(fname)

    return file_dict
import pandas as pd
import numpy as np

# data = np.genfromtxt('/Users/apple/desktop/py4e/ImportData1/titanic_sub.csv', delimiter = ',', names=True,dtype=None)
# print(data)
# print(np.shape(data))
# print(data['Fare'])
# print(data['Survived'])

#recfromcsv is similar to genfromtxt but the arg dtupe=True is default
d = np.recfromcsv('/Users/apple/desktop/py4e/ImportData1/titanic_sub.csv', delimiter=',', names=True)
Ejemplo n.º 53
0
import pylab as plt
import numpy as np
import random

raw_data = np.recfromcsv("../data/data3.csv")

m = {}
# read dataset and put them into a dictionary
for line in raw_data:
    line[2] = int(line[2][len(line[2]) - 1]) - 1
    if not m.has_key(line[2]):
        m[line[2]] = []
    m[line[2]].append([line[0], line[1], line[2]])

fig = plt.figure()

ax = fig.add_subplot(111)

ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
colors = "brcmgykw"

for key in m:
    for value in (m.get(key)):
        color = value[2]
        i = int(color[len(color) - 1])
        ax.plot(value[0], value[1], "o", color=colors[i])
    m[i] = m.pop(key)


# setup GUI
Ejemplo n.º 54
0
import textwrap, sys, random, getTerms
import numpy as np

terms = getTerms.get()  # get terms from getTerms.py
punctuations = [' ', ', ', '. ']
wordCt = [100, 300]

words = np.recfromcsv(
    'common_words_freq.csv')  # read CSV of Eng. terms and freq
p = np.true_divide(words['frequency'], np.sum(words['frequency']))

probs1 = np.array([0.8, 0.05, 0.15])  # texts, terms, specific terms
probs2 = np.array([0.8, 0.1, 0.1])  # prob of space vs comma vs period


def genDoc(spec):  # spec argument is a list of specialized terms
    ct = np.random.randint(wordCt[0], wordCt[1])  # word count for doc
    strc = np.random.choice(range(3), ct, True, probs1)  # 0=wrd, 1=trm, 2=spec

    txt = np.empty(ct, dtype=words.dtype[0])
    txt[strc == 0] = np.random.choice(words['word'], np.sum(strc == 0), True,
                                      p)
    txt[strc == 1] = np.random.choice(np.hstack(terms), np.sum(strc == 1),
                                      True)
    txt[strc == 2] = np.random.choice(spec, np.sum(strc == 2), True)

    puncts = np.random.choice(punctuations, ct, True, probs2)

    out = np.empty(txt.size * 2, dtype=txt.dtype)
    out[0::2] = txt
    out[1::2] = puncts
Ejemplo n.º 55
0
def stats_test(_output_filename, input_filename):
    _, extension = os.path.splitext(input_filename)

    if extension == 'csv':
        data = recfromcsv(input_filename, delimiter=',')
        first_f = 1
    else:
        data = np.load(input_filename)
        first_f = 0

    dims = 15
    mfss = np.zeros([len(data), dims])

    for i in range(len(data)):

        print i

        if extension == 'csv':
            mfs = np.array(tuple(data[i])[first_f:])  #.astype(np.float32)
        else:
            mfs = data[i]

        if len(mfs) < 100:
            max_fa = np.max(mfs)
            min_fa = np.min(mfs)
            std_fa = np.std(mfs)
            mean_fa = np.mean(mfs)
            median_fa = np.median(mfs)
            sum_fa = np.sum(mfs)
            variation = scipy.stats.variation(mfs)
            var = scipy.stats.tvar(mfs)
            skew = scipy.stats.skew(mfs)
            kurtosis = scipy.stats.kurtosis(mfs)
            arg_max = np.argmax(mfs)
            arg_min = np.argmin(mfs)
            diff = np.max(mfs) - np.min(mfs)
            first_f = mfs[0]
            last_f = mfs[-1]

            mfss[i] = np.array([
                max_fa, min_fa, mean_fa, std_fa, median_fa, sum_fa, skew,
                kurtosis, variation, var, arg_max, arg_min, diff, last_f,
                first_f
            ])

        else:
            tmp = np.array([])

            for l in range(5):
                mmfs = mfs[l * 20:(l + 1) * 20 - 1]
                max_fa = np.max(mmfs)
                min_fa = np.min(mmfs)
                std_fa = np.std(mmfs)
                mean_fa = np.mean(mmfs)
                median_fa = np.median(mmfs)
                sum_fa = np.sum(mmfs)
                variation = scipy.stats.variation(mmfs)
                var = scipy.stats.tvar(mmfs)
                skew = scipy.stats.skew(mmfs)
                kurtosis = scipy.stats.kurtosis(mmfs)
                arg_max = np.argmax(mmfs)
                arg_min = np.argmin(mmfs)

                tmp = np.hstack(
                    (tmp,
                     np.array([
                         max_fa, min_fa, mean_fa, std_fa, median_fa, sum_fa,
                         skew, kurtosis, variation, var, arg_max, arg_min
                     ])))

        # for l in range(5):
        #    skews[l] = scipy.stats.skew(mfs[l*20 : (l+1)*20 - 1])
        #    kurtosiss[l] = scipy.stats.kurtosis(mfs[l*20 : (l+1)*20 - 1])

        #skew = np.mean(skews)
        #kurtosis = np.mean(kurtosiss)

            mfss[i] = tmp

        print mfss[i], i

        np.save(data_path + _output_filename, mfss)

    print "Saved ", data_path + _output_filename
Ejemplo n.º 56
0
    def fetch(self,
              contrasts=None,
              n_subjects=None,
              get_tmaps=False,
              get_masks=False,
              get_anats=False,
              url=None,
              resume=True,
              force=False,
              verbose=1):
        if n_subjects is None:
            n_subjects = 94  # 94 subjects available
        if (n_subjects > 94) or (n_subjects < 1):
            warnings.warn("Wrong value for \'n_subjects\' (%d). The maximum "
                          "value will be used instead (\'n_subjects=94\')")
            n_subjects = 94  # 94 subjects available

        if contrasts is None:
            contrasts = self.contrast_name_wrapper.values()
        elif isinstance(contrasts, _basestring):
            contrasts = [contrasts]

        allowed_contrasts = list(self.contrast_name_wrapper.values())
        # convert contrast names
        contrasts_wrapped = []
        # get a unique ID for each contrast. It is used to give a unique name to
        # each download file and avoid name collisions.
        contrasts_indices = []
        for contrast in contrasts:
            if contrast in allowed_contrasts:
                contrasts_wrapped.append(contrast)
                contrasts_indices.append(allowed_contrasts.index(contrast))
            elif contrast in self.contrast_name_wrapper:
                name = self.contrast_name_wrapper[contrast]
                contrasts_wrapped.append(name)
                contrasts_indices.append(allowed_contrasts.index(name))
            else:
                raise ValueError("Contrast \'%s\' is not available" % contrast)

        # It is better to perform several small requests than a big one because:
        # - Brainomics server has no cache (can lead to timeout while the archive
        #   is generated on the remote server)
        # - Local (cached) version of the files can be checked for each contrast
        opts = {'uncompress': True}
        subject_ids = ["S%02d" % s for s in range(1, n_subjects + 1)]
        subject_id_max = subject_ids[-1]
        data_types = ["c map"]
        if get_tmaps:
            data_types.append("t map")
        rql_types = str.join(", ", ["\"%s\"" % x for x in data_types])
        root_url = "http://brainomics.cea.fr/localizer/"

        base_query = ("Any X,XT,XL,XI,XF,XD WHERE X is Scan, X type XT, "
                      "X concerns S, "
                      "X label XL, X identifier XI, "
                      "X format XF, X description XD, "
                      'S identifier <= "%s", ' % (subject_id_max, ) +
                      'X type IN(%(types)s), X label "%(label)s"')

        urls = [
            "%sbrainomics_data_%d.zip?rql=%s&vid=data-zip" %
            (root_url, i,
             _urllib.parse.quote(base_query % {
                 "types": rql_types,
                 "label": c
             },
                                 safe=',()'))
            for c, i in zip(contrasts_wrapped, contrasts_indices)
        ]
        filenames = []
        for subject_id in subject_ids:
            for data_type in data_types:
                for contrast_id, contrast in enumerate(contrasts_wrapped):
                    name_aux = str.replace(
                        str.join('_', [data_type, contrast]), ' ', '_')
                    file_path = os.path.join("brainomics_data", subject_id,
                                             "%s.nii.gz" % name_aux)
                    file_tarball_url = urls[contrast_id]
                    filenames.append((file_path, file_tarball_url, opts))
        # Fetch masks if asked by user
        if get_masks:
            urls.append("%sbrainomics_data_masks.zip?rql=%s&vid=data-zip" %
                        (root_url,
                         _urllib.parse.quote(base_query % {
                             "types": '"boolean mask"',
                             "label": "mask"
                         },
                                             safe=',()')))
            for subject_id in subject_ids:
                file_path = os.path.join("brainomics_data", subject_id,
                                         "boolean_mask_mask.nii.gz")
                file_tarball_url = urls[-1]
                filenames.append((file_path, file_tarball_url, opts))
        # Fetch anats if asked by user
        if get_anats:
            urls.append("%sbrainomics_data_anats.zip?rql=%s&vid=data-zip" %
                        (root_url,
                         _urllib.parse.quote(base_query % {
                             "types": '"normalized T1"',
                             "label": "anatomy"
                         },
                                             safe=',()')))
            for subject_id in subject_ids:
                file_path = os.path.join("brainomics_data", subject_id,
                                         "normalized_T1_anat_defaced.nii.gz")
                file_tarball_url = urls[-1]
                filenames.append((file_path, file_tarball_url, opts))
        # Fetch subject characteristics (separated in two files)
        if url is None:
            url_csv = (
                "%sdataset/cubicwebexport.csv?rql=%s&vid=csvexport" %
                (root_url, _urllib.parse.quote("Any X WHERE X is Subject")))
            url_csv2 = ("%sdataset/cubicwebexport2.csv?rql=%s&vid=csvexport" %
                        (root_url,
                         _urllib.parse.quote(
                             "Any X,XI,XD WHERE X is QuestionnaireRun, "
                             "X identifier XI, X datetime "
                             "XD",
                             safe=',')))
        else:
            url_csv = "%s/cubicwebexport.csv" % url
            url_csv2 = "%s/cubicwebexport2.csv" % url
        filenames += [("cubicwebexport.csv", url_csv, {}),
                      ("cubicwebexport2.csv", url_csv2, {})]

        # Actual data fetching
        files = self.fetcher.fetch(filenames,
                                   resume=resume,
                                   force=force,
                                   verbose=verbose)
        anats = None
        masks = None
        tmaps = None
        # combine data from both covariates files into one single recarray
        from numpy.lib.recfunctions import join_by
        ext_vars_file2 = files[-1]
        csv_data2 = np.recfromcsv(ext_vars_file2, delimiter=';')
        files = files[:-1]
        ext_vars_file = files[-1]
        csv_data = np.recfromcsv(ext_vars_file, delimiter=';')
        files = files[:-1]
        # join_by sorts the output along the key
        csv_data = join_by('subject_id',
                           csv_data,
                           csv_data2,
                           usemask=False,
                           asrecarray=True)[:n_subjects]
        if get_anats:
            anats = files[-n_subjects:]
            files = files[:-n_subjects]
        if get_masks:
            masks = files[-n_subjects:]
            files = files[:-n_subjects]
        if get_tmaps:
            tmaps = files[1::2]
            files = files[::2]
        return Bunch(cmaps=files,
                     tmaps=tmaps,
                     masks=masks,
                     anats=anats,
                     ext_vars=csv_data)
Ejemplo n.º 57
0
import numpy as np

from sklearn.decomposition import FastICA
from sklearn.ensemble import RandomForestRegressor, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('raw_df.csv', delimiter=',', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    FastICA(tol=11.0),
    RandomForestRegressor(n_estimators=500)
)

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)

print testing_features
print results
Ejemplo n.º 58
0
    'dmn_rACC_1to4_a0.05t0.99d5p5000.txt',
    'dorsal_lFusiform_1to4_a0.05t0.99d5p5000.txt',
    'dorsal_rFusiform_4to8_a0.05t0.99d5p5000.txt',
    'dorsal_rFusiform_65to100_a0.05t0.99d5p5000.txt',
    'ventral_lCingulate_4to8_a0.05t0.99d5p5000.txt',
    'ventral_rVFC_30to55_a0.05t0.99d5p5000.txt',
    'ventral_rVFC_4to8_a0.05t0.99d5p5000.txt'
]]

results['fmri'] = [[
    'cognitive_lSupra_a0.05t0.99d5p5000.txt',
    'cognitive_rSupra_a0.05t0.99d5p5000.txt', 'dmn_rACC_a0.05t0.99d5p5000.txt',
    'dmn_rPrecuneus_a0.05t0.99d5p5000.txt', 'dorsal_lIPS_a0.05t0.99d5p5000.txt'
]]

phen = np.recfromcsv(home + '/data/overlap_resting/subjs.csv')

# load the fMRI result and the data. Transform the data, remove residuals, and concatenate to the others in the the same group
data_dir = home + '/data/results/inatt_resting/'
subjs_fname = home + '/data/fmri/joel_all.txt'
gf_fname = home + '/data/fmri/gf.csv'
fid = open(subjs_fname, 'r')
subjs = [line.rstrip() for line in fid]
fid.close()
gf = pd.read_csv(gf_fname)

# the order of subjects in data is the same as in subjs, because that's how data was created. let's find that order in the gf and resort it
idx = [np.nonzero(gf.maskid == int(s))[0][0] for s in subjs]
gf = gf.iloc[idx]

# find out what are the indices of subjects in overlap group
Ejemplo n.º 59
0
def create_metadata(self, cond_type):
    """ The metadata file format is two columns [full scan index, value at index] """
    # checks file exists
    file = self.file_check(self.meta_dir, cond_type)

    # separates file type and string name of file.
    fileparts = os.path.basename(file).split('.')
    if len(fileparts) != 2:
        logger.criticalcal(
            "Error processing file: %s, too many . in filebase" % file)

    # take file parts for processing.
    basefile, filetype = fileparts

    # IMPORTANT, transfering the meta file description
    logger.info("Matching '%s' to one of: %s meta types" %
                (basefile, self.meta_types))
    pattern = [
        t for t in self.meta_types if re.search(t, basefile) is not None
    ]
    logger.info("Found %s to bring into HDF5." % basefile)

    # catch any potential errors.
    if len(pattern) > 1:
        logger.critical(
            'Multiple patterns matched for meta files.'\
            'Must change meta file names to match a type from list below.'\
            '%s' %self.meta_types)
    else:
        logger.debug("Matched %s with %s" % (pattern[0], basefile))
        attr_name = pattern[0]

    # now we can load in the file.
    if filetype == 'csv':
        info = np.recfromcsv(file, names=['index', attr_name], delimiter=',')

    elif filetype == 'txt':
        info = np.recfromcsv(file, names=['index', attr_name], delimiter='\t')

    else:
        logger.exception(
            "%s, not .txt or .csv, chill out, we're not there yet..." %
            filename)

    # check to make sure we don't have more than 2 columns in meta data file
    if len(info[0]) > 2:
        logger.exception(
            'The metadata file format is [full scan index, value at index]')

    # checks that the length of the info file is the same length as the number of TRs.
    if len(info) != (self.n_TRs):
        logger.exception("Length of %s, does not match total # of TRs." %
                         filename)

    # cycle through each run
    for i, r in self.hdf['func'].iteritems():
        walker = []

        # make sure run index matches
        run = self.hdf['func'][i].attrs['run']

        # cycles through each TR
        for j in self.hdf['func'][i].attrs['f_ind']:
            # append to vector if run index matches global index
            if run == info[j][0]:
                walker.append(info[j][1])
            else:
                logger.critical("Run index and meta file do not match! AHH")

        # write to hdf5
        logger.info("No obvious errors processing %s file into hdf5" %
                    filename)
        self.hdf['func'][i].attrs[attr_name] = np.array(walker)
Ejemplo n.º 60
0
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target,
                                                    train_size=0.75, test_size=0.25)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_boston_pipeline.py')


import numpy as np

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('C:/Users/ecervera/.spyder-py3/tpot_boston_pipeline.py', delimiter=';', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_target, testing_target = \
    train_test_split(features, tpot_data['class'], random_state=None)

exported_pipeline = GradientBoostingRegressor(alpha=0.85, learning_rate=0.1, loss="ls",
                                              max_features=0.9, min_samples_leaf=5,
                                              min_samples_split=6)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)