Exemple #1
0
def create_species_bunch(species_name, train, test, coverages, xgrid, ygrid):
    """
    create a bunch with information about a particular organism

    This will use the test/train record arrays to extract the
    data specific to the given species name.
    """
    bunch = Bunch(name=' '.join(species_name.split("_")[:2]))

    points = dict(test=test, train=train)

    for label, pts in points.iteritems():
        # choose points associated with the desired species
        pts = pts[pts['species'] == species_name]
        bunch['pts_%s' % label] = pts

        # determine coverage values for each of the training & testing points
        ix = np.searchsorted(xgrid, pts['dd long'])
        iy = np.searchsorted(ygrid, pts['dd lat'])
        bunch['cov_%s' % label] = coverages[:, -iy, ix].T

    return bunch
Exemple #2
0
def _fetch_surf_fsaverage(data_dir=None):
    """Helper function to ship fsaverage (highest resolution) surfaces
    and sulcal information with Nilearn.

    The source of the data is downloaded from nitrc.
    """
    dataset_dir = _get_dataset_dir('fsaverage', data_dir=data_dir)
    url = 'https://www.nitrc.org/frs/download.php/10846/fsaverage.tar.gz'
    if not os.path.isdir(os.path.join(dataset_dir, 'fsaverage')):
        _fetch_files(dataset_dir, [('fsaverage.tar.gz', url, {})])
        _uncompress_file(os.path.join(dataset_dir, 'fsaverage.tar.gz'))
    result = {
        name: os.path.join(dataset_dir, 'fsaverage', '{}.gii'.format(name))
        for name in ['pial_right', 'sulc_right', 'sulc_left', 'pial_left']
    }
    result['infl_left'] = os.path.join(dataset_dir, 'fsaverage',
                                       'inflated_left.gii')
    result['infl_right'] = os.path.join(dataset_dir, 'fsaverage',
                                        'inflated_right.gii')

    result['description'] = str(_get_dataset_descr('fsaverage'))
    return Bunch(**result)
Exemple #3
0
def fetch_coords_power_2011():
    """Download and load the Power et al. brain atlas composed of 264 ROIs.

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        dictionary-like object, contains:
        - "rois": coordinates of 264 ROIs in MNI space


    References
    ----------
    Power, Jonathan D., et al. "Functional network organization of the human
    brain." Neuron 72.4 (2011): 665-678.
    """
    dataset_name = 'power_2011'
    fdescr = _get_dataset_descr(dataset_name)
    package_directory = os.path.dirname(os.path.abspath(__file__))
    csv = os.path.join(package_directory, "data", "power_2011.csv")
    params = dict(rois=np.recfromcsv(csv), description=fdescr)

    return Bunch(**params)
Exemple #4
0
def fetch_TR9856():
    """
    Fetch TR9856 dataset for testing multi-word term relatedness

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': matrix of 2 words per column,
        'y': vector with scores,
        'topic': vector of topics providing context for each pair of terms

    References
    ----------
    Levy, Ran et al., "TR9856: A multi-word term relatedness benchmark", 2015.

    Notes
    -----
    """
    data = pd.read_csv(
        os.path.join(
            _fetch_file(
                "https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_TR9856.v2.zip",
                "similarity",
                uncompress=True,
                verbose=0,
            ),
            "IBM_Debater_(R)_TR9856.v0.2",
            "TermRelatednessResults.csv",
        ),
        encoding="iso-8859-1",
    )

    # We basically select all the columns available
    X = data[["term1", "term2"]].values
    y = data["score"].values
    topic = data["topic"].values

    return Bunch(X=X.astype("object"), y=y, topic=topic)
Exemple #5
0
def vector_space(stopword_path, bunch_path, space_path):

    stpwrdlst = _readfile(stopword_path).splitlines()
    bunch = _readbunchobj(bunch_path)
    #构建tf-idf词向量空间对象
    tfidfspace = Bunch(target_name=bunch.target_name,
                       label=bunch.label,
                       filenames=bunch.filenames,
                       tdm=[],
                       vocabulary={})

    vectorizer = TfidfVectorizer(stop_words=stpwrdlst,
                                 sublinear_tf=True,
                                 max_df=0.5)

    # 此时tdm里面存储的就是tf-idf权值矩阵
    # vectorizer.fit_transform(corpus)将文本corpus输入,得到词频矩阵
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
    tfidfspace.vocabulary = vectorizer.vocabulary_

    _writebunchobj(space_path, tfidfspace)
    print("if-idf词向量空间实例创建成功!")
Exemple #6
0
def _ising_case(n_samples=100,
                n_dim_obs=100,
                T=10,
                time_on_axis='first',
                update_theta='l2',
                responses=[-1, 1],
                **kwargs):
    thetas = ising_theta_generator(n_dim_obs=n_dim_obs,
                                   n=n_samples,
                                   T=T,
                                   mode=update_theta,
                                   **kwargs)
    samples = [
        ising_sampler(t, np.zeros(n_dim_obs), n=n_samples, responses=[-1, 1])
        for t in thetas
    ]
    data = np.array(samples)
    X = np.vstack(data)
    y = np.repeat(range(len(thetas)), n_samples).astype(int)
    if time_on_axis == "last":
        data = data.transpose(1, 2, 0)
    return Bunch(data=data, thetas=np.array(thetas), X=X, y=y)
Exemple #7
0
def _load_camcan_scores(filename_csv, subjects_selected):
    """Load the scores from the Cam-CAN data set.

    Parameters
    ----------
    filename_csv : str,
        Path to the csv file containing the participants information.

    subjects_selected: list of str,
        A list of strings, contains the ID of the patient to be selected. The
        string provided should follow the BIDS standard (e.g., 'sub-******').

    Returns
    -------
    data : Bunch,
        Dictionary-like object. The interesting attributes are:

        - 'age', the age of the patient;
        - 'hand', handedness of the patient;
        - 'gender_text', gender of the patient.

    """

    if not isfile(filename_csv):
        raise ValueError('The file {} does not exist.'.format(filename_csv))

    if not filename_csv.endswith('.csv'):
        raise ValueError('The file {} is not a CSV file.'.format(filename_csv))

    patients_info = pd.read_csv(filename_csv,
                                usecols=COLUMN_SELECT_PATIENTS_INFO)

    # the id in the CSV is missing 'sub-'
    patients_info['Observations'] = 'sub-' + patients_info['Observations']
    # filter the IDs to be kept and sort just in case
    patients_info = (
        patients_info.set_index('Observations').loc[subjects_selected])

    return Bunch(**patients_info.to_dict('list'))
Exemple #8
0
def build_training_set(file_list):
    '''This function is an alternative form of the loads in sklearn but allows
    me to load from a list of files output by another file rather than the folder
    structure prescribed by giving loads a folder-path'''
    from sklearn.datasets.base import Bunch

    b = Bunch()
    b['filenames'] = file_list  #filenames

    def target_function(filepath):
        #/home/dhrumil/Desktop/PoliticalFraming/data/immigration/D/123.json
        if filepath[filepath.rfind("/") - 1] == 'D':
            return 0
        elif filepath[filepath.rfind("/") - 1] == 'R':
            return 1
        else:
            print "file must be categorized as D or R : " + str(filepath)

    b['target'] = []  #target
    for filepath in file_list:
        b['target'].append(target_function(filepath))

    b['target_names'] = ['D', 'R']  #target_names

    b['data'] = []  #data
    for filepath in file_list:
        f = open(filepath, 'r')
        jdata = json.loads(f.read())
        f.close()

        speech_string = ""
        for sentence in jdata['speaking']:
            speech_string += sentence

        b['data'].append(speech_string)

    b['DESCR'] = ""  #DESCR

    return b
Exemple #9
0
def _compute_fit(series, fitter):
    if cfg.verbosity > 0:
        print 'Computing fit for {}@{} using {}'.format(
            series.gene_name, series.region_name, fitter)
    x = series.ages
    y = series.single_expression
    if np.count_nonzero(abs(y) > cfg.nonzero_threshold
                        ) < cfg.min_nonzero_points_for_fitting:
        print 'Not enough non-zero data points to fit for {}@{}. Skipping...'.format(
            series.gene_name, series.region_name)
        theta = None
        sigma = None
        fit_predictions = None
        LOO_predictions = None
        theta_samples = None
    else:
        theta, sigma, LOO_predictions, LOO_fits = fitter.fit(x, y, loo=True)
        if theta is None:
            print 'WARNING: Optimization failed during overall fit for {}@{} using {}'.format(
                series.gene_name, series.region_name, fitter)
            fit_predictions = None
            theta_samples = None
        else:
            fit_predictions = fitter.shape.f(theta, x)
            if fitter.shape.parameter_type() == object:
                theta_samples = None
            else:
                theta_samples = fitter.parametric_bootstrap(x, theta, sigma)

    return Bunch(
        fitter=fitter,
        seed=cfg.random_seed,
        theta=theta,
        sigma=sigma,
        fit_predictions=fit_predictions,
        LOO_predictions=LOO_predictions,
        theta_samples=theta_samples,
    )
Exemple #10
0
def _add_dataset_correlation_fits_from_results_dictionary(
        dataset, ds_fits, dct_results):
    """This function converts the results of the job_splitting which is a flat dictionary to structures which 
       are easier to use and integrated into the dataset fits
    """
    region_to_ix_original_inds = {}
    for ir, r in enumerate(dataset.region_names):
        series = dataset.get_several_series(dataset.gene_names, r)
        region_to_ix_original_inds[r] = series.original_inds

    for (ir, loo_point), levels in dct_results.iteritems():
        n_iterations = len(levels)
        r = dataset.region_names[ir]
        if loo_point is None:
            # Global fit - collect the parameters (theta, sigma, L) and compute a correlation matrix for the region
            # the hack of using the key (None,r) to store these results can be removed if/when dataset fits is changed from a dictionary to a class with several fields
            k = (None, r)
            if k not in ds_fits:
                ds_fits[k] = n_iterations * [None]
            for iLevel, level in enumerate(levels):
                ds_fits[k][iLevel] = level
                level.correlations = covariance_to_correlation(level.sigma)
        else:
            # LOO point - collect the predictions
            ix, iy = loo_point
            g = dataset.gene_names[iy]
            fit = ds_fits[(g, r)]
            if not hasattr(fit, 'with_correlations'):
                fit.with_correlations = [
                    Bunch(
                        LOO_predictions=init_array(np.NaN, len(dataset.ages))
                    )  # NOTE: we place the predictions at the original indexes (before NaN were removed by the get_series)
                    for _ in xrange(n_iterations)
                ]
            for iLevel, level_prediction in enumerate(levels):
                orig_ix = region_to_ix_original_inds[r][ix]
                fit.with_correlations[iLevel].LOO_predictions[
                    orig_ix] = level_prediction
Exemple #11
0
def fetch_localizer_first_level(data_dir=None, verbose=1):
    """ Download a first-level localizer fMRI dataset

    Parameters
    ----------
    data_dir: string
        directory where data should be downloaded and unpacked.

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        dictionary-like object, with the keys:
        epi_img: the input 4D image
        events: a csv file describing the paardigm
    """
    url = 'ftp://ftp.cea.fr/pub/dsv/madic/download/nipy'

    dataset_name = "localizer_first_level"
    files = dict(epi_img="s12069_swaloc1_corr.nii.gz",
                 events="localizer_paradigm.csv")
    # The options needed for _fetch_files
    options = [(filename, os.path.join(url, filename), {})
               for _, filename in sorted(files.items())]

    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
                                verbose=verbose)
    sub_files = _fetch_files(data_dir, options, resume=True,
                             verbose=verbose)

    params = dict(zip(sorted(files.keys()), sub_files))
    try:
        _check_events_file_uses_tab_separators(params['events'])
    except ValueError:
        _make_events_file_localizer_first_level(events_file=
                                                             params['events']
                                                )
    
    return Bunch(**params)
Exemple #12
0
def vector_space(stopword_path, bunch_path, space_path, train_tfidf_path=None):

    stpwrdlst = readfile(stopword_path).splitlines()

    bunch = readbunchobj(bunch_path)

    tfidfspace = Bunch(target_name=bunch.target_name,
                       label=bunch.label,
                       filenames=bunch.filenames,
                       tdm=[],
                       vocabulary={})

    if train_tfidf_path is not None:

        trainbunch = readbunchobj(train_tfidf_path)

        tfidfspace.vocabulary = trainbunch.vocabulary

        vectorizer = TfidfVectorizer(stop_words=stpwrdlst,
                                     sublinear_tf=True,
                                     max_df=0.5,
                                     vocabulary=trainbunch.vocabulary)

        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)

    else:

        vectorizer = TfidfVectorizer(stop_words=stpwrdlst,
                                     sublinear_tf=True,
                                     max_df=0.5)

        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)

        tfidfspace.vocabulary = vectorizer.vocabulary_

    writebunchobj(space_path, tfidfspace)

    print("if-idf词向量空间实例创建成功!!!")
Exemple #13
0
def load_corpus(path):
    """
    Loads and wrangles the passed in text corpus by path.
    """

    # Check if the data exists, otherwise download or raise
    if not os.path.exists(path):
        raise ValueError(
            ("'{}' dataset has not been downloaded, "
             "use the yellowbrick.download module to fetch datasets"
             ).format(path))

    # Read the directories in the directory as the categories.
    categories = [
        cat for cat in os.listdir(path)
        if os.path.isdir(os.path.join(path, cat))
    ]

    files = []  # holds the file names relative to the root
    data = []  # holds the text read from the file
    target = []  # holds the string of the category

    # Load the data from the files in the corpus
    for cat in categories:
        for name in os.listdir(os.path.join(path, cat)):
            files.append(os.path.join(path, cat, name))
            target.append(cat)

            with open(os.path.join(path, cat, name), 'r') as f:
                data.append(f.read())

    # Return the data bunch for use similar to the newsgroups example
    return Bunch(
        categories=categories,
        files=files,
        data=data,
        target=target,
    )
Exemple #14
0
def fetch_TR9856():
    """
    Fetch TR9856 dataset for testing multi-word term relatedness

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': matrix of 2 words per column,
        'y': vector with scores,
        'topic': vector of topics providing context for each pair of terms

    References
    ----------
    Levy, Ran et al., "TR9856: A multi-word term relatedness benchmark", 2015.

    Notes
    -----
    """
    #data = pd.read_csv(os.path.join(_fetch_file(
    #    'https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_TR9856.v2.zip',
    #    'similarity', uncompress=True, verbose=0),
    #    'IBM_Debater_(R)_TR9856.v0.2', 'TermRelatednessResults.csv'), encoding="iso-8859-1")
    data = pd.read_csv(os.path.join(
        _fetch_file(
            'http://homes.cs.washington.edu/~febrahim/files/IBM_Debater_(R)_TR9856.v2.zip',
            'similarity',
            uncompress=True,
            verbose=0), 'IBM_Debater_(R)_TR9856.v0.2',
        'TermRelatednessResults.csv'),
                       encoding="iso-8859-1")

    # We basically select all the columns available
    X = data[['term1', 'term2']].values
    y = data['score'].values
    topic = data['topic'].values

    return Bunch(X=X.astype("object"), y=y, topic=topic)
Exemple #15
0
def load_datasets(data_home=DATA_HOME_BASIC):
    """
    Load the benchmark datasets.

    :param data_home: Default catalogue in which the data is stored in .tar.gz format.
    :returns:
        OrderedDict of Bunch object. Each Bunch object refered as dataset have the following attributes:

            * dataset.data :
                ndarray, shape (n_samples, n_features)
            * dataset.target :
                ndarray, shape (n_samples, )
            * dataset.DESCR :
                string Description of the each dataset.
    """
    extracted_dir = join(data_home, "extracted")
    datasets = OrderedDict()

    filter_data_ = MAP_NAME_ID.keys()

    for it in filter_data_:
        filename = PRE_FILENAME + str(MAP_NAME_ID[it]) + POST_FILENAME
        filename = join(extracted_dir, filename)
        available = isfile(filename)

        if not available:
            makedirs(extracted_dir, exist_ok=True)
            with open(f'{data_home}data.tar.gz', 'rb') as fin:
                f = BytesIO(fin.read())
            tar = tarfile.open(fileobj=f)
            tar.extractall(path=extracted_dir)

        data = np.load(filename)
        X, y = data['data'], data['label']

        datasets[it] = Bunch(data=X, target=y, DESCR=it)

    return datasets
Exemple #16
0
def load_graphs_LMdata():
    """Load the LMdata graph dataset for graph classification..

    Returns
    -------
    data : Bunch
        Dictionary-like object with the following attributes :
        'graphs', the graphs in the dataset in Networkx format,  'target', the classification labels for each
        sample.
    """
    input_target_url = 'http://www.math.unipd.it/~nnavarin/datasets/LMdata/labels.txt.standardized'
    input_data_url = 'http://www.math.unipd.it/~nnavarin/datasets//LMdata/graphs.gspan.standardized'
    _target = load_target(input_target_url)
    label_dict = {}
    counter = [1]
    g_it = instance_to_graph(input_data_url, label_dict, counter)

    print 'Loaded LMdata graph dataset for graph classification.'
    return Bunch(graphs=[i for i in g_it],
                 label_dict=label_dict,
                 target=_target,
                 labels=True,
                 veclabels=False)
Exemple #17
0
def confidence_calculate(label, content, stop_word_list, clf, train_path, url,
                         alexa_dict):
    # bunch规格化
    bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
    bunch.target_name.append(label)
    bunch.label.append(label)
    bunch.filenames.append(label)
    bunch.contents.append(content)

    # 计算TF-IDF并预测
    bunch = vector_space(bunch, train_path, stop_word_list)
    predicted = clf.predict_proba(bunch.tdm)
    predict_score = predicted[0][0]

    # 根据alexa排名,得到url对应的初始分值
    alexa_score = 0
    for domain in alexa_dict:
        if domain in url:
            alexa_score = alexa_dict[domain]

    # 综合得出可信度
    confidence = predict_score * 0.95 + (alexa_score - 1) * 0.05
    return confidence
Exemple #18
0
def _fetch_surf_fsaverage5_sphere(data_dir=None):
    """Helper function to ship fsaverage5 spherical meshes.

    These meshes can be used for visualization purposes, but also to run
    cortical surface-based searchlight decoding.

    The source of the data is downloaded from OSF.
    """

    fsaverage_dir = _get_dataset_dir('fsaverage', data_dir=data_dir)
    dataset_dir = _get_dataset_dir('fsaverage5_sphere', data_dir=fsaverage_dir)
    url = 'https://osf.io/b79fy/download'
    opts = {'uncompress': True}
    names = ['sphere_right', 'sphere_left']
    filenames = [('{}.gii'.format(name), url, opts) for name in names]
    _fetch_files(dataset_dir, filenames)
    result = {
        name: os.path.join(dataset_dir, '{}.gii'.format(name))
        for name in names
    }

    result['description'] = str(_get_dataset_descr('fsaverage5_sphere'))
    return Bunch(**result)
Exemple #19
0
def corpus2Bunch(wordbag_path, seg_path):
    catelist = os.listdir(seg_path)  # 获取seg_path下的所有子目录,也就是分类信息
    # 创建一个Bunch实例
    bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
    bunch.target_name.extend(catelist)
    '''
    extend(addlist)是python list中的函数,意思是用新的list(addlist)去扩充
    原来的list
    '''
    # 获取每个目录下所有的文件
    for mydir in catelist:
        class_path = seg_path + mydir + "/"  # 拼出分类子目录的路径
        file_list = os.listdir(class_path)  # 获取class_path下的所有文件
        for file_path in file_list:  # 遍历类别目录下文件
            fullname = class_path + file_path  # 拼出文件名全路径
            bunch.label.append(mydir)
            bunch.filenames.append(fullname)
            bunch.contents.append(readfile(fullname))  # 读取文件内容
            '''append(element)是python list中的函数,意思是向原来的list中添加element,注意与extend()函数的区别'''
    # 将bunch存储到wordbag_path路径中
    with open(wordbag_path, "wb") as file_obj:
        pickle.dump(bunch, file_obj)
    print("构建文本对象结束!!!")
Exemple #20
0
def fetch_thai_simlex999():
    """
    added by Gerhard Wohlgenannt, ([email protected], [email protected]), 2019
    Get the SemEval2017-Task2 dataset for Thai language
    
    The dataset is in Thai language (!) for the evaluation of Thai embedding models

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': matrix of 2 words per column,
        'y': vector with scores,

    """
    data = _get_as_pd(
        'https://www.dropbox.com/s/nlct64af7qmhc49/thaiSimLex-999-v2.csv?dl=1',  # SimLex-999 -- thai version
        'similarity',
        header=None,
        sep=",").values

    return Bunch(X=data[:, 0:2].astype("object"),
                 y=2 * data[:, 2].astype(np.float))
Exemple #21
0
def fetch_thai_wordsim353():
    """
    added by Gerhard Wohlgenannt, ([email protected], [email protected]), 2019
    Get the WordSim-353 dataset for Thai language
    
    The dataset is in Thai language (!) for the evaluation of Thai embedding models

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': matrix of 2 words per column,
        'y': vector with scores,

    """
    data = _get_as_pd(
        'https://www.dropbox.com/s/h8c3ll1764d7akf/thai-wordsim353-v2.csv?dl=1',
        'similarity',
        header=None,
        sep=",").values

    return Bunch(X=data[:, 0:2].astype("object"),
                 y=2 * data[:, 2].astype(np.float))
def word_to_bunch(train_save_path, train_bunch_path):
    bunch = Bunch(label=[], filepath=[], contents=[])
    all_labels = os.listdir(train_save_path)

    for label in all_labels:
        detail_path = train_save_path + label + '/'

        all_details = os.listdir(detail_path)

        for all_detail in all_details:
            file_detail_path = detail_path + all_detail  # 文件具体路径
            bunch.label.append(label)
            # print(bunch.label)  #
            bunch.filepath.append(file_detail_path)
            # print(bunch.filepath)   #
            contents = read_file(file_detail_path)
            # print(contents)  #
            bunch.contents.append(contents)
            # print(bunch.contents)   #

    with open(train_bunch_path, "wb+") as fp:
        pickle.dump(bunch, fp)
    print("创建完成")
Exemple #23
0
def load_yeast():
    '''
    Yeast
    1484 instances
    1 sequence number + 8 real attributes
    10 classes (localization site of protein)
      CYT (cytosolic or cytoskeletal)                    463
      NUC (nuclear)                                      429
      MIT (mitochondrial)                                244
      ME3 (membrane protein, no N-terminal signal)       163
      ME2 (membrane protein, uncleaved signal)            51
      ME1 (membrane protein, cleaved signal)              44
      EXC (extracellular)                                 37
      VAC (vacuolar)                                      30
      POX (peroxisomal)                                   20
      ERL (endoplasmic reticulum lumen)                    5
    Note: first attribute(sequence number)IGNORED
    '''
    
    data = pd.read_csv(get_data_path('yeast.data'),delim_whitespace=True,header=None)
    flat_data = data.ix[:, 1:8].values
    labels = data.ix[:, 9].values
    return Bunch(data=flat_data, target=labels, name='yeast', dataset_type='classification')
    def _glob_fsl_feeds_data(subject_dir):
        """glob data from subject_dir.

        """

        if not os.path.exists(subject_dir):
            return None

        for file_name in FSL_FEEDS_DATA_FILES:
            file_path = os.path.join(subject_dir, file_name)
            if os.path.exists(file_path) or os.path.exists(
                    file_path.rstrip(".gz")):
                file_name = re.sub("(?:\.nii\.gz|\.txt)", "", file_name)
            else:
                if not os.path.basename(subject_dir) == 'data':
                    return _glob_fsl_feeds_data(
                        os.path.join(subject_dir, 'feeds/data'))
                else:
                    print("%s missing from filelist!" % file_name)
                    return None
        return Bunch(data_dir=data_dir,
                     func=os.path.join(subject_dir, "fmri.nii.gz"),
                     anat=os.path.join(subject_dir, "structural_brain.nii.gz"))
Exemple #25
0
def load_sample_images():
    """Load sample images for image manipulation.
    Loads ``sloth``, ``sloth_closeup``, ``cat_and_dog``.

    Returns
    -------
    data : Bunch
        Dictionary-like object with the following attributes :
        'images', the sample images, 'filenames', the file
        names for the images, and 'DESCR'
        the full description of the dataset.
    """
    module_path = os.path.join(os.path.dirname(__file__), "images")
    with open(os.path.join(module_path, 'README.txt')) as f:
        descr = f.read()
    filenames = [
        os.path.join(module_path, filename)
        for filename in os.listdir(module_path) if filename.endswith(".jpg")
    ]
    # Load image data for each image in the source folder.
    images = [np.array(Image.open(filename, 'r')) for filename in filenames]

    return Bunch(images=images, filenames=filenames, DESCR=descr)
Exemple #26
0
def load_data(root='data'):
    with open(os.path.join(root, 'meta.json'), 'r') as f:
        meta = json.load(f)
    names = meta['features']
    #	print names[:-1]
    train = pd.read_excel(os.path.join(root, 'mergewothdkom.xlsx'),
                          sheetname='Sheet1')
    #	train=list(train)
    train = train.values
    #	print train
    test = pd.read_excel(os.path.join(root, 'merge_test1.xlsx'),
                         sheetname='Sheet1')
    test = test.values
    #	print names[-1]
    #	test=list(test)
    return Bunch(
        data=train[:, :-2],
        target=train[:, -2],
        data_test=test[:, :-1],
        target_test=test[:, -1],
        target_names=meta['target_names'],
        feature_names=meta['features'],
    )
Exemple #27
0
def fetch_RG65():
    """
    Fetch Rubenstein and Goodenough dataset for testing attributional and
    relatedness similarity
    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': matrix of 2 words per column,
        'y': vector with scores,
        'sd': vector of std of scores if available (for set1 and set2)
    References
    ----------
    Rubenstein, Goodenough, "Contextual correlates of synonymy", 1965
    Notes
    -----
    Scores were scaled by factor 10/4
    """
    data = _get_as_pd('https://www.dropbox.com/s/chopke5zqly228d/EN-RG-65.txt?dl=1',
                      'similarity', header=None, sep="\t").values

    return Bunch(X=data[:, 0:2].astype("object"),
                 y=data[:, 2].astype(np.float) * 10.0 / 4.0)
Exemple #28
0
def fetch_asirra(image_count=1000):
    """

    Parameters
    ----------
    image_count : positive integer

    Returns
    -------
    data : Bunch
        Dictionary-like object with the following attributes :
        'images', the sample images, 'data', the flattened images,
        'target', the label for the image (0 for cat, 1 for dog),
        and 'DESCR' the full description of the dataset.
    """
    partial_path = check_fetch_asirra()
    m = Memory(cachedir=partial_path, compress=6, verbose=0)
    load_func = m.cache(_fetch_asirra)
    images, target = load_func(partial_path, image_count=image_count)
    return Bunch(data=images.reshape(len(images), -1),
                 images=images,
                 target=target,
                 DESCR="Asirra cats and dogs dataset")
Exemple #29
0
def load_fuman_rant(file_path, target_func=fuman_gvb_target):
    data = list()
    target = list()
    parse_errors = 0
    n_samples = 0
    with open(file_path, newline='') as csv_file:
        data_file = csv.reader(csv_file, delimiter=',', quotechar="'")
        next(data_file)
        for row in data_file:
            if not check_row_format(row[0], row):
                parse_errors += 1
                continue
            data.append(unicodedata.normalize('NFKC', row[5]))
            status = int(row[6])
            if len(row) is 16:
                price = int(row[15])
            else:
                price = 0
            target.append(target_func(status, price))
            n_samples += 1
    logging.info('Finished loading data. (read: {} errors: {})'.format(
        n_samples, parse_errors))
    return Bunch(data=data, target=target, DESCR="Fuman DB csv dump dataset")
def load_train_file(fname, selected_categorys=None, description=None):
    import json
    data = []
    target = []
    segmentor = Segmentor()
    with open(fname) as ifd:
        for line in ifd:
            obj = json.loads(line)
            if 'topic' not in obj or 'title' not in obj:
                continue
            title = obj['title']
            L = bow(title, skip_unigram=False, segmentor=segmentor)
            title_words = [i.word for i in L]
            #title_words_str = ' '.join(title_words)
            for tname, tidstr in obj['topic'].iteritems():
                if selected_categorys and tname not in selected_categorys:
                    continue
                #data.append(title_words_str)
                data.append(title_words)
                target.append(tname)
    print '%s file name[%s] load %d records' % (datetime.now(), fname,
                                                len(data))
    return Bunch(fname=fname, data=data, target=target, DESCR=description)