def shuffleData(self, res):
     shuffle(res)
     train = Bunch()
     train.data = map(lambda x:x[1], res)
     train.target = map(lambda x:x[0], res)
     train.target_names = self.names
     return train
def gen_tf_idf_space():
    bunch = read_object(train_data)
    tf_idf_space = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, vocabulary={})

    vectorizer = TfidfVectorizer(stop_words=load_stop_words(), sublinear_tf=True, max_df=0.5)
    transformer = TfidfTransformer()

    tf_idf_space.tdm = vectorizer.fit_transform(bunch.contents)
    tf_idf_space.vocabulary = vectorizer.vocabulary_
    save_object(tf_idf_space_data, tf_idf_space)
def calc_tfidf(trainsetfile,stopwordfile,dstdir):
    data_set = joblib.load(trainsetfile)
    wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
    wordbag.target_name = data_set.tatget_name
    wordbag.label = data_set.label
    
    corpus = data_set.contents
    stopwordlist = read_stopword(stopwordfile)
    vectorize = TfidfVectorizer(sublinear_tf=True,max_df = 0.8,min_df=3,max_features=50000,stop_words=stopwordlist)
    feature_train = vectorize.fit_transform(corpus)
    wordbag.tdm = feature_train
    wordbag.vocabulary = vectorize.vocabulary_
    joblib.dump(wordbag,dstdir+"/"+"word_bag.data",compress=3)
def testset_tfidf(testsetfile,stopwordfile,myvocabulary):
    data_set = joblib.load(testsetfile)
    wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
    wordbag.target_name = data_set.tatget_name
    wordbag.label = data_set.label
    
    corpus = data_set.contents
    stopwordlist = read_stopword(stopwordfile)
    vectorize = TfidfVectorizer(sublinear_tf=True,stop_words=stopwordlist,vocabulary=myvocabulary)
    feature_train = vectorize.fit_transform(corpus)
    wordbag.tdm = feature_train
    joblib.dump(wordbag,"test_wordbag/test_word_bag.data",compress=3)
    return wordbag
    
def train_bags(token_path,filename,wordbag_path):
    data_set = Bunch(tatget_name=[],label=[],filenames=[],contents=[])

    dir_list = os.listdir(token_path)
    data_set.target_name = dir_list
    
    for file in dir_list:
        file_name = token_path+"/"+file
        file_read = open(file_name,"r")
        for line in file_read:
            data_set.label.append(data_set.target_name.index(file))
            data_set.contents.append(line.strip())
        file_read.close()
    #持久化
    joblib.dump(data_set, wordbag_path+"/"+filename, compress=3)
Example #6
0
def test_bunch_pickle_generated_with_0_16_and_read_with_0_17():
    bunch = Bunch(key='original')
    # This reproduces a problem when Bunch pickles have been created
    # with scikit-learn 0.16 and are read with 0.17. Basically there
    # is a suprising behaviour because reading bunch.key uses
    # bunch.__dict__ (which is non empty for 0.16 Bunch objects)
    # whereas assigning into bunch.key uses bunch.__setattr__. See
    # https://github.com/scikit-learn/scikit-learn/issues/6196 for
    # more details
    bunch.__dict__['key'] = 'set from __dict__'
    bunch_from_pkl = loads(dumps(bunch))
    # After loading from pickle the __dict__ should have been ignored
    assert_equal(bunch_from_pkl.key, 'original')
    assert_equal(bunch_from_pkl['key'], 'original')
    # Making sure that changing the attr does change the value
    # associated with __getitem__ as well
    bunch_from_pkl.key = 'changed'
    assert_equal(bunch_from_pkl.key, 'changed')
    assert_equal(bunch_from_pkl['key'], 'changed')
def execute_NM_predict():
    test_bunch = read_object(test_data)

    test_space = Bunch(target_name=test_bunch.target_name, label=test_bunch.label, filenames=test_bunch.filenames,
                       tdm=[], vocabulary={})

    tf_idf_bunch = read_object(tf_idf_space_data)
    vectorizer = TfidfVectorizer(stop_words=load_stop_words(), sublinear_tf=True, max_df=0.5,
                                 vocabulary=tf_idf_bunch.vocabulary)
    transformer = TfidfTransformer()

    test_space.tdm = vectorizer.fit_transform(test_bunch.contents)
    test_space.vocabulary = tf_idf_bunch.vocabulary

    clf = MultinomialNB(alpha=0.001).fit(tf_idf_bunch.tdm, tf_idf_bunch.label)
    #预测结果
    predicted = clf.predict(test_space.tdm)
    #对结果进行更加友好的打印
    for label, file_name, excect_cate in zip(test_bunch.label, test_bunch.filenames, predicted):
        print file_name, ' 实际类别:', label, ' 预测类别:', excect_cate
Example #8
0
File: vis.py Project: ohadfel/Baus
def scatter3d(X, fig=None,ax=None ,color='b',cs=None, colorsMap='jet'):
    if (cs is not None):
        cm = plt.get_cmap(colorsMap)
        cNorm = matplotlib.colors.Normalize(vmin=min(cs), vmax=max(cs))
        scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm)
    if (ax is None):
        fig = plt.figure()
        ax = Axes3D(fig)
    if (cs is None):
        ax.scatter(X[:, 0], X[:, 1], X[:, 2],c=color)
    else:
        ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=scalarMap.to_rgba(cs))
        scalarMap.set_array(cs)
        fig.colorbar(scalarMap)
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_zlabel('z')
    plt.show()
    b=Bunch()
    b.fig=fig
    b.ax=ax
    return b
Example #9
0
def fetch_atlas_yeo_2011(data_dir=None, url=None, resume=True, verbose=1):
    """Download and return file names for the Yeo 2011 parcellation.

    The provided images are in MNI152 space.

    Parameters
    ----------
    data_dir: string
        directory where data should be downloaded and unpacked.

    url: string
        url of file to download.

    resume: bool
        whether to resumed download of a partly-downloaded file.

    verbose: int
        verbosity level (0 means no message).

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        dictionary-like object, keys are:

        - "thin_7", "thick_7": 7-region parcellations,
          fitted to resp. thin and thick template cortex segmentations.

        - "thin_17", "thick_17": 17-region parcellations.

        - "colors_7", "colors_17": colormaps (text files) for 7- and 17-region
          parcellation respectively.

        - "anat": anatomy image.

    Notes
    -----
    For more information on this dataset's structure, see
    http://surfer.nmr.mgh.harvard.edu/fswiki/CorticalParcellation_Yeo2011

    Yeo BT, Krienen FM, Sepulcre J, Sabuncu MR, Lashkari D, Hollinshead M,
    Roffman JL, Smoller JW, Zollei L., Polimeni JR, Fischl B, Liu H,
    Buckner RL. The organization of the human cerebral cortex estimated by
    intrinsic functional connectivity. J Neurophysiol 106(3):1125-65, 2011.

    Licence: unknown.
    """
    if url is None:
        url = "ftp://surfer.nmr.mgh.harvard.edu/" \
              "pub/data/Yeo_JNeurophysiol11_MNI152.zip"
    opts = {'uncompress': True}

    dataset_name = "yeo_2011"
    keys = ("thin_7", "thick_7",
            "thin_17", "thick_17",
            "colors_7", "colors_17", "anat")
    basenames = (
        "Yeo2011_7Networks_MNI152_FreeSurferConformed1mm.nii.gz",
        "Yeo2011_7Networks_MNI152_FreeSurferConformed1mm_LiberalMask.nii.gz",
        "Yeo2011_17Networks_MNI152_FreeSurferConformed1mm.nii.gz",
        "Yeo2011_17Networks_MNI152_FreeSurferConformed1mm_LiberalMask.nii.gz",
        "Yeo2011_7Networks_ColorLUT.txt",
        "Yeo2011_17Networks_ColorLUT.txt",
        "FSL_MNI152_FreeSurferConformed_1mm.nii.gz")

    filenames = [(os.path.join("Yeo_JNeurophysiol11_MNI152", f), url, opts)
                 for f in basenames]

    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
            verbose=verbose)
    sub_files = _fetch_files(data_dir, filenames, resume=resume,
                             verbose=verbose)

    fdescr = _get_dataset_descr(dataset_name)

    params = dict([('description', fdescr)] + list(zip(keys, sub_files)))
    return Bunch(**params)
Example #10
0
def fetch_atlas_msdl(data_dir=None, url=None, resume=True, verbose=1):
    """Download and load the MSDL brain atlas.

    Parameters
    ----------
    data_dir: string, optional
        Path of the data directory. Used to force data storage in a specified
        location. Default: None

    url: string, optional
        Override download URL. Used for test only (or if you setup a mirror of
        the data).

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        Dictionary-like object, the interest attributes are :

        - 'maps': str, path to nifti file containing regions definition.
        - 'labels': string list containing the labels of the regions.
        - 'region_coords': tuple list (x, y, z) containing coordinates
          of each region in MNI space.
        - 'networks': string list containing names of the networks.
        - 'description': description about the atlas.


    References
    ----------
    :Download:
        https://team.inria.fr/parietal/files/2015/01/MSDL_rois.zip

    :Paper to cite:
        `Multi-subject dictionary learning to segment an atlas of brain
        spontaneous activity <http://hal.inria.fr/inria-00588898/en>`_
        Gael Varoquaux, Alexandre Gramfort, Fabian Pedregosa, Vincent Michel,
        Bertrand Thirion. Information Processing in Medical Imaging, 2011,
        pp. 562-573, Lecture Notes in Computer Science.

    :Other references:
        `Learning and comparing functional connectomes across subjects
        <http://hal.inria.fr/hal-00812911/en>`_.
        Gael Varoquaux, R.C. Craddock NeuroImage, 2013.

    """
    url = 'https://team.inria.fr/parietal/files/2015/01/MSDL_rois.zip'
    opts = {'uncompress': True}

    dataset_name = "msdl_atlas"
    files = [(os.path.join('MSDL_rois', 'msdl_rois_labels.csv'), url, opts),
             (os.path.join('MSDL_rois', 'msdl_rois.nii'), url, opts)]

    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
                                verbose=verbose)
    files = _fetch_files(data_dir, files, resume=resume, verbose=verbose)
    csv_data = np.recfromcsv(files[0])
    labels = [name.strip() for name in csv_data['name'].tolist()]
    labels = [label.decode("utf-8") for label in labels]
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', module='numpy',
                                category=FutureWarning)
        region_coords = csv_data[['x', 'y', 'z']].tolist()
    net_names = [net_name.strip() for net_name in csv_data['net_name'].tolist()]
    fdescr = _get_dataset_descr(dataset_name)

    return Bunch(maps=files[1], labels=labels, region_coords=region_coords,
                 networks=net_names, description=fdescr)
Example #11
0
def fetch_atlas_harvard_oxford(atlas_name, data_dir=None,
                               symmetric_split=False,
                               resume=True, verbose=1):
    """Load Harvard-Oxford parcellations from FSL.

    This function downloads Harvard Oxford atlas packaged from FSL 5.0
    and stores atlases in NILEARN_DATA folder in home directory.

    This function can also load Harvard Oxford atlas from your local directory
    specified by your FSL installed path given in `data_dir` argument.
    See documentation for details.

    Parameters
    ----------
    atlas_name: string
        Name of atlas to load. Can be:
        cort-maxprob-thr0-1mm,  cort-maxprob-thr0-2mm,
        cort-maxprob-thr25-1mm, cort-maxprob-thr25-2mm,
        cort-maxprob-thr50-1mm, cort-maxprob-thr50-2mm,
        sub-maxprob-thr0-1mm,  sub-maxprob-thr0-2mm,
        sub-maxprob-thr25-1mm, sub-maxprob-thr25-2mm,
        sub-maxprob-thr50-1mm, sub-maxprob-thr50-2mm,
        cort-prob-1mm, cort-prob-2mm,
        sub-prob-1mm, sub-prob-2mm

    data_dir: string, optional
        Path of data directory where data will be stored. Optionally,
        it can also be a FSL installation directory (which is dependent
        on your installation).
        Example, if FSL is installed in /usr/share/fsl/ then
        specifying as '/usr/share/' can get you Harvard Oxford atlas
        from your installed directory. Since we mimic same root directory
        as FSL to load it easily from your installation.

    symmetric_split: bool, optional, (default False).
        If True, lateralized atlases of cort or sub with maxprob will be
        returned. For subcortical types (sub-maxprob), we split every
        symmetric region in left and right parts. Effectively doubles the
        number of regions.
        NOTE Not implemented for full probabilistic atlas (*-prob-* atlases).

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        dictionary-like object, keys are:

        - "maps": nibabel.Nifti1Image, 4D maps if a probabilistic atlas is
          requested and 3D labels if a maximum probabilistic atlas was
          requested.

        - "labels": string list, labels of the regions in the atlas.
    """
    atlas_items = ("cort-maxprob-thr0-1mm", "cort-maxprob-thr0-2mm",
                   "cort-maxprob-thr25-1mm", "cort-maxprob-thr25-2mm",
                   "cort-maxprob-thr50-1mm", "cort-maxprob-thr50-2mm",
                   "sub-maxprob-thr0-1mm", "sub-maxprob-thr0-2mm",
                   "sub-maxprob-thr25-1mm", "sub-maxprob-thr25-2mm",
                   "sub-maxprob-thr50-1mm", "sub-maxprob-thr50-2mm",
                   "cort-prob-1mm", "cort-prob-2mm",
                   "sub-prob-1mm", "sub-prob-2mm")
    if atlas_name not in atlas_items:
        raise ValueError("Invalid atlas name: {0}. Please chose an atlas "
                         "among:\n{1}".format(
                             atlas_name, '\n'.join(atlas_items)))

    url = 'http://www.nitrc.org/frs/download.php/9902/HarvardOxford.tgz'

    # For practical reasons, we mimic the FSL data directory here.
    dataset_name = 'fsl'
    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
                                verbose=verbose)
    opts = {'uncompress': True}
    root = os.path.join('data', 'atlases')

    if atlas_name[0] == 'c':
        if 'cort-maxprob' in atlas_name and symmetric_split:
            split_name = atlas_name.split('cort')
            atlas_name = 'cortl' + split_name[1]
            label_file = 'HarvardOxford-Cortical-Lateralized.xml'
            lateralized = True
        else:
            label_file = 'HarvardOxford-Cortical.xml'
            lateralized = False
    else:
        label_file = 'HarvardOxford-Subcortical.xml'
        lateralized = False
    label_file = os.path.join(root, label_file)

    atlas_file = os.path.join(root, 'HarvardOxford',
                              'HarvardOxford-' + atlas_name + '.nii.gz')

    atlas_img, label_file = _fetch_files(
        data_dir,
        [(atlas_file, url, opts), (label_file, url, opts)],
        resume=resume, verbose=verbose)

    names = {}
    from xml.etree import ElementTree
    names[0] = 'Background'
    for label in ElementTree.parse(label_file).findall('.//label'):
        names[int(label.get('index')) + 1] = label.text
    names = list(names.values())

    if not symmetric_split:
        return Bunch(maps=atlas_img, labels=names)

    if atlas_name in ("cort-prob-1mm", "cort-prob-2mm",
                      "sub-prob-1mm", "sub-prob-2mm"):
        raise ValueError("Region splitting not supported for probabilistic "
                         "atlases")

    atlas_img = check_niimg(atlas_img)
    if lateralized:
        return Bunch(maps=atlas_img, labels=names)

    atlas = atlas_img.get_data()

    labels = np.unique(atlas)
    # Build a mask of both halves of the brain
    middle_ind = (atlas.shape[0] - 1) // 2
    # Put zeros on the median plane
    atlas[middle_ind, ...] = 0
    # Split every zone crossing the median plane into two parts.
    left_atlas = atlas.copy()
    left_atlas[middle_ind:, ...] = 0
    right_atlas = atlas.copy()
    right_atlas[:middle_ind, ...] = 0

    new_label = 0
    new_atlas = atlas.copy()
    # Assumes that the background label is zero.
    new_names = [names[0]]
    for label, name in zip(labels[1:], names[1:]):
        new_label += 1
        left_elements = (left_atlas == label).sum()
        right_elements = (right_atlas == label).sum()
        n_elements = float(left_elements + right_elements)
        if (left_elements / n_elements < 0.05 or
                right_elements / n_elements < 0.05):
            new_atlas[atlas == label] = new_label
            new_names.append(name)
            continue
        new_atlas[right_atlas == label] = new_label
        new_names.append(name + ', left part')
        new_label += 1
        new_atlas[left_atlas == label] = new_label
        new_names.append(name + ', right part')

    atlas_img = new_img_like(atlas_img, new_atlas, atlas_img.affine)
    return Bunch(maps=atlas_img, labels=new_names)
Example #12
0
def fetch_WS353(which="all"):
    """
    Fetch WS353 dataset for testing attributional and
    relatedness similarity

    Parameters
    ----------
    which : 'all': for both relatedness and attributional similarity,
            'relatedness': for relatedness similarity
            'similarity': for attributional similarity
            'set1': as divided by authors
            'set2': as divided by authors

    References
    ----------
    Finkelstein, Gabrilovich, "Placing Search in Context: The Concept Revisited†", 2002
    Agirre, Eneko et al., "A Study on Similarity and Relatedness Using Distributional and WordNet-based Approaches",
    2009

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': matrix of 2 words per column,
        'y': vector with scores,
        'sd': vector of std of scores if available (for set1 and set2)
    """
    if which == "all":
        data = _get_as_pd(
            'https://www.dropbox.com/s/eqal5qj97ajaycz/EN-WS353.txt?dl=1',
            'similarity',
            header=0,
            sep="\t")
    elif which == "relatedness":
        data = _get_as_pd(
            'https://www.dropbox.com/s/x94ob9zg0kj67xg/EN-WSR353.txt?dl=1',
            'similarity',
            header=None,
            sep="\t")
    elif which == "similarity":
        data = _get_as_pd(
            'https://www.dropbox.com/s/ohbamierd2kt1kp/EN-WSS353.txt?dl=1',
            'similarity',
            header=None,
            sep="\t")
    elif which == "set1":
        data = _get_as_pd(
            'https://www.dropbox.com/s/opj6uxzh5ov8gha/EN-WS353-SET1.txt?dl=1',
            'similarity',
            header=0,
            sep="\t")
    elif which == "set2":
        data = _get_as_pd(
            'https://www.dropbox.com/s/w03734er70wyt5o/EN-WS353-SET2.txt?dl=1',
            'similarity',
            header=0,
            sep="\t")
    else:
        raise RuntimeError("Not recognized which parameter")

    # We basically select all the columns available
    X = data.values[:, 0:2]
    y = data.values[:, 2].astype(np.float)

    # We have also scores
    if data.values.shape[1] > 3:
        sd = np.std(data.values[:, 2:15].astype(np.float), axis=1).flatten()
        return Bunch(X=X.astype("object"), y=y, sd=sd)
    else:
        return Bunch(X=X.astype("object"), y=y)
Example #13
0
#######################
#                     #
#      文本分类        #
#                     #
#######################
import pickle
from sklearn.svm import LinearSVC    #导入线性SVM
'''1、导入数据'''
with open("D:\\mywork\\test\\ML_Chinese\\tfidfspace.dat","rb") as f1:
    train = pickle.load(f1)
with open("D:\\mywork\\test\\ML_Chinese\\test_set.dat","rb") as f2:
    test = pickle.load(f2)
'''2、构建测试集tdm向量'''
from sklearn.datasets.base import Bunch
tfidftest = Bunch(target_name=test.target_name, label=test.label, filenames=test.filenames,
                   tdm=[], vocabulary={})
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer    #TF-IDF向量转换类
from sklearn.feature_extraction.text import TfidfVectorizer     #TF-IDF向量生成类
stoplist = readfile(stop_path).splitlines()         #见第二章的函数
'''2-1 构建测试集向量时需使用训练集词袋向量'''
vectorizer = TfidfVectorizer(stop_words=stoplist, sublinear_tf=True, max_df=0.5,
                             vocabulary=train.vocabulary)
transformer = TfidfTransformer()    #统计每个词语的TF-IDF权重
text=[i.decode("GBK","ignore") for i in test.contents]     #将二进制转为unicode
tfidftest.tdm = vectorizer.fit_transform(text)
tfidftest.vocabulary = train.vocabulary
'''3、建模'''
svm = LinearSVC(penalty='l2',dual=False,tol=0.0001)
svm.fit(train.tdm,train.label)
pre=svm.predict(tfidftest.tdm)
from sklearn.datasets.base import Bunch

# 分词后分类语料库路径
seg_path = "text_corpus_segment/"

# 词袋语料路径
wordbag_path = "text_corpus_wordbag/"
if not os.path.exists(wordbag_path):
    os.makedirs(wordbag_path)

# Bunch类提供一种key,value的对象形式
# target_name:所有分类名称列表
# label:每个文件的分类标签列表
# filenames:文件名称
# contents:文件内容
data_set = Bunch(target_name=[], label=[], filenames=[], contents=[])

# 获取seg_path下的所有子分类
class_list = os.listdir(seg_path)
data_set.target_name = class_list

# 获取每个子目录下所有的文件
for mydir in class_list:
  class_path = seg_path + mydir + "/"
  file_list = os.listdir(class_path) # 获取class_path下的所有文件
  for file_name in file_list:
      file_path = class_path + file_name
      data_set.filenames.append(file_path) # 把文件路径附加到数据集中
      data_set.label.append(data_set.target_name.index(mydir)) # 把文件分类标签附加到数据集中
      with open(file_path, 'r', encoding='gb18030') as file:
          seg_corpus = file.read() # 读取语料
Example #15
0
import numpy as np
from skimage import io
from sklearn.datasets.base import Bunch

from dip.load_data import load_image_files, load_mask_images
from dip.mask import bounding_rect_of_mask


datasets = load_mask_images()

data = []
for f, mask in zip(
        datasets.filenames,
        load_image_files(datasets.filenames),
        ):
    # rect: (min_x, max_x, min_y, max_x)
    rect = bounding_rect_of_mask(mask, negative=True)
    data.append(list(rect))
    print('{0}: {1}'.format(f, rect))

bunch = Bunch(name='mask rects')
bunch.data = np.array(data)
bunch.filenames = datasets.filenames
bunch.target = datasets.target
bunch.target_names = datasets.target_names
bunch.description = 'mask rects: (min_x, min_y, max_x, max_y)'

with gzip.open('rects.pkl.gz', 'wb') as f:
    pickle.dump(bunch, f)
Example #16
0
from sklearn.feature_extraction.text import TfidfVectorizer

reload(sys)

#导入训练预料
data_set={}
#训练语料集路径
train_path='text_corpus1_wordbag/train_set.data'
file_obj=open(train_path,'rb')

#读取持久化后的对象
data_set=pickle.load(file_obj)
file_obj.close()

#定义词袋数据结构
wordbag=Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
wordbag.target_name=data_set.target_name
wordbag.label=data_set.label
wordbag.filenames=data_set.filenames

#构建语料
corpus=data_set.contents

#从文件导入停用词表
stpwrdpath='extra_dict/hlt_stop_words.txt'
stpwrd_dic=open(stpwrdpath,'rb')
stpwrd_content=stpwrd_dic.read()

#将停用词转换为list
stpwrdlst=stpwrd_content.splitlines()
stpwrd_dic.close()
Example #17
0
def get_data(clf, train, cats, fixk, min_size, vct, raw, limit=2):
    import copy
    min_size = 10

    args.fixk = None

    data, vct2 = load_from_file(train, cats, fixk, min_size, vct, raw=raw)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))


    ### SENTENCE TRANSFORMATION
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    ## delete <br> to "." to recognize as end of sentence
    data.train.data = clean_html(data.train.data)
    data.test.data = clean_html(data.test.data)

    print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
    ## Get the features of the sentence dataset

    ## create splits of data: pool, test, oracle, sentences
    expert_data = Bunch()
    train_test_data = Bunch()

    expert_data.sentence, train_test_data.pool = split_data(data.train)
    expert_data.oracle, train_test_data.test = split_data(data.test)

    data.train.data = train_test_data.pool.train.data
    data.train.target = train_test_data.pool.train.target

    data.test.data = train_test_data.test.train.data
    data.test.target = train_test_data.test.train.target

    ## convert document to matrix
    data.train.bow = vct.fit_transform(data.train.data)
    data.test.bow = vct.transform(data.test.data)

    #### EXPERT CLASSIFIER: ORACLE
    print("Training Oracle expert")

    labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=limit)
    print len(sent_train)
    expert_data.oracle.train.data = sent_train
    expert_data.oracle.train.target = np.array(labels)
    expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)
    print expert_data.oracle.train.bow.shape
    # exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    exp_clf = copy.copy(clf)
    exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)

    #### EXPERT CLASSIFIER: SENTENCES
    print("Training sentence expert")
    labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=limit)

    expert_data.sentence.train.data = sent_train
    expert_data.sentence.train.target = np.array(labels)
    expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)

    sent_clf = None
    # if args.cheating:
    sent_clf = copy.copy(clf)
    # sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)

    return exp_clf, data, vct, sent_clf, expert_data
Example #18
0
def fetch_atlas_basc_multiscale_2015(version='sym', data_dir=None,
                                     resume=True, verbose=1):
    """Downloads and loads multiscale functional brain parcellations

    This atlas includes group brain parcellations generated from
    resting-state functional magnetic resonance images from about
    200 young healthy subjects.

    Multiple scales (number of networks) are available, among
    7, 12, 20, 36, 64, 122, 197, 325, 444. The brain parcellations
    have been generated using a method called bootstrap analysis of
    stable clusters called as BASC, (Bellec et al., 2010) and the
    scales have been selected using a data-driven method called MSTEPS
    (Bellec, 2013).

    Note that two versions of the template are available, 'sym' or 'asym'.
    The 'asym' type contains brain images that have been registered in the
    asymmetric version of the MNI brain template (reflecting that the brain
    is asymmetric), while the 'sym' type contains images registered in the
    symmetric version of the MNI template. The symmetric template has been
    forced to be symmetric anatomically, and is therefore ideally suited to
    study homotopic functional connections in fMRI: finding homotopic regions
    simply consists of flipping the x-axis of the template.

    .. versionadded:: 0.2.3

    Parameters
    ----------
    version: str, optional
        Available versions are 'sym' or 'asym'. By default all scales of
        brain parcellations of version 'sym' will be returned.

    data_dir: str, optional
        directory where data should be downloaded and unpacked.

    url: str, optional
        url of file to download.

    resume: bool
        whether to resumed download of a partly-downloaded file.

    verbose: int
        verbosity level (0 means no message).

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        dictionary-like object, Keys are:

        - "scale007", "scale012", "scale020", "scale036", "scale064",
          "scale122", "scale197", "scale325", "scale444": str, path
          to Nifti file of various scales of brain parcellations.

        - "description": details about the data release.

    References
    ----------
    Bellec P, Rosa-Neto P, Lyttelton OC, Benali H, Evans AC, Jul. 2010.
    Multi-level bootstrap analysis of stable clusters in resting-state fMRI.
    NeuroImage 51 (3), 1126-1139.
    URL http://dx.doi.org/10.1016/j.neuroimage.2010.02.082

    Bellec P, Jun. 2013. Mining the Hierarchy of Resting-State Brain Networks:
    Selection of Representative Clusters in a Multiscale Structure.
    Pattern Recognition in Neuroimaging (PRNI), 2013 pp. 54-57.

    Notes
    -----
    For more information on this dataset's structure, see
    https://figshare.com/articles/basc/1285615
    """
    versions = ['sym', 'asym']
    if version not in versions:
        raise ValueError('The version of Brain parcellations requested "%s" '
                         'does not exist. Please choose one among them %s.' %
                         (version, str(versions)))

    keys = ['scale007', 'scale012', 'scale020', 'scale036', 'scale064',
            'scale122', 'scale197', 'scale325', 'scale444']

    if version == 'sym':
        url = "https://ndownloader.figshare.com/files/1861819"
    elif version == 'asym':
        url = "https://ndownloader.figshare.com/files/1861820"
    opts = {'uncompress': True}

    dataset_name = "basc_multiscale_2015"
    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
                                verbose=verbose)

    folder_name = 'template_cambridge_basc_multiscale_nii_' + version
    basenames = ['template_cambridge_basc_multiscale_' + version +
                 '_' + key + '.nii.gz' for key in keys]

    filenames = [(os.path.join(folder_name, basename), url, opts)
                 for basename in basenames]
    data = _fetch_files(data_dir, filenames, resume=resume, verbose=verbose)

    descr = _get_dataset_descr(dataset_name)

    params = dict(zip(keys, data))
    params['description'] = descr

    return Bunch(**params)
Example #19
0
def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
    vct_analizer = vct.build_tokenizer()
    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = max(50, args.fixk)

    if "imdb" in args.train:
        ########## IMDB MOVIE REVIEWS ###########
        data = load_imdb(args.train, shuffle=True, rnd=2356, vct=vct, min_size=min_size,
                               fix_k=args.fixk)  # should brind data as is
    elif "aviation" in args.train:
        raise Exception("We are not ready for that data yet")
    elif "20news" in args.train:
        ########## 20 news groups ######
        data = load_20newsgroups(categories=categories[0], vectorizer=vct, min_size=min_size,
                                       fix_k=args.fixk)  # for testing purposes
    elif "dummy" in args.train:
        ########## DUMMY DATA###########
        data = load_dummy("C:/Users/mramire8/Documents/code/python/data/dummy", shuffle=True,
                                rnd=2356, vct=vct, min_size=0, fix_k=args.fixk)
    else:
        raise Exception("We do not know that dataset")

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))
    #print(data.train.data[0])
    #### COST MODEL
    parameters = parse_parameters(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = set_cost_model(parameters)

    print "\nCost Model: %s" % cost_model.__class__.__name__

    #### ACCURACY MODEL
    # try:
    # #     accu_parameters = parse_parameters(args.accu_model)
    # except ValueError:
    accu_parameters = parse_parameters_mat(args.accu_model)
    # else
    #     print("Error: Accuracy parameters didn't work")

    print "Accuracy Parameters %s" % accu_parameters
    #if "fixed" in args.accu_function:
    #    accuracy_model = base_models.FixedAccuracyModel(accuracy_value=.7)
    #elif "log" in args.accu_function:
    #    accuracy_model = base_models.LogAccuracyModel(model=parameters)
    #elif "linear" in args.accu_function:
    #    accuracy_model = base_models.LRAccuracyModel(model=parameters)
    #else:
    #    raise Exception("We need a defined cost function options [fixed|log|linear]")
    #
    #print "\nAccuracy Model: %s " % accuracy_model

    #### CLASSIFIER
    #### Informed priors
    #feature_counts = np.ones(x_train.shape[0]) * x_train
    #feature_frequencies = feature_counts / np.sum(feature_counts)
    #alpha = feature_frequencies
    alpha = 1
    clf = MultinomialNB(alpha=alpha)
    print "\nClassifier: %s" % clf

    #### EXPERT MODEL
    #expert = baseexpert.BaseExpert()
    if "fixed" in args.expert:
        expert = baseexpert.FixedAccuracyExpert(accuracy_value=accu_parameters[0],
                                                cost_function=cost_model.cost_function)  #average value of accuracy of the experts
    elif "true" in args.expert:
        expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function)
    elif "linear" in args.expert:
        #expert = baseexpert.LRFunctionExpert(model=[0.0019, 0.6363],cost_function=cost_model.cost_function)
        raise Exception("We do not know linear yet!!")
    elif "log" in args.expert:
        expert = baseexpert.LogFunctionExpert(model=accu_parameters, cost_function=cost_model.cost_function)
    elif "direct" in args.expert:
        expert = baseexpert.LookUpExpert(accuracy_value=accu_parameters, cost_function=cost_model.cost_function)
    else:
        raise Exception("We need a defined cost function options [fixed|log|linear]")
        #expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function)
    print "\nExpert: %s " % expert

    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200
    eval_range = 1 if (args.budget / evaluation_points) <= 0 else args.budget / evaluation_points
    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          50))

    t0 = time.time()
    ### experiment starts
    for t in range(args.trials):
        print "*" * 60
        print "Trial: %s" % t
        # TODO shuffle the data??
        #student = baselearner.BaseLearner(model=clf, cost_model=cost_model, accuracy_model=accuracy_model, budget=args.budget,
        #                                  seed=t)
        student = randomsampling.RandomSamplingLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t)
        print "\nStudent: %s " % student
        train_indices = []
        train_x = []
        train_y = []
        pool = Bunch()
        pool.data = data.train.bow.tocsr()   # full words, for training
        pool.fixk = data.train.bowk.tocsr()  # k words BOW for querying
        pool.target = data.train.target
        pool.predicted = []
        pool.kwords = np.array(data.train.kwords)  # k words
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool

        #for x in pool.fixk:
        #    print x.todense().sum()

        bootstrapped = False

        current_cost = 0
        iteration = 0
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:

            if not bootstrapped:
                ## random bootstrap
                #bt = randomsampling.BootstrapRandom(random_state=t * 10)

                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:
                query_index = student.pick_next(pool=pool, k=step_size)

            query = pool.fixk[query_index]  # query with k words

            query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]]

            #if query_size[0] >50:
            #    print "*** %s" % pool.kwords[query_index]

            ground_truth = pool.target[query_index]
            #labels, spent = expert.label(unlabeled=query, target=ground_truth)
            if iteration == 0: ## bootstrap uses ground truth
                labels = ground_truth
            else:
                #labels = expert.label_instances(query, ground_truth)
                labels = expert.label_instances(query_size, ground_truth)
                #spent = expert.estimate_instances(pool.kwords[query_index])
            spent = expert.estimate_instances(query_size)

            query_cost = np.array(spent).sum()
            current_cost += query_cost

            train_indices.extend(query_index)

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # add labels to training
            train_x = pool.data[train_indices]  ## train with all the words

            # update labels with the expert labels
            #train_y = pool.target[train_indices]
            train_y.extend(labels)
            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # retrain the model
            current_model = student.train(train_x, train_y)
            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            #auc = metrics.roc_auc_score(data.test.target, y_probas[:,1])
            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print (
            "TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices), accu,
                                                                                              auc, query_cost,
                                                                                              current_cost, spent))

            ## the results should be based on the cost of the labeling
            if iteration > 0: # bootstrap iteration

                student.budget -= query_cost ## Bootstrap doesn't count

                #x_axis_range = int(current_cost / eval_range)
                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                #accuracies[len(train_indices)].append(accu)
                #aucs[len(train_indices)].append(auc)
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)
            iteration += 1
    print("Elapsed time %.3f" % (time() - t0))
    print_results(x_axis, accuracies, aucs)
	return bunch
#写入bunch对象	
def writebunchobj(path,bunchobj):
	file_obj = open(path, "wb")
	pickle.dump(bunchobj,file_obj) 
	file_obj.close()	

# 1. 读取停用词表	
stopword_path = "train_word_bag/hlt_stop_words.txt"
stpwrdlst = readfile(stopword_path).splitlines()

# 2. 导入分词后的词向量bunch对象
path = "test_word_bag/test_set.dat"        # 词向量空间保存路径
bunch	= readbunchobj(path)

# 3. 构建测试集tfidf向量空间
testspace = Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={})
# 4. 导入训练集的词袋
trainbunch = readbunchobj("train_word_bag/tfdifspace.dat")
# 5. 使用TfidfVectorizer初始化向量空间模型 
vectorizer = TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf = True,max_df = 0.5,vocabulary=trainbunch.vocabulary)
transformer=TfidfTransformer() # 该类会统计每个词语的tf-idf权值
# 文本转为tf-idf矩阵,单独保存字典文件 
testspace.tdm = vectorizer.fit_transform(bunch.contents)
testspace.vocabulary = trainbunch.vocabulary

# 创建词袋的持久化
space_path = "test_word_bag/testspace.dat"        # 词向量空间保存路径
writebunchobj(space_path,testspace)

print "test词向量空间创建成功!!!"
Example #21
0
def fetch_atlas_surf_destrieux(data_dir=None, url=None,
                               resume=True, verbose=1):
    """Download and load Destrieux et al, 2010 cortical atlas.

    This atlas returns 76 labels per hemisphere based on sulco-gryal pattnerns
    as distributed with Freesurfer in fsaverage5 surface space.

    .. versionadded:: 0.3

    Parameters
    ----------
    data_dir: str, optional
        Path of the data directory. Use to force data storage in a non-
        standard location. Default: None

    url: str, optional
        Download URL of the dataset. Overwrite the default URL.

    resume: bool, optional (default True)
        If True, try resuming download if possible.

    verbose: int, optional (default 1)
        Defines the level of verbosity of the output.

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        dictionary-like object, contains:

        - "labels": list
                     Contains region labels

        - "map_left": numpy.ndarray
                      Index into 'labels' for each vertex on the
                      left hemisphere of the fsaverage5 surface

        - "map_right": numpy.ndarray
                       Index into 'labels' for each vertex on the
                       right hemisphere of the fsaverage5 surface

        - "description": str
                         Details about the dataset


    References
    ----------
    Destrieux et al. (2010), Automatic parcellation of human cortical gyri and
    sulci using standard anatomical nomenclature. NeuroImage 53, 1-15.
    """

    if url is None:
        url = "https://www.nitrc.org/frs/download.php/"

    dataset_name = 'destrieux_surface'
    fdescr = _get_dataset_descr(dataset_name)
    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
                                verbose=verbose)

    # Download annot files, fsaverage surfaces and sulcal information
    annot_file = '%s.aparc.a2009s.annot'
    annot_url = url + '%i/%s.aparc.a2009s.annot'
    annot_nids = {'lh annot': 9343, 'rh annot': 9342}

    annots = []
    for hemi in [('lh', 'left'), ('rh', 'right')]:

        annot = _fetch_files(data_dir,
                             [(annot_file % (hemi[1]),
                               annot_url % (annot_nids['%s annot' % hemi[0]],
                                            hemi[0]),
                              {'move': annot_file % (hemi[1])})],
                             resume=resume, verbose=verbose)[0]
        annots.append(annot)

    annot_left = nb.freesurfer.read_annot(annots[0])
    annot_right = nb.freesurfer.read_annot(annots[1])

    return Bunch(labels=annot_left[2],  map_left=annot_left[0],
                 map_right=annot_right[0], description=fdescr)
Example #22
0
def create_dataset(subject_id):
    import numpy as np
    import os 
    from nilearn import datasets
    from nilearn.datasets import _get_dataset_dir
    from nilearn.datasets import _get_dataset
    from sklearn.datasets.base import Bunch
    import pylab as pl
    import nibabel as nb
    
    from remove import remove_range, remove

    dataset_name = 'machine_learning'
    runs = 4
    img_data = np.zeros((64,64,33,1))
    lab_data = []
    session_data = []
    for r in range(runs):
        print 'RUN', r
        rv = None
        path = '/gablab/p/eegfmri/analysis/eeg/elists'
        path_all_codes = '/gablab/p/eegfmri/analysis/iaps/all_labels.txt'
        path_names2 = os.path.join(path, 'elist_IAPS_%s_%s_raw.txt' %(subject_id, r+1))
        if subject_id == '009':
            path_names2 = os.path.join(path, 'elist_IAPS_%s_%s.txt' %(subject_id, r+1)) 
        eegcodes = np.genfromtxt(path_all_codes, dtype=int) [:, 0]
        attributes = np.genfromtxt(path_all_codes, dtype=float) [:, 1:4]
        binary = attributes[:, 2]
        run_code = np.genfromtxt(path_names2, dtype=str) [:,3]
        clock = np.genfromtxt(path_names2, dtype=str) [:,4] 
        cl = []
        tp = []
        for i in range(len(clock)):
            if run_code[i] == 'R128':
                timepoint = clock[i].lstrip('0123456789')  
                tp.append(timepoint)            
            if len(tp) > 0:
                clock[i] = clock[i].lstrip('0123456789')
                if clock[i] == tp[0]:
                    cl.append([i])
                    if run_code[i] != 'R128':
                        print i, run_code[i] 
                if clock[i] != tp[0] and run_code[i] == 'R128':
                    print 'TR at index', i, 'removed.'
                    run_code[i] = 'remove'
        print 'Numbers of TR identical timepoints', len(cl)
        tr = []
        for idx,i in enumerate(run_code):
            if i == 'R128':
                tr.append([idx])
        print 'Number of TR counted from elist code', len(tr)
        rv = remove(run_code, 'R')
        rv = remove(rv, 'remove')
        rv = remove(rv, 'boundary')
        rv = remove(rv, 'SyncOn')
        rv = remove(rv, 'Start')
        rv = remove(rv, 'Userdefined')
        rv = remove(rv, 'LowCorrelation')
        rv = remove(rv, 'TSTART')
        rv = remove(rv, 'TPEAK')
        rv = remove(rv, 'TEND')
        for i in range(len(rv)):
            if rv[i] == 'R128':
                rv[i] = '-99'
            rv[i] = rv[i].lstrip('S')
            rv[i] = int(rv[i])
        # remove stimulus codes for responses
        rv = remove_range(rv, 240)
        for idx, i in enumerate(rv):
            for idx2, i2 in enumerate(eegcodes):
                if i == i2:
                    rv[idx] = binary[idx2]            
        for idx, i in enumerate(rv):
            if i != -99:
                rv[idx-1] = i
                rv[idx] = 0
        # remove last TR as it was apparently not recorded
        rv[-1] = 0
        rv = remove(rv, 0)
        for idx, i in enumerate(rv):
            if i == -99:
                rv[idx] = 0
        
        # until now the list with negative / neutral labels also contains zeros, which we will want to get rid of. 
        # To do this, we will replace the zeros with the code shown prior
        # First two values will be deleted as well as first two TRs (after fmri_data_i gets assigned
        
        for idx, z in enumerate(rv):
            if idx <= 2 and z == 0:
                rv[idx] = -77
            if idx > 2 and z == 0:
                rv[idx] = rv[idx-1]
                
        for idx, z in enumerate(rv):
            if idx <= 1 and z != -77:
                print 'Warning, non-empty first two TRs were deleted.'
        
        rv = remove(rv, -77)
        unique = sorted(list(set(rv)))
        print 'Unique values in RV', unique  
        
        t = open('/gablab/p/eegfmri/analysis/iaps/pilot%s/machine_learning/neg-neutr_attributes_run%s.txt' %(subject_id, r), 'w')
        for i in range(len(rv)):
            t.write("%s %s" %(rv[i], r))
            t.write('\n')  
        t.close()
        
        print 'Labels Length:', len(rv)
        file_name = ['neg-neutr_attributes_run%s.txt' %(r), 'pilot%s_r0%s_bandpassed.nii.gz' %(subject_id, r)]
        fil = _get_dataset(dataset_name, file_name, data_dir='/gablab/p/eegfmri/analysis/iaps/pilot%s' %(subject_id), folder=None)
        ds_i = Bunch(func=fil[1], conditions_target=fil[0])
        labels_i = np.loadtxt(ds_i.conditions_target, dtype=np.str)
        bold_i = nb.load(ds_i.func)
        fmri_data_i = np.copy(bold_i.get_data())
        print 'Original fMRI data', fmri_data_i.shape
        
        fmri_data_i = fmri_data_i[...,2:]
        print fmri_data_i.shape
        
        affine = bold_i.get_affine()
        mean_img_i = np.mean(fmri_data_i, axis=3)
        session_data = np.append(session_data, labels_i[:,1])
        lab_data = np.append(lab_data, labels_i[:,0])
        img_data = np.concatenate((img_data, fmri_data_i), axis=3)        
        print '__________________________________________________________________________________________________________'
        
        
        if r == 3:
            img_data = img_data[...,1:]
            print 'fMRI image', img_data.shape
            print 'Label Vector Length:', len(lab_data), 'Session Vector Length:', len(session_data)
            ni_img = nb.Nifti1Image(img_data, affine=None, header=None)
            nb.save(ni_img, '/gablab/p/eegfmri/analysis/iaps/pilot%s/machine_learning/all_runs.nii' %(subject_id))
            f = open('/gablab/p/eegfmri/analysis/iaps/pilot%s/machine_learning/neg-neutr_attributes_all_runs.txt' %(subject_id), 'w')
            for i in range(len(lab_data)):
                f.write("%s %s" %(lab_data[i], session_data[i]))
                f.write('\n')  
            f.close()
            # set up concatenated dataset in nilearn format
            file_names = ['neg-neutr_attributes_all_runs.txt', 'all_runs.nii']
            files = _get_dataset(dataset_name, file_names, data_dir='/gablab/p/eegfmri/analysis/iaps/pilot%s' %(subject_id), folder=None)
            ds = Bunch(func=files[1], conditions_target=files[0])
            print ds.keys(), ds
            labels = np.loadtxt(ds.conditions_target, dtype=np.str)
            bold = nb.load(ds.func)
            fmri_data = np.copy(bold.get_data())
            print fmri_data.shape
            affine = bold_i.get_affine() # just choose one
            # Compute the mean EPI: we do the mean along the axis 3, which is time
            mean_img = np.mean(fmri_data, axis=3)
            
    return (ds, labels, bold, fmri_data, affine, mean_img) # later 'ds' will be sufficient
Example #23
0
def main():
    print args
    print

    accuracies = defaultdict(lambda: [])

    ora_accu = defaultdict(lambda: [])

    oracle_accuracies =[]
    ora_cm = defaultdict(lambda: [])
    lbl_dit = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())

    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = 10

    args.fixk = None

    data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    parameters = experiment_utils.parse_parameters_mat(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = experiment_utils.set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__

    ### SENTENCE TRANSFORMATION
    if args.train == "twitter":
        sent_detector = TwitterSentenceTokenizer()
    else:
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    ## delete <br> to "." to recognize as end of sentence
    data.train.data = experiment_utils.clean_html(data.train.data)
    data.test.data = experiment_utils.clean_html(data.test.data)

    print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
    ## Get the features of the sentence dataset

    ## create splits of data: pool, test, oracle, sentences
    expert_data = Bunch()
    if not args.fulloracle:
        train_test_data = Bunch()

        expert_data.sentence, train_test_data.pool = split_data(data.train)
        expert_data.oracle, train_test_data.test = split_data(data.test)

        data.train.data = train_test_data.pool.train.data
        data.train.target = train_test_data.pool.train.target

        data.test.data = train_test_data.test.train.data
        data.test.target = train_test_data.test.train.target

    ## convert document to matrix
    data.train.bow = vct.fit_transform(data.train.data)
    data.test.bow = vct.transform(data.test.data)

    #### EXPERT CLASSIFIER: ORACLE
    print("Training Oracle expert")
    exp_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty)

    if not args.fulloracle:
        print "Training expert documents:%s" % len(expert_data.oracle.train.data)
        labels, sent_train = experiment_utils.split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=args.limit)

        expert_data.oracle.train.data = sent_train
        expert_data.oracle.train.target = np.array(labels)
        expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)

        exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)
    else:
        # expert_data.data = np.concatenate((data.train.data, data.test.data))
        # expert_data.target = np.concatenate((data.train.target, data.test.target))
        expert_data.data =data.train.data
        expert_data.target = data.train.target
        expert_data.target_names = data.train.target_names
        labels, sent_train = experiment_utils.split_data_sentences(expert_data, sent_detector, vct, limit=args.limit)
        expert_data.bow = vct.transform(sent_train)
        expert_data.target = labels
        expert_data.data = sent_train
        exp_clf.fit(expert_data.bow, expert_data.target)

    if "neutral" in args.expert:
        expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                             cost_function=cost_model.cost_function)
    elif "true" in args.expert:
        expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function)
    elif "pred" in args.expert:
        expert = baseexpert.PredictingExpert(exp_clf,  #threshold=args.neutral_threshold,
                                             cost_function=cost_model.cost_function)
    elif "human" in args.expert:
        expert = baseexpert.HumanExpert(", ".join(["{}={}".format(a,b) for a,b in enumerate(data.train.target_names)])+"? > ")
    else:
        raise Exception("We need an expert!")

    print "\nExpert: %s " % expert

    #### EXPERT CLASSIFIER: SENTENCES
    print("Training sentence expert")
    sent_clf = None
    if args.cheating:
        labels, sent_train = experiment_utils.split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=args.limit)

        expert_data.sentence.train.data = sent_train
        expert_data.sentence.train.target = np.array(labels)
        expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)
        sent_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty)
        sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)

    #### STUDENT CLASSIFIER
    clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty)

    print "\nStudent Classifier: %s" % clf
    print "\nSentence Classifier: %s" % sent_clf
    print "\nExpert Oracle Classifier: %s" % exp_clf
    print "\nPenalty Oracle:", exp_clf.C
    print "\nVectorizer: %s" % vct
    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          min_size))
    print ("Anytime active learning experiment - use objective function to pick data")
    t0 = time.time()
    tac = []
    tau = []
    ### experiment starts
    for t in range(args.trials):
        trial_accu = []

        trial_aucs = []

        print "*" * 60
        print "Trial: %s" % t

        student = get_student(clf, cost_model, sent_clf, sent_detector, vct)
        student.human_mode = args.expert == 'human'

        print "\nStudent: %s " % student

        train_indices = []
        neutral_data = []  # save the xik vectors
        train_x = []
        train_y = []
        neu_x = []  # data to train the classifier
        neu_y = np.array([])

        pool = Bunch()
        pool.data = data.train.bow.tocsr()  # full words, for training
        pool.text = data.train.data
        pool.target = data.train.target
        pool.predicted = []
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool

        bootstrapped = False
        current_cost = 0
        iteration = 0
        query_index = None
        query_size = None
        oracle_answers = 0
        calibrated=args.calibrate
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:
            util = []

            if not bootstrapped:
                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                query = pool.data[query_index]
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:

                chosen = student.pick_next(pool=pool, step_size=step_size)

                query_index = [x for x, y in chosen]  # document id of chosen instances
                query = [y[0] for x, y in chosen]  # sentence of the document

                query_size = [1] * len(query_index)

            ground_truth = pool.target[query_index]

            if iteration == 0:  ## bootstrap uses ground truth
                labels = ground_truth
                spent = [0] * len(ground_truth)  ## bootstrap cost is ignored
            else:
                # print "ask labels"
                labels = expert.label_instances(query, ground_truth)
                spent = expert.estimate_instances(query_size)

            ### accumulate the cost of the query
            query_cost = np.array(spent).sum()
            current_cost += query_cost

            useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None])

            neutral_answers = np.array([[x, z] for x, y, z in zip(query_index, labels, query_size) if y is None]) \
                if iteration != 0 else np.array([])

            ## add data recent acquired to train
            if useful_answers.shape[0] != 0:
                train_indices.extend(useful_answers[:, 0])

                # add labels to training
                train_x = pool.data[train_indices]  # # train with all the words

                # update labels with the expert labels
                train_y.extend(useful_answers[:, 1])

            neu_x, neu_y, neutral_data = update_sentence(neutral_data, neu_x, neu_y, labels, query_index, pool, vct)
            # neu_x, neu_y, neutral_data = update_sentence_query(neutral_data, neu_x, neu_y, query, labels)

            if neu_y.shape[0] != neu_x.shape[0]:
                raise Exception("Training data corrupted!")
            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # retrain the model
            current_model = student.train_all(train_x, train_y, neu_x, neu_y)

            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            correct_labels = (np.array(ground_truth) == np.array(labels).reshape(len(labels))).sum()

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tGT:{5}\tneu:{6}\t{7}\tND:{8}\tTD:{9}\t ora_accu:{10}".format(
                len(train_indices),
                accu,
                auc, query_cost,
                current_cost,
                ground_truth,
                len(neutral_answers), neu_y.shape[0], neu_y.sum(), np.array(train_y).sum(), correct_labels))

            ## the results should be based on the cost of the labeling
            if iteration > 0:  # bootstrap iteration

                student.budget -= query_cost  ## Bootstrap doesn't count
                # oracle accuracy (from queries)
                oracle_answers += correct_labels
                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)
                ora_accu[x_axis_range].append(1. * correct_labels)
                ora_cm[x_axis_range].append(metrics.confusion_matrix(ground_truth, labels, labels=np.unique(train_y)))
                lbl_dit[x_axis_range].append(np.sum(train_y))
                # partial trial results
                trial_accu.append([x_axis_range, accu])
                trial_aucs.append([x_axis_range, auc])
                # oracle_accuracies[x_axis_range].append(oracle_answers)
            iteration += 1
            # end of budget loop

        tac.append(trial_accu)
        tau.append(trial_aucs)
        oracle_accuracies.append(1.*oracle_answers / (len(train_indices)-bootstrap_size))
        print "Trial: {}, Oracle right answers: {}, Iteration: {}, Labels:{}, ACCU-OR:{}".format(t, oracle_answers,
                 iteration, len(train_indices)-bootstrap_size,1.*oracle_answers / (len(train_indices)-bootstrap_size))
        #end trial loop
    if args.cost_function not in "uniform":
        accuracies = experiment_utils.extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size)
        aucs = experiment_utils.extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size)
    print "\nAverage oracle accuracy: ", np.array(oracle_accuracies).mean()
    print("Elapsed time %.3f" % (time.time() - t0))
    cheating = "CHEATING" if args.cheating else "NOCHEAT"
    experiment_utils.print_extrapolated_results(accuracies, aucs, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student)
    experiment_utils.oracle_accuracy(ora_accu, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student, cm=ora_cm, num_trials=args.trials)
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets.base import Bunch  # 引入 Bunch 类
import pickle  # 引入持久化类
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
##################################################################
## 导入数据
categories = ["alt.atheism", "soc.religion.christian", "comp.graphics", "sci.med"]  # 选取需要下载的新闻分类
data_set = fetch_20newsgroups(subset="train", categories=categories, shuffle=True, random_state=42)  # 下载并获取训练数据, 也是先全部下载, 再提取部分
print(data_set.target_names)  # ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
##################################################################
## 定义词袋数据结构
# tdm:tf-idf 计算后词袋
stpwrdlst = []  # 停用词表为 空
wordbag = Bunch(target_name=[], label=[], filenames=[], tdm=[], vocabulary={}, stpwrdlst=[])
wordbag.target_name = data_set.target_names
wordbag.label = data_set.target
wordbag.filenames = data_set.filenames
wordbag.stpwrdlst = stpwrdlst

vectorizer = CountVectorizer(stop_words=stpwrdlst)  # 使用 TfidfVectorizer 初始化向量空间模型--创建词袋
transformer = TfidfTransformer()  # 该类会统计每个词语的 tf-idf 权值
fea_train = vectorizer.fit_transform(data_set.data)  # 文本转为词频矩阵
print(fea_train.shape)  # (2257, 35788); 2257 篇文档, 35788 个单词

wordbag.tdm = fea_train  # 为 tdm 赋值
wordbag.vocabulary = vectorizer.vocabulary_
##################################################################
## 创建词袋的持久化
file_obj = open("tmp.data", "wb")
Example #25
0
def fetch_mixed_gambles(n_subjects=1, data_dir=None, url=None, resume=True, return_raw_data=False, verbose=0):
    """Fetch Jimura "mixed gambles" dataset.

    Parameters
    ----------
    n_subjects: int, optional (default 1)
        The number of subjects to load. If None is given, all the
        subjects are used.

    data_dir: string, optional (default None)
        Path of the data directory. Used to force data storage in a specified
        location. Default: None.

    url: string, optional (default None)
        Override download URL. Used for test only (or if you setup a mirror of
        the data).

    resume: bool, optional (default True)
        If true, try resuming download if possible.

    verbose: int, optional (default 0)
        Defines the level of verbosity of the output.

    return_raw_data: bool, optional (default True)
        If false, then the data will transformed into and (X, y) pair, suitable
        for machine learning routines. X is a list of n_subjects * 48
        Nifti1Image objects (where 48 is the number of trials),
        and y is an array of shape (n_subjects * 48,).

    smooth: float, or list of 3 floats, optional (default 0.)
        Size of smoothing kernel to apply to the loaded zmaps.

    Returns
    -------
    data: Bunch
        Dictionary-like object, the interest attributes are :
        'zmaps': string list
            Paths to realigned gain betamaps (one nifti per subject).
        'gain': ..
            If make_Xy is true, this is a list of n_subjects * 48
            Nifti1Image objects, else it is None.
        'y': array of shape (n_subjects * 48,) or None
            If make_Xy is true, then this is an array of shape
            (n_subjects * 48,), else it is None.

    References
    ----------
    [1] K. Jimura and R. Poldrack, "Analyses of regional-average activation
        and multivoxel pattern information tell complementary stories",
        Neuropsychologia, vol. 50, page 544, 2012
    """
    if n_subjects > 16:
        warnings.warn("Warning: there are only 16 subjects!")
        n_subjects = 16
    if url is None:
        url = "https://www.nitrc.org/frs/download.php/7229/" "jimura_poldrack_2012_zmaps.zip"
    opts = dict(uncompress=True)
    files = [("zmaps%ssub%03i_zmaps.nii.gz" % (os.sep, (j + 1)), url, opts) for j in range(n_subjects)]
    data_dir = _get_dataset_dir("jimura_poldrack_2012_zmaps", data_dir=data_dir)
    zmap_fnames = _fetch_files(data_dir, files, resume=resume, verbose=verbose)
    data = Bunch(zmaps=zmap_fnames)
    if not return_raw_data:
        X, y, mask_img = _load_mixed_gambles(map(nibabel.load, data.zmaps))
        data.zmaps, data.gain, data.mask_img = X, y, mask_img
    return data
def get_WS353_set2():
    data = pd.read_csv(WS353_set2_path, sep="\t", header=0).values
    return Bunch(X=data[:, 0:2].astype("object"),
                 y=data[:, 2].astype(np.float))
Example #27
0
def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
    vct_analizer = vct.build_tokenizer()
    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = max(100, args.fixk)

    fixk_saved = "{0}{1}.p".format(args.train, args.fixk)

    try:
        fixk_file = open(fixk_saved, "rb")
        data = pickle.load(fixk_file)
    except IOError:
        data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5)
        fixk_file = open(fixk_saved, "wb")
        pickle.dump(data, fixk_file)

    # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    parameters = parse_parameters_mat(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__


    #### STUDENT CLASSIFIER
    clf = linear_model.LogisticRegression(penalty="l1", C=1)
    print "\nStudent Classifier: %s" % clf

    #### EXPERT CLASSIFIER

    exp_clf = linear_model.LogisticRegression(penalty='l1', C=.3)
    exp_clf.fit(data.test.bow, data.test.target)
    expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                         cost_function=cost_model.cost_function)
    print "\nExpert: %s " % expert

    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          min_size))
    print ("Cheating experiment - use full uncertainty query k words")
    t0 = time.time()
    ### experiment starts
    tx =[]
    tac = []
    tau = []
    for t in range(args.trials):
        trial_accu =[]

        trial_aucs = []

        trial_x_axis = []
        print "*" * 60
        print "Trial: %s" % t

        student = randomsampling.UncertaintyLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t)
        print "\nStudent: %s " % student
        train_indices = []
        train_x = []
        train_y = []
        pool = Bunch()
        pool.data = data.train.bow.tocsr()   # full words, for training
        pool.fixk = data.train.bowk.tocsr()  # k words BOW for querying
        pool.target = data.train.target
        pool.predicted = []
        pool.kwords = np.array(data.train.kwords)  # k words
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool

        bootstrapped = False

        current_cost = 0
        iteration = 0
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:

            if not bootstrapped:
                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:
                query_index = student.pick_next(pool=pool, k=step_size)

            query = pool.fixk[query_index]  # query with k words

            query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]]

            ground_truth = pool.target[query_index]
            #labels, spent = expert.label(unlabeled=query, target=ground_truth)
            if iteration == 0: ## bootstrap uses ground truth
                labels = ground_truth
                spent = [0] * len(ground_truth) ## bootstrap cost is ignored
            else:
                labels = expert.label_instances(query, ground_truth)
                spent = expert.estimate_instances(query_size)


            ## add data recent acquired to train
            ## CHANGE: if label is not useful, ignore and do not charge money for it
            useful_answers = np.array([[x, y, z] for x, y, z in zip(query_index, labels, spent) if y is not None])

            # train_indices.extend(query_index)
            if useful_answers.shape[0] != 0:
                train_indices.extend(useful_answers[:, 0])

                # add labels to training
                train_x = pool.data[train_indices]  ## train with all the words

                # update labels with the expert labels
                train_y.extend(useful_answers[:, 1])

                #count for cost
                ### accumulate the cost of the query
                # query_cost = np.array(spent).sum()
                # current_cost += query_cost
                query_cost = useful_answers[:, 2]
                query_cost = np.sum(query_cost)
                current_cost += query_cost

            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # retrain the model
            current_model = student.train(train_x, train_y)

            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices),
                                                                                            accu,
                                                                                            auc, query_cost,
                                                                                            current_cost, spent))

            ## the results should be based on the cost of the labeling
            if iteration > 0:   # bootstrap iteration

                student.budget -= query_cost ## Bootstrap doesn't count

                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)

                ## partial trial results

                trial_accu.append([x_axis_range, accu])
                trial_aucs.append([x_axis_range, auc])
            iteration += 1

        # end of budget loop

        tac.append(trial_accu)
        tau.append(trial_aucs)
    #end trial loop

    accuracies = extrapolate_trials(tac)
    aucs = extrapolate_trials(tau)

    print("Elapsed time %.3f" % (time.time() - t0))
    print_extrapolated_results(accuracies, aucs)
Example #28
0

# 读取文件
def readfile(path):
    fp = open(path, "rb")
    content = fp.read()
    fp.close()
    return content


# Bunch类提供一种key,value的对象形式
# target_name:所有分类集名称列表
# label:每个文件的分类标签列表
# filenames:文件路径
# contents:分词后文件词向量形式
bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])

wordbag_path = "test_word_bag/test_set.dat"  # 未分词分类语料库路径
seg_path = "test_corpus_seg/"  # 分词后分类语料库路径

catelist = os.listdir(seg_path)  # 获取seg_path下的所有子目录
bunch.target_name.extend(catelist)
# 获取每个目录下所有的文件
for mydir in catelist:
    class_path = seg_path + mydir + "/"  # 拼出分类子目录的路径
    file_list = os.listdir(class_path)  # 获取class_path下的所有文件
    for file_path in file_list:  # 遍历类别目录下文件
        fullname = class_path + file_path  # 拼出文件名全路径
        bunch.label.append(mydir)
        bunch.filenames.append(fullname)
        bunch.contents.append(readfile(fullname).strip())  # 读取文件内容
Example #29
0
    allBunch.tfidf = vector2tfidf.fit_transform(allBunch.vector,
                                                allBunch.tfidf)
    allBunch.multi_labels = MultiLabelBinarizer(
        classes=ACCU_LIST).fit_transform(allBunch.labels)
    joblib.dump(allBunch,
                ALL_BUNCH_FILE)  # 保存train_bunch和test_bunch,后面的程序可以直接读取,以节省时间。

    # allBunch = joblib.load(ALL_BUNCH_FILE) # 读取保存的allBunch
    print(allBunch.vector.shape)
    print(np.max(allBunch.vector))
    allBunch.vector = allBunch.vector / np.max(allBunch.vector)
    # 先不进行特征选择对allBunch进行交叉验证,多标签训练
    trainBunch = Bunch(labels=[],
                       multi_labels=[],
                       contents=[],
                       vector=[],
                       selectVector=[],
                       tfidf=[],
                       selectTfidf=[])
    testBunch = Bunch(labels=[],
                      multi_labels=[],
                      contents=[],
                      vector=[],
                      selectVector=[],
                      tfidf=[],
                      selectTfidf=[])
    trainBunch.multi_labels, testBunch.multi_labels,trainBunch.vector, testBunch.vector , trainBunch.tfidf, testBunch.tfidf\
        = train_test_split(allBunch.multi_labels, allBunch.vector,  allBunch.tfidf,test_size=0.3)
    clf = DecisionTreeClassifier()
    print("正在进行训练》》》》》》")
    st = time.time()
Example #30
0
def fetch_imagesets(
        data_folder_path="../coral_labeling/Labels",
        funneled=True,
        resize=None,
        min_images_per_category=0,
        color=False,
        hue=0,
        restrict=None,
        #slice_=(slice(0, 255), slice(0, 318)),
        slice_=None,
        download_if_missing=True):
    """Loader for images.

    This dataset is a collection of JPEG pictures 

    Each pixel of each channel
    (color in RGB) is encoded by a float in range 0.0 - 1.0.

    Parameters
    ----------
    data_home: optional, default: None
        Specify another download and cache folder for the datasets. By default
    funneled: boolean, optional, default: True
        Download and use the funneled variant of the dataset.

    resize: float, optional, default 0.5
        Ratio used to resize the each image picture.

    min_images_per_category: int, optional, default None
        The extracted dataset will only retain pictures of people that have at
        least `min_images_per_category` different pictures.

    color: boolean, optional, default False
        Keep the 3 RGB channels instead of averaging them to a single
        gray level channel. If color is True the shape of the data has
        one more dimension than than the shape with color = False.

    slice_: optional
        Provide a custom 2D slice (height, width) to extract the
        'interesting' part of the jpeg files and avoid use statistical
        correlation from the background

    download_if_missing: optional, True by default
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.
        THIS IS NOT SUPPORTED SICNE IMAGES must ALWAYS COME FROM LOCAL FILES.

    hue: optional, False by default
        Return hsv images rather than intensity or RGB

    Returns
    -------
    dataset : dict-like object with the following attributes:

    dataset.data : numpy array of shape (13233, 2914)
        Each row corresponds to a ravelled image image of original size 62 x 47
        pixels.

    dataset.images : numpy array of shape (13233, 62, 47)
        Each row is a image image corresponding to one of the 5749 categories in
        the dataset.

    dataset.target : numpy array of shape (13233,)
        Labels associated to each image image. Those labels range from 0-5748
        and correspond to the category IDs.

    restrict : restrict category selection.  If None, all category directories
        within the data folder are used, if a string, only the specified category 
        is used, if 0 (zero) then files in the main data folder (only) are used, 
        if 1 (unimplemented) all subcategories are merged into one.

    dataset.target_names : names of the categories (folders with images in them)

    dataset.paths : pathnames to the individual images

    dataset.huespaces : array of vectors summarizing hue information

    dataset.DESCR : string
    """
    #images_home = "/Volumes/Macintosh_HD/Users/dudek/Code/coral_labeling/Labelsx"

    # wrap the loader in a memoizing function that will return memmaped data
    # arrays for optimal memory usage
    #GD m = Memory(cachedir=images_home, compress=6, verbose=0)
    #GD load_func = m.cache(_fetch_imagesets)

    # load and memoize the pairs as np arrays
    #GD images, target, target_names = load_func(
    images, target, target_names, paths, huespaces = _fetch_imagesets(
        data_folder_path,
        resize=resize,
        restrict=restrict,
        min_images_per_category=min_images_per_category,
        color=color,
        hue=hue,
        slice_=slice_)

    # pack the results as a Bunch instance
    # return Bunch(data=images.reshape(len(images), -1), images=images,
    return Bunch(data=images,
                 images=images,
                 target=target,
                 target_names=target_names,
                 paths=paths,
                 huespaces=huespaces,
                 DESCR="coral dataset")
Example #31
0
    # bunch_data
    bounch_path = '/Users/slade/Documents/YMM/Code/UCGPCG/src/jobs/terror_recognition/train_model/baseline/bounch_data'
    build_bounch(train_comment_data, bounch_path)
    wordbag_path = "/Users/slade/Documents/YMM/Code/UCGPCG/src/jobs/terror_recognition/train_model/baseline/model_data/bunch_set.dat"
    corpus2Bunch(wordbag_path, bounch_path)

    # tfidf/cut sequence
    stopword_path = "/Users/slade/Documents/YMM/Code/UCGPCG/src/jobs/terror_recognition/train_model/stop_words.txt"
    bunch_path = wordbag_path
    tri_space_path = '/Users/slade/Documents/YMM/Code/UCGPCG/src/jobs/terror_recognition/train_model/baseline/model_data/tri_space.dat'
    stpwrdlst = readfile(stopword_path).splitlines()

    bunch = _readbunchobj(bunch_path)
    tfidfspace = Bunch(target_name=bunch.target_name,
                       label=bunch.label,
                       filenames=bunch.filenames,
                       tdm=[],
                       vocabulary={})
    vectorizer = TfidfVectorizer(stop_words=stpwrdlst,
                                 sublinear_tf=True,
                                 max_df=0.5,
                                 token_pattern=r"(?u)\b\w+\b",
                                 ngram_range=(1, 3),
                                 max_features=30000)
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
    tfidfspace.vocabulary = vectorizer.vocabulary_
    _writebunchobj(tri_space_path, tfidfspace)

    tri_train_set = _readbunchobj(tri_space_path)
    mnb_tri = MultinomialNB(alpha=0.001)
    mnb_tri.fit(tri_train_set.tdm, tri_train_set.label)
Example #32
0
def test_loads_dumps_bunch():
    bunch = Bunch(x="x")
    bunch_from_pkl = loads(dumps(bunch))
    bunch_from_pkl.x = "y"
    assert_equal(bunch_from_pkl['x'], bunch_from_pkl.x)
Example #33
0
def fetch_multilingual_SimLex999(which="EN"):
    """
    Fetch Multilingual SimLex999 dataset for testing attributional similarity

    Parameters
    -------
    which : "EN", "RU", "IT" or "DE" for language

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': matrix of 2 words per column,
        'y': vector with scores,
        'sd': vector of sd of scores,

    References
    ----------
    Published at http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html.

    Notes
    -----
    Scores for EN are different than the original SimLex999 dataset.

    Authors description:
    Multilingual SimLex999 resource consists of translations of the SimLex999 word similarity data set to
    three languages: German, Italian and Russian. Each of the translated datasets is scored by
    13 human judges (crowdworkers) - all fluent speakers of its language. For consistency, we
    also collected human judgments for the original English corpus according to the same protocol
    applied to the other languages. This dataset allows to explore the impact of the "judgement language"
    (the language in which word pairs are presented to the human judges) on the resulted similarity scores
    and to evaluate vector space models on a truly multilingual setup (i.e. when both the training and the
    test data are multilingual).
    """
    if which == "EN":
        data = _get_as_pd(
            'https://www.dropbox.com/s/nczc4ao6koqq7qm/EN-MSIM999.txt?dl=1',
            'similarity',
            header=None,
            encoding='utf-8',
            sep=" ")
    elif which == "DE":
        data = _get_as_pd(
            'https://www.dropbox.com/s/ucpwrp0ahawsdtf/DE-MSIM999.txt?dl=1',
            'similarity',
            header=None,
            encoding='utf-8',
            sep=" ")
    elif which == "IT":
        data = _get_as_pd(
            'https://www.dropbox.com/s/siqjagyz8dkjb9q/IT-MSIM999.txt?dl=1',
            'similarity',
            header=None,
            encoding='utf-8',
            sep=" ")
    elif which == "RU":
        data = _get_as_pd(
            'https://www.dropbox.com/s/3v26edm9a31klko/RU-MSIM999.txt?dl=1',
            'similarity',
            header=None,
            encoding='utf-8',
            sep=" ")
    else:
        raise RuntimeError("Not recognized which parameter")

    # We basically select all the columns available
    X = data.values[:, 0:2]
    scores = data.values[:, 2:].astype(np.float)
    y = np.mean(scores, axis=1)
    sd = np.std(scores, axis=1)

    return Bunch(X=X.astype("object"), y=y, sd=sd)
Example #34
0
def load_student_grades(return_X_y=False, y_type='G3'):
    strings = {
        'at_home': 1,
        'health': 2,
        'other': 5,
        'services': 3,
        'teacher': 4,
        'GP': 1,
        'MS': 2,
        'course': 1,
        'home': 2,
        'other': 3,
        'reputation': 4,
        'father': 2,
        'mother': 1,
        'other': 3,
        'F': 1,
        'M': 0,
        'yes': 1,
        'no': 0
    }
    file_path = os.path.join('datasets\student', 'student-por.csv')
    with open(file_path) as f:
        data_file = csv.reader(f, delimiter=';')

        # temp = next(data_file)
        temp = next(data_file)

        n_features = 30
        # n_samples = sum(1 for row in data_file)
        # data = np.empty((n_samples, n_features))
        # target = np.empty((n_samples,))
        feature_names = np.array(temp)

    data = []
    target = []
    with open(file_path) as f:
        data_file = csv.reader(f, delimiter=';')
        i = 0
        firstline = True
        for d in data_file:
            if firstline:
                firstline = False
            else:
                d = np.array(d)
                d = np.delete(d, 3, 0)
                d = np.delete(d, 3, 0)
                d = np.delete(d, 3, 0)
                for str, val in strings.iteritems():
                    d[d == str] = val

                #data.append(d[1:-3])
                data.append(np.array(d[1:-3], dtype='float'))

                d.astype(float)
                if y_type == 'G3':
                    target.append(d[-1])
                elif y_type == 'G2':
                    target.append(d[-2])
                elif y_type == 'G1':
                    target.append(d[-3])
                i += 1
    data = np.array(data[:1000])
    target = np.array(target[:1000]).astype(float)

    if return_X_y:
        return data, target

    return Bunch(
        data=data,
        target=target,
        # last column is target value
        feature_names=feature_names)
Example #35
0
def fetch_atlas_craddock_2012(data_dir=None, url=None, resume=True, verbose=1):
    """Download and return file names for the Craddock 2012 parcellation

    The provided images are in MNI152 space.

    Parameters
    ----------
    data_dir: string
        directory where data should be downloaded and unpacked.

    url: string
        url of file to download.

    resume: bool
        whether to resumed download of a partly-downloaded file.

    verbose: int
        verbosity level (0 means no message).

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        dictionary-like object, keys are:
        scorr_mean, tcorr_mean,
        scorr_2level, tcorr_2level,
        random

    References
    ----------
    Licence: Creative Commons Attribution Non-commercial Share Alike
    http://creativecommons.org/licenses/by-nc-sa/2.5/

    Craddock, R. Cameron, G.Andrew James, Paul E. Holtzheimer, Xiaoping P. Hu,
    and Helen S. Mayberg. "A Whole Brain fMRI Atlas Generated via Spatially
    Constrained Spectral Clustering". Human Brain Mapping 33, no 8 (2012):
    1914-1928. doi:10.1002/hbm.21333.

    See http://www.nitrc.org/projects/cluster_roi/ for more information
    on this parcellation.
    """

    if url is None:
        url = "ftp://www.nitrc.org/home/groups/cluster_roi/htdocs" \
              "/Parcellations/craddock_2011_parcellations.tar.gz"
    opts = {'uncompress': True}

    dataset_name = "craddock_2012"
    keys = ("scorr_mean", "tcorr_mean",
            "scorr_2level", "tcorr_2level",
            "random")
    filenames = [
            ("scorr05_mean_all.nii.gz", url, opts),
            ("tcorr05_mean_all.nii.gz", url, opts),
            ("scorr05_2level_all.nii.gz", url, opts),
            ("tcorr05_2level_all.nii.gz", url, opts),
            ("random_all.nii.gz", url, opts)
    ]

    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
                                verbose=verbose)
    sub_files = _fetch_files(data_dir, filenames, resume=resume,
                             verbose=verbose)

    fdescr = _get_dataset_descr(dataset_name)

    params = dict([('description', fdescr)] + list(zip(keys, sub_files)))

    return Bunch(**params)
Example #36
0
rootPath = fileutils.getDataPath() + os.sep
trainSetFolderPath = rootPath + 'train_word_bag'
trainSetFilePath = trainSetFolderPath + os.sep + 'train_set.dat'
trainSet = fileutils.readBatchObj(trainSetFilePath)

# 2.get stop words
stopWordFolderPath = rootPath + 'train_corpus_stop_word'
stopWordFileName = 'corpus_stop_word_china.txt'
stopWordFilePath = stopWordFolderPath + os.sep + stopWordFileName
stopWordFile = open(stopWordFilePath, 'r')
stopWordList = stopWordFile.read().splitlines()
# print('stopWordList,', stopWordList)
stopWordFile.close()

# 3.build TF-IDF vector space
tfidfSpace = Bunch(target_name=trainSet.target_name, lable=trainSet.lable,
                   filenames=trainSet.filenames, tdm=[], vocabulary=[])
vectorizer = TfidfVectorizer(
    stop_words=stopWordList, sublinear_tf=True, max_df=0.5)
transformer = TfidfTransformer()

tfidfSpace.tdm = vectorizer.fit_transform(trainSet.contents)
# print('tdm:', tfidfSpace.tdm)
tfidfSpace.vocabulary = vectorizer.vocabulary_
# print('tfidfSpace:', tfidfSpace)

# 3.save tfidfSpace
vocabularyFolderPath = rootPath + 'train_word_bag'
vocabularyFileName = 'tfidfSpace.dat'
vocabularyPath = vocabularyFolderPath + os.sep + vocabularyFileName
if os.path.exists(vocabularyPath):
    os.remove(vocabularyPath)
Example #37
0
def fetch_atlas_smith_2009(data_dir=None, mirror='origin', url=None,
                           resume=True, verbose=1):
    """Download and load the Smith ICA and BrainMap atlas (dated 2009)

    Parameters
    ----------
    data_dir: string, optional
        Path of the data directory. Used to force data storage in a non-
        standard location. Default: None (meaning: default)
    mirror: string, optional
        By default, the dataset is downloaded from the original website of the
        atlas. Specifying "nitrc" will force download from a mirror, with
        potentially higher bandwith.
    url: string, optional
        Download URL of the dataset. Overwrite the default URL.

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        dictionary-like object, contains:

        - 20-dimensional ICA, Resting-FMRI components:

          - all 20 components (rsn20)
          - 10 well-matched maps from these, as shown in PNAS paper (rsn10)

        - 20-dimensional ICA, BrainMap components:

          - all 20 components (bm20)
          - 10 well-matched maps from these, as shown in PNAS paper (bm10)

        - 70-dimensional ICA, Resting-FMRI components (rsn70)

        - 70-dimensional ICA, BrainMap components (bm70)


    References
    ----------

    S.M. Smith, P.T. Fox, K.L. Miller, D.C. Glahn, P.M. Fox, C.E. Mackay, N.
    Filippini, K.E. Watkins, R. Toro, A.R. Laird, and C.F. Beckmann.
    Correspondence of the brain's functional architecture during activation and
    rest. Proc Natl Acad Sci USA (PNAS), 106(31):13040-13045, 2009.

    A.R. Laird, P.M. Fox, S.B. Eickhoff, J.A. Turner, K.L. Ray, D.R. McKay, D.C
    Glahn, C.F. Beckmann, S.M. Smith, and P.T. Fox. Behavioral interpretations
    of intrinsic connectivity networks. Journal of Cognitive Neuroscience, 2011

    Notes
    -----
    For more information about this dataset's structure:
    http://www.fmrib.ox.ac.uk/datasets/brainmap+rsns/
    """
    if url is None:
        if mirror == 'origin':
            url = "http://www.fmrib.ox.ac.uk/datasets/brainmap+rsns/"
        elif mirror == 'nitrc':
            url = [
                    'https://www.nitrc.org/frs/download.php/7730/',
                    'https://www.nitrc.org/frs/download.php/7729/',
                    'https://www.nitrc.org/frs/download.php/7731/',
                    'https://www.nitrc.org/frs/download.php/7726/',
                    'https://www.nitrc.org/frs/download.php/7728/',
                    'https://www.nitrc.org/frs/download.php/7727/',
            ]
        else:
            raise ValueError('Unknown mirror "%s". Mirror must be "origin" '
                'or "nitrc"' % str(mirror))

    files = [
            'rsn20.nii.gz',
            'PNAS_Smith09_rsn10.nii.gz',
            'rsn70.nii.gz',
            'bm20.nii.gz',
            'PNAS_Smith09_bm10.nii.gz',
            'bm70.nii.gz'
    ]

    if isinstance(url, _basestring):
        url = [url] * len(files)

    files = [(f, u + f, {}) for f, u in zip(files, url)]

    dataset_name = 'smith_2009'
    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
                                verbose=verbose)
    files_ = _fetch_files(data_dir, files, resume=resume,
                          verbose=verbose)

    fdescr = _get_dataset_descr(dataset_name)

    keys = ['rsn20', 'rsn10', 'rsn70', 'bm20', 'bm10', 'bm70']
    params = dict(zip(keys, files_))
    params['description'] = fdescr

    return Bunch(**params)
from sklearn.datasets.base import Bunch
import jieba
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

#把训练集和测试集的数据保存到Bunch对象中

root = fileutils.getDataPath() + os.sep
trainDataPath = root + 'news' + os.sep + "train"
testDataPath = root + 'news' + os.sep + "test"
segPath = root + 'news' + os.sep + "seg"
trainRawPath = segPath +os.sep+"trainRaw.dat"
testRawPath = segPath +os.sep+"testRaw.dat"
bunch = Bunch(target_name=[], lable=[], filenames=[], contents=[])
bunch.target_name = segPath

contenttitle =''
# parser all train data and save it to bunch
for file in os.listdir(trainDataPath):
    filePath = trainDataPath + os.sep + file
    if os.path.isdir(filePath):
        print(file, ' is dir. continue')
        continue
    with open(filePath, 'r') as file:
        text = file.read()
        text = re.sub(u"[\x00-\x08\x0b-\x0c\x0e-\x1f|&]+", u"", text)
        root = ET.fromstring(text)
        for child in root:
            # 第二层节点的标签名称和属性,遍历xml文档的第三层
Example #39
0
def fetch_atlas_aal(version='SPM12', data_dir=None, url=None, resume=True,
                    verbose=1):
    """Downloads and returns the AAL template for SPM 12.

    This atlas is the result of an automated anatomical parcellation of the
    spatially normalized single-subject high-resolution T1 volume provided by
    the Montreal Neurological Institute (MNI) (D. L. Collins et al., 1998,
    Trans. Med. Imag. 17, 463-468, PubMed).

    Parameters
    ----------
    version: string, optional
        The version of the AAL atlas. Must be SPM5, SPM8 or SPM12. Default is
        SPM12.

    data_dir: string
        directory where data should be downloaded and unpacked.

    url: string
        url of file to download.

    resume: bool
        whether to resumed download of a partly-downloaded file.

    verbose: int
        verbosity level (0 means no message).

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        dictionary-like object, keys are:

        - "maps": str. path to nifti file containing regions.

        - "labels": list of the names of the regions

    Notes
    -----
    For more information on this dataset's structure, see
    http://www.gin.cnrs.fr/AAL-217?lang=en

    Automated Anatomical Labeling of Activations in SPM Using a Macroscopic
    Anatomical Parcellation of the MNI MRI Single-Subject Brain.
    N. Tzourio-Mazoyer, B. Landeau, D. Papathanassiou, F. Crivello,
    O. Etard, N. Delcroix, B. Mazoyer, and M. Joliot.
    NeuroImage 2002. 15 :273-28

    Licence: unknown.
    """
    versions = ['SPM5', 'SPM8', 'SPM12']
    if version not in versions:
        raise ValueError('The version of AAL requested "%s" does not exist.'
                         'Please choose one among %s.' %
                         (version, str(versions)))

    if url is None:
        baseurl = "http://www.gin.cnrs.fr/AAL_files/aal_for_%s.tar.gz"
        url = baseurl % version
    opts = {'uncompress': True}

    dataset_name = "aal_" + version
    # keys and basenames would need to be handled for each spm_version
    # for now spm_version 12 is hardcoded.
    basenames = ("AAL.nii", "AAL.xml")
    filenames = [(os.path.join('aal', 'atlas', f), url, opts)
                 for f in basenames]

    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
                                verbose=verbose)
    atlas_img, labels_file = _fetch_files(data_dir, filenames, resume=resume,
                                          verbose=verbose)

    fdescr = _get_dataset_descr(dataset_name)

    # We return the labels contained in the xml file as a dictionary
    xml_tree = xml.etree.ElementTree.parse(labels_file)
    root = xml_tree.getroot()
    labels = []
    indices = []
    for label in root.getiterator('label'):
        indices.append(label.find('index').text)
        labels.append(label.find('name').text)

    params = {'description': fdescr, 'maps': atlas_img,
              'labels': labels, 'indices': indices}

    return Bunch(**params)
Example #40
0
def fetch_atlas_harvard_oxford(atlas_name,
                               data_dir=None,
                               symmetric_split=False,
                               resume=True,
                               verbose=1):
    """Load Harvard-Oxford parcellation from FSL if installed or download it.

    This function looks up for Harvard Oxford atlas in the system and load it
    if present. If not, it downloads it and stores it in NILEARN_DATA
    directory.

    Parameters
    ----------
    atlas_name: string
        Name of atlas to load. Can be:
        cort-maxprob-thr0-1mm,  cort-maxprob-thr0-2mm,
        cort-maxprob-thr25-1mm, cort-maxprob-thr25-2mm,
        cort-maxprob-thr50-1mm, cort-maxprob-thr50-2mm,
        sub-maxprob-thr0-1mm,  sub-maxprob-thr0-2mm,
        sub-maxprob-thr25-1mm, sub-maxprob-thr25-2mm,
        sub-maxprob-thr50-1mm, sub-maxprob-thr50-2mm,
        cort-prob-1mm, cort-prob-2mm,
        sub-prob-1mm, sub-prob-2mm

    data_dir: string, optional
        Path of data directory. It can be FSL installation directory
        (which is dependent on your installation).

    symmetric_split: bool, optional
        If True, split every symmetric region in left and right parts.
        Effectively doubles the number of regions. Default: False.
        Not implemented for probabilistic atlas (*-prob-* atlases)

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        dictionary-like object, keys are:

        - "maps": nibabel.Nifti1Image, 4D maps if a probabilistic atlas is
          requested and 3D labels if a maximum probabilistic atlas was
          requested.

        - "labels": string list, labels of the regions in the atlas.
    """
    atlas_items = ("cort-maxprob-thr0-1mm", "cort-maxprob-thr0-2mm",
                   "cort-maxprob-thr25-1mm", "cort-maxprob-thr25-2mm",
                   "cort-maxprob-thr50-1mm", "cort-maxprob-thr50-2mm",
                   "sub-maxprob-thr0-1mm", "sub-maxprob-thr0-2mm",
                   "sub-maxprob-thr25-1mm", "sub-maxprob-thr25-2mm",
                   "sub-maxprob-thr50-1mm", "sub-maxprob-thr50-2mm",
                   "cort-prob-1mm", "cort-prob-2mm", "sub-prob-1mm",
                   "sub-prob-2mm")
    if atlas_name not in atlas_items:
        raise ValueError("Invalid atlas name: {0}. Please chose an atlas "
                         "among:\n{1}".format(atlas_name,
                                              '\n'.join(atlas_items)))

    url = 'http://www.nitrc.org/frs/download.php/7700/HarvardOxford.tgz'

    # For practical reasons, we mimic the FSL data directory here.
    dataset_name = 'fsl'
    # Environment variables
    default_paths = []
    for env_var in ['FSL_DIR', 'FSLDIR']:
        path = os.getenv(env_var)
        if path is not None:
            default_paths.extend(path.split(':'))
    data_dir = _get_dataset_dir(dataset_name,
                                data_dir=data_dir,
                                default_paths=default_paths,
                                verbose=verbose)
    opts = {'uncompress': True}
    root = os.path.join('data', 'atlases')
    atlas_file = os.path.join(root, 'HarvardOxford',
                              'HarvardOxford-' + atlas_name + '.nii.gz')
    if atlas_name[0] == 'c':
        label_file = 'HarvardOxford-Cortical.xml'
    else:
        label_file = 'HarvardOxford-Subcortical.xml'
    label_file = os.path.join(root, label_file)

    atlas_img, label_file = _fetch_files(data_dir, [(atlas_file, url, opts),
                                                    (label_file, url, opts)],
                                         resume=resume,
                                         verbose=verbose)

    names = {}
    from xml.etree import ElementTree
    names[0] = 'Background'
    for label in ElementTree.parse(label_file).findall('.//label'):
        names[int(label.get('index')) + 1] = label.text
    names = list(names.values())

    if not symmetric_split:
        return Bunch(maps=atlas_img, labels=names)

    if atlas_name in ("cort-prob-1mm", "cort-prob-2mm", "sub-prob-1mm",
                      "sub-prob-2mm"):
        raise ValueError("Region splitting not supported for probabilistic "
                         "atlases")

    atlas_img = check_niimg(atlas_img)
    atlas = atlas_img.get_data()

    labels = np.unique(atlas)
    # Build a mask of both halves of the brain
    middle_ind = (atlas.shape[0] - 1) // 2
    # Put zeros on the median plane
    atlas[middle_ind, ...] = 0
    # Split every zone crossing the median plane into two parts.
    left_atlas = atlas.copy()
    left_atlas[middle_ind:, ...] = 0
    right_atlas = atlas.copy()
    right_atlas[:middle_ind, ...] = 0

    new_label = 0
    new_atlas = atlas.copy()
    # Assumes that the background label is zero.
    new_names = [names[0]]
    for label, name in zip(labels[1:], names[1:]):
        new_label += 1
        left_elements = (left_atlas == label).sum()
        right_elements = (right_atlas == label).sum()
        n_elements = float(left_elements + right_elements)
        if (left_elements / n_elements < 0.05
                or right_elements / n_elements < 0.05):
            new_atlas[atlas == label] = new_label
            new_names.append(name)
            continue
        new_atlas[right_atlas == label] = new_label
        new_names.append(name + ', left part')
        new_label += 1
        new_atlas[left_atlas == label] = new_label
        new_names.append(name + ', right part')

    atlas_img = new_img_like(atlas_img, new_atlas, get_affine(atlas_img))
    return Bunch(maps=atlas_img, labels=new_names)
Example #41
0
def fetch_atlas_allen_2011(data_dir=None, url=None, resume=True, verbose=1):
    """Download and return file names for the Allen and MIALAB ICA atlas
    (dated 2011).

    The provided images are in MNI152 space.

    Parameters
    ----------
    data_dir: str, optional
        directory where data should be downloaded and unpacked.
    url: str, optional
        url of file to download.
    resume: bool
        whether to resumed download of a partly-downloaded file.
    verbose: int
        verbosity level (0 means no message).

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        dictionary-like object, keys are:

        - "maps": T-maps of all 75 unthresholded components.
        - "rsn28": T-maps of 28 RSNs included in E. Allen et al.
        - "networks": string list containing the names for the 28 RSNs.
        - "rsn_indices": dict[rsn_name] -> list of int, indices in the "maps"
          file of the 28 RSNs.
        - "comps": The aggregate ICA Components.
        - "description": details about the data release.

    References
    ----------
    E. Allen, et al, "A baseline for the multivariate comparison of resting
    state networks," Frontiers in Systems Neuroscience, vol. 5, p. 12, 2011.

    Notes
    -----
    Licence: unknown

    See http://mialab.mrn.org/data/index.html for more information
    on this dataset.
    """
    if url is None:
        url = "http://mialab.mrn.org/data/hcp/"

    dataset_name = "allen_rsn_2011"
    keys = ("maps",
            "rsn28",
            "comps")

    opts = {}
    files = ["ALL_HC_unthresholded_tmaps.nii",
             "RSN_HC_unthresholded_tmaps.nii",
             "rest_hcp_agg__component_ica_.nii"]

    labels = [('Basal Ganglia', [21]),
              ('Auditory', [17]),
              ('Sensorimotor', [7, 23, 24, 38, 56, 29]),
              ('Visual', [46, 64, 67, 48, 39, 59]),
              ('Default-Mode', [50, 53, 25, 68]),
              ('Attentional', [34, 60, 52, 72, 71, 55]),
              ('Frontal', [42, 20, 47, 49])]

    networks = [[name] * len(idxs) for name, idxs in labels]

    filenames = [(f, url + f, opts) for f in files]

    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
                                verbose=verbose)
    sub_files = _fetch_files(data_dir, filenames, resume=resume,
                             verbose=verbose)

    fdescr = _get_dataset_descr(dataset_name)

    params = [('description', fdescr),
              ('rsn_indices', labels),
              ('networks', networks)]
    params.extend(list(zip(keys, sub_files)))

    return Bunch(**dict(params))
Example #42
0
Input: high dimensional data
Output: k-dimensional data (save to CSV files as train.csv and test.csv)
"""

# ### Load Haxby dataset ######################################################
import numpy as np
import nibabel
from os.path import expanduser
from sklearn.datasets.base import Bunch

# data_dir = expanduser('~') + '/workshops/aiml/data/pymvpa-exampledata/'
data_dir = expanduser('~') + '/downloads/pymvpa-exampledata/'

# create sklearn's Bunch of data
dataset_files = Bunch(func=data_dir + 'bold.nii.gz',
                      session_target=data_dir + 'attributes.txt',
                      mask=data_dir + 'mask.nii.gz',
                      conditions_target=data_dir + 'attributes_literal.txt')

# fmri_data and mask are copied to break any reference to the original object
bold_img = nibabel.load(dataset_files.func)
fmri_data = bold_img.get_data().astype(float)
affine = bold_img.get_affine()
y, session = np.loadtxt(dataset_files.session_target).astype("int").T
conditions = np.recfromtxt(dataset_files.conditions_target)['f0']
mask = dataset_files.mask

# fmri_data.shape is (40, 64, 64, 1452)
# and mask.shape is (40, 64, 64)

# ### Preprocess data
# Build the mean image because we have no anatomic data
Example #43
0
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

reload(sys)
sys.setdefaultencoding('utf-8')

#导入训练集
train_path = "wordbag" + "/" + "train_set1124.data"
data_set = joblib.load(train_path)
# print data_set.target_name
# print data_set.contents[0]
####exit
# sys.exit(0)
#定义词袋数据结构
#tf-idf计算后的词袋
wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
wordbag.target_name = data_set.target_name
wordbag.label = data_set.label
#语料
corpus = data_set.contents

#导入停用词
stopwordpath = "extra_dict/stop_words.txt"
stopword_dic = open(stopwordpath,'r')
stopword_content = stopword_dic.read()
#将停用词转为list
stopwordlist = stopword_content.splitlines()
stopword_dic.close()

#词袋创建时间
start = datetime.datetime.now()
Example #44
0
def load_dynacomp(preprocessing_folder='pipeline_1', prefix='swr'):
    """ Returns paths of Dynacomp preprocessed resting-state fMRI
    """
    BASE_DIR = set_data_base_dir('Dynacomp')
    SUBJ_DIR = os.path.join(BASE_DIR, 'preprocessed', preprocessing_folder)
    subject_paths = sorted(glob.glob(os.path.join(SUBJ_DIR, '[A-Z][A-Z]*')))
    mask_path = os.path.join(BASE_DIR, 'masks', 'all_subjects.nii.gz')
    description = pd.read_csv(os.path.join(BASE_DIR, 'subject_infos.csv'))
    session1_files = []
    session2_files = []
    session1_motion = []
    session2_motion = []
    anat_files = []
    group = []
    subjects = []
    behavior = []
    date = []
    for f in subject_paths:
        # subject id
        _, subject_id = os.path.split(f)
        # set prefix
        # functional data
        session1_files.append(
            glob.glob(
                os.path.join(f, 'fMRI', 'acquisition1',
                             prefix + 'rest1*.nii'))[0])
        session2_files.append(
            glob.glob(
                os.path.join(f, 'fMRI', 'acquisition1',
                             prefix + 'rest2*.nii'))[0])
        # anatomical data
        anat_files.append(
            glob.glob(os.path.join(f, 't1mri', 'acquisition1',
                                   'wanat*.nii'))[0])

        # motion parameters
        session1_motion.append(
            glob.glob(os.path.join(f, 'fMRI', 'acquisition1',
                                   'rp_rest1*.txt'))[0])
        session2_motion.append(
            glob.glob(os.path.join(f, 'fMRI', 'acquisition1',
                                   'rp_rest2*.txt'))[0])

        # subject group
        gr = description[description.NIP == subject_id].GROUP.values
        if len(gr) > 0:
            group.append(gr[0])
        # date acquisition
        dt = description[description.NIP == subject_id].DATE.values
        if len(dt) > 0:
            date.append(dt[0])
        # subject id
        subjects.append(subject_id)
        behavior.append(get_behavior_scores(description, subject_id))

    indices = set_group_indices(group)
    rois = load_dynacomp_rois()
    return Bunch(func1=session1_files,
                 func2=session2_files,
                 anat=anat_files,
                 group_indices=indices,
                 motion1=session1_motion,
                 motion2=session2_motion,
                 rois=rois,
                 group=group,
                 subjects=subjects,
                 date=date,
                 behavior=behavior,
                 mask=mask_path)
Example #45
0
def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
    vct_analizer = vct.build_tokenizer()

    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = max(100, args.fixk)

    if args.fixk < 0:
        args.fixk = None

    fixk_saved = "{0}{1}.p".format(args.train, args.fixk)

    try:
        print "Loading existing file... %s " % args.train
        fixk_file = open(fixk_saved, "rb")
        data = pickle.load(fixk_file)
        fixk_file.close()
        vectorizer = open("{0}vectorizer.p".format(args.train), "rb")
        vct = pickle.load(vectorizer)
        vectorizer.close()
    except (IOError, ValueError):
        print "Loading from scratch..."
        data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5)
        fixk_file = open(fixk_saved, "wb")
        pickle.dump(data, fixk_file)
        fixk_file.close()
        vectorizer = open("{0}vectorizer.p".format(args.train), "wb")
        pickle.dump(vct, vectorizer)
        vectorizer.close()

    # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    parameters = parse_parameters_mat(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__

    #### STUDENT CLASSIFIER
    clf = linear_model.LogisticRegression(penalty="l1", C=1)
    # clf = set_classifier(args.classifier)
    print "\nStudent Classifier: %s" % clf

    #### EXPERT CLASSIFIER

    exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    exp_clf.fit(data.test.bow, data.test.target)
    expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                         cost_function=cost_model.cost_function)
    print "\nExpert: %s " % expert

    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          min_size))
    print ("Anytime active learning experiment - use objective function to pick data")
    t0 = time.time()
    tac = []
    tau = []
    ### experiment starts
    for t in range(args.trials):
        trial_accu = []

        trial_aucs = []

        print "*" * 60
        print "Trial: %s" % t
        if args.student in "anyunc":
            student = randomsampling.AnytimeLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
                                                    subpool=250, cost_model=cost_model)
        elif args.student in "lambda":
            student = randomsampling.AnytimeLearnerDiff(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
                                                    subpool=250, cost_model=cost_model, lambda_value=args.lambda_value)
        elif args.student in "anyzero":
            student = randomsampling.AnytimeLearnerZeroUtility(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
                                                    subpool=250, cost_model=cost_model)
        else:
            raise ValueError("Oops! We do not know that anytime strategy. Try again.")

        print "\nStudent: %s " % student
        train_indices = []
        neutral_text = []  # save the raw text of the queries
        neutral_data = []  # save the xik vectors
        train_x = []
        train_y = []
        neu_x = [] # data to train the classifier
        neu_y = np.array([])

        pool = Bunch()
        pool.data = data.train.bow.tocsr()   # full words, for training
        pool.text = data.train.data
        # pool.fixk = data.train.bowk.tocsr()  # k words BOW for querying
        pool.target = data.train.target
        pool.predicted = []
        # pool.kwords = np.array(data.train.kwords)  # k words
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool

        bootstrapped = False

        current_cost = 0
        iteration = 0
        query_index = None
        query_size = None
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:
            util = []
            if not bootstrapped:
                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                query = pool.data[query_index]
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:
                # print "pick instance"

                ## chose returns: index, k
                ## util returns: utility, k, unc
                query_chosen, util = student.pick_next(pool=pool, step_size=step_size)
                query_index = [a for a, b in query_chosen]
                query_size = [b for a, b in query_chosen]

                # query = pool.fixk[query_index]  # query with k words
                qk = []
                for q, k in query_chosen:
                    qk.append(" ".join(vct_analizer(pool.text[q])[0:int(k)]))
                query = vct.transform(qk)

            # query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]]

            ground_truth = pool.target[query_index]
            #labels, spent = expert.label(unlabeled=query, target=ground_truth)
            if iteration == 0: ## bootstrap uses ground truth
                labels = ground_truth
                spent = [0] * len(ground_truth) ## bootstrap cost is ignored
            else:
                # print "ask labels"
                labels = expert.label_instances(query, ground_truth)
                spent = expert.estimate_instances(query_size)

            ### accumulate the cost of the query
            query_cost = np.array(spent).sum()
            current_cost += query_cost
            # print query_index
            useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None])
            neutral_answers = np.array([[x, z] for x, y, z in zip(query_index, labels, query_size) if y is None]) \
                if iteration != 0 else np.array([])

            # print labels
            # print "label\tutility\tk\tunc"
            # print format_query(zip(labels, util))

            ## add data recent acquired to train
            if useful_answers.shape[0] != 0:
                # print "get training"
                # train_indices.extend(query_index)
                train_indices.extend(useful_answers[:, 0])

                # add labels to training
                train_x = pool.data[train_indices]  # # train with all the words

                # update labels with the expert labels
                #train_y = pool.target[train_indices]
                train_y.extend(useful_answers[:, 1])

            if neutral_answers.shape[0] != 0:
                # current query neutrals
                qlbl = []

                for xik, lbl in zip(query, labels):
                    # neutral_data.append(xik)
                    if isinstance(neutral_data, list):
                        neutral_data = xik
                    else:
                        neutral_data = vstack([neutral_data, xik], format='csr')
                    qlbl.append(neutral_label(lbl))

                ## append the labels of the current query
                neu_y = np.append(neu_y, qlbl)
                neu_x = neutral_data
                #end usefulanswers


            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # retrain the model
            # current_model = student.train(train_x, train_y)
            # print "train models"
            current_model = student.train_all(train_x, train_y, neu_x, neu_y)
            # print "evaluate"
            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}\tneu:{6}\t{7}".format(
                len(train_indices),
                accu,
                auc, query_cost,
                current_cost,
                format_spent(spent),
                len(neutral_answers), neu_y.shape[0]))

            ## the results should be based on the cost of the labeling
            if iteration > 0:   # bootstrap iteration

                student.budget -= query_cost ## Bootstrap doesn't count

                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)
                # partial trial results
                trial_accu.append([x_axis_range, accu])
                trial_aucs.append([x_axis_range, auc])

            iteration += 1
            # end of budget loop

        tac.append(trial_accu)
        tau.append(trial_aucs)
        #end trial loop
    if args.cost_function not in "uniform":
        accuracies = extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size)
        aucs = extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size)

    print("Elapsed time %.3f" % (time.time() - t0))
    print_extrapolated_results(accuracies, aucs)
Example #46
0
def load_files(container_path, description=None, categories=None,
               shuffle=True, encoding='utf-8', random_state=0,
               key_path_index=-2):
    """Load text files with categories as subfolder names.

    Individual samples are assumed to be files stored a two levels folder
    structure such as the following:

        container_folder/
            category_1_folder/
                file_1.txt
                    line 1
                    line 2
                    ...
                    line n
            category_2_folder/
                file_2.txt
                    line 1
                    line 2
                    ...
                    line n
            ...

    The folder names are used as supervised signal label names. The
    individual file names are not important.

    Parameters
    ----------
    container_path : string or unicode
        Path to the main folder holding one subfolder per category

    description: string or unicode, optional (default=None)
        A paragraph describing the characteristic of the dataset: its source,
        reference, etc.

    categories : A collection of strings or None, optional (default=None)
        If None (default), load all the categories.
        If not None, list of category names to load (other categories ignored).

    shuffle : bool, optional (default=True)
        Whether or not to shuffle the data: might be important for models that
        make the assumption that the samples are independent and identically
        distributed (i.i.d.), such as stochastic gradient descent.

    random_state : int, RandomState instance or None, optional (default=0)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    key_name_index : int, category's index containing text file.

    Returns
    -------
    data : Bunch
        Dictionary-like object, the interesting attributes are: either
        data, the raw text data to learn, or 'filenames', the files
        holding it, 'target', the classification labels (integer index),
        'target_names', the meaning of the labels, and 'DESCR', the full
        description of the dataset.
    """
    target = list()
    target_names = list()
    filenames = list()
    filelines2data = dict()

    folders = [f for f in sorted(listdir(container_path))
               if isdir(join(container_path, f))]

    if categories is not None:
        folders = [f for f in folders if f in categories]

    for label, folder in enumerate(folders):
        target_names.append(folder)
        folder_path = join(container_path, folder)
        documents = [join(folder_path, d)
                     for d in sorted(listdir(folder_path))]
        for training_doc in documents:
            if key_path_index:
                category = training_doc.split(os.sep)[key_path_index]
            else:
                category = training_doc
            with codecs.open(training_doc, encoding=encoding) as td:
                for line_index, data in enumerate(td):
                    key4file = category + str(line_index)
                    filelines2data[key4file] = data
                    target.append(label)
                    filenames.append(key4file)

    # convert to array for fancy indexing
    filenames = np.array(filenames)
    target = np.array(target)

    if shuffle:
        random_state = check_random_state(random_state)
        indices = np.arange(filenames.shape[0])
        random_state.shuffle(indices)
        filenames = filenames[indices]
        target = target[indices]

    data = list()
    for filename in filenames:
        data.append(filelines2data.get(filename))

    return Bunch(data=data,
                 filenames=filenames,
                 target_names=target_names,
                 target=target,
                 DESCR=description)
Example #47
0
def main():


    vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=False, ngram_range=(1, 1),
                          token_pattern='\\b\\w+\\b')  #, tokenizer=StemTokenizer())

    vct_analizer = vct.build_tokenizer()

    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = 10  # max(10, args.fixk)

    args.fixk = None

    data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))


    ### SENTENCE TRANSFORMATION
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    ## delete <br> to "." to recognize as end of sentence
    data.train.data = experiment_utils.clean_html(data.train.data)
    data.test.data = experiment_utils.clean_html(data.test.data)

    print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
    ## Get the features of the sentence dataset

    ## create splits of data: pool, test, oracle, sentences
    expert_data = Bunch()
    train_test_data = Bunch()

    expert_data.sentence, train_test_data.pool = split_data(data.train)
    expert_data.oracle, train_test_data.test = split_data(data.test)

    data.train.data = train_test_data.pool.train.data
    data.train.target = train_test_data.pool.train.target

    data.test.data = train_test_data.test.train.data
    data.test.target = train_test_data.test.train.target

    ## convert document to matrix
    data.train.bow = vct.fit_transform(data.train.data)
    data.test.bow = vct.transform(data.test.data)

    #### EXPERT CLASSIFIER: ORACLE
    print("Training Oracle expert")

    labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector)

    expert_data.oracle.train.data = sent_train
    expert_data.oracle.train.target = np.array(labels)
    expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)

    exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)


    #### EXPERT CLASSIFIER: SENTENCES
    print("Training sentence expert")
    labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector)

    expert_data.sentence.train.data = sent_train
    expert_data.sentence.train.target = np.array(labels)
    expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)

    sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)

    #### TESTING THE CLASSIFERS

    test_target, test_data = split_data_sentences(data.test,sent_detector)
    test_data_bow = vct.transform(test_data)

    #pred_sent = sent_clf.predict(test_data_bow)
    pred_ora = exp_clf.predict(test_data_bow)
    y_probas = sent_clf.predict_proba(test_data_bow)
    pred_sent = sent_clf.classes_[np.argmax(y_probas, axis=1)]
    ## just based on one class probability
    # order = np.argsort(y_probas[:,0])
    order = np.argsort(y_probas.max(axis=1))
    print "ORACLE\tSENTENCE\tMAX-SENT"
    # for i in order[:500]:
    #     print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i]
    for i in order[-500:]:
        print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i]
    print "Accuracy of Sentences Classifier", metrics.accuracy_score(test_target, pred_sent)
    print "Class distribution: %s" % pred_sent.sum()
    print "Size of data: %s" % pred_sent.shape[0]
    sizes = [50, 100, 500, 1000, 2000, 3000, 4000, 20000]
    clf = linear_model.LogisticRegression(penalty='l1', C=1)
    bootstrap = rand.permutation(len(test_data))
    x = []
    y = []
    for s in sizes:
        indices = bootstrap[:s]

        train_x = expert_data.sentence.train.bow[indices[:s]]
        train_y = expert_data.sentence.train.target[indices[:s]]

        clf.fit(train_x, train_y)

        predictions = clf.predict(test_data_bow)
        scores = metrics.accuracy_score(test_target,predictions)
        ## print clf.__class__.__name__
        print "Accuracy {0}: {1}".format(s, scores)
        y.append(scores)
    plt.clf()
    plt.title("Accuracy")
    plt.xlabel("Labels")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.plot(sizes, y, '--bo', label="sent")
    plt.show()
Example #48
0
    def fetch(self, n_subjects=1, fetch_stimuli=False,
              url=None, resume=True, force=False, verbose=1):

        if self.simple:
            # URL of the dataset. It is optional because a test uses it to test dataset
            # downloading
            if url is None:
                url = 'http://www.pymvpa.org/files/pymvpa_exampledata.tar.bz2'

            opts = {'uncompress': True}
            files = [
                (os.path.join('pymvpa-exampledata', 'attributes.txt'), url, opts),
                (os.path.join('pymvpa-exampledata', 'bold.nii.gz'), url, opts),
                (os.path.join('pymvpa-exampledata', 'mask.nii.gz'), url, opts),
                (os.path.join('pymvpa-exampledata', 'attributes_literal.txt'),
                     url, opts),
            ]

            files = self.fetcher.fetch(files, resume=resume, force=force, verbose=verbose)

            # return the data
            return Bunch(func=files[1], session_target=files[0], mask=files[2],
                         conditions_target=files[3])

        else:
            if n_subjects > 6:
                warnings.warn('Warning: there are only 6 subjects')
                n_subjects = 6

            # Dataset files
            if url is None:
                url = 'http://data.pymvpa.org/datasets/haxby2001/'
            md5sums = self.fetcher.fetch([('MD5SUMS', url + 'MD5SUMS', {})],
                                   resume=resume, force=force, verbose=verbose)[0]
            md5sums = readmd5_sum_file(md5sums)

            # definition of dataset files
            sub_files = ['bold.nii.gz', 'labels.txt',
                          'mask4_vt.nii.gz', 'mask8b_face_vt.nii.gz',
                          'mask8b_house_vt.nii.gz', 'mask8_face_vt.nii.gz',
                          'mask8_house_vt.nii.gz', 'anat.nii.gz']
            n_files = len(sub_files)

            files = [
                    (os.path.join('subj%d' % i, sub_file),
                     url + 'subj%d-2010.01.14.tar.gz' % i,
                     {'uncompress': True,
                      'md5sum': md5sums.get('subj%d-2010.01.14.tar.gz' % i, None)})
                    for i in range(1, n_subjects + 1)
                    for sub_file in sub_files
                    if not (sub_file == 'anat.nii.gz' and i == 6)  # no anat for sub. 6
            ]

            files = self.fetcher.fetch(files, resume=resume, force=force, verbose=verbose)

            if n_subjects == 6:
                files.append(None)  # None value because subject 6 has no anat

            kwargs = {}
            if fetch_stimuli:
                stimuli_files = [(os.path.join('stimuli', 'README'),
                                  url + 'stimuli-2010.01.14.tar.gz',
                                  {'uncompress': True})]
                readme = self.fetcher.fetch(stimuli_files, resume=resume,
                                            force=force, verbose=verbose)[0]
                kwargs['stimuli'] = _tree(os.path.dirname(readme), pattern='*.jpg',
                                          dictionary=True)

            # return the data
            return Bunch(
                    anat=files[7::n_files],
                    func=files[0::n_files],
                    session_target=files[1::n_files],
                    mask_vt=files[2::n_files],
                    mask_face=files[3::n_files],
                    mask_house=files[4::n_files],
                    mask_face_little=files[5::n_files],
                    mask_house_little=files[6::n_files],
                    **kwargs)
import os

from sklearn.datasets.base import Bunch

from sklearn.externals import joblib
import jieba
from sklearn.feature_extraction.text import HashingVectorizer

reload(sys)
# sys.setdefaultencoding('utf-8')

token_path = "token"+"/"
#次袋语料路径
wordbag_path = "wordbag"+"/"
#是引用bunch存储
data_set = Bunch(target_name=[],label=[],filenames=[],contents=[])

dir_list = os.listdir(token_path)
data_set.target_name = dir_list

for file in dir_list:
    file_name = token_path+file
    file_read = open(file_name,"r")
    for line in file_read:
        data_set.label.append(data_set.target_name.index(file))
        data_set.contents.append(line.strip())
    file_read.close()
#持久化
joblib.dump(data_set, wordbag_path+"train_set1124.data", compress=3)

#验证
Example #50
0
def load_info(dataset_path, return_X_y=False):
    """Load and return the info dataset (classification).

    The iris dataset is a classic and very easy multi-class classification
    dataset.

    =================   ==============
    Classes                          7
    Samples per class               20
    Samples total                  140
    Dimensionality                   16
    Features            '铝', '沪铝', '伦铝', '氧化铝', '沪铜',
                        '伦铜', '铜', '铅', '伦铅', '沪铅', '铅锌',
                        '锌精矿', '锌', '铁', '钢铁', 'PVC'
    =================   ==============

    Read more in the :ref:`User Guide <datasets>`.

    Parameters
    ----------
    return_X_y : boolean, default=False.
        If True, returns ``(data, target)`` instead of a Bunch object.
        See below for more information about the `data` and `target` object.

        .. versionadded:: 0.18

    Returns
    -------
    data : Bunch
        Dictionary-like object, the interesting attributes are:
        'data', the data to learn, 'target', the classification labels,
        'target_names', the meaning of the labels, 'feature_names', the
        meaning of the features, and 'DESCR', the
        full description of the dataset.

    (data, target) : tuple if ``return_X_y`` is True

        .. versionadded:: 0.18

    Examples
    --------
    Let's say you are interested in the samples 10, 25, and 50, and want to
    know their class name.

    >>> from sklearn.datasets import load_iris
    >>> data = load_iris()
    >>> data.target[[10, 25, 50]]
    array([0, 0, 1])
    >>> list(data.target_names)
    ['setosa', 'versicolor', 'virginica']
    """
    with open(dataset_path) as csv_file:
        data_file = csv.reader(csv_file)
        temp = next(data_file)
        n_samples = int(temp[0])
        n_features = int(temp[1])
        target_names = np.array(temp[2:])
        data = np.empty((n_samples, n_features))
        target = np.empty((n_samples,), dtype=np.int)

        for i, ir in enumerate(data_file):
            data[i] = np.asarray(ir[:-1], dtype=np.float64)
            target[i] = np.asarray(ir[-1], dtype=np.int)

    if return_X_y:
        return data, target

    return Bunch(data=data, target=target,
                 target_names=target_names,
                 DESCR="",
                 feature_names=['sepal length (cm)', 'sepal width (cm)',
                                'petal length (cm)', 'petal width (cm)'])
        X = read_file(fpath)
        data.append(X)
    return np.array(data, dtype=np.float32)

print "loading data from disk..."
species2id = lambda s: species_map.get(s, -1)
train = np.loadtxt('samples/alltrain.csv', converters={0: species2id},
                   skiprows=1, delimiter=",")
test = np.loadtxt('samples/alltest.csv', converters={0: species2id},
                  skiprows=1, delimiter=",")
# Load env variable grids
coverage = load_dir("coverages")

# Per species data
bv = Bunch(name=" ".join(species[0].split("_")[:2]),
           train=train[train[:, 0] == 0, 1:],
           test=test[test[:, 0] == 0, 1:])
mm = Bunch(name=" ".join(species[1].split("_")[:2]),
           train=train[train[:, 0] == 1, 1:],
           test=test[test[:, 0] == 1, 1:])


def get_coverages(points, coverages, xx, yy):
    """Get coverages (aka features) for each point.
    Returns
    -------
    array : shape = [points.shape[0], coverages.shape[0]]
        The feature vectors (coverages) for each data point.
    """
    rows = []
    cols = []
Example #52
0
def fetch_species_distributions(data_home=None, download_if_missing=True):
    """Loader for species distribution dataset from Phillips et. al. (2006)

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets. By default
        all scikit learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing: optional, True by default
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    Returns
    --------
    The data is returned as a Bunch object with the following attributes:

    coverages : array, shape = [14, 1592, 1212]
        These represent the 14 features measured at each point of the map grid.
        The latitude/longitude values for the grid are discussed below.
        Missing data is represented by the value -9999.

    train : record array, shape = (1623,)
        The training points for the data.  Each point has three fields:

        - train['species'] is the species name
        - train['dd long'] is the longitude, in degrees
        - train['dd lat'] is the latitude, in degrees

    test : record array, shape = (619,)
        The test points for the data.  Same format as the training data.

    Nx, Ny : integers
        The number of longitudes (x) and latitudes (y) in the grid

    x_left_lower_corner, y_left_lower_corner : floats
        The (x,y) position of the lower-left corner, in degrees

    grid_size : float
        The spacing between points of the grid, in degrees

    Notes
    ------

    This dataset represents the geographic distribution of species.
    The dataset is provided by Phillips et. al. (2006).

    The two species are:

    - `"Bradypus variegatus"
      <http://www.iucnredlist.org/apps/redlist/details/3038/0>`_ ,
      the Brown-throated Sloth.

    - `"Microryzomys minutus"
      <http://www.iucnredlist.org/apps/redlist/details/13408/0>`_ ,
      also known as the Forest Small Rice Rat, a rodent that lives in Peru,
      Colombia, Ecuador, Peru, and Venezuela.

    References
    ----------

    * `"Maximum entropy modeling of species geographic distributions"
      <http://www.cs.princeton.edu/~schapire/papers/ecolmod.pdf>`_
      S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,
      190:231-259, 2006.

    Notes
    -----

    * See examples/applications/plot_species_distribution_modeling.py
      for an example of using this dataset with scikit-learn

    """
    data_home = get_data_home(data_home)
    if not exists(data_home):
        makedirs(data_home)

    # Define parameters for the data files.  These should not be changed
    # unless the data model changes.  They will be saved in the npz file
    # with the downloaded data.
    extra_params = dict(x_left_lower_corner=-94.8,
                        Nx=1212,
                        y_left_lower_corner=-56.05,
                        Ny=1592,
                        grid_size=0.05)
    dtype = np.int16

    if not exists(join(data_home, DATA_ARCHIVE_NAME)):
        print('Downloading species data from %s to %s' %
              (SAMPLES_URL, data_home))
        X = np.load(BytesIO(urlopen(SAMPLES_URL).read()))

        for f in X.files:
            fhandle = BytesIO(X[f])
            if 'train' in f:
                train = _load_csv(fhandle)
            if 'test' in f:
                test = _load_csv(fhandle)

        print('Downloading coverage data from %s to %s' %
              (COVERAGES_URL, data_home))

        X = np.load(BytesIO(urlopen(COVERAGES_URL).read()))

        coverages = []
        for f in X.files:
            fhandle = BytesIO(X[f])
            print(' - converting', f)
            coverages.append(_load_coverage(fhandle))
        coverages = np.asarray(coverages, dtype=dtype)

        bunch = Bunch(coverages=coverages,
                      test=test,
                      train=train,
                      **extra_params)
        joblib.dump(bunch, join(data_home, DATA_ARCHIVE_NAME), compress=9)
    else:
        bunch = joblib.load(join(data_home, DATA_ARCHIVE_NAME))

    return bunch
Example #53
0
    def fetch(self,
              contrasts=None,
              n_subjects=None,
              get_tmaps=False,
              get_masks=False,
              get_anats=False,
              url=None,
              resume=True,
              force=False,
              verbose=1):
        if n_subjects is None:
            n_subjects = 94  # 94 subjects available
        if (n_subjects > 94) or (n_subjects < 1):
            warnings.warn("Wrong value for \'n_subjects\' (%d). The maximum "
                          "value will be used instead (\'n_subjects=94\')")
            n_subjects = 94  # 94 subjects available

        if contrasts is None:
            contrasts = self.contrast_name_wrapper.values()
        elif isinstance(contrasts, _basestring):
            contrasts = [contrasts]

        allowed_contrasts = list(self.contrast_name_wrapper.values())
        # convert contrast names
        contrasts_wrapped = []
        # get a unique ID for each contrast. It is used to give a unique name to
        # each download file and avoid name collisions.
        contrasts_indices = []
        for contrast in contrasts:
            if contrast in allowed_contrasts:
                contrasts_wrapped.append(contrast)
                contrasts_indices.append(allowed_contrasts.index(contrast))
            elif contrast in self.contrast_name_wrapper:
                name = self.contrast_name_wrapper[contrast]
                contrasts_wrapped.append(name)
                contrasts_indices.append(allowed_contrasts.index(name))
            else:
                raise ValueError("Contrast \'%s\' is not available" % contrast)

        # It is better to perform several small requests than a big one because:
        # - Brainomics server has no cache (can lead to timeout while the archive
        #   is generated on the remote server)
        # - Local (cached) version of the files can be checked for each contrast
        opts = {'uncompress': True}
        subject_ids = ["S%02d" % s for s in range(1, n_subjects + 1)]
        subject_id_max = subject_ids[-1]
        data_types = ["c map"]
        if get_tmaps:
            data_types.append("t map")
        rql_types = str.join(", ", ["\"%s\"" % x for x in data_types])
        root_url = "http://brainomics.cea.fr/localizer/"

        base_query = ("Any X,XT,XL,XI,XF,XD WHERE X is Scan, X type XT, "
                      "X concerns S, "
                      "X label XL, X identifier XI, "
                      "X format XF, X description XD, "
                      'S identifier <= "%s", ' % (subject_id_max, ) +
                      'X type IN(%(types)s), X label "%(label)s"')

        urls = [
            "%sbrainomics_data_%d.zip?rql=%s&vid=data-zip" %
            (root_url, i,
             _urllib.parse.quote(base_query % {
                 "types": rql_types,
                 "label": c
             },
                                 safe=',()'))
            for c, i in zip(contrasts_wrapped, contrasts_indices)
        ]
        filenames = []
        for subject_id in subject_ids:
            for data_type in data_types:
                for contrast_id, contrast in enumerate(contrasts_wrapped):
                    name_aux = str.replace(
                        str.join('_', [data_type, contrast]), ' ', '_')
                    file_path = os.path.join("brainomics_data", subject_id,
                                             "%s.nii.gz" % name_aux)
                    file_tarball_url = urls[contrast_id]
                    filenames.append((file_path, file_tarball_url, opts))
        # Fetch masks if asked by user
        if get_masks:
            urls.append("%sbrainomics_data_masks.zip?rql=%s&vid=data-zip" %
                        (root_url,
                         _urllib.parse.quote(base_query % {
                             "types": '"boolean mask"',
                             "label": "mask"
                         },
                                             safe=',()')))
            for subject_id in subject_ids:
                file_path = os.path.join("brainomics_data", subject_id,
                                         "boolean_mask_mask.nii.gz")
                file_tarball_url = urls[-1]
                filenames.append((file_path, file_tarball_url, opts))
        # Fetch anats if asked by user
        if get_anats:
            urls.append("%sbrainomics_data_anats.zip?rql=%s&vid=data-zip" %
                        (root_url,
                         _urllib.parse.quote(base_query % {
                             "types": '"normalized T1"',
                             "label": "anatomy"
                         },
                                             safe=',()')))
            for subject_id in subject_ids:
                file_path = os.path.join("brainomics_data", subject_id,
                                         "normalized_T1_anat_defaced.nii.gz")
                file_tarball_url = urls[-1]
                filenames.append((file_path, file_tarball_url, opts))
        # Fetch subject characteristics (separated in two files)
        if url is None:
            url_csv = (
                "%sdataset/cubicwebexport.csv?rql=%s&vid=csvexport" %
                (root_url, _urllib.parse.quote("Any X WHERE X is Subject")))
            url_csv2 = ("%sdataset/cubicwebexport2.csv?rql=%s&vid=csvexport" %
                        (root_url,
                         _urllib.parse.quote(
                             "Any X,XI,XD WHERE X is QuestionnaireRun, "
                             "X identifier XI, X datetime "
                             "XD",
                             safe=',')))
        else:
            url_csv = "%s/cubicwebexport.csv" % url
            url_csv2 = "%s/cubicwebexport2.csv" % url
        filenames += [("cubicwebexport.csv", url_csv, {}),
                      ("cubicwebexport2.csv", url_csv2, {})]

        # Actual data fetching
        files = self.fetcher.fetch(filenames,
                                   resume=resume,
                                   force=force,
                                   verbose=verbose)
        anats = None
        masks = None
        tmaps = None
        # combine data from both covariates files into one single recarray
        from numpy.lib.recfunctions import join_by
        ext_vars_file2 = files[-1]
        csv_data2 = np.recfromcsv(ext_vars_file2, delimiter=';')
        files = files[:-1]
        ext_vars_file = files[-1]
        csv_data = np.recfromcsv(ext_vars_file, delimiter=';')
        files = files[:-1]
        # join_by sorts the output along the key
        csv_data = join_by('subject_id',
                           csv_data,
                           csv_data2,
                           usemask=False,
                           asrecarray=True)[:n_subjects]
        if get_anats:
            anats = files[-n_subjects:]
            files = files[:-n_subjects]
        if get_masks:
            masks = files[-n_subjects:]
            files = files[:-n_subjects]
        if get_tmaps:
            tmaps = files[1::2]
            files = files[::2]
        return Bunch(cmaps=files,
                     tmaps=tmaps,
                     masks=masks,
                     anats=anats,
                     ext_vars=csv_data)
Example #54
0
    'acc': 1,
    'good': 2,
    'vgood': 3,
}

X, y = data_utils.dispose_data(url, str2int)

#将数据集切分为训练集和测试集:
train_data, test_data, train_target, test_target = train_test_split(
    X, y, test_size=0.3, random_state=0)

# 生成一个默认的数据结构,方便使用
dataArray = np.empty((len(train_target), 6))
for i in range(len(train_target)):
    dataArray[i] = np.asarray(train_data[i], dtype=np.float)
targetArray = np.asarray(train_target, dtype=np.int)
target_names = np.asarray(['unacc', 'acc', 'good', 'vgood'])
fdescr = "Train data for the car"
carSet = Bunch(data=dataArray,
               target=targetArray,
               target_names=target_names,
               DESCR=fdescr,
               feature_names=[
                   'buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'
               ])

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
                             n_estimators=1000,
                             learning_rate=1)
ada_clf.fit(carSet.data, carSet.target)
print ada_clf.score(test_data, test_target)
	file_obj.close()
	return bunch
#写入bunch对象	
def writebunchobj(path,bunchobj):
	file_obj = open(path, "wb")
	pickle.dump(bunchobj,file_obj) 
	file_obj.close()	

# 1. 读取停用词表	
stopword_path = "train_word_bag/hlt_stop_words.txt"
stpwrdlst = readfile(stopword_path).splitlines()

# 2. 导入分词后的词向量bunch对象
path = "train_word_bag/train_set.dat"        # 词向量空间保存路径
bunch	= readbunchobj(path)

# 3. 构建tf-idf词向量空间对象
tfidfspace = Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={})

# 4. 使用TfidfVectorizer初始化向量空间模型 
vectorizer = TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf = True,max_df = 0.5)
transformer=TfidfTransformer() # 该类会统计每个词语的tf-idf权值
# 文本转为词频矩阵,单独保存字典文件 
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary = vectorizer.vocabulary_

# 创建词袋的持久化
space_path = "train_word_bag/tfdifspace.dat"        # 词向量空间保存路径
writebunchobj(space_path,tfidfspace)

print "if-idf词向量空间创建成功!!!"
Example #56
0
def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = CountVectorizer(encoding='latin-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
    vct_analizer = vct.build_tokenizer()
    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = max(10, args.fixk)

    if args.fixk < 0:
        args.fixk = None

    # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5)
    # fixk_saved = "{0}{1}.p".format(args.train, args.fixk)

    data, vct = load_from_file(args.train, categories, args.fixk, min_size, vct)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    #### COST MODEL
    parameters = parse_parameters_mat(args.cost_model)
    print "Cost Parameters %s" % parameters
    cost_model = set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__

    #### ACCURACY MODEL
    accu_parameters = parse_parameters_mat(args.accu_model)

    #### CLASSIFIER
    clf = set_classifier(args.classifier)
    print "\nClassifier: %s" % clf

    #### EXPERT MODEL

    if "fixed" in args.expert:
        expert = baseexpert.FixedAccuracyExpert(accuracy_value=accu_parameters[0],
                                                cost_function=cost_model.cost_function)  #average value of accuracy of the experts
    elif "true" in args.expert:
        expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function)
    elif "linear" in args.expert:
        #expert = baseexpert.LRFunctionExpert(model=[0.0019, 0.6363],cost_function=cost_model.cost_function)
        raise Exception("We do not know linear yet!!")
    elif "log" in args.expert:
        expert = baseexpert.LogFunctionExpert(model=accu_parameters, cost_function=cost_model.cost_function)
    elif "direct" in args.expert:
        expert = baseexpert.LookUpExpert(accuracy_value=accu_parameters, cost_function=cost_model.cost_function)
    elif "neutral" in args.expert:
        exp_clf = LogisticRegression(penalty='l1', C=1)
        exp_clf.fit(data.test.bow, data.test.target)
        expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                         cost_function=cost_model.cost_function)
    else:
        raise Exception("We need a defined cost function options [fixed|log|linear]")

    exp_clf = LogisticRegression(penalty='l1', C=args.expert_penalty)
    exp_clf.fit(data.test.bow, data.test.target)
    print "\nExpert: %s " % expert
    coef = exp_clf.coef_[0]
    # print_features(coef, vct.get_feature_names())
    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          50))

    t0 = time.time()
    tac = []
    tau = []
    ### experiment starts
    for t in range(args.trials):
        trial_accu = []

        trial_aucs = []

        print "*" * 60
        print "Trial: %s" % t
        if  args.student in "unc":
            student = randomsampling.UncertaintyLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t,
                                                        subpool=250)
        else:
            student = randomsampling.RandomSamplingLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t)

        print "\nStudent: %s " % student

        train_indices = []
        train_x = []
        train_y = []
        pool = Bunch()
        pool.data = data.train.bow.tocsr()   # full words, for training
        if args.fixk is None:
            pool.fixk = data.train.bow.tocsr()
        else:
            pool.fixk = data.train.bowk.tocsr()  # k words BOW for querying
        pool.target = data.train.target
        pool.predicted = []
        # pool.kwords = np.array(data.train.kwords)  # k words
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool


        bootstrapped = False

        current_cost = 0
        iteration = 0
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:

            if not bootstrapped:
                ## random bootstrap
                #bt = randomsampling.BootstrapRandom(random_state=t * 10)

                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:
                query_index = student.pick_next(pool=pool, k=step_size)

            # query = pool.fixk[query_index]  # query with k words
            query = pool.data[query_index]
            # print query_index
            # query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]]
            query_size = [1]*query.shape[0]

            ground_truth = pool.target[query_index]

            if iteration == 0: ## bootstrap uses ground truth
                labels = ground_truth
                spent = [0] * len(ground_truth)
            else:
                labels = expert.label_instances(query, ground_truth)
                spent = expert.estimate_instances(query_size)

            query_cost = np.array(spent).sum()
            current_cost += query_cost

            # train_indices.extend(query_index)

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # add labels to training
            # train_x = pool.data[train_indices]  ## train with all the words

            # update labels with the expert labels
            useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None])
            if useful_answers.shape[0] != 0:
                train_indices.extend(useful_answers[:, 0])
                # add labels to training
                train_x = pool.data[train_indices]  ## train with all the words
                # update labels with the expert labels
                train_y.extend(useful_answers[:, 1])


            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # retrain the model
            current_model = student.train(train_x, train_y)
            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print (
            "TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices), accu,
                                                                                              auc, query_cost,
                                                                                              current_cost, format_spent(spent)))

            ## the results should be based on the cost of the labeling
            if iteration > 0: # bootstrap iteration

                student.budget -= query_cost ## Bootstrap doesn't count

                #x_axis_range = int(current_cost / eval_range)
                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)
                trial_accu.append([x_axis_range, accu])
                trial_aucs.append([x_axis_range, auc])

            iteration += 1
            # end of budget loop

        tac.append(trial_accu)
        tau.append(trial_aucs)
        #end trial loop
    if args.cost_function not in "uniform":
        accuracies = extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size)
        aucs = extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size)

    print("Elapsed time %.3f" % (time.time() - t0))
    print_extrapolated_results(accuracies, aucs)