def shuffleData(self, res): shuffle(res) train = Bunch() train.data = map(lambda x:x[1], res) train.target = map(lambda x:x[0], res) train.target_names = self.names return train
def gen_tf_idf_space(): bunch = read_object(train_data) tf_idf_space = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, vocabulary={}) vectorizer = TfidfVectorizer(stop_words=load_stop_words(), sublinear_tf=True, max_df=0.5) transformer = TfidfTransformer() tf_idf_space.tdm = vectorizer.fit_transform(bunch.contents) tf_idf_space.vocabulary = vectorizer.vocabulary_ save_object(tf_idf_space_data, tf_idf_space)
def calc_tfidf(trainsetfile,stopwordfile,dstdir): data_set = joblib.load(trainsetfile) wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={}) wordbag.target_name = data_set.tatget_name wordbag.label = data_set.label corpus = data_set.contents stopwordlist = read_stopword(stopwordfile) vectorize = TfidfVectorizer(sublinear_tf=True,max_df = 0.8,min_df=3,max_features=50000,stop_words=stopwordlist) feature_train = vectorize.fit_transform(corpus) wordbag.tdm = feature_train wordbag.vocabulary = vectorize.vocabulary_ joblib.dump(wordbag,dstdir+"/"+"word_bag.data",compress=3)
def testset_tfidf(testsetfile,stopwordfile,myvocabulary): data_set = joblib.load(testsetfile) wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={}) wordbag.target_name = data_set.tatget_name wordbag.label = data_set.label corpus = data_set.contents stopwordlist = read_stopword(stopwordfile) vectorize = TfidfVectorizer(sublinear_tf=True,stop_words=stopwordlist,vocabulary=myvocabulary) feature_train = vectorize.fit_transform(corpus) wordbag.tdm = feature_train joblib.dump(wordbag,"test_wordbag/test_word_bag.data",compress=3) return wordbag
def train_bags(token_path,filename,wordbag_path): data_set = Bunch(tatget_name=[],label=[],filenames=[],contents=[]) dir_list = os.listdir(token_path) data_set.target_name = dir_list for file in dir_list: file_name = token_path+"/"+file file_read = open(file_name,"r") for line in file_read: data_set.label.append(data_set.target_name.index(file)) data_set.contents.append(line.strip()) file_read.close() #持久化 joblib.dump(data_set, wordbag_path+"/"+filename, compress=3)
def test_bunch_pickle_generated_with_0_16_and_read_with_0_17(): bunch = Bunch(key='original') # This reproduces a problem when Bunch pickles have been created # with scikit-learn 0.16 and are read with 0.17. Basically there # is a suprising behaviour because reading bunch.key uses # bunch.__dict__ (which is non empty for 0.16 Bunch objects) # whereas assigning into bunch.key uses bunch.__setattr__. See # https://github.com/scikit-learn/scikit-learn/issues/6196 for # more details bunch.__dict__['key'] = 'set from __dict__' bunch_from_pkl = loads(dumps(bunch)) # After loading from pickle the __dict__ should have been ignored assert_equal(bunch_from_pkl.key, 'original') assert_equal(bunch_from_pkl['key'], 'original') # Making sure that changing the attr does change the value # associated with __getitem__ as well bunch_from_pkl.key = 'changed' assert_equal(bunch_from_pkl.key, 'changed') assert_equal(bunch_from_pkl['key'], 'changed')
def execute_NM_predict(): test_bunch = read_object(test_data) test_space = Bunch(target_name=test_bunch.target_name, label=test_bunch.label, filenames=test_bunch.filenames, tdm=[], vocabulary={}) tf_idf_bunch = read_object(tf_idf_space_data) vectorizer = TfidfVectorizer(stop_words=load_stop_words(), sublinear_tf=True, max_df=0.5, vocabulary=tf_idf_bunch.vocabulary) transformer = TfidfTransformer() test_space.tdm = vectorizer.fit_transform(test_bunch.contents) test_space.vocabulary = tf_idf_bunch.vocabulary clf = MultinomialNB(alpha=0.001).fit(tf_idf_bunch.tdm, tf_idf_bunch.label) #预测结果 predicted = clf.predict(test_space.tdm) #对结果进行更加友好的打印 for label, file_name, excect_cate in zip(test_bunch.label, test_bunch.filenames, predicted): print file_name, ' 实际类别:', label, ' 预测类别:', excect_cate
def scatter3d(X, fig=None,ax=None ,color='b',cs=None, colorsMap='jet'): if (cs is not None): cm = plt.get_cmap(colorsMap) cNorm = matplotlib.colors.Normalize(vmin=min(cs), vmax=max(cs)) scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm) if (ax is None): fig = plt.figure() ax = Axes3D(fig) if (cs is None): ax.scatter(X[:, 0], X[:, 1], X[:, 2],c=color) else: ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=scalarMap.to_rgba(cs)) scalarMap.set_array(cs) fig.colorbar(scalarMap) ax.set_xlabel('x') ax.set_ylabel('y') ax.set_zlabel('z') plt.show() b=Bunch() b.fig=fig b.ax=ax return b
def fetch_atlas_yeo_2011(data_dir=None, url=None, resume=True, verbose=1): """Download and return file names for the Yeo 2011 parcellation. The provided images are in MNI152 space. Parameters ---------- data_dir: string directory where data should be downloaded and unpacked. url: string url of file to download. resume: bool whether to resumed download of a partly-downloaded file. verbose: int verbosity level (0 means no message). Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, keys are: - "thin_7", "thick_7": 7-region parcellations, fitted to resp. thin and thick template cortex segmentations. - "thin_17", "thick_17": 17-region parcellations. - "colors_7", "colors_17": colormaps (text files) for 7- and 17-region parcellation respectively. - "anat": anatomy image. Notes ----- For more information on this dataset's structure, see http://surfer.nmr.mgh.harvard.edu/fswiki/CorticalParcellation_Yeo2011 Yeo BT, Krienen FM, Sepulcre J, Sabuncu MR, Lashkari D, Hollinshead M, Roffman JL, Smoller JW, Zollei L., Polimeni JR, Fischl B, Liu H, Buckner RL. The organization of the human cerebral cortex estimated by intrinsic functional connectivity. J Neurophysiol 106(3):1125-65, 2011. Licence: unknown. """ if url is None: url = "ftp://surfer.nmr.mgh.harvard.edu/" \ "pub/data/Yeo_JNeurophysiol11_MNI152.zip" opts = {'uncompress': True} dataset_name = "yeo_2011" keys = ("thin_7", "thick_7", "thin_17", "thick_17", "colors_7", "colors_17", "anat") basenames = ( "Yeo2011_7Networks_MNI152_FreeSurferConformed1mm.nii.gz", "Yeo2011_7Networks_MNI152_FreeSurferConformed1mm_LiberalMask.nii.gz", "Yeo2011_17Networks_MNI152_FreeSurferConformed1mm.nii.gz", "Yeo2011_17Networks_MNI152_FreeSurferConformed1mm_LiberalMask.nii.gz", "Yeo2011_7Networks_ColorLUT.txt", "Yeo2011_17Networks_ColorLUT.txt", "FSL_MNI152_FreeSurferConformed_1mm.nii.gz") filenames = [(os.path.join("Yeo_JNeurophysiol11_MNI152", f), url, opts) for f in basenames] data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) sub_files = _fetch_files(data_dir, filenames, resume=resume, verbose=verbose) fdescr = _get_dataset_descr(dataset_name) params = dict([('description', fdescr)] + list(zip(keys, sub_files))) return Bunch(**params)
def fetch_atlas_msdl(data_dir=None, url=None, resume=True, verbose=1): """Download and load the MSDL brain atlas. Parameters ---------- data_dir: string, optional Path of the data directory. Used to force data storage in a specified location. Default: None url: string, optional Override download URL. Used for test only (or if you setup a mirror of the data). Returns ------- data: sklearn.datasets.base.Bunch Dictionary-like object, the interest attributes are : - 'maps': str, path to nifti file containing regions definition. - 'labels': string list containing the labels of the regions. - 'region_coords': tuple list (x, y, z) containing coordinates of each region in MNI space. - 'networks': string list containing names of the networks. - 'description': description about the atlas. References ---------- :Download: https://team.inria.fr/parietal/files/2015/01/MSDL_rois.zip :Paper to cite: `Multi-subject dictionary learning to segment an atlas of brain spontaneous activity <http://hal.inria.fr/inria-00588898/en>`_ Gael Varoquaux, Alexandre Gramfort, Fabian Pedregosa, Vincent Michel, Bertrand Thirion. Information Processing in Medical Imaging, 2011, pp. 562-573, Lecture Notes in Computer Science. :Other references: `Learning and comparing functional connectomes across subjects <http://hal.inria.fr/hal-00812911/en>`_. Gael Varoquaux, R.C. Craddock NeuroImage, 2013. """ url = 'https://team.inria.fr/parietal/files/2015/01/MSDL_rois.zip' opts = {'uncompress': True} dataset_name = "msdl_atlas" files = [(os.path.join('MSDL_rois', 'msdl_rois_labels.csv'), url, opts), (os.path.join('MSDL_rois', 'msdl_rois.nii'), url, opts)] data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) files = _fetch_files(data_dir, files, resume=resume, verbose=verbose) csv_data = np.recfromcsv(files[0]) labels = [name.strip() for name in csv_data['name'].tolist()] labels = [label.decode("utf-8") for label in labels] with warnings.catch_warnings(): warnings.filterwarnings('ignore', module='numpy', category=FutureWarning) region_coords = csv_data[['x', 'y', 'z']].tolist() net_names = [net_name.strip() for net_name in csv_data['net_name'].tolist()] fdescr = _get_dataset_descr(dataset_name) return Bunch(maps=files[1], labels=labels, region_coords=region_coords, networks=net_names, description=fdescr)
def fetch_atlas_harvard_oxford(atlas_name, data_dir=None, symmetric_split=False, resume=True, verbose=1): """Load Harvard-Oxford parcellations from FSL. This function downloads Harvard Oxford atlas packaged from FSL 5.0 and stores atlases in NILEARN_DATA folder in home directory. This function can also load Harvard Oxford atlas from your local directory specified by your FSL installed path given in `data_dir` argument. See documentation for details. Parameters ---------- atlas_name: string Name of atlas to load. Can be: cort-maxprob-thr0-1mm, cort-maxprob-thr0-2mm, cort-maxprob-thr25-1mm, cort-maxprob-thr25-2mm, cort-maxprob-thr50-1mm, cort-maxprob-thr50-2mm, sub-maxprob-thr0-1mm, sub-maxprob-thr0-2mm, sub-maxprob-thr25-1mm, sub-maxprob-thr25-2mm, sub-maxprob-thr50-1mm, sub-maxprob-thr50-2mm, cort-prob-1mm, cort-prob-2mm, sub-prob-1mm, sub-prob-2mm data_dir: string, optional Path of data directory where data will be stored. Optionally, it can also be a FSL installation directory (which is dependent on your installation). Example, if FSL is installed in /usr/share/fsl/ then specifying as '/usr/share/' can get you Harvard Oxford atlas from your installed directory. Since we mimic same root directory as FSL to load it easily from your installation. symmetric_split: bool, optional, (default False). If True, lateralized atlases of cort or sub with maxprob will be returned. For subcortical types (sub-maxprob), we split every symmetric region in left and right parts. Effectively doubles the number of regions. NOTE Not implemented for full probabilistic atlas (*-prob-* atlases). Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, keys are: - "maps": nibabel.Nifti1Image, 4D maps if a probabilistic atlas is requested and 3D labels if a maximum probabilistic atlas was requested. - "labels": string list, labels of the regions in the atlas. """ atlas_items = ("cort-maxprob-thr0-1mm", "cort-maxprob-thr0-2mm", "cort-maxprob-thr25-1mm", "cort-maxprob-thr25-2mm", "cort-maxprob-thr50-1mm", "cort-maxprob-thr50-2mm", "sub-maxprob-thr0-1mm", "sub-maxprob-thr0-2mm", "sub-maxprob-thr25-1mm", "sub-maxprob-thr25-2mm", "sub-maxprob-thr50-1mm", "sub-maxprob-thr50-2mm", "cort-prob-1mm", "cort-prob-2mm", "sub-prob-1mm", "sub-prob-2mm") if atlas_name not in atlas_items: raise ValueError("Invalid atlas name: {0}. Please chose an atlas " "among:\n{1}".format( atlas_name, '\n'.join(atlas_items))) url = 'http://www.nitrc.org/frs/download.php/9902/HarvardOxford.tgz' # For practical reasons, we mimic the FSL data directory here. dataset_name = 'fsl' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) opts = {'uncompress': True} root = os.path.join('data', 'atlases') if atlas_name[0] == 'c': if 'cort-maxprob' in atlas_name and symmetric_split: split_name = atlas_name.split('cort') atlas_name = 'cortl' + split_name[1] label_file = 'HarvardOxford-Cortical-Lateralized.xml' lateralized = True else: label_file = 'HarvardOxford-Cortical.xml' lateralized = False else: label_file = 'HarvardOxford-Subcortical.xml' lateralized = False label_file = os.path.join(root, label_file) atlas_file = os.path.join(root, 'HarvardOxford', 'HarvardOxford-' + atlas_name + '.nii.gz') atlas_img, label_file = _fetch_files( data_dir, [(atlas_file, url, opts), (label_file, url, opts)], resume=resume, verbose=verbose) names = {} from xml.etree import ElementTree names[0] = 'Background' for label in ElementTree.parse(label_file).findall('.//label'): names[int(label.get('index')) + 1] = label.text names = list(names.values()) if not symmetric_split: return Bunch(maps=atlas_img, labels=names) if atlas_name in ("cort-prob-1mm", "cort-prob-2mm", "sub-prob-1mm", "sub-prob-2mm"): raise ValueError("Region splitting not supported for probabilistic " "atlases") atlas_img = check_niimg(atlas_img) if lateralized: return Bunch(maps=atlas_img, labels=names) atlas = atlas_img.get_data() labels = np.unique(atlas) # Build a mask of both halves of the brain middle_ind = (atlas.shape[0] - 1) // 2 # Put zeros on the median plane atlas[middle_ind, ...] = 0 # Split every zone crossing the median plane into two parts. left_atlas = atlas.copy() left_atlas[middle_ind:, ...] = 0 right_atlas = atlas.copy() right_atlas[:middle_ind, ...] = 0 new_label = 0 new_atlas = atlas.copy() # Assumes that the background label is zero. new_names = [names[0]] for label, name in zip(labels[1:], names[1:]): new_label += 1 left_elements = (left_atlas == label).sum() right_elements = (right_atlas == label).sum() n_elements = float(left_elements + right_elements) if (left_elements / n_elements < 0.05 or right_elements / n_elements < 0.05): new_atlas[atlas == label] = new_label new_names.append(name) continue new_atlas[right_atlas == label] = new_label new_names.append(name + ', left part') new_label += 1 new_atlas[left_atlas == label] = new_label new_names.append(name + ', right part') atlas_img = new_img_like(atlas_img, new_atlas, atlas_img.affine) return Bunch(maps=atlas_img, labels=new_names)
def fetch_WS353(which="all"): """ Fetch WS353 dataset for testing attributional and relatedness similarity Parameters ---------- which : 'all': for both relatedness and attributional similarity, 'relatedness': for relatedness similarity 'similarity': for attributional similarity 'set1': as divided by authors 'set2': as divided by authors References ---------- Finkelstein, Gabrilovich, "Placing Search in Context: The Concept Revisited†", 2002 Agirre, Eneko et al., "A Study on Similarity and Relatedness Using Distributional and WordNet-based Approaches", 2009 Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': matrix of 2 words per column, 'y': vector with scores, 'sd': vector of std of scores if available (for set1 and set2) """ if which == "all": data = _get_as_pd( 'https://www.dropbox.com/s/eqal5qj97ajaycz/EN-WS353.txt?dl=1', 'similarity', header=0, sep="\t") elif which == "relatedness": data = _get_as_pd( 'https://www.dropbox.com/s/x94ob9zg0kj67xg/EN-WSR353.txt?dl=1', 'similarity', header=None, sep="\t") elif which == "similarity": data = _get_as_pd( 'https://www.dropbox.com/s/ohbamierd2kt1kp/EN-WSS353.txt?dl=1', 'similarity', header=None, sep="\t") elif which == "set1": data = _get_as_pd( 'https://www.dropbox.com/s/opj6uxzh5ov8gha/EN-WS353-SET1.txt?dl=1', 'similarity', header=0, sep="\t") elif which == "set2": data = _get_as_pd( 'https://www.dropbox.com/s/w03734er70wyt5o/EN-WS353-SET2.txt?dl=1', 'similarity', header=0, sep="\t") else: raise RuntimeError("Not recognized which parameter") # We basically select all the columns available X = data.values[:, 0:2] y = data.values[:, 2].astype(np.float) # We have also scores if data.values.shape[1] > 3: sd = np.std(data.values[:, 2:15].astype(np.float), axis=1).flatten() return Bunch(X=X.astype("object"), y=y, sd=sd) else: return Bunch(X=X.astype("object"), y=y)
####################### # # # 文本分类 # # # ####################### import pickle from sklearn.svm import LinearSVC #导入线性SVM '''1、导入数据''' with open("D:\\mywork\\test\\ML_Chinese\\tfidfspace.dat","rb") as f1: train = pickle.load(f1) with open("D:\\mywork\\test\\ML_Chinese\\test_set.dat","rb") as f2: test = pickle.load(f2) '''2、构建测试集tdm向量''' from sklearn.datasets.base import Bunch tfidftest = Bunch(target_name=test.target_name, label=test.label, filenames=test.filenames, tdm=[], vocabulary={}) from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer #TF-IDF向量转换类 from sklearn.feature_extraction.text import TfidfVectorizer #TF-IDF向量生成类 stoplist = readfile(stop_path).splitlines() #见第二章的函数 '''2-1 构建测试集向量时需使用训练集词袋向量''' vectorizer = TfidfVectorizer(stop_words=stoplist, sublinear_tf=True, max_df=0.5, vocabulary=train.vocabulary) transformer = TfidfTransformer() #统计每个词语的TF-IDF权重 text=[i.decode("GBK","ignore") for i in test.contents] #将二进制转为unicode tfidftest.tdm = vectorizer.fit_transform(text) tfidftest.vocabulary = train.vocabulary '''3、建模''' svm = LinearSVC(penalty='l2',dual=False,tol=0.0001) svm.fit(train.tdm,train.label) pre=svm.predict(tfidftest.tdm)
from sklearn.datasets.base import Bunch # 分词后分类语料库路径 seg_path = "text_corpus_segment/" # 词袋语料路径 wordbag_path = "text_corpus_wordbag/" if not os.path.exists(wordbag_path): os.makedirs(wordbag_path) # Bunch类提供一种key,value的对象形式 # target_name:所有分类名称列表 # label:每个文件的分类标签列表 # filenames:文件名称 # contents:文件内容 data_set = Bunch(target_name=[], label=[], filenames=[], contents=[]) # 获取seg_path下的所有子分类 class_list = os.listdir(seg_path) data_set.target_name = class_list # 获取每个子目录下所有的文件 for mydir in class_list: class_path = seg_path + mydir + "/" file_list = os.listdir(class_path) # 获取class_path下的所有文件 for file_name in file_list: file_path = class_path + file_name data_set.filenames.append(file_path) # 把文件路径附加到数据集中 data_set.label.append(data_set.target_name.index(mydir)) # 把文件分类标签附加到数据集中 with open(file_path, 'r', encoding='gb18030') as file: seg_corpus = file.read() # 读取语料
import numpy as np from skimage import io from sklearn.datasets.base import Bunch from dip.load_data import load_image_files, load_mask_images from dip.mask import bounding_rect_of_mask datasets = load_mask_images() data = [] for f, mask in zip( datasets.filenames, load_image_files(datasets.filenames), ): # rect: (min_x, max_x, min_y, max_x) rect = bounding_rect_of_mask(mask, negative=True) data.append(list(rect)) print('{0}: {1}'.format(f, rect)) bunch = Bunch(name='mask rects') bunch.data = np.array(data) bunch.filenames = datasets.filenames bunch.target = datasets.target bunch.target_names = datasets.target_names bunch.description = 'mask rects: (min_x, min_y, max_x, max_y)' with gzip.open('rects.pkl.gz', 'wb') as f: pickle.dump(bunch, f)
from sklearn.feature_extraction.text import TfidfVectorizer reload(sys) #导入训练预料 data_set={} #训练语料集路径 train_path='text_corpus1_wordbag/train_set.data' file_obj=open(train_path,'rb') #读取持久化后的对象 data_set=pickle.load(file_obj) file_obj.close() #定义词袋数据结构 wordbag=Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={}) wordbag.target_name=data_set.target_name wordbag.label=data_set.label wordbag.filenames=data_set.filenames #构建语料 corpus=data_set.contents #从文件导入停用词表 stpwrdpath='extra_dict/hlt_stop_words.txt' stpwrd_dic=open(stpwrdpath,'rb') stpwrd_content=stpwrd_dic.read() #将停用词转换为list stpwrdlst=stpwrd_content.splitlines() stpwrd_dic.close()
def get_data(clf, train, cats, fixk, min_size, vct, raw, limit=2): import copy min_size = 10 args.fixk = None data, vct2 = load_from_file(train, cats, fixk, min_size, vct, raw=raw) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) ### SENTENCE TRANSFORMATION sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') ## delete <br> to "." to recognize as end of sentence data.train.data = clean_html(data.train.data) data.test.data = clean_html(data.test.data) print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0])) ## Get the features of the sentence dataset ## create splits of data: pool, test, oracle, sentences expert_data = Bunch() train_test_data = Bunch() expert_data.sentence, train_test_data.pool = split_data(data.train) expert_data.oracle, train_test_data.test = split_data(data.test) data.train.data = train_test_data.pool.train.data data.train.target = train_test_data.pool.train.target data.test.data = train_test_data.test.train.data data.test.target = train_test_data.test.train.target ## convert document to matrix data.train.bow = vct.fit_transform(data.train.data) data.test.bow = vct.transform(data.test.data) #### EXPERT CLASSIFIER: ORACLE print("Training Oracle expert") labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=limit) print len(sent_train) expert_data.oracle.train.data = sent_train expert_data.oracle.train.target = np.array(labels) expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data) print expert_data.oracle.train.bow.shape # exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty) exp_clf = copy.copy(clf) exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target) #### EXPERT CLASSIFIER: SENTENCES print("Training sentence expert") labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=limit) expert_data.sentence.train.data = sent_train expert_data.sentence.train.target = np.array(labels) expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data) sent_clf = None # if args.cheating: sent_clf = copy.copy(clf) # sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty) sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target) return exp_clf, data, vct, sent_clf, expert_data
def fetch_atlas_basc_multiscale_2015(version='sym', data_dir=None, resume=True, verbose=1): """Downloads and loads multiscale functional brain parcellations This atlas includes group brain parcellations generated from resting-state functional magnetic resonance images from about 200 young healthy subjects. Multiple scales (number of networks) are available, among 7, 12, 20, 36, 64, 122, 197, 325, 444. The brain parcellations have been generated using a method called bootstrap analysis of stable clusters called as BASC, (Bellec et al., 2010) and the scales have been selected using a data-driven method called MSTEPS (Bellec, 2013). Note that two versions of the template are available, 'sym' or 'asym'. The 'asym' type contains brain images that have been registered in the asymmetric version of the MNI brain template (reflecting that the brain is asymmetric), while the 'sym' type contains images registered in the symmetric version of the MNI template. The symmetric template has been forced to be symmetric anatomically, and is therefore ideally suited to study homotopic functional connections in fMRI: finding homotopic regions simply consists of flipping the x-axis of the template. .. versionadded:: 0.2.3 Parameters ---------- version: str, optional Available versions are 'sym' or 'asym'. By default all scales of brain parcellations of version 'sym' will be returned. data_dir: str, optional directory where data should be downloaded and unpacked. url: str, optional url of file to download. resume: bool whether to resumed download of a partly-downloaded file. verbose: int verbosity level (0 means no message). Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, Keys are: - "scale007", "scale012", "scale020", "scale036", "scale064", "scale122", "scale197", "scale325", "scale444": str, path to Nifti file of various scales of brain parcellations. - "description": details about the data release. References ---------- Bellec P, Rosa-Neto P, Lyttelton OC, Benali H, Evans AC, Jul. 2010. Multi-level bootstrap analysis of stable clusters in resting-state fMRI. NeuroImage 51 (3), 1126-1139. URL http://dx.doi.org/10.1016/j.neuroimage.2010.02.082 Bellec P, Jun. 2013. Mining the Hierarchy of Resting-State Brain Networks: Selection of Representative Clusters in a Multiscale Structure. Pattern Recognition in Neuroimaging (PRNI), 2013 pp. 54-57. Notes ----- For more information on this dataset's structure, see https://figshare.com/articles/basc/1285615 """ versions = ['sym', 'asym'] if version not in versions: raise ValueError('The version of Brain parcellations requested "%s" ' 'does not exist. Please choose one among them %s.' % (version, str(versions))) keys = ['scale007', 'scale012', 'scale020', 'scale036', 'scale064', 'scale122', 'scale197', 'scale325', 'scale444'] if version == 'sym': url = "https://ndownloader.figshare.com/files/1861819" elif version == 'asym': url = "https://ndownloader.figshare.com/files/1861820" opts = {'uncompress': True} dataset_name = "basc_multiscale_2015" data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) folder_name = 'template_cambridge_basc_multiscale_nii_' + version basenames = ['template_cambridge_basc_multiscale_' + version + '_' + key + '.nii.gz' for key in keys] filenames = [(os.path.join(folder_name, basename), url, opts) for basename in basenames] data = _fetch_files(data_dir, filenames, resume=resume, verbose=verbose) descr = _get_dataset_descr(dataset_name) params = dict(zip(keys, data)) params['description'] = descr return Bunch(**params)
def main(): accuracies = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) vct_analizer = vct.build_tokenizer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = max(50, args.fixk) if "imdb" in args.train: ########## IMDB MOVIE REVIEWS ########### data = load_imdb(args.train, shuffle=True, rnd=2356, vct=vct, min_size=min_size, fix_k=args.fixk) # should brind data as is elif "aviation" in args.train: raise Exception("We are not ready for that data yet") elif "20news" in args.train: ########## 20 news groups ###### data = load_20newsgroups(categories=categories[0], vectorizer=vct, min_size=min_size, fix_k=args.fixk) # for testing purposes elif "dummy" in args.train: ########## DUMMY DATA########### data = load_dummy("C:/Users/mramire8/Documents/code/python/data/dummy", shuffle=True, rnd=2356, vct=vct, min_size=0, fix_k=args.fixk) else: raise Exception("We do not know that dataset") print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) #print(data.train.data[0]) #### COST MODEL parameters = parse_parameters(args.cost_model) print "Cost Parameters %s" % parameters cost_model = set_cost_model(parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ #### ACCURACY MODEL # try: # # accu_parameters = parse_parameters(args.accu_model) # except ValueError: accu_parameters = parse_parameters_mat(args.accu_model) # else # print("Error: Accuracy parameters didn't work") print "Accuracy Parameters %s" % accu_parameters #if "fixed" in args.accu_function: # accuracy_model = base_models.FixedAccuracyModel(accuracy_value=.7) #elif "log" in args.accu_function: # accuracy_model = base_models.LogAccuracyModel(model=parameters) #elif "linear" in args.accu_function: # accuracy_model = base_models.LRAccuracyModel(model=parameters) #else: # raise Exception("We need a defined cost function options [fixed|log|linear]") # #print "\nAccuracy Model: %s " % accuracy_model #### CLASSIFIER #### Informed priors #feature_counts = np.ones(x_train.shape[0]) * x_train #feature_frequencies = feature_counts / np.sum(feature_counts) #alpha = feature_frequencies alpha = 1 clf = MultinomialNB(alpha=alpha) print "\nClassifier: %s" % clf #### EXPERT MODEL #expert = baseexpert.BaseExpert() if "fixed" in args.expert: expert = baseexpert.FixedAccuracyExpert(accuracy_value=accu_parameters[0], cost_function=cost_model.cost_function) #average value of accuracy of the experts elif "true" in args.expert: expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function) elif "linear" in args.expert: #expert = baseexpert.LRFunctionExpert(model=[0.0019, 0.6363],cost_function=cost_model.cost_function) raise Exception("We do not know linear yet!!") elif "log" in args.expert: expert = baseexpert.LogFunctionExpert(model=accu_parameters, cost_function=cost_model.cost_function) elif "direct" in args.expert: expert = baseexpert.LookUpExpert(accuracy_value=accu_parameters, cost_function=cost_model.cost_function) else: raise Exception("We need a defined cost function options [fixed|log|linear]") #expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function) print "\nExpert: %s " % expert #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 eval_range = 1 if (args.budget / evaluation_points) <= 0 else args.budget / evaluation_points print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, 50)) t0 = time.time() ### experiment starts for t in range(args.trials): print "*" * 60 print "Trial: %s" % t # TODO shuffle the data?? #student = baselearner.BaseLearner(model=clf, cost_model=cost_model, accuracy_model=accuracy_model, budget=args.budget, # seed=t) student = randomsampling.RandomSamplingLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t) print "\nStudent: %s " % student train_indices = [] train_x = [] train_y = [] pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training pool.fixk = data.train.bowk.tocsr() # k words BOW for querying pool.target = data.train.target pool.predicted = [] pool.kwords = np.array(data.train.kwords) # k words pool.remaining = set(range(pool.data.shape[0])) # indices of the pool #for x in pool.fixk: # print x.todense().sum() bootstrapped = False current_cost = 0 iteration = 0 while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter: if not bootstrapped: ## random bootstrap #bt = randomsampling.BootstrapRandom(random_state=t * 10) ## random from each bootstrap bt = randomsampling.BootstrapFromEach(t * 10) query_index = bt.bootstrap(pool=pool, k=bootstrap_size) bootstrapped = True print "Bootstrap: %s " % bt.__class__.__name__ print else: query_index = student.pick_next(pool=pool, k=step_size) query = pool.fixk[query_index] # query with k words query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]] #if query_size[0] >50: # print "*** %s" % pool.kwords[query_index] ground_truth = pool.target[query_index] #labels, spent = expert.label(unlabeled=query, target=ground_truth) if iteration == 0: ## bootstrap uses ground truth labels = ground_truth else: #labels = expert.label_instances(query, ground_truth) labels = expert.label_instances(query_size, ground_truth) #spent = expert.estimate_instances(pool.kwords[query_index]) spent = expert.estimate_instances(query_size) query_cost = np.array(spent).sum() current_cost += query_cost train_indices.extend(query_index) # remove labels from pool pool.remaining.difference_update(query_index) # add labels to training train_x = pool.data[train_indices] ## train with all the words # update labels with the expert labels #train_y = pool.target[train_indices] train_y.extend(labels) if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # retrain the model current_model = student.train(train_x, train_y) # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) #auc = metrics.roc_auc_score(data.test.target, y_probas[:,1]) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] accu = metrics.accuracy_score(data.test.target, pred_y) print ( "TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices), accu, auc, query_cost, current_cost, spent)) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count #x_axis_range = int(current_cost / eval_range) x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results #accuracies[len(train_indices)].append(accu) #aucs[len(train_indices)].append(auc) accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) iteration += 1 print("Elapsed time %.3f" % (time() - t0)) print_results(x_axis, accuracies, aucs)
return bunch #写入bunch对象 def writebunchobj(path,bunchobj): file_obj = open(path, "wb") pickle.dump(bunchobj,file_obj) file_obj.close() # 1. 读取停用词表 stopword_path = "train_word_bag/hlt_stop_words.txt" stpwrdlst = readfile(stopword_path).splitlines() # 2. 导入分词后的词向量bunch对象 path = "test_word_bag/test_set.dat" # 词向量空间保存路径 bunch = readbunchobj(path) # 3. 构建测试集tfidf向量空间 testspace = Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={}) # 4. 导入训练集的词袋 trainbunch = readbunchobj("train_word_bag/tfdifspace.dat") # 5. 使用TfidfVectorizer初始化向量空间模型 vectorizer = TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf = True,max_df = 0.5,vocabulary=trainbunch.vocabulary) transformer=TfidfTransformer() # 该类会统计每个词语的tf-idf权值 # 文本转为tf-idf矩阵,单独保存字典文件 testspace.tdm = vectorizer.fit_transform(bunch.contents) testspace.vocabulary = trainbunch.vocabulary # 创建词袋的持久化 space_path = "test_word_bag/testspace.dat" # 词向量空间保存路径 writebunchobj(space_path,testspace) print "test词向量空间创建成功!!!"
def fetch_atlas_surf_destrieux(data_dir=None, url=None, resume=True, verbose=1): """Download and load Destrieux et al, 2010 cortical atlas. This atlas returns 76 labels per hemisphere based on sulco-gryal pattnerns as distributed with Freesurfer in fsaverage5 surface space. .. versionadded:: 0.3 Parameters ---------- data_dir: str, optional Path of the data directory. Use to force data storage in a non- standard location. Default: None url: str, optional Download URL of the dataset. Overwrite the default URL. resume: bool, optional (default True) If True, try resuming download if possible. verbose: int, optional (default 1) Defines the level of verbosity of the output. Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, contains: - "labels": list Contains region labels - "map_left": numpy.ndarray Index into 'labels' for each vertex on the left hemisphere of the fsaverage5 surface - "map_right": numpy.ndarray Index into 'labels' for each vertex on the right hemisphere of the fsaverage5 surface - "description": str Details about the dataset References ---------- Destrieux et al. (2010), Automatic parcellation of human cortical gyri and sulci using standard anatomical nomenclature. NeuroImage 53, 1-15. """ if url is None: url = "https://www.nitrc.org/frs/download.php/" dataset_name = 'destrieux_surface' fdescr = _get_dataset_descr(dataset_name) data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) # Download annot files, fsaverage surfaces and sulcal information annot_file = '%s.aparc.a2009s.annot' annot_url = url + '%i/%s.aparc.a2009s.annot' annot_nids = {'lh annot': 9343, 'rh annot': 9342} annots = [] for hemi in [('lh', 'left'), ('rh', 'right')]: annot = _fetch_files(data_dir, [(annot_file % (hemi[1]), annot_url % (annot_nids['%s annot' % hemi[0]], hemi[0]), {'move': annot_file % (hemi[1])})], resume=resume, verbose=verbose)[0] annots.append(annot) annot_left = nb.freesurfer.read_annot(annots[0]) annot_right = nb.freesurfer.read_annot(annots[1]) return Bunch(labels=annot_left[2], map_left=annot_left[0], map_right=annot_right[0], description=fdescr)
def create_dataset(subject_id): import numpy as np import os from nilearn import datasets from nilearn.datasets import _get_dataset_dir from nilearn.datasets import _get_dataset from sklearn.datasets.base import Bunch import pylab as pl import nibabel as nb from remove import remove_range, remove dataset_name = 'machine_learning' runs = 4 img_data = np.zeros((64,64,33,1)) lab_data = [] session_data = [] for r in range(runs): print 'RUN', r rv = None path = '/gablab/p/eegfmri/analysis/eeg/elists' path_all_codes = '/gablab/p/eegfmri/analysis/iaps/all_labels.txt' path_names2 = os.path.join(path, 'elist_IAPS_%s_%s_raw.txt' %(subject_id, r+1)) if subject_id == '009': path_names2 = os.path.join(path, 'elist_IAPS_%s_%s.txt' %(subject_id, r+1)) eegcodes = np.genfromtxt(path_all_codes, dtype=int) [:, 0] attributes = np.genfromtxt(path_all_codes, dtype=float) [:, 1:4] binary = attributes[:, 2] run_code = np.genfromtxt(path_names2, dtype=str) [:,3] clock = np.genfromtxt(path_names2, dtype=str) [:,4] cl = [] tp = [] for i in range(len(clock)): if run_code[i] == 'R128': timepoint = clock[i].lstrip('0123456789') tp.append(timepoint) if len(tp) > 0: clock[i] = clock[i].lstrip('0123456789') if clock[i] == tp[0]: cl.append([i]) if run_code[i] != 'R128': print i, run_code[i] if clock[i] != tp[0] and run_code[i] == 'R128': print 'TR at index', i, 'removed.' run_code[i] = 'remove' print 'Numbers of TR identical timepoints', len(cl) tr = [] for idx,i in enumerate(run_code): if i == 'R128': tr.append([idx]) print 'Number of TR counted from elist code', len(tr) rv = remove(run_code, 'R') rv = remove(rv, 'remove') rv = remove(rv, 'boundary') rv = remove(rv, 'SyncOn') rv = remove(rv, 'Start') rv = remove(rv, 'Userdefined') rv = remove(rv, 'LowCorrelation') rv = remove(rv, 'TSTART') rv = remove(rv, 'TPEAK') rv = remove(rv, 'TEND') for i in range(len(rv)): if rv[i] == 'R128': rv[i] = '-99' rv[i] = rv[i].lstrip('S') rv[i] = int(rv[i]) # remove stimulus codes for responses rv = remove_range(rv, 240) for idx, i in enumerate(rv): for idx2, i2 in enumerate(eegcodes): if i == i2: rv[idx] = binary[idx2] for idx, i in enumerate(rv): if i != -99: rv[idx-1] = i rv[idx] = 0 # remove last TR as it was apparently not recorded rv[-1] = 0 rv = remove(rv, 0) for idx, i in enumerate(rv): if i == -99: rv[idx] = 0 # until now the list with negative / neutral labels also contains zeros, which we will want to get rid of. # To do this, we will replace the zeros with the code shown prior # First two values will be deleted as well as first two TRs (after fmri_data_i gets assigned for idx, z in enumerate(rv): if idx <= 2 and z == 0: rv[idx] = -77 if idx > 2 and z == 0: rv[idx] = rv[idx-1] for idx, z in enumerate(rv): if idx <= 1 and z != -77: print 'Warning, non-empty first two TRs were deleted.' rv = remove(rv, -77) unique = sorted(list(set(rv))) print 'Unique values in RV', unique t = open('/gablab/p/eegfmri/analysis/iaps/pilot%s/machine_learning/neg-neutr_attributes_run%s.txt' %(subject_id, r), 'w') for i in range(len(rv)): t.write("%s %s" %(rv[i], r)) t.write('\n') t.close() print 'Labels Length:', len(rv) file_name = ['neg-neutr_attributes_run%s.txt' %(r), 'pilot%s_r0%s_bandpassed.nii.gz' %(subject_id, r)] fil = _get_dataset(dataset_name, file_name, data_dir='/gablab/p/eegfmri/analysis/iaps/pilot%s' %(subject_id), folder=None) ds_i = Bunch(func=fil[1], conditions_target=fil[0]) labels_i = np.loadtxt(ds_i.conditions_target, dtype=np.str) bold_i = nb.load(ds_i.func) fmri_data_i = np.copy(bold_i.get_data()) print 'Original fMRI data', fmri_data_i.shape fmri_data_i = fmri_data_i[...,2:] print fmri_data_i.shape affine = bold_i.get_affine() mean_img_i = np.mean(fmri_data_i, axis=3) session_data = np.append(session_data, labels_i[:,1]) lab_data = np.append(lab_data, labels_i[:,0]) img_data = np.concatenate((img_data, fmri_data_i), axis=3) print '__________________________________________________________________________________________________________' if r == 3: img_data = img_data[...,1:] print 'fMRI image', img_data.shape print 'Label Vector Length:', len(lab_data), 'Session Vector Length:', len(session_data) ni_img = nb.Nifti1Image(img_data, affine=None, header=None) nb.save(ni_img, '/gablab/p/eegfmri/analysis/iaps/pilot%s/machine_learning/all_runs.nii' %(subject_id)) f = open('/gablab/p/eegfmri/analysis/iaps/pilot%s/machine_learning/neg-neutr_attributes_all_runs.txt' %(subject_id), 'w') for i in range(len(lab_data)): f.write("%s %s" %(lab_data[i], session_data[i])) f.write('\n') f.close() # set up concatenated dataset in nilearn format file_names = ['neg-neutr_attributes_all_runs.txt', 'all_runs.nii'] files = _get_dataset(dataset_name, file_names, data_dir='/gablab/p/eegfmri/analysis/iaps/pilot%s' %(subject_id), folder=None) ds = Bunch(func=files[1], conditions_target=files[0]) print ds.keys(), ds labels = np.loadtxt(ds.conditions_target, dtype=np.str) bold = nb.load(ds.func) fmri_data = np.copy(bold.get_data()) print fmri_data.shape affine = bold_i.get_affine() # just choose one # Compute the mean EPI: we do the mean along the axis 3, which is time mean_img = np.mean(fmri_data, axis=3) return (ds, labels, bold, fmri_data, affine, mean_img) # later 'ds' will be sufficient
def main(): print args print accuracies = defaultdict(lambda: []) ora_accu = defaultdict(lambda: []) oracle_accuracies =[] ora_cm = defaultdict(lambda: []) lbl_dit = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = 10 args.fixk = None data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) parameters = experiment_utils.parse_parameters_mat(args.cost_model) print "Cost Parameters %s" % parameters cost_model = experiment_utils.set_cost_model(args.cost_function, parameters=parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ ### SENTENCE TRANSFORMATION if args.train == "twitter": sent_detector = TwitterSentenceTokenizer() else: sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') ## delete <br> to "." to recognize as end of sentence data.train.data = experiment_utils.clean_html(data.train.data) data.test.data = experiment_utils.clean_html(data.test.data) print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0])) ## Get the features of the sentence dataset ## create splits of data: pool, test, oracle, sentences expert_data = Bunch() if not args.fulloracle: train_test_data = Bunch() expert_data.sentence, train_test_data.pool = split_data(data.train) expert_data.oracle, train_test_data.test = split_data(data.test) data.train.data = train_test_data.pool.train.data data.train.target = train_test_data.pool.train.target data.test.data = train_test_data.test.train.data data.test.target = train_test_data.test.train.target ## convert document to matrix data.train.bow = vct.fit_transform(data.train.data) data.test.bow = vct.transform(data.test.data) #### EXPERT CLASSIFIER: ORACLE print("Training Oracle expert") exp_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty) if not args.fulloracle: print "Training expert documents:%s" % len(expert_data.oracle.train.data) labels, sent_train = experiment_utils.split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=args.limit) expert_data.oracle.train.data = sent_train expert_data.oracle.train.target = np.array(labels) expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data) exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target) else: # expert_data.data = np.concatenate((data.train.data, data.test.data)) # expert_data.target = np.concatenate((data.train.target, data.test.target)) expert_data.data =data.train.data expert_data.target = data.train.target expert_data.target_names = data.train.target_names labels, sent_train = experiment_utils.split_data_sentences(expert_data, sent_detector, vct, limit=args.limit) expert_data.bow = vct.transform(sent_train) expert_data.target = labels expert_data.data = sent_train exp_clf.fit(expert_data.bow, expert_data.target) if "neutral" in args.expert: expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold, cost_function=cost_model.cost_function) elif "true" in args.expert: expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function) elif "pred" in args.expert: expert = baseexpert.PredictingExpert(exp_clf, #threshold=args.neutral_threshold, cost_function=cost_model.cost_function) elif "human" in args.expert: expert = baseexpert.HumanExpert(", ".join(["{}={}".format(a,b) for a,b in enumerate(data.train.target_names)])+"? > ") else: raise Exception("We need an expert!") print "\nExpert: %s " % expert #### EXPERT CLASSIFIER: SENTENCES print("Training sentence expert") sent_clf = None if args.cheating: labels, sent_train = experiment_utils.split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=args.limit) expert_data.sentence.train.data = sent_train expert_data.sentence.train.target = np.array(labels) expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data) sent_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty) sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target) #### STUDENT CLASSIFIER clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty) print "\nStudent Classifier: %s" % clf print "\nSentence Classifier: %s" % sent_clf print "\nExpert Oracle Classifier: %s" % exp_clf print "\nPenalty Oracle:", exp_clf.C print "\nVectorizer: %s" % vct #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, min_size)) print ("Anytime active learning experiment - use objective function to pick data") t0 = time.time() tac = [] tau = [] ### experiment starts for t in range(args.trials): trial_accu = [] trial_aucs = [] print "*" * 60 print "Trial: %s" % t student = get_student(clf, cost_model, sent_clf, sent_detector, vct) student.human_mode = args.expert == 'human' print "\nStudent: %s " % student train_indices = [] neutral_data = [] # save the xik vectors train_x = [] train_y = [] neu_x = [] # data to train the classifier neu_y = np.array([]) pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training pool.text = data.train.data pool.target = data.train.target pool.predicted = [] pool.remaining = set(range(pool.data.shape[0])) # indices of the pool bootstrapped = False current_cost = 0 iteration = 0 query_index = None query_size = None oracle_answers = 0 calibrated=args.calibrate while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter: util = [] if not bootstrapped: ## random from each bootstrap bt = randomsampling.BootstrapFromEach(t * 10) query_index = bt.bootstrap(pool=pool, k=bootstrap_size) bootstrapped = True query = pool.data[query_index] print "Bootstrap: %s " % bt.__class__.__name__ print else: chosen = student.pick_next(pool=pool, step_size=step_size) query_index = [x for x, y in chosen] # document id of chosen instances query = [y[0] for x, y in chosen] # sentence of the document query_size = [1] * len(query_index) ground_truth = pool.target[query_index] if iteration == 0: ## bootstrap uses ground truth labels = ground_truth spent = [0] * len(ground_truth) ## bootstrap cost is ignored else: # print "ask labels" labels = expert.label_instances(query, ground_truth) spent = expert.estimate_instances(query_size) ### accumulate the cost of the query query_cost = np.array(spent).sum() current_cost += query_cost useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None]) neutral_answers = np.array([[x, z] for x, y, z in zip(query_index, labels, query_size) if y is None]) \ if iteration != 0 else np.array([]) ## add data recent acquired to train if useful_answers.shape[0] != 0: train_indices.extend(useful_answers[:, 0]) # add labels to training train_x = pool.data[train_indices] # # train with all the words # update labels with the expert labels train_y.extend(useful_answers[:, 1]) neu_x, neu_y, neutral_data = update_sentence(neutral_data, neu_x, neu_y, labels, query_index, pool, vct) # neu_x, neu_y, neutral_data = update_sentence_query(neutral_data, neu_x, neu_y, query, labels) if neu_y.shape[0] != neu_x.shape[0]: raise Exception("Training data corrupted!") if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # remove labels from pool pool.remaining.difference_update(query_index) # retrain the model current_model = student.train_all(train_x, train_y, neu_x, neu_y) # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] correct_labels = (np.array(ground_truth) == np.array(labels).reshape(len(labels))).sum() accu = metrics.accuracy_score(data.test.target, pred_y) print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tGT:{5}\tneu:{6}\t{7}\tND:{8}\tTD:{9}\t ora_accu:{10}".format( len(train_indices), accu, auc, query_cost, current_cost, ground_truth, len(neutral_answers), neu_y.shape[0], neu_y.sum(), np.array(train_y).sum(), correct_labels)) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count # oracle accuracy (from queries) oracle_answers += correct_labels x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) ora_accu[x_axis_range].append(1. * correct_labels) ora_cm[x_axis_range].append(metrics.confusion_matrix(ground_truth, labels, labels=np.unique(train_y))) lbl_dit[x_axis_range].append(np.sum(train_y)) # partial trial results trial_accu.append([x_axis_range, accu]) trial_aucs.append([x_axis_range, auc]) # oracle_accuracies[x_axis_range].append(oracle_answers) iteration += 1 # end of budget loop tac.append(trial_accu) tau.append(trial_aucs) oracle_accuracies.append(1.*oracle_answers / (len(train_indices)-bootstrap_size)) print "Trial: {}, Oracle right answers: {}, Iteration: {}, Labels:{}, ACCU-OR:{}".format(t, oracle_answers, iteration, len(train_indices)-bootstrap_size,1.*oracle_answers / (len(train_indices)-bootstrap_size)) #end trial loop if args.cost_function not in "uniform": accuracies = experiment_utils.extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size) aucs = experiment_utils.extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size) print "\nAverage oracle accuracy: ", np.array(oracle_accuracies).mean() print("Elapsed time %.3f" % (time.time() - t0)) cheating = "CHEATING" if args.cheating else "NOCHEAT" experiment_utils.print_extrapolated_results(accuracies, aucs, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student) experiment_utils.oracle_accuracy(ora_accu, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student, cm=ora_cm, num_trials=args.trials)
from sklearn.datasets import fetch_20newsgroups from sklearn.datasets.base import Bunch # 引入 Bunch 类 import pickle # 引入持久化类 from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer ################################################################## ## 导入数据 categories = ["alt.atheism", "soc.religion.christian", "comp.graphics", "sci.med"] # 选取需要下载的新闻分类 data_set = fetch_20newsgroups(subset="train", categories=categories, shuffle=True, random_state=42) # 下载并获取训练数据, 也是先全部下载, 再提取部分 print(data_set.target_names) # ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian'] ################################################################## ## 定义词袋数据结构 # tdm:tf-idf 计算后词袋 stpwrdlst = [] # 停用词表为 空 wordbag = Bunch(target_name=[], label=[], filenames=[], tdm=[], vocabulary={}, stpwrdlst=[]) wordbag.target_name = data_set.target_names wordbag.label = data_set.target wordbag.filenames = data_set.filenames wordbag.stpwrdlst = stpwrdlst vectorizer = CountVectorizer(stop_words=stpwrdlst) # 使用 TfidfVectorizer 初始化向量空间模型--创建词袋 transformer = TfidfTransformer() # 该类会统计每个词语的 tf-idf 权值 fea_train = vectorizer.fit_transform(data_set.data) # 文本转为词频矩阵 print(fea_train.shape) # (2257, 35788); 2257 篇文档, 35788 个单词 wordbag.tdm = fea_train # 为 tdm 赋值 wordbag.vocabulary = vectorizer.vocabulary_ ################################################################## ## 创建词袋的持久化 file_obj = open("tmp.data", "wb")
def fetch_mixed_gambles(n_subjects=1, data_dir=None, url=None, resume=True, return_raw_data=False, verbose=0): """Fetch Jimura "mixed gambles" dataset. Parameters ---------- n_subjects: int, optional (default 1) The number of subjects to load. If None is given, all the subjects are used. data_dir: string, optional (default None) Path of the data directory. Used to force data storage in a specified location. Default: None. url: string, optional (default None) Override download URL. Used for test only (or if you setup a mirror of the data). resume: bool, optional (default True) If true, try resuming download if possible. verbose: int, optional (default 0) Defines the level of verbosity of the output. return_raw_data: bool, optional (default True) If false, then the data will transformed into and (X, y) pair, suitable for machine learning routines. X is a list of n_subjects * 48 Nifti1Image objects (where 48 is the number of trials), and y is an array of shape (n_subjects * 48,). smooth: float, or list of 3 floats, optional (default 0.) Size of smoothing kernel to apply to the loaded zmaps. Returns ------- data: Bunch Dictionary-like object, the interest attributes are : 'zmaps': string list Paths to realigned gain betamaps (one nifti per subject). 'gain': .. If make_Xy is true, this is a list of n_subjects * 48 Nifti1Image objects, else it is None. 'y': array of shape (n_subjects * 48,) or None If make_Xy is true, then this is an array of shape (n_subjects * 48,), else it is None. References ---------- [1] K. Jimura and R. Poldrack, "Analyses of regional-average activation and multivoxel pattern information tell complementary stories", Neuropsychologia, vol. 50, page 544, 2012 """ if n_subjects > 16: warnings.warn("Warning: there are only 16 subjects!") n_subjects = 16 if url is None: url = "https://www.nitrc.org/frs/download.php/7229/" "jimura_poldrack_2012_zmaps.zip" opts = dict(uncompress=True) files = [("zmaps%ssub%03i_zmaps.nii.gz" % (os.sep, (j + 1)), url, opts) for j in range(n_subjects)] data_dir = _get_dataset_dir("jimura_poldrack_2012_zmaps", data_dir=data_dir) zmap_fnames = _fetch_files(data_dir, files, resume=resume, verbose=verbose) data = Bunch(zmaps=zmap_fnames) if not return_raw_data: X, y, mask_img = _load_mixed_gambles(map(nibabel.load, data.zmaps)) data.zmaps, data.gain, data.mask_img = X, y, mask_img return data
def get_WS353_set2(): data = pd.read_csv(WS353_set2_path, sep="\t", header=0).values return Bunch(X=data[:, 0:2].astype("object"), y=data[:, 2].astype(np.float))
def main(): accuracies = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) vct_analizer = vct.build_tokenizer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = max(100, args.fixk) fixk_saved = "{0}{1}.p".format(args.train, args.fixk) try: fixk_file = open(fixk_saved, "rb") data = pickle.load(fixk_file) except IOError: data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5) fixk_file = open(fixk_saved, "wb") pickle.dump(data, fixk_file) # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) parameters = parse_parameters_mat(args.cost_model) print "Cost Parameters %s" % parameters cost_model = set_cost_model(args.cost_function, parameters=parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ #### STUDENT CLASSIFIER clf = linear_model.LogisticRegression(penalty="l1", C=1) print "\nStudent Classifier: %s" % clf #### EXPERT CLASSIFIER exp_clf = linear_model.LogisticRegression(penalty='l1', C=.3) exp_clf.fit(data.test.bow, data.test.target) expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold, cost_function=cost_model.cost_function) print "\nExpert: %s " % expert #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, min_size)) print ("Cheating experiment - use full uncertainty query k words") t0 = time.time() ### experiment starts tx =[] tac = [] tau = [] for t in range(args.trials): trial_accu =[] trial_aucs = [] trial_x_axis = [] print "*" * 60 print "Trial: %s" % t student = randomsampling.UncertaintyLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t) print "\nStudent: %s " % student train_indices = [] train_x = [] train_y = [] pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training pool.fixk = data.train.bowk.tocsr() # k words BOW for querying pool.target = data.train.target pool.predicted = [] pool.kwords = np.array(data.train.kwords) # k words pool.remaining = set(range(pool.data.shape[0])) # indices of the pool bootstrapped = False current_cost = 0 iteration = 0 while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter: if not bootstrapped: ## random from each bootstrap bt = randomsampling.BootstrapFromEach(t * 10) query_index = bt.bootstrap(pool=pool, k=bootstrap_size) bootstrapped = True print "Bootstrap: %s " % bt.__class__.__name__ print else: query_index = student.pick_next(pool=pool, k=step_size) query = pool.fixk[query_index] # query with k words query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]] ground_truth = pool.target[query_index] #labels, spent = expert.label(unlabeled=query, target=ground_truth) if iteration == 0: ## bootstrap uses ground truth labels = ground_truth spent = [0] * len(ground_truth) ## bootstrap cost is ignored else: labels = expert.label_instances(query, ground_truth) spent = expert.estimate_instances(query_size) ## add data recent acquired to train ## CHANGE: if label is not useful, ignore and do not charge money for it useful_answers = np.array([[x, y, z] for x, y, z in zip(query_index, labels, spent) if y is not None]) # train_indices.extend(query_index) if useful_answers.shape[0] != 0: train_indices.extend(useful_answers[:, 0]) # add labels to training train_x = pool.data[train_indices] ## train with all the words # update labels with the expert labels train_y.extend(useful_answers[:, 1]) #count for cost ### accumulate the cost of the query # query_cost = np.array(spent).sum() # current_cost += query_cost query_cost = useful_answers[:, 2] query_cost = np.sum(query_cost) current_cost += query_cost if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # remove labels from pool pool.remaining.difference_update(query_index) # retrain the model current_model = student.train(train_x, train_y) # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] accu = metrics.accuracy_score(data.test.target, pred_y) print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices), accu, auc, query_cost, current_cost, spent)) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) ## partial trial results trial_accu.append([x_axis_range, accu]) trial_aucs.append([x_axis_range, auc]) iteration += 1 # end of budget loop tac.append(trial_accu) tau.append(trial_aucs) #end trial loop accuracies = extrapolate_trials(tac) aucs = extrapolate_trials(tau) print("Elapsed time %.3f" % (time.time() - t0)) print_extrapolated_results(accuracies, aucs)
# 读取文件 def readfile(path): fp = open(path, "rb") content = fp.read() fp.close() return content # Bunch类提供一种key,value的对象形式 # target_name:所有分类集名称列表 # label:每个文件的分类标签列表 # filenames:文件路径 # contents:分词后文件词向量形式 bunch = Bunch(target_name=[], label=[], filenames=[], contents=[]) wordbag_path = "test_word_bag/test_set.dat" # 未分词分类语料库路径 seg_path = "test_corpus_seg/" # 分词后分类语料库路径 catelist = os.listdir(seg_path) # 获取seg_path下的所有子目录 bunch.target_name.extend(catelist) # 获取每个目录下所有的文件 for mydir in catelist: class_path = seg_path + mydir + "/" # 拼出分类子目录的路径 file_list = os.listdir(class_path) # 获取class_path下的所有文件 for file_path in file_list: # 遍历类别目录下文件 fullname = class_path + file_path # 拼出文件名全路径 bunch.label.append(mydir) bunch.filenames.append(fullname) bunch.contents.append(readfile(fullname).strip()) # 读取文件内容
allBunch.tfidf = vector2tfidf.fit_transform(allBunch.vector, allBunch.tfidf) allBunch.multi_labels = MultiLabelBinarizer( classes=ACCU_LIST).fit_transform(allBunch.labels) joblib.dump(allBunch, ALL_BUNCH_FILE) # 保存train_bunch和test_bunch,后面的程序可以直接读取,以节省时间。 # allBunch = joblib.load(ALL_BUNCH_FILE) # 读取保存的allBunch print(allBunch.vector.shape) print(np.max(allBunch.vector)) allBunch.vector = allBunch.vector / np.max(allBunch.vector) # 先不进行特征选择对allBunch进行交叉验证,多标签训练 trainBunch = Bunch(labels=[], multi_labels=[], contents=[], vector=[], selectVector=[], tfidf=[], selectTfidf=[]) testBunch = Bunch(labels=[], multi_labels=[], contents=[], vector=[], selectVector=[], tfidf=[], selectTfidf=[]) trainBunch.multi_labels, testBunch.multi_labels,trainBunch.vector, testBunch.vector , trainBunch.tfidf, testBunch.tfidf\ = train_test_split(allBunch.multi_labels, allBunch.vector, allBunch.tfidf,test_size=0.3) clf = DecisionTreeClassifier() print("正在进行训练》》》》》》") st = time.time()
def fetch_imagesets( data_folder_path="../coral_labeling/Labels", funneled=True, resize=None, min_images_per_category=0, color=False, hue=0, restrict=None, #slice_=(slice(0, 255), slice(0, 318)), slice_=None, download_if_missing=True): """Loader for images. This dataset is a collection of JPEG pictures Each pixel of each channel (color in RGB) is encoded by a float in range 0.0 - 1.0. Parameters ---------- data_home: optional, default: None Specify another download and cache folder for the datasets. By default funneled: boolean, optional, default: True Download and use the funneled variant of the dataset. resize: float, optional, default 0.5 Ratio used to resize the each image picture. min_images_per_category: int, optional, default None The extracted dataset will only retain pictures of people that have at least `min_images_per_category` different pictures. color: boolean, optional, default False Keep the 3 RGB channels instead of averaging them to a single gray level channel. If color is True the shape of the data has one more dimension than than the shape with color = False. slice_: optional Provide a custom 2D slice (height, width) to extract the 'interesting' part of the jpeg files and avoid use statistical correlation from the background download_if_missing: optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. THIS IS NOT SUPPORTED SICNE IMAGES must ALWAYS COME FROM LOCAL FILES. hue: optional, False by default Return hsv images rather than intensity or RGB Returns ------- dataset : dict-like object with the following attributes: dataset.data : numpy array of shape (13233, 2914) Each row corresponds to a ravelled image image of original size 62 x 47 pixels. dataset.images : numpy array of shape (13233, 62, 47) Each row is a image image corresponding to one of the 5749 categories in the dataset. dataset.target : numpy array of shape (13233,) Labels associated to each image image. Those labels range from 0-5748 and correspond to the category IDs. restrict : restrict category selection. If None, all category directories within the data folder are used, if a string, only the specified category is used, if 0 (zero) then files in the main data folder (only) are used, if 1 (unimplemented) all subcategories are merged into one. dataset.target_names : names of the categories (folders with images in them) dataset.paths : pathnames to the individual images dataset.huespaces : array of vectors summarizing hue information dataset.DESCR : string """ #images_home = "/Volumes/Macintosh_HD/Users/dudek/Code/coral_labeling/Labelsx" # wrap the loader in a memoizing function that will return memmaped data # arrays for optimal memory usage #GD m = Memory(cachedir=images_home, compress=6, verbose=0) #GD load_func = m.cache(_fetch_imagesets) # load and memoize the pairs as np arrays #GD images, target, target_names = load_func( images, target, target_names, paths, huespaces = _fetch_imagesets( data_folder_path, resize=resize, restrict=restrict, min_images_per_category=min_images_per_category, color=color, hue=hue, slice_=slice_) # pack the results as a Bunch instance # return Bunch(data=images.reshape(len(images), -1), images=images, return Bunch(data=images, images=images, target=target, target_names=target_names, paths=paths, huespaces=huespaces, DESCR="coral dataset")
# bunch_data bounch_path = '/Users/slade/Documents/YMM/Code/UCGPCG/src/jobs/terror_recognition/train_model/baseline/bounch_data' build_bounch(train_comment_data, bounch_path) wordbag_path = "/Users/slade/Documents/YMM/Code/UCGPCG/src/jobs/terror_recognition/train_model/baseline/model_data/bunch_set.dat" corpus2Bunch(wordbag_path, bounch_path) # tfidf/cut sequence stopword_path = "/Users/slade/Documents/YMM/Code/UCGPCG/src/jobs/terror_recognition/train_model/stop_words.txt" bunch_path = wordbag_path tri_space_path = '/Users/slade/Documents/YMM/Code/UCGPCG/src/jobs/terror_recognition/train_model/baseline/model_data/tri_space.dat' stpwrdlst = readfile(stopword_path).splitlines() bunch = _readbunchobj(bunch_path) tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={}) vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, token_pattern=r"(?u)\b\w+\b", ngram_range=(1, 3), max_features=30000) tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) tfidfspace.vocabulary = vectorizer.vocabulary_ _writebunchobj(tri_space_path, tfidfspace) tri_train_set = _readbunchobj(tri_space_path) mnb_tri = MultinomialNB(alpha=0.001) mnb_tri.fit(tri_train_set.tdm, tri_train_set.label)
def test_loads_dumps_bunch(): bunch = Bunch(x="x") bunch_from_pkl = loads(dumps(bunch)) bunch_from_pkl.x = "y" assert_equal(bunch_from_pkl['x'], bunch_from_pkl.x)
def fetch_multilingual_SimLex999(which="EN"): """ Fetch Multilingual SimLex999 dataset for testing attributional similarity Parameters ------- which : "EN", "RU", "IT" or "DE" for language Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': matrix of 2 words per column, 'y': vector with scores, 'sd': vector of sd of scores, References ---------- Published at http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html. Notes ----- Scores for EN are different than the original SimLex999 dataset. Authors description: Multilingual SimLex999 resource consists of translations of the SimLex999 word similarity data set to three languages: German, Italian and Russian. Each of the translated datasets is scored by 13 human judges (crowdworkers) - all fluent speakers of its language. For consistency, we also collected human judgments for the original English corpus according to the same protocol applied to the other languages. This dataset allows to explore the impact of the "judgement language" (the language in which word pairs are presented to the human judges) on the resulted similarity scores and to evaluate vector space models on a truly multilingual setup (i.e. when both the training and the test data are multilingual). """ if which == "EN": data = _get_as_pd( 'https://www.dropbox.com/s/nczc4ao6koqq7qm/EN-MSIM999.txt?dl=1', 'similarity', header=None, encoding='utf-8', sep=" ") elif which == "DE": data = _get_as_pd( 'https://www.dropbox.com/s/ucpwrp0ahawsdtf/DE-MSIM999.txt?dl=1', 'similarity', header=None, encoding='utf-8', sep=" ") elif which == "IT": data = _get_as_pd( 'https://www.dropbox.com/s/siqjagyz8dkjb9q/IT-MSIM999.txt?dl=1', 'similarity', header=None, encoding='utf-8', sep=" ") elif which == "RU": data = _get_as_pd( 'https://www.dropbox.com/s/3v26edm9a31klko/RU-MSIM999.txt?dl=1', 'similarity', header=None, encoding='utf-8', sep=" ") else: raise RuntimeError("Not recognized which parameter") # We basically select all the columns available X = data.values[:, 0:2] scores = data.values[:, 2:].astype(np.float) y = np.mean(scores, axis=1) sd = np.std(scores, axis=1) return Bunch(X=X.astype("object"), y=y, sd=sd)
def load_student_grades(return_X_y=False, y_type='G3'): strings = { 'at_home': 1, 'health': 2, 'other': 5, 'services': 3, 'teacher': 4, 'GP': 1, 'MS': 2, 'course': 1, 'home': 2, 'other': 3, 'reputation': 4, 'father': 2, 'mother': 1, 'other': 3, 'F': 1, 'M': 0, 'yes': 1, 'no': 0 } file_path = os.path.join('datasets\student', 'student-por.csv') with open(file_path) as f: data_file = csv.reader(f, delimiter=';') # temp = next(data_file) temp = next(data_file) n_features = 30 # n_samples = sum(1 for row in data_file) # data = np.empty((n_samples, n_features)) # target = np.empty((n_samples,)) feature_names = np.array(temp) data = [] target = [] with open(file_path) as f: data_file = csv.reader(f, delimiter=';') i = 0 firstline = True for d in data_file: if firstline: firstline = False else: d = np.array(d) d = np.delete(d, 3, 0) d = np.delete(d, 3, 0) d = np.delete(d, 3, 0) for str, val in strings.iteritems(): d[d == str] = val #data.append(d[1:-3]) data.append(np.array(d[1:-3], dtype='float')) d.astype(float) if y_type == 'G3': target.append(d[-1]) elif y_type == 'G2': target.append(d[-2]) elif y_type == 'G1': target.append(d[-3]) i += 1 data = np.array(data[:1000]) target = np.array(target[:1000]).astype(float) if return_X_y: return data, target return Bunch( data=data, target=target, # last column is target value feature_names=feature_names)
def fetch_atlas_craddock_2012(data_dir=None, url=None, resume=True, verbose=1): """Download and return file names for the Craddock 2012 parcellation The provided images are in MNI152 space. Parameters ---------- data_dir: string directory where data should be downloaded and unpacked. url: string url of file to download. resume: bool whether to resumed download of a partly-downloaded file. verbose: int verbosity level (0 means no message). Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, keys are: scorr_mean, tcorr_mean, scorr_2level, tcorr_2level, random References ---------- Licence: Creative Commons Attribution Non-commercial Share Alike http://creativecommons.org/licenses/by-nc-sa/2.5/ Craddock, R. Cameron, G.Andrew James, Paul E. Holtzheimer, Xiaoping P. Hu, and Helen S. Mayberg. "A Whole Brain fMRI Atlas Generated via Spatially Constrained Spectral Clustering". Human Brain Mapping 33, no 8 (2012): 1914-1928. doi:10.1002/hbm.21333. See http://www.nitrc.org/projects/cluster_roi/ for more information on this parcellation. """ if url is None: url = "ftp://www.nitrc.org/home/groups/cluster_roi/htdocs" \ "/Parcellations/craddock_2011_parcellations.tar.gz" opts = {'uncompress': True} dataset_name = "craddock_2012" keys = ("scorr_mean", "tcorr_mean", "scorr_2level", "tcorr_2level", "random") filenames = [ ("scorr05_mean_all.nii.gz", url, opts), ("tcorr05_mean_all.nii.gz", url, opts), ("scorr05_2level_all.nii.gz", url, opts), ("tcorr05_2level_all.nii.gz", url, opts), ("random_all.nii.gz", url, opts) ] data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) sub_files = _fetch_files(data_dir, filenames, resume=resume, verbose=verbose) fdescr = _get_dataset_descr(dataset_name) params = dict([('description', fdescr)] + list(zip(keys, sub_files))) return Bunch(**params)
rootPath = fileutils.getDataPath() + os.sep trainSetFolderPath = rootPath + 'train_word_bag' trainSetFilePath = trainSetFolderPath + os.sep + 'train_set.dat' trainSet = fileutils.readBatchObj(trainSetFilePath) # 2.get stop words stopWordFolderPath = rootPath + 'train_corpus_stop_word' stopWordFileName = 'corpus_stop_word_china.txt' stopWordFilePath = stopWordFolderPath + os.sep + stopWordFileName stopWordFile = open(stopWordFilePath, 'r') stopWordList = stopWordFile.read().splitlines() # print('stopWordList,', stopWordList) stopWordFile.close() # 3.build TF-IDF vector space tfidfSpace = Bunch(target_name=trainSet.target_name, lable=trainSet.lable, filenames=trainSet.filenames, tdm=[], vocabulary=[]) vectorizer = TfidfVectorizer( stop_words=stopWordList, sublinear_tf=True, max_df=0.5) transformer = TfidfTransformer() tfidfSpace.tdm = vectorizer.fit_transform(trainSet.contents) # print('tdm:', tfidfSpace.tdm) tfidfSpace.vocabulary = vectorizer.vocabulary_ # print('tfidfSpace:', tfidfSpace) # 3.save tfidfSpace vocabularyFolderPath = rootPath + 'train_word_bag' vocabularyFileName = 'tfidfSpace.dat' vocabularyPath = vocabularyFolderPath + os.sep + vocabularyFileName if os.path.exists(vocabularyPath): os.remove(vocabularyPath)
def fetch_atlas_smith_2009(data_dir=None, mirror='origin', url=None, resume=True, verbose=1): """Download and load the Smith ICA and BrainMap atlas (dated 2009) Parameters ---------- data_dir: string, optional Path of the data directory. Used to force data storage in a non- standard location. Default: None (meaning: default) mirror: string, optional By default, the dataset is downloaded from the original website of the atlas. Specifying "nitrc" will force download from a mirror, with potentially higher bandwith. url: string, optional Download URL of the dataset. Overwrite the default URL. Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, contains: - 20-dimensional ICA, Resting-FMRI components: - all 20 components (rsn20) - 10 well-matched maps from these, as shown in PNAS paper (rsn10) - 20-dimensional ICA, BrainMap components: - all 20 components (bm20) - 10 well-matched maps from these, as shown in PNAS paper (bm10) - 70-dimensional ICA, Resting-FMRI components (rsn70) - 70-dimensional ICA, BrainMap components (bm70) References ---------- S.M. Smith, P.T. Fox, K.L. Miller, D.C. Glahn, P.M. Fox, C.E. Mackay, N. Filippini, K.E. Watkins, R. Toro, A.R. Laird, and C.F. Beckmann. Correspondence of the brain's functional architecture during activation and rest. Proc Natl Acad Sci USA (PNAS), 106(31):13040-13045, 2009. A.R. Laird, P.M. Fox, S.B. Eickhoff, J.A. Turner, K.L. Ray, D.R. McKay, D.C Glahn, C.F. Beckmann, S.M. Smith, and P.T. Fox. Behavioral interpretations of intrinsic connectivity networks. Journal of Cognitive Neuroscience, 2011 Notes ----- For more information about this dataset's structure: http://www.fmrib.ox.ac.uk/datasets/brainmap+rsns/ """ if url is None: if mirror == 'origin': url = "http://www.fmrib.ox.ac.uk/datasets/brainmap+rsns/" elif mirror == 'nitrc': url = [ 'https://www.nitrc.org/frs/download.php/7730/', 'https://www.nitrc.org/frs/download.php/7729/', 'https://www.nitrc.org/frs/download.php/7731/', 'https://www.nitrc.org/frs/download.php/7726/', 'https://www.nitrc.org/frs/download.php/7728/', 'https://www.nitrc.org/frs/download.php/7727/', ] else: raise ValueError('Unknown mirror "%s". Mirror must be "origin" ' 'or "nitrc"' % str(mirror)) files = [ 'rsn20.nii.gz', 'PNAS_Smith09_rsn10.nii.gz', 'rsn70.nii.gz', 'bm20.nii.gz', 'PNAS_Smith09_bm10.nii.gz', 'bm70.nii.gz' ] if isinstance(url, _basestring): url = [url] * len(files) files = [(f, u + f, {}) for f, u in zip(files, url)] dataset_name = 'smith_2009' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) files_ = _fetch_files(data_dir, files, resume=resume, verbose=verbose) fdescr = _get_dataset_descr(dataset_name) keys = ['rsn20', 'rsn10', 'rsn70', 'bm20', 'bm10', 'bm70'] params = dict(zip(keys, files_)) params['description'] = fdescr return Bunch(**params)
from sklearn.datasets.base import Bunch import jieba try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET #把训练集和测试集的数据保存到Bunch对象中 root = fileutils.getDataPath() + os.sep trainDataPath = root + 'news' + os.sep + "train" testDataPath = root + 'news' + os.sep + "test" segPath = root + 'news' + os.sep + "seg" trainRawPath = segPath +os.sep+"trainRaw.dat" testRawPath = segPath +os.sep+"testRaw.dat" bunch = Bunch(target_name=[], lable=[], filenames=[], contents=[]) bunch.target_name = segPath contenttitle ='' # parser all train data and save it to bunch for file in os.listdir(trainDataPath): filePath = trainDataPath + os.sep + file if os.path.isdir(filePath): print(file, ' is dir. continue') continue with open(filePath, 'r') as file: text = file.read() text = re.sub(u"[\x00-\x08\x0b-\x0c\x0e-\x1f|&]+", u"", text) root = ET.fromstring(text) for child in root: # 第二层节点的标签名称和属性,遍历xml文档的第三层
def fetch_atlas_aal(version='SPM12', data_dir=None, url=None, resume=True, verbose=1): """Downloads and returns the AAL template for SPM 12. This atlas is the result of an automated anatomical parcellation of the spatially normalized single-subject high-resolution T1 volume provided by the Montreal Neurological Institute (MNI) (D. L. Collins et al., 1998, Trans. Med. Imag. 17, 463-468, PubMed). Parameters ---------- version: string, optional The version of the AAL atlas. Must be SPM5, SPM8 or SPM12. Default is SPM12. data_dir: string directory where data should be downloaded and unpacked. url: string url of file to download. resume: bool whether to resumed download of a partly-downloaded file. verbose: int verbosity level (0 means no message). Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, keys are: - "maps": str. path to nifti file containing regions. - "labels": list of the names of the regions Notes ----- For more information on this dataset's structure, see http://www.gin.cnrs.fr/AAL-217?lang=en Automated Anatomical Labeling of Activations in SPM Using a Macroscopic Anatomical Parcellation of the MNI MRI Single-Subject Brain. N. Tzourio-Mazoyer, B. Landeau, D. Papathanassiou, F. Crivello, O. Etard, N. Delcroix, B. Mazoyer, and M. Joliot. NeuroImage 2002. 15 :273-28 Licence: unknown. """ versions = ['SPM5', 'SPM8', 'SPM12'] if version not in versions: raise ValueError('The version of AAL requested "%s" does not exist.' 'Please choose one among %s.' % (version, str(versions))) if url is None: baseurl = "http://www.gin.cnrs.fr/AAL_files/aal_for_%s.tar.gz" url = baseurl % version opts = {'uncompress': True} dataset_name = "aal_" + version # keys and basenames would need to be handled for each spm_version # for now spm_version 12 is hardcoded. basenames = ("AAL.nii", "AAL.xml") filenames = [(os.path.join('aal', 'atlas', f), url, opts) for f in basenames] data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) atlas_img, labels_file = _fetch_files(data_dir, filenames, resume=resume, verbose=verbose) fdescr = _get_dataset_descr(dataset_name) # We return the labels contained in the xml file as a dictionary xml_tree = xml.etree.ElementTree.parse(labels_file) root = xml_tree.getroot() labels = [] indices = [] for label in root.getiterator('label'): indices.append(label.find('index').text) labels.append(label.find('name').text) params = {'description': fdescr, 'maps': atlas_img, 'labels': labels, 'indices': indices} return Bunch(**params)
def fetch_atlas_harvard_oxford(atlas_name, data_dir=None, symmetric_split=False, resume=True, verbose=1): """Load Harvard-Oxford parcellation from FSL if installed or download it. This function looks up for Harvard Oxford atlas in the system and load it if present. If not, it downloads it and stores it in NILEARN_DATA directory. Parameters ---------- atlas_name: string Name of atlas to load. Can be: cort-maxprob-thr0-1mm, cort-maxprob-thr0-2mm, cort-maxprob-thr25-1mm, cort-maxprob-thr25-2mm, cort-maxprob-thr50-1mm, cort-maxprob-thr50-2mm, sub-maxprob-thr0-1mm, sub-maxprob-thr0-2mm, sub-maxprob-thr25-1mm, sub-maxprob-thr25-2mm, sub-maxprob-thr50-1mm, sub-maxprob-thr50-2mm, cort-prob-1mm, cort-prob-2mm, sub-prob-1mm, sub-prob-2mm data_dir: string, optional Path of data directory. It can be FSL installation directory (which is dependent on your installation). symmetric_split: bool, optional If True, split every symmetric region in left and right parts. Effectively doubles the number of regions. Default: False. Not implemented for probabilistic atlas (*-prob-* atlases) Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, keys are: - "maps": nibabel.Nifti1Image, 4D maps if a probabilistic atlas is requested and 3D labels if a maximum probabilistic atlas was requested. - "labels": string list, labels of the regions in the atlas. """ atlas_items = ("cort-maxprob-thr0-1mm", "cort-maxprob-thr0-2mm", "cort-maxprob-thr25-1mm", "cort-maxprob-thr25-2mm", "cort-maxprob-thr50-1mm", "cort-maxprob-thr50-2mm", "sub-maxprob-thr0-1mm", "sub-maxprob-thr0-2mm", "sub-maxprob-thr25-1mm", "sub-maxprob-thr25-2mm", "sub-maxprob-thr50-1mm", "sub-maxprob-thr50-2mm", "cort-prob-1mm", "cort-prob-2mm", "sub-prob-1mm", "sub-prob-2mm") if atlas_name not in atlas_items: raise ValueError("Invalid atlas name: {0}. Please chose an atlas " "among:\n{1}".format(atlas_name, '\n'.join(atlas_items))) url = 'http://www.nitrc.org/frs/download.php/7700/HarvardOxford.tgz' # For practical reasons, we mimic the FSL data directory here. dataset_name = 'fsl' # Environment variables default_paths = [] for env_var in ['FSL_DIR', 'FSLDIR']: path = os.getenv(env_var) if path is not None: default_paths.extend(path.split(':')) data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, default_paths=default_paths, verbose=verbose) opts = {'uncompress': True} root = os.path.join('data', 'atlases') atlas_file = os.path.join(root, 'HarvardOxford', 'HarvardOxford-' + atlas_name + '.nii.gz') if atlas_name[0] == 'c': label_file = 'HarvardOxford-Cortical.xml' else: label_file = 'HarvardOxford-Subcortical.xml' label_file = os.path.join(root, label_file) atlas_img, label_file = _fetch_files(data_dir, [(atlas_file, url, opts), (label_file, url, opts)], resume=resume, verbose=verbose) names = {} from xml.etree import ElementTree names[0] = 'Background' for label in ElementTree.parse(label_file).findall('.//label'): names[int(label.get('index')) + 1] = label.text names = list(names.values()) if not symmetric_split: return Bunch(maps=atlas_img, labels=names) if atlas_name in ("cort-prob-1mm", "cort-prob-2mm", "sub-prob-1mm", "sub-prob-2mm"): raise ValueError("Region splitting not supported for probabilistic " "atlases") atlas_img = check_niimg(atlas_img) atlas = atlas_img.get_data() labels = np.unique(atlas) # Build a mask of both halves of the brain middle_ind = (atlas.shape[0] - 1) // 2 # Put zeros on the median plane atlas[middle_ind, ...] = 0 # Split every zone crossing the median plane into two parts. left_atlas = atlas.copy() left_atlas[middle_ind:, ...] = 0 right_atlas = atlas.copy() right_atlas[:middle_ind, ...] = 0 new_label = 0 new_atlas = atlas.copy() # Assumes that the background label is zero. new_names = [names[0]] for label, name in zip(labels[1:], names[1:]): new_label += 1 left_elements = (left_atlas == label).sum() right_elements = (right_atlas == label).sum() n_elements = float(left_elements + right_elements) if (left_elements / n_elements < 0.05 or right_elements / n_elements < 0.05): new_atlas[atlas == label] = new_label new_names.append(name) continue new_atlas[right_atlas == label] = new_label new_names.append(name + ', left part') new_label += 1 new_atlas[left_atlas == label] = new_label new_names.append(name + ', right part') atlas_img = new_img_like(atlas_img, new_atlas, get_affine(atlas_img)) return Bunch(maps=atlas_img, labels=new_names)
def fetch_atlas_allen_2011(data_dir=None, url=None, resume=True, verbose=1): """Download and return file names for the Allen and MIALAB ICA atlas (dated 2011). The provided images are in MNI152 space. Parameters ---------- data_dir: str, optional directory where data should be downloaded and unpacked. url: str, optional url of file to download. resume: bool whether to resumed download of a partly-downloaded file. verbose: int verbosity level (0 means no message). Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, keys are: - "maps": T-maps of all 75 unthresholded components. - "rsn28": T-maps of 28 RSNs included in E. Allen et al. - "networks": string list containing the names for the 28 RSNs. - "rsn_indices": dict[rsn_name] -> list of int, indices in the "maps" file of the 28 RSNs. - "comps": The aggregate ICA Components. - "description": details about the data release. References ---------- E. Allen, et al, "A baseline for the multivariate comparison of resting state networks," Frontiers in Systems Neuroscience, vol. 5, p. 12, 2011. Notes ----- Licence: unknown See http://mialab.mrn.org/data/index.html for more information on this dataset. """ if url is None: url = "http://mialab.mrn.org/data/hcp/" dataset_name = "allen_rsn_2011" keys = ("maps", "rsn28", "comps") opts = {} files = ["ALL_HC_unthresholded_tmaps.nii", "RSN_HC_unthresholded_tmaps.nii", "rest_hcp_agg__component_ica_.nii"] labels = [('Basal Ganglia', [21]), ('Auditory', [17]), ('Sensorimotor', [7, 23, 24, 38, 56, 29]), ('Visual', [46, 64, 67, 48, 39, 59]), ('Default-Mode', [50, 53, 25, 68]), ('Attentional', [34, 60, 52, 72, 71, 55]), ('Frontal', [42, 20, 47, 49])] networks = [[name] * len(idxs) for name, idxs in labels] filenames = [(f, url + f, opts) for f in files] data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) sub_files = _fetch_files(data_dir, filenames, resume=resume, verbose=verbose) fdescr = _get_dataset_descr(dataset_name) params = [('description', fdescr), ('rsn_indices', labels), ('networks', networks)] params.extend(list(zip(keys, sub_files))) return Bunch(**dict(params))
Input: high dimensional data Output: k-dimensional data (save to CSV files as train.csv and test.csv) """ # ### Load Haxby dataset ###################################################### import numpy as np import nibabel from os.path import expanduser from sklearn.datasets.base import Bunch # data_dir = expanduser('~') + '/workshops/aiml/data/pymvpa-exampledata/' data_dir = expanduser('~') + '/downloads/pymvpa-exampledata/' # create sklearn's Bunch of data dataset_files = Bunch(func=data_dir + 'bold.nii.gz', session_target=data_dir + 'attributes.txt', mask=data_dir + 'mask.nii.gz', conditions_target=data_dir + 'attributes_literal.txt') # fmri_data and mask are copied to break any reference to the original object bold_img = nibabel.load(dataset_files.func) fmri_data = bold_img.get_data().astype(float) affine = bold_img.get_affine() y, session = np.loadtxt(dataset_files.session_target).astype("int").T conditions = np.recfromtxt(dataset_files.conditions_target)['f0'] mask = dataset_files.mask # fmri_data.shape is (40, 64, 64, 1452) # and mask.shape is (40, 64, 64) # ### Preprocess data # Build the mean image because we have no anatomic data
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer reload(sys) sys.setdefaultencoding('utf-8') #导入训练集 train_path = "wordbag" + "/" + "train_set1124.data" data_set = joblib.load(train_path) # print data_set.target_name # print data_set.contents[0] ####exit # sys.exit(0) #定义词袋数据结构 #tf-idf计算后的词袋 wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={}) wordbag.target_name = data_set.target_name wordbag.label = data_set.label #语料 corpus = data_set.contents #导入停用词 stopwordpath = "extra_dict/stop_words.txt" stopword_dic = open(stopwordpath,'r') stopword_content = stopword_dic.read() #将停用词转为list stopwordlist = stopword_content.splitlines() stopword_dic.close() #词袋创建时间 start = datetime.datetime.now()
def load_dynacomp(preprocessing_folder='pipeline_1', prefix='swr'): """ Returns paths of Dynacomp preprocessed resting-state fMRI """ BASE_DIR = set_data_base_dir('Dynacomp') SUBJ_DIR = os.path.join(BASE_DIR, 'preprocessed', preprocessing_folder) subject_paths = sorted(glob.glob(os.path.join(SUBJ_DIR, '[A-Z][A-Z]*'))) mask_path = os.path.join(BASE_DIR, 'masks', 'all_subjects.nii.gz') description = pd.read_csv(os.path.join(BASE_DIR, 'subject_infos.csv')) session1_files = [] session2_files = [] session1_motion = [] session2_motion = [] anat_files = [] group = [] subjects = [] behavior = [] date = [] for f in subject_paths: # subject id _, subject_id = os.path.split(f) # set prefix # functional data session1_files.append( glob.glob( os.path.join(f, 'fMRI', 'acquisition1', prefix + 'rest1*.nii'))[0]) session2_files.append( glob.glob( os.path.join(f, 'fMRI', 'acquisition1', prefix + 'rest2*.nii'))[0]) # anatomical data anat_files.append( glob.glob(os.path.join(f, 't1mri', 'acquisition1', 'wanat*.nii'))[0]) # motion parameters session1_motion.append( glob.glob(os.path.join(f, 'fMRI', 'acquisition1', 'rp_rest1*.txt'))[0]) session2_motion.append( glob.glob(os.path.join(f, 'fMRI', 'acquisition1', 'rp_rest2*.txt'))[0]) # subject group gr = description[description.NIP == subject_id].GROUP.values if len(gr) > 0: group.append(gr[0]) # date acquisition dt = description[description.NIP == subject_id].DATE.values if len(dt) > 0: date.append(dt[0]) # subject id subjects.append(subject_id) behavior.append(get_behavior_scores(description, subject_id)) indices = set_group_indices(group) rois = load_dynacomp_rois() return Bunch(func1=session1_files, func2=session2_files, anat=anat_files, group_indices=indices, motion1=session1_motion, motion2=session2_motion, rois=rois, group=group, subjects=subjects, date=date, behavior=behavior, mask=mask_path)
def main(): accuracies = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) vct_analizer = vct.build_tokenizer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = max(100, args.fixk) if args.fixk < 0: args.fixk = None fixk_saved = "{0}{1}.p".format(args.train, args.fixk) try: print "Loading existing file... %s " % args.train fixk_file = open(fixk_saved, "rb") data = pickle.load(fixk_file) fixk_file.close() vectorizer = open("{0}vectorizer.p".format(args.train), "rb") vct = pickle.load(vectorizer) vectorizer.close() except (IOError, ValueError): print "Loading from scratch..." data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5) fixk_file = open(fixk_saved, "wb") pickle.dump(data, fixk_file) fixk_file.close() vectorizer = open("{0}vectorizer.p".format(args.train), "wb") pickle.dump(vct, vectorizer) vectorizer.close() # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) parameters = parse_parameters_mat(args.cost_model) print "Cost Parameters %s" % parameters cost_model = set_cost_model(args.cost_function, parameters=parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ #### STUDENT CLASSIFIER clf = linear_model.LogisticRegression(penalty="l1", C=1) # clf = set_classifier(args.classifier) print "\nStudent Classifier: %s" % clf #### EXPERT CLASSIFIER exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty) exp_clf.fit(data.test.bow, data.test.target) expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold, cost_function=cost_model.cost_function) print "\nExpert: %s " % expert #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, min_size)) print ("Anytime active learning experiment - use objective function to pick data") t0 = time.time() tac = [] tau = [] ### experiment starts for t in range(args.trials): trial_accu = [] trial_aucs = [] print "*" * 60 print "Trial: %s" % t if args.student in "anyunc": student = randomsampling.AnytimeLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct, subpool=250, cost_model=cost_model) elif args.student in "lambda": student = randomsampling.AnytimeLearnerDiff(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct, subpool=250, cost_model=cost_model, lambda_value=args.lambda_value) elif args.student in "anyzero": student = randomsampling.AnytimeLearnerZeroUtility(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct, subpool=250, cost_model=cost_model) else: raise ValueError("Oops! We do not know that anytime strategy. Try again.") print "\nStudent: %s " % student train_indices = [] neutral_text = [] # save the raw text of the queries neutral_data = [] # save the xik vectors train_x = [] train_y = [] neu_x = [] # data to train the classifier neu_y = np.array([]) pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training pool.text = data.train.data # pool.fixk = data.train.bowk.tocsr() # k words BOW for querying pool.target = data.train.target pool.predicted = [] # pool.kwords = np.array(data.train.kwords) # k words pool.remaining = set(range(pool.data.shape[0])) # indices of the pool bootstrapped = False current_cost = 0 iteration = 0 query_index = None query_size = None while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter: util = [] if not bootstrapped: ## random from each bootstrap bt = randomsampling.BootstrapFromEach(t * 10) query_index = bt.bootstrap(pool=pool, k=bootstrap_size) bootstrapped = True query = pool.data[query_index] print "Bootstrap: %s " % bt.__class__.__name__ print else: # print "pick instance" ## chose returns: index, k ## util returns: utility, k, unc query_chosen, util = student.pick_next(pool=pool, step_size=step_size) query_index = [a for a, b in query_chosen] query_size = [b for a, b in query_chosen] # query = pool.fixk[query_index] # query with k words qk = [] for q, k in query_chosen: qk.append(" ".join(vct_analizer(pool.text[q])[0:int(k)])) query = vct.transform(qk) # query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]] ground_truth = pool.target[query_index] #labels, spent = expert.label(unlabeled=query, target=ground_truth) if iteration == 0: ## bootstrap uses ground truth labels = ground_truth spent = [0] * len(ground_truth) ## bootstrap cost is ignored else: # print "ask labels" labels = expert.label_instances(query, ground_truth) spent = expert.estimate_instances(query_size) ### accumulate the cost of the query query_cost = np.array(spent).sum() current_cost += query_cost # print query_index useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None]) neutral_answers = np.array([[x, z] for x, y, z in zip(query_index, labels, query_size) if y is None]) \ if iteration != 0 else np.array([]) # print labels # print "label\tutility\tk\tunc" # print format_query(zip(labels, util)) ## add data recent acquired to train if useful_answers.shape[0] != 0: # print "get training" # train_indices.extend(query_index) train_indices.extend(useful_answers[:, 0]) # add labels to training train_x = pool.data[train_indices] # # train with all the words # update labels with the expert labels #train_y = pool.target[train_indices] train_y.extend(useful_answers[:, 1]) if neutral_answers.shape[0] != 0: # current query neutrals qlbl = [] for xik, lbl in zip(query, labels): # neutral_data.append(xik) if isinstance(neutral_data, list): neutral_data = xik else: neutral_data = vstack([neutral_data, xik], format='csr') qlbl.append(neutral_label(lbl)) ## append the labels of the current query neu_y = np.append(neu_y, qlbl) neu_x = neutral_data #end usefulanswers if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # remove labels from pool pool.remaining.difference_update(query_index) # retrain the model # current_model = student.train(train_x, train_y) # print "train models" current_model = student.train_all(train_x, train_y, neu_x, neu_y) # print "evaluate" # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] accu = metrics.accuracy_score(data.test.target, pred_y) print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}\tneu:{6}\t{7}".format( len(train_indices), accu, auc, query_cost, current_cost, format_spent(spent), len(neutral_answers), neu_y.shape[0])) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) # partial trial results trial_accu.append([x_axis_range, accu]) trial_aucs.append([x_axis_range, auc]) iteration += 1 # end of budget loop tac.append(trial_accu) tau.append(trial_aucs) #end trial loop if args.cost_function not in "uniform": accuracies = extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size) aucs = extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size) print("Elapsed time %.3f" % (time.time() - t0)) print_extrapolated_results(accuracies, aucs)
def load_files(container_path, description=None, categories=None, shuffle=True, encoding='utf-8', random_state=0, key_path_index=-2): """Load text files with categories as subfolder names. Individual samples are assumed to be files stored a two levels folder structure such as the following: container_folder/ category_1_folder/ file_1.txt line 1 line 2 ... line n category_2_folder/ file_2.txt line 1 line 2 ... line n ... The folder names are used as supervised signal label names. The individual file names are not important. Parameters ---------- container_path : string or unicode Path to the main folder holding one subfolder per category description: string or unicode, optional (default=None) A paragraph describing the characteristic of the dataset: its source, reference, etc. categories : A collection of strings or None, optional (default=None) If None (default), load all the categories. If not None, list of category names to load (other categories ignored). shuffle : bool, optional (default=True) Whether or not to shuffle the data: might be important for models that make the assumption that the samples are independent and identically distributed (i.i.d.), such as stochastic gradient descent. random_state : int, RandomState instance or None, optional (default=0) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. key_name_index : int, category's index containing text file. Returns ------- data : Bunch Dictionary-like object, the interesting attributes are: either data, the raw text data to learn, or 'filenames', the files holding it, 'target', the classification labels (integer index), 'target_names', the meaning of the labels, and 'DESCR', the full description of the dataset. """ target = list() target_names = list() filenames = list() filelines2data = dict() folders = [f for f in sorted(listdir(container_path)) if isdir(join(container_path, f))] if categories is not None: folders = [f for f in folders if f in categories] for label, folder in enumerate(folders): target_names.append(folder) folder_path = join(container_path, folder) documents = [join(folder_path, d) for d in sorted(listdir(folder_path))] for training_doc in documents: if key_path_index: category = training_doc.split(os.sep)[key_path_index] else: category = training_doc with codecs.open(training_doc, encoding=encoding) as td: for line_index, data in enumerate(td): key4file = category + str(line_index) filelines2data[key4file] = data target.append(label) filenames.append(key4file) # convert to array for fancy indexing filenames = np.array(filenames) target = np.array(target) if shuffle: random_state = check_random_state(random_state) indices = np.arange(filenames.shape[0]) random_state.shuffle(indices) filenames = filenames[indices] target = target[indices] data = list() for filename in filenames: data.append(filelines2data.get(filename)) return Bunch(data=data, filenames=filenames, target_names=target_names, target=target, DESCR=description)
def main(): vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=False, ngram_range=(1, 1), token_pattern='\\b\\w+\\b') #, tokenizer=StemTokenizer()) vct_analizer = vct.build_tokenizer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = 10 # max(10, args.fixk) args.fixk = None data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) ### SENTENCE TRANSFORMATION sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') ## delete <br> to "." to recognize as end of sentence data.train.data = experiment_utils.clean_html(data.train.data) data.test.data = experiment_utils.clean_html(data.test.data) print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0])) ## Get the features of the sentence dataset ## create splits of data: pool, test, oracle, sentences expert_data = Bunch() train_test_data = Bunch() expert_data.sentence, train_test_data.pool = split_data(data.train) expert_data.oracle, train_test_data.test = split_data(data.test) data.train.data = train_test_data.pool.train.data data.train.target = train_test_data.pool.train.target data.test.data = train_test_data.test.train.data data.test.target = train_test_data.test.train.target ## convert document to matrix data.train.bow = vct.fit_transform(data.train.data) data.test.bow = vct.transform(data.test.data) #### EXPERT CLASSIFIER: ORACLE print("Training Oracle expert") labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector) expert_data.oracle.train.data = sent_train expert_data.oracle.train.target = np.array(labels) expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data) exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty) exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target) #### EXPERT CLASSIFIER: SENTENCES print("Training sentence expert") labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector) expert_data.sentence.train.data = sent_train expert_data.sentence.train.target = np.array(labels) expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data) sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty) sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target) #### TESTING THE CLASSIFERS test_target, test_data = split_data_sentences(data.test,sent_detector) test_data_bow = vct.transform(test_data) #pred_sent = sent_clf.predict(test_data_bow) pred_ora = exp_clf.predict(test_data_bow) y_probas = sent_clf.predict_proba(test_data_bow) pred_sent = sent_clf.classes_[np.argmax(y_probas, axis=1)] ## just based on one class probability # order = np.argsort(y_probas[:,0]) order = np.argsort(y_probas.max(axis=1)) print "ORACLE\tSENTENCE\tMAX-SENT" # for i in order[:500]: # print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i] for i in order[-500:]: print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i] print "Accuracy of Sentences Classifier", metrics.accuracy_score(test_target, pred_sent) print "Class distribution: %s" % pred_sent.sum() print "Size of data: %s" % pred_sent.shape[0] sizes = [50, 100, 500, 1000, 2000, 3000, 4000, 20000] clf = linear_model.LogisticRegression(penalty='l1', C=1) bootstrap = rand.permutation(len(test_data)) x = [] y = [] for s in sizes: indices = bootstrap[:s] train_x = expert_data.sentence.train.bow[indices[:s]] train_y = expert_data.sentence.train.target[indices[:s]] clf.fit(train_x, train_y) predictions = clf.predict(test_data_bow) scores = metrics.accuracy_score(test_target,predictions) ## print clf.__class__.__name__ print "Accuracy {0}: {1}".format(s, scores) y.append(scores) plt.clf() plt.title("Accuracy") plt.xlabel("Labels") plt.ylabel("Accuracy") plt.legend() plt.plot(sizes, y, '--bo', label="sent") plt.show()
def fetch(self, n_subjects=1, fetch_stimuli=False, url=None, resume=True, force=False, verbose=1): if self.simple: # URL of the dataset. It is optional because a test uses it to test dataset # downloading if url is None: url = 'http://www.pymvpa.org/files/pymvpa_exampledata.tar.bz2' opts = {'uncompress': True} files = [ (os.path.join('pymvpa-exampledata', 'attributes.txt'), url, opts), (os.path.join('pymvpa-exampledata', 'bold.nii.gz'), url, opts), (os.path.join('pymvpa-exampledata', 'mask.nii.gz'), url, opts), (os.path.join('pymvpa-exampledata', 'attributes_literal.txt'), url, opts), ] files = self.fetcher.fetch(files, resume=resume, force=force, verbose=verbose) # return the data return Bunch(func=files[1], session_target=files[0], mask=files[2], conditions_target=files[3]) else: if n_subjects > 6: warnings.warn('Warning: there are only 6 subjects') n_subjects = 6 # Dataset files if url is None: url = 'http://data.pymvpa.org/datasets/haxby2001/' md5sums = self.fetcher.fetch([('MD5SUMS', url + 'MD5SUMS', {})], resume=resume, force=force, verbose=verbose)[0] md5sums = readmd5_sum_file(md5sums) # definition of dataset files sub_files = ['bold.nii.gz', 'labels.txt', 'mask4_vt.nii.gz', 'mask8b_face_vt.nii.gz', 'mask8b_house_vt.nii.gz', 'mask8_face_vt.nii.gz', 'mask8_house_vt.nii.gz', 'anat.nii.gz'] n_files = len(sub_files) files = [ (os.path.join('subj%d' % i, sub_file), url + 'subj%d-2010.01.14.tar.gz' % i, {'uncompress': True, 'md5sum': md5sums.get('subj%d-2010.01.14.tar.gz' % i, None)}) for i in range(1, n_subjects + 1) for sub_file in sub_files if not (sub_file == 'anat.nii.gz' and i == 6) # no anat for sub. 6 ] files = self.fetcher.fetch(files, resume=resume, force=force, verbose=verbose) if n_subjects == 6: files.append(None) # None value because subject 6 has no anat kwargs = {} if fetch_stimuli: stimuli_files = [(os.path.join('stimuli', 'README'), url + 'stimuli-2010.01.14.tar.gz', {'uncompress': True})] readme = self.fetcher.fetch(stimuli_files, resume=resume, force=force, verbose=verbose)[0] kwargs['stimuli'] = _tree(os.path.dirname(readme), pattern='*.jpg', dictionary=True) # return the data return Bunch( anat=files[7::n_files], func=files[0::n_files], session_target=files[1::n_files], mask_vt=files[2::n_files], mask_face=files[3::n_files], mask_house=files[4::n_files], mask_face_little=files[5::n_files], mask_house_little=files[6::n_files], **kwargs)
import os from sklearn.datasets.base import Bunch from sklearn.externals import joblib import jieba from sklearn.feature_extraction.text import HashingVectorizer reload(sys) # sys.setdefaultencoding('utf-8') token_path = "token"+"/" #次袋语料路径 wordbag_path = "wordbag"+"/" #是引用bunch存储 data_set = Bunch(target_name=[],label=[],filenames=[],contents=[]) dir_list = os.listdir(token_path) data_set.target_name = dir_list for file in dir_list: file_name = token_path+file file_read = open(file_name,"r") for line in file_read: data_set.label.append(data_set.target_name.index(file)) data_set.contents.append(line.strip()) file_read.close() #持久化 joblib.dump(data_set, wordbag_path+"train_set1124.data", compress=3) #验证
def load_info(dataset_path, return_X_y=False): """Load and return the info dataset (classification). The iris dataset is a classic and very easy multi-class classification dataset. ================= ============== Classes 7 Samples per class 20 Samples total 140 Dimensionality 16 Features '铝', '沪铝', '伦铝', '氧化铝', '沪铜', '伦铜', '铜', '铅', '伦铅', '沪铅', '铅锌', '锌精矿', '锌', '铁', '钢铁', 'PVC' ================= ============== Read more in the :ref:`User Guide <datasets>`. Parameters ---------- return_X_y : boolean, default=False. If True, returns ``(data, target)`` instead of a Bunch object. See below for more information about the `data` and `target` object. .. versionadded:: 0.18 Returns ------- data : Bunch Dictionary-like object, the interesting attributes are: 'data', the data to learn, 'target', the classification labels, 'target_names', the meaning of the labels, 'feature_names', the meaning of the features, and 'DESCR', the full description of the dataset. (data, target) : tuple if ``return_X_y`` is True .. versionadded:: 0.18 Examples -------- Let's say you are interested in the samples 10, 25, and 50, and want to know their class name. >>> from sklearn.datasets import load_iris >>> data = load_iris() >>> data.target[[10, 25, 50]] array([0, 0, 1]) >>> list(data.target_names) ['setosa', 'versicolor', 'virginica'] """ with open(dataset_path) as csv_file: data_file = csv.reader(csv_file) temp = next(data_file) n_samples = int(temp[0]) n_features = int(temp[1]) target_names = np.array(temp[2:]) data = np.empty((n_samples, n_features)) target = np.empty((n_samples,), dtype=np.int) for i, ir in enumerate(data_file): data[i] = np.asarray(ir[:-1], dtype=np.float64) target[i] = np.asarray(ir[-1], dtype=np.int) if return_X_y: return data, target return Bunch(data=data, target=target, target_names=target_names, DESCR="", feature_names=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'])
X = read_file(fpath) data.append(X) return np.array(data, dtype=np.float32) print "loading data from disk..." species2id = lambda s: species_map.get(s, -1) train = np.loadtxt('samples/alltrain.csv', converters={0: species2id}, skiprows=1, delimiter=",") test = np.loadtxt('samples/alltest.csv', converters={0: species2id}, skiprows=1, delimiter=",") # Load env variable grids coverage = load_dir("coverages") # Per species data bv = Bunch(name=" ".join(species[0].split("_")[:2]), train=train[train[:, 0] == 0, 1:], test=test[test[:, 0] == 0, 1:]) mm = Bunch(name=" ".join(species[1].split("_")[:2]), train=train[train[:, 0] == 1, 1:], test=test[test[:, 0] == 1, 1:]) def get_coverages(points, coverages, xx, yy): """Get coverages (aka features) for each point. Returns ------- array : shape = [points.shape[0], coverages.shape[0]] The feature vectors (coverages) for each data point. """ rows = [] cols = []
def fetch_species_distributions(data_home=None, download_if_missing=True): """Loader for species distribution dataset from Phillips et. al. (2006) Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing: optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns -------- The data is returned as a Bunch object with the following attributes: coverages : array, shape = [14, 1592, 1212] These represent the 14 features measured at each point of the map grid. The latitude/longitude values for the grid are discussed below. Missing data is represented by the value -9999. train : record array, shape = (1623,) The training points for the data. Each point has three fields: - train['species'] is the species name - train['dd long'] is the longitude, in degrees - train['dd lat'] is the latitude, in degrees test : record array, shape = (619,) The test points for the data. Same format as the training data. Nx, Ny : integers The number of longitudes (x) and latitudes (y) in the grid x_left_lower_corner, y_left_lower_corner : floats The (x,y) position of the lower-left corner, in degrees grid_size : float The spacing between points of the grid, in degrees Notes ------ This dataset represents the geographic distribution of species. The dataset is provided by Phillips et. al. (2006). The two species are: - `"Bradypus variegatus" <http://www.iucnredlist.org/apps/redlist/details/3038/0>`_ , the Brown-throated Sloth. - `"Microryzomys minutus" <http://www.iucnredlist.org/apps/redlist/details/13408/0>`_ , also known as the Forest Small Rice Rat, a rodent that lives in Peru, Colombia, Ecuador, Peru, and Venezuela. References ---------- * `"Maximum entropy modeling of species geographic distributions" <http://www.cs.princeton.edu/~schapire/papers/ecolmod.pdf>`_ S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006. Notes ----- * See examples/applications/plot_species_distribution_modeling.py for an example of using this dataset with scikit-learn """ data_home = get_data_home(data_home) if not exists(data_home): makedirs(data_home) # Define parameters for the data files. These should not be changed # unless the data model changes. They will be saved in the npz file # with the downloaded data. extra_params = dict(x_left_lower_corner=-94.8, Nx=1212, y_left_lower_corner=-56.05, Ny=1592, grid_size=0.05) dtype = np.int16 if not exists(join(data_home, DATA_ARCHIVE_NAME)): print('Downloading species data from %s to %s' % (SAMPLES_URL, data_home)) X = np.load(BytesIO(urlopen(SAMPLES_URL).read())) for f in X.files: fhandle = BytesIO(X[f]) if 'train' in f: train = _load_csv(fhandle) if 'test' in f: test = _load_csv(fhandle) print('Downloading coverage data from %s to %s' % (COVERAGES_URL, data_home)) X = np.load(BytesIO(urlopen(COVERAGES_URL).read())) coverages = [] for f in X.files: fhandle = BytesIO(X[f]) print(' - converting', f) coverages.append(_load_coverage(fhandle)) coverages = np.asarray(coverages, dtype=dtype) bunch = Bunch(coverages=coverages, test=test, train=train, **extra_params) joblib.dump(bunch, join(data_home, DATA_ARCHIVE_NAME), compress=9) else: bunch = joblib.load(join(data_home, DATA_ARCHIVE_NAME)) return bunch
def fetch(self, contrasts=None, n_subjects=None, get_tmaps=False, get_masks=False, get_anats=False, url=None, resume=True, force=False, verbose=1): if n_subjects is None: n_subjects = 94 # 94 subjects available if (n_subjects > 94) or (n_subjects < 1): warnings.warn("Wrong value for \'n_subjects\' (%d). The maximum " "value will be used instead (\'n_subjects=94\')") n_subjects = 94 # 94 subjects available if contrasts is None: contrasts = self.contrast_name_wrapper.values() elif isinstance(contrasts, _basestring): contrasts = [contrasts] allowed_contrasts = list(self.contrast_name_wrapper.values()) # convert contrast names contrasts_wrapped = [] # get a unique ID for each contrast. It is used to give a unique name to # each download file and avoid name collisions. contrasts_indices = [] for contrast in contrasts: if contrast in allowed_contrasts: contrasts_wrapped.append(contrast) contrasts_indices.append(allowed_contrasts.index(contrast)) elif contrast in self.contrast_name_wrapper: name = self.contrast_name_wrapper[contrast] contrasts_wrapped.append(name) contrasts_indices.append(allowed_contrasts.index(name)) else: raise ValueError("Contrast \'%s\' is not available" % contrast) # It is better to perform several small requests than a big one because: # - Brainomics server has no cache (can lead to timeout while the archive # is generated on the remote server) # - Local (cached) version of the files can be checked for each contrast opts = {'uncompress': True} subject_ids = ["S%02d" % s for s in range(1, n_subjects + 1)] subject_id_max = subject_ids[-1] data_types = ["c map"] if get_tmaps: data_types.append("t map") rql_types = str.join(", ", ["\"%s\"" % x for x in data_types]) root_url = "http://brainomics.cea.fr/localizer/" base_query = ("Any X,XT,XL,XI,XF,XD WHERE X is Scan, X type XT, " "X concerns S, " "X label XL, X identifier XI, " "X format XF, X description XD, " 'S identifier <= "%s", ' % (subject_id_max, ) + 'X type IN(%(types)s), X label "%(label)s"') urls = [ "%sbrainomics_data_%d.zip?rql=%s&vid=data-zip" % (root_url, i, _urllib.parse.quote(base_query % { "types": rql_types, "label": c }, safe=',()')) for c, i in zip(contrasts_wrapped, contrasts_indices) ] filenames = [] for subject_id in subject_ids: for data_type in data_types: for contrast_id, contrast in enumerate(contrasts_wrapped): name_aux = str.replace( str.join('_', [data_type, contrast]), ' ', '_') file_path = os.path.join("brainomics_data", subject_id, "%s.nii.gz" % name_aux) file_tarball_url = urls[contrast_id] filenames.append((file_path, file_tarball_url, opts)) # Fetch masks if asked by user if get_masks: urls.append("%sbrainomics_data_masks.zip?rql=%s&vid=data-zip" % (root_url, _urllib.parse.quote(base_query % { "types": '"boolean mask"', "label": "mask" }, safe=',()'))) for subject_id in subject_ids: file_path = os.path.join("brainomics_data", subject_id, "boolean_mask_mask.nii.gz") file_tarball_url = urls[-1] filenames.append((file_path, file_tarball_url, opts)) # Fetch anats if asked by user if get_anats: urls.append("%sbrainomics_data_anats.zip?rql=%s&vid=data-zip" % (root_url, _urllib.parse.quote(base_query % { "types": '"normalized T1"', "label": "anatomy" }, safe=',()'))) for subject_id in subject_ids: file_path = os.path.join("brainomics_data", subject_id, "normalized_T1_anat_defaced.nii.gz") file_tarball_url = urls[-1] filenames.append((file_path, file_tarball_url, opts)) # Fetch subject characteristics (separated in two files) if url is None: url_csv = ( "%sdataset/cubicwebexport.csv?rql=%s&vid=csvexport" % (root_url, _urllib.parse.quote("Any X WHERE X is Subject"))) url_csv2 = ("%sdataset/cubicwebexport2.csv?rql=%s&vid=csvexport" % (root_url, _urllib.parse.quote( "Any X,XI,XD WHERE X is QuestionnaireRun, " "X identifier XI, X datetime " "XD", safe=','))) else: url_csv = "%s/cubicwebexport.csv" % url url_csv2 = "%s/cubicwebexport2.csv" % url filenames += [("cubicwebexport.csv", url_csv, {}), ("cubicwebexport2.csv", url_csv2, {})] # Actual data fetching files = self.fetcher.fetch(filenames, resume=resume, force=force, verbose=verbose) anats = None masks = None tmaps = None # combine data from both covariates files into one single recarray from numpy.lib.recfunctions import join_by ext_vars_file2 = files[-1] csv_data2 = np.recfromcsv(ext_vars_file2, delimiter=';') files = files[:-1] ext_vars_file = files[-1] csv_data = np.recfromcsv(ext_vars_file, delimiter=';') files = files[:-1] # join_by sorts the output along the key csv_data = join_by('subject_id', csv_data, csv_data2, usemask=False, asrecarray=True)[:n_subjects] if get_anats: anats = files[-n_subjects:] files = files[:-n_subjects] if get_masks: masks = files[-n_subjects:] files = files[:-n_subjects] if get_tmaps: tmaps = files[1::2] files = files[::2] return Bunch(cmaps=files, tmaps=tmaps, masks=masks, anats=anats, ext_vars=csv_data)
'acc': 1, 'good': 2, 'vgood': 3, } X, y = data_utils.dispose_data(url, str2int) #将数据集切分为训练集和测试集: train_data, test_data, train_target, test_target = train_test_split( X, y, test_size=0.3, random_state=0) # 生成一个默认的数据结构,方便使用 dataArray = np.empty((len(train_target), 6)) for i in range(len(train_target)): dataArray[i] = np.asarray(train_data[i], dtype=np.float) targetArray = np.asarray(train_target, dtype=np.int) target_names = np.asarray(['unacc', 'acc', 'good', 'vgood']) fdescr = "Train data for the car" carSet = Bunch(data=dataArray, target=targetArray, target_names=target_names, DESCR=fdescr, feature_names=[ 'buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety' ]) ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=1000, learning_rate=1) ada_clf.fit(carSet.data, carSet.target) print ada_clf.score(test_data, test_target)
file_obj.close() return bunch #写入bunch对象 def writebunchobj(path,bunchobj): file_obj = open(path, "wb") pickle.dump(bunchobj,file_obj) file_obj.close() # 1. 读取停用词表 stopword_path = "train_word_bag/hlt_stop_words.txt" stpwrdlst = readfile(stopword_path).splitlines() # 2. 导入分词后的词向量bunch对象 path = "train_word_bag/train_set.dat" # 词向量空间保存路径 bunch = readbunchobj(path) # 3. 构建tf-idf词向量空间对象 tfidfspace = Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={}) # 4. 使用TfidfVectorizer初始化向量空间模型 vectorizer = TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf = True,max_df = 0.5) transformer=TfidfTransformer() # 该类会统计每个词语的tf-idf权值 # 文本转为词频矩阵,单独保存字典文件 tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) tfidfspace.vocabulary = vectorizer.vocabulary_ # 创建词袋的持久化 space_path = "train_word_bag/tfdifspace.dat" # 词向量空间保存路径 writebunchobj(space_path,tfidfspace) print "if-idf词向量空间创建成功!!!"
def main(): accuracies = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = CountVectorizer(encoding='latin-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) vct_analizer = vct.build_tokenizer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = max(10, args.fixk) if args.fixk < 0: args.fixk = None # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5) # fixk_saved = "{0}{1}.p".format(args.train, args.fixk) data, vct = load_from_file(args.train, categories, args.fixk, min_size, vct) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) #### COST MODEL parameters = parse_parameters_mat(args.cost_model) print "Cost Parameters %s" % parameters cost_model = set_cost_model(args.cost_function, parameters=parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ #### ACCURACY MODEL accu_parameters = parse_parameters_mat(args.accu_model) #### CLASSIFIER clf = set_classifier(args.classifier) print "\nClassifier: %s" % clf #### EXPERT MODEL if "fixed" in args.expert: expert = baseexpert.FixedAccuracyExpert(accuracy_value=accu_parameters[0], cost_function=cost_model.cost_function) #average value of accuracy of the experts elif "true" in args.expert: expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function) elif "linear" in args.expert: #expert = baseexpert.LRFunctionExpert(model=[0.0019, 0.6363],cost_function=cost_model.cost_function) raise Exception("We do not know linear yet!!") elif "log" in args.expert: expert = baseexpert.LogFunctionExpert(model=accu_parameters, cost_function=cost_model.cost_function) elif "direct" in args.expert: expert = baseexpert.LookUpExpert(accuracy_value=accu_parameters, cost_function=cost_model.cost_function) elif "neutral" in args.expert: exp_clf = LogisticRegression(penalty='l1', C=1) exp_clf.fit(data.test.bow, data.test.target) expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold, cost_function=cost_model.cost_function) else: raise Exception("We need a defined cost function options [fixed|log|linear]") exp_clf = LogisticRegression(penalty='l1', C=args.expert_penalty) exp_clf.fit(data.test.bow, data.test.target) print "\nExpert: %s " % expert coef = exp_clf.coef_[0] # print_features(coef, vct.get_feature_names()) #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, 50)) t0 = time.time() tac = [] tau = [] ### experiment starts for t in range(args.trials): trial_accu = [] trial_aucs = [] print "*" * 60 print "Trial: %s" % t if args.student in "unc": student = randomsampling.UncertaintyLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t, subpool=250) else: student = randomsampling.RandomSamplingLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t) print "\nStudent: %s " % student train_indices = [] train_x = [] train_y = [] pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training if args.fixk is None: pool.fixk = data.train.bow.tocsr() else: pool.fixk = data.train.bowk.tocsr() # k words BOW for querying pool.target = data.train.target pool.predicted = [] # pool.kwords = np.array(data.train.kwords) # k words pool.remaining = set(range(pool.data.shape[0])) # indices of the pool bootstrapped = False current_cost = 0 iteration = 0 while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter: if not bootstrapped: ## random bootstrap #bt = randomsampling.BootstrapRandom(random_state=t * 10) ## random from each bootstrap bt = randomsampling.BootstrapFromEach(t * 10) query_index = bt.bootstrap(pool=pool, k=bootstrap_size) bootstrapped = True print "Bootstrap: %s " % bt.__class__.__name__ print else: query_index = student.pick_next(pool=pool, k=step_size) # query = pool.fixk[query_index] # query with k words query = pool.data[query_index] # print query_index # query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]] query_size = [1]*query.shape[0] ground_truth = pool.target[query_index] if iteration == 0: ## bootstrap uses ground truth labels = ground_truth spent = [0] * len(ground_truth) else: labels = expert.label_instances(query, ground_truth) spent = expert.estimate_instances(query_size) query_cost = np.array(spent).sum() current_cost += query_cost # train_indices.extend(query_index) # remove labels from pool pool.remaining.difference_update(query_index) # add labels to training # train_x = pool.data[train_indices] ## train with all the words # update labels with the expert labels useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None]) if useful_answers.shape[0] != 0: train_indices.extend(useful_answers[:, 0]) # add labels to training train_x = pool.data[train_indices] ## train with all the words # update labels with the expert labels train_y.extend(useful_answers[:, 1]) if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # retrain the model current_model = student.train(train_x, train_y) # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] accu = metrics.accuracy_score(data.test.target, pred_y) print ( "TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices), accu, auc, query_cost, current_cost, format_spent(spent))) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count #x_axis_range = int(current_cost / eval_range) x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) trial_accu.append([x_axis_range, accu]) trial_aucs.append([x_axis_range, auc]) iteration += 1 # end of budget loop tac.append(trial_accu) tau.append(trial_aucs) #end trial loop if args.cost_function not in "uniform": accuracies = extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size) aucs = extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size) print("Elapsed time %.3f" % (time.time() - t0)) print_extrapolated_results(accuracies, aucs)