def process(raw_data_file, trainbunch_file, testbunch_file, split_value): ''' 此函数为控制整个处理过程的函数 输入为原始文件,以及处理后要储存的bunch文件,还有训练集和测试集的分割比例 函数没有返回值,最终将处理后的bunch写入文件为止 ''' databunch = Bunch( contents=[], accu=[]) # databunch 储存对raw_data_file处理后的数据,然后再shuffle成训练集和测试集。 # contents储存的是处理后的fact, accu储存的是汉字的犯罪标签,new_accu储存的二值化的犯罪标签。 print("正在写入数据》》》》") st = time.time() with open(raw_data_file, "r", encoding="utf-8") as rawdata_file: file_lenth = len(rawdata_file.readlines()) cnt = 0 # 用于记录数据处理进度 rawdata_file = open(raw_data_file, "r", encoding="utf-8") for line_json in rawdata_file.readlines(): # line_json中包含着我们所需的所有信息 line = json.loads(line_json) # line是一个字典 fact = line["fact"] accusation = line["meta"]["accusation"] new_fact = process_fact( fact) # new_fact是处理后的犯罪事实,保留了中文, 然后用jieba分词得到最终的结果。 databunch.contents.append(new_fact) databunch.accu.append(accusation) cnt += 1 if cnt % 1000 == 0: print("写入数据进度为:{:.3f}%".format(cnt / file_lenth * 100)) rawdata_file.close() et = time.time() print("写入数据完毕!!!!用时:{:.3f}s".format(et - st)) print("正在处理数据》》》》") st = time.time() # new_fact和accu已经写入bunch, 将databunch打乱顺序, 分为trainbunch和testbunch random_seed = np.random.randint(0, 100) # 保证databunch中各个内容的shuffle次序是一样的。 for i in databunch.keys(): np.random.seed(random_seed) np.random.shuffle(databunch[i]) trainbunch, testbunch = split_train_test(databunch, split_value) # 对accu二值化 multilabelbinarizer = MultiLabelBinarizer(accu_list) trainbunch.new_accu = multilabelbinarizer.fit_transform(trainbunch.accu) testbunch.new_accu = multilabelbinarizer.transform(testbunch.accu) # 将trainbunch, testbunch写入trainbunch_file, testbunch_file joblib.dump(trainbunch, trainbunch_file) joblib.dump(testbunch, testbunch_file) et = time.time() print("处理数据完毕!!!!用时:{:.3f}s".format(et - st))
#print total faces and labels print("Total faces: ", len(faces)) print("Total labels: ", len(labels)) #eval_faces = [np.reshape(a, (64, 64)) for a in faces] rostros=Bunch(DESCR="descripcion dataset", keys=['target', 'DESCR', 'data', 'images'], images=faces,data=len(faces),target=np.asarray(faces)) #imprimimos propiedades del dataset faces.data contiene el puntero de la lista y faces.target la lista de imagenes en cuestion print(rostros.DESCR) print(rostros.keys()) print(rostros.images) print(rostros.data) print(rostros.target.shape) #create our LBPH face recognizer #face_recognizer = cv2.face.createLBPHFaceRecognizer() #or use EigenFaceRecognizer by replacing above line with #face_recognizer = cv2.face.createEigenFaceRecognizer() #or use FisherFaceRecognizer by replacing above line with #face_recognizer = cv2.face.createFisherFaceRecognizer() #train our face recognizer of our training faces #train_and_evaluate(svc_3, X_train, X_test, y_train, y_test)
def create_dataset(subject_id): import numpy as np import os from nilearn import datasets from nilearn.datasets import _get_dataset_dir from nilearn.datasets import _get_dataset from sklearn.datasets.base import Bunch import pylab as pl import nibabel as nb from remove import remove_range, remove dataset_name = 'machine_learning' runs = 4 img_data = np.zeros((64,64,33,1)) lab_data = [] session_data = [] for r in range(runs): print 'RUN', r rv = None path = '/gablab/p/eegfmri/analysis/eeg/elists' path_all_codes = '/gablab/p/eegfmri/analysis/iaps/all_labels.txt' path_names2 = os.path.join(path, 'elist_IAPS_%s_%s_raw.txt' %(subject_id, r+1)) if subject_id == '009': path_names2 = os.path.join(path, 'elist_IAPS_%s_%s.txt' %(subject_id, r+1)) eegcodes = np.genfromtxt(path_all_codes, dtype=int) [:, 0] attributes = np.genfromtxt(path_all_codes, dtype=float) [:, 1:4] binary = attributes[:, 2] run_code = np.genfromtxt(path_names2, dtype=str) [:,3] clock = np.genfromtxt(path_names2, dtype=str) [:,4] cl = [] tp = [] for i in range(len(clock)): if run_code[i] == 'R128': timepoint = clock[i].lstrip('0123456789') tp.append(timepoint) if len(tp) > 0: clock[i] = clock[i].lstrip('0123456789') if clock[i] == tp[0]: cl.append([i]) if run_code[i] != 'R128': print i, run_code[i] if clock[i] != tp[0] and run_code[i] == 'R128': print 'TR at index', i, 'removed.' run_code[i] = 'remove' print 'Numbers of TR identical timepoints', len(cl) tr = [] for idx,i in enumerate(run_code): if i == 'R128': tr.append([idx]) print 'Number of TR counted from elist code', len(tr) rv = remove(run_code, 'R') rv = remove(rv, 'remove') rv = remove(rv, 'boundary') rv = remove(rv, 'SyncOn') rv = remove(rv, 'Start') rv = remove(rv, 'Userdefined') rv = remove(rv, 'LowCorrelation') rv = remove(rv, 'TSTART') rv = remove(rv, 'TPEAK') rv = remove(rv, 'TEND') for i in range(len(rv)): if rv[i] == 'R128': rv[i] = '-99' rv[i] = rv[i].lstrip('S') rv[i] = int(rv[i]) # remove stimulus codes for responses rv = remove_range(rv, 240) for idx, i in enumerate(rv): for idx2, i2 in enumerate(eegcodes): if i == i2: rv[idx] = binary[idx2] for idx, i in enumerate(rv): if i != -99: rv[idx-1] = i rv[idx] = 0 # remove last TR as it was apparently not recorded rv[-1] = 0 rv = remove(rv, 0) for idx, i in enumerate(rv): if i == -99: rv[idx] = 0 # until now the list with negative / neutral labels also contains zeros, which we will want to get rid of. # To do this, we will replace the zeros with the code shown prior # First two values will be deleted as well as first two TRs (after fmri_data_i gets assigned for idx, z in enumerate(rv): if idx <= 2 and z == 0: rv[idx] = -77 if idx > 2 and z == 0: rv[idx] = rv[idx-1] for idx, z in enumerate(rv): if idx <= 1 and z != -77: print 'Warning, non-empty first two TRs were deleted.' rv = remove(rv, -77) unique = sorted(list(set(rv))) print 'Unique values in RV', unique t = open('/gablab/p/eegfmri/analysis/iaps/pilot%s/machine_learning/neg-neutr_attributes_run%s.txt' %(subject_id, r), 'w') for i in range(len(rv)): t.write("%s %s" %(rv[i], r)) t.write('\n') t.close() print 'Labels Length:', len(rv) file_name = ['neg-neutr_attributes_run%s.txt' %(r), 'pilot%s_r0%s_bandpassed.nii.gz' %(subject_id, r)] fil = _get_dataset(dataset_name, file_name, data_dir='/gablab/p/eegfmri/analysis/iaps/pilot%s' %(subject_id), folder=None) ds_i = Bunch(func=fil[1], conditions_target=fil[0]) labels_i = np.loadtxt(ds_i.conditions_target, dtype=np.str) bold_i = nb.load(ds_i.func) fmri_data_i = np.copy(bold_i.get_data()) print 'Original fMRI data', fmri_data_i.shape fmri_data_i = fmri_data_i[...,2:] print fmri_data_i.shape affine = bold_i.get_affine() mean_img_i = np.mean(fmri_data_i, axis=3) session_data = np.append(session_data, labels_i[:,1]) lab_data = np.append(lab_data, labels_i[:,0]) img_data = np.concatenate((img_data, fmri_data_i), axis=3) print '__________________________________________________________________________________________________________' if r == 3: img_data = img_data[...,1:] print 'fMRI image', img_data.shape print 'Label Vector Length:', len(lab_data), 'Session Vector Length:', len(session_data) ni_img = nb.Nifti1Image(img_data, affine=None, header=None) nb.save(ni_img, '/gablab/p/eegfmri/analysis/iaps/pilot%s/machine_learning/all_runs.nii' %(subject_id)) f = open('/gablab/p/eegfmri/analysis/iaps/pilot%s/machine_learning/neg-neutr_attributes_all_runs.txt' %(subject_id), 'w') for i in range(len(lab_data)): f.write("%s %s" %(lab_data[i], session_data[i])) f.write('\n') f.close() # set up concatenated dataset in nilearn format file_names = ['neg-neutr_attributes_all_runs.txt', 'all_runs.nii'] files = _get_dataset(dataset_name, file_names, data_dir='/gablab/p/eegfmri/analysis/iaps/pilot%s' %(subject_id), folder=None) ds = Bunch(func=files[1], conditions_target=files[0]) print ds.keys(), ds labels = np.loadtxt(ds.conditions_target, dtype=np.str) bold = nb.load(ds.func) fmri_data = np.copy(bold.get_data()) print fmri_data.shape affine = bold_i.get_affine() # just choose one # Compute the mean EPI: we do the mean along the axis 3, which is time mean_img = np.mean(fmri_data, axis=3) return (ds, labels, bold, fmri_data, affine, mean_img) # later 'ds' will be sufficient