Exemple #1
0
def process(raw_data_file, trainbunch_file, testbunch_file, split_value):
    '''
    此函数为控制整个处理过程的函数
    输入为原始文件,以及处理后要储存的bunch文件,还有训练集和测试集的分割比例
    函数没有返回值,最终将处理后的bunch写入文件为止
    '''
    databunch = Bunch(
        contents=[],
        accu=[])  # databunch 储存对raw_data_file处理后的数据,然后再shuffle成训练集和测试集。
    # contents储存的是处理后的fact, accu储存的是汉字的犯罪标签,new_accu储存的二值化的犯罪标签。
    print("正在写入数据》》》》")
    st = time.time()
    with open(raw_data_file, "r", encoding="utf-8") as rawdata_file:
        file_lenth = len(rawdata_file.readlines())
        cnt = 0  #  用于记录数据处理进度
    rawdata_file = open(raw_data_file, "r", encoding="utf-8")
    for line_json in rawdata_file.readlines():  # line_json中包含着我们所需的所有信息
        line = json.loads(line_json)  # line是一个字典
        fact = line["fact"]
        accusation = line["meta"]["accusation"]
        new_fact = process_fact(
            fact)  # new_fact是处理后的犯罪事实,保留了中文, 然后用jieba分词得到最终的结果。
        databunch.contents.append(new_fact)
        databunch.accu.append(accusation)
        cnt += 1
        if cnt % 1000 == 0:
            print("写入数据进度为:{:.3f}%".format(cnt / file_lenth * 100))
    rawdata_file.close()
    et = time.time()
    print("写入数据完毕!!!!用时:{:.3f}s".format(et - st))
    print("正在处理数据》》》》")
    st = time.time()
    # new_fact和accu已经写入bunch, 将databunch打乱顺序, 分为trainbunch和testbunch
    random_seed = np.random.randint(0, 100)  # 保证databunch中各个内容的shuffle次序是一样的。
    for i in databunch.keys():
        np.random.seed(random_seed)
        np.random.shuffle(databunch[i])
    trainbunch, testbunch = split_train_test(databunch, split_value)
    # 对accu二值化
    multilabelbinarizer = MultiLabelBinarizer(accu_list)
    trainbunch.new_accu = multilabelbinarizer.fit_transform(trainbunch.accu)
    testbunch.new_accu = multilabelbinarizer.transform(testbunch.accu)
    # 将trainbunch, testbunch写入trainbunch_file, testbunch_file
    joblib.dump(trainbunch, trainbunch_file)
    joblib.dump(testbunch, testbunch_file)
    et = time.time()
    print("处理数据完毕!!!!用时:{:.3f}s".format(et - st))
Exemple #2
0
 
#print total faces and labels
print("Total faces: ", len(faces))
print("Total labels: ", len(labels))    



#eval_faces = [np.reshape(a, (64, 64)) for a in faces]


rostros=Bunch(DESCR="descripcion dataset", keys=['target', 'DESCR', 'data', 'images'],
            images=faces,data=len(faces),target=np.asarray(faces))

#imprimimos propiedades del dataset faces.data contiene el puntero de la lista y faces.target la lista de imagenes en cuestion
print(rostros.DESCR)
print(rostros.keys())
print(rostros.images)
print(rostros.data)
print(rostros.target.shape)
	
#create our LBPH face recognizer 
#face_recognizer = cv2.face.createLBPHFaceRecognizer()
 
#or use EigenFaceRecognizer by replacing above line with 
#face_recognizer = cv2.face.createEigenFaceRecognizer()
 
#or use FisherFaceRecognizer by replacing above line with 
#face_recognizer = cv2.face.createFisherFaceRecognizer()

#train our face recognizer of our training faces
#train_and_evaluate(svc_3, X_train, X_test, y_train, y_test)	
Exemple #3
0
def create_dataset(subject_id):
    import numpy as np
    import os 
    from nilearn import datasets
    from nilearn.datasets import _get_dataset_dir
    from nilearn.datasets import _get_dataset
    from sklearn.datasets.base import Bunch
    import pylab as pl
    import nibabel as nb
    
    from remove import remove_range, remove

    dataset_name = 'machine_learning'
    runs = 4
    img_data = np.zeros((64,64,33,1))
    lab_data = []
    session_data = []
    for r in range(runs):
        print 'RUN', r
        rv = None
        path = '/gablab/p/eegfmri/analysis/eeg/elists'
        path_all_codes = '/gablab/p/eegfmri/analysis/iaps/all_labels.txt'
        path_names2 = os.path.join(path, 'elist_IAPS_%s_%s_raw.txt' %(subject_id, r+1))
        if subject_id == '009':
            path_names2 = os.path.join(path, 'elist_IAPS_%s_%s.txt' %(subject_id, r+1)) 
        eegcodes = np.genfromtxt(path_all_codes, dtype=int) [:, 0]
        attributes = np.genfromtxt(path_all_codes, dtype=float) [:, 1:4]
        binary = attributes[:, 2]
        run_code = np.genfromtxt(path_names2, dtype=str) [:,3]
        clock = np.genfromtxt(path_names2, dtype=str) [:,4] 
        cl = []
        tp = []
        for i in range(len(clock)):
            if run_code[i] == 'R128':
                timepoint = clock[i].lstrip('0123456789')  
                tp.append(timepoint)            
            if len(tp) > 0:
                clock[i] = clock[i].lstrip('0123456789')
                if clock[i] == tp[0]:
                    cl.append([i])
                    if run_code[i] != 'R128':
                        print i, run_code[i] 
                if clock[i] != tp[0] and run_code[i] == 'R128':
                    print 'TR at index', i, 'removed.'
                    run_code[i] = 'remove'
        print 'Numbers of TR identical timepoints', len(cl)
        tr = []
        for idx,i in enumerate(run_code):
            if i == 'R128':
                tr.append([idx])
        print 'Number of TR counted from elist code', len(tr)
        rv = remove(run_code, 'R')
        rv = remove(rv, 'remove')
        rv = remove(rv, 'boundary')
        rv = remove(rv, 'SyncOn')
        rv = remove(rv, 'Start')
        rv = remove(rv, 'Userdefined')
        rv = remove(rv, 'LowCorrelation')
        rv = remove(rv, 'TSTART')
        rv = remove(rv, 'TPEAK')
        rv = remove(rv, 'TEND')
        for i in range(len(rv)):
            if rv[i] == 'R128':
                rv[i] = '-99'
            rv[i] = rv[i].lstrip('S')
            rv[i] = int(rv[i])
        # remove stimulus codes for responses
        rv = remove_range(rv, 240)
        for idx, i in enumerate(rv):
            for idx2, i2 in enumerate(eegcodes):
                if i == i2:
                    rv[idx] = binary[idx2]            
        for idx, i in enumerate(rv):
            if i != -99:
                rv[idx-1] = i
                rv[idx] = 0
        # remove last TR as it was apparently not recorded
        rv[-1] = 0
        rv = remove(rv, 0)
        for idx, i in enumerate(rv):
            if i == -99:
                rv[idx] = 0
        
        # until now the list with negative / neutral labels also contains zeros, which we will want to get rid of. 
        # To do this, we will replace the zeros with the code shown prior
        # First two values will be deleted as well as first two TRs (after fmri_data_i gets assigned
        
        for idx, z in enumerate(rv):
            if idx <= 2 and z == 0:
                rv[idx] = -77
            if idx > 2 and z == 0:
                rv[idx] = rv[idx-1]
                
        for idx, z in enumerate(rv):
            if idx <= 1 and z != -77:
                print 'Warning, non-empty first two TRs were deleted.'
        
        rv = remove(rv, -77)
        unique = sorted(list(set(rv)))
        print 'Unique values in RV', unique  
        
        t = open('/gablab/p/eegfmri/analysis/iaps/pilot%s/machine_learning/neg-neutr_attributes_run%s.txt' %(subject_id, r), 'w')
        for i in range(len(rv)):
            t.write("%s %s" %(rv[i], r))
            t.write('\n')  
        t.close()
        
        print 'Labels Length:', len(rv)
        file_name = ['neg-neutr_attributes_run%s.txt' %(r), 'pilot%s_r0%s_bandpassed.nii.gz' %(subject_id, r)]
        fil = _get_dataset(dataset_name, file_name, data_dir='/gablab/p/eegfmri/analysis/iaps/pilot%s' %(subject_id), folder=None)
        ds_i = Bunch(func=fil[1], conditions_target=fil[0])
        labels_i = np.loadtxt(ds_i.conditions_target, dtype=np.str)
        bold_i = nb.load(ds_i.func)
        fmri_data_i = np.copy(bold_i.get_data())
        print 'Original fMRI data', fmri_data_i.shape
        
        fmri_data_i = fmri_data_i[...,2:]
        print fmri_data_i.shape
        
        affine = bold_i.get_affine()
        mean_img_i = np.mean(fmri_data_i, axis=3)
        session_data = np.append(session_data, labels_i[:,1])
        lab_data = np.append(lab_data, labels_i[:,0])
        img_data = np.concatenate((img_data, fmri_data_i), axis=3)        
        print '__________________________________________________________________________________________________________'
        
        
        if r == 3:
            img_data = img_data[...,1:]
            print 'fMRI image', img_data.shape
            print 'Label Vector Length:', len(lab_data), 'Session Vector Length:', len(session_data)
            ni_img = nb.Nifti1Image(img_data, affine=None, header=None)
            nb.save(ni_img, '/gablab/p/eegfmri/analysis/iaps/pilot%s/machine_learning/all_runs.nii' %(subject_id))
            f = open('/gablab/p/eegfmri/analysis/iaps/pilot%s/machine_learning/neg-neutr_attributes_all_runs.txt' %(subject_id), 'w')
            for i in range(len(lab_data)):
                f.write("%s %s" %(lab_data[i], session_data[i]))
                f.write('\n')  
            f.close()
            # set up concatenated dataset in nilearn format
            file_names = ['neg-neutr_attributes_all_runs.txt', 'all_runs.nii']
            files = _get_dataset(dataset_name, file_names, data_dir='/gablab/p/eegfmri/analysis/iaps/pilot%s' %(subject_id), folder=None)
            ds = Bunch(func=files[1], conditions_target=files[0])
            print ds.keys(), ds
            labels = np.loadtxt(ds.conditions_target, dtype=np.str)
            bold = nb.load(ds.func)
            fmri_data = np.copy(bold.get_data())
            print fmri_data.shape
            affine = bold_i.get_affine() # just choose one
            # Compute the mean EPI: we do the mean along the axis 3, which is time
            mean_img = np.mean(fmri_data, axis=3)
            
    return (ds, labels, bold, fmri_data, affine, mean_img) # later 'ds' will be sufficient