コード例 #1
0
ファイル: confidence.py プロジェクト: leolincoln/EDW_intern
def keydb_get_dbs(folderName='./keydb/'):
    '''
    for earlier versions, when we did not have data input. 
    This functions gets all the .data files. 
    Returns: 
        a list of filesNames for .data files in specified folder names. 
    E.g.
    if you have a folder ./keydb/ and 1 file called test.data, 
    then 
    keydb_get_dbs(folderName='./keydb/')  -> [test]
    '''    
    import glob
    dbs = glob.glob(folderName+'*.data')
    dbs.extend(glob.glob('*.data'))
    return [get_name(item) for item in dbs]
コード例 #2
0
ファイル: confidence.py プロジェクト: leolincoln/EDW_intern
def keydb_build():
    '''
    this function is for building all keydbs from all csvs from /data folder. 
    when the build finishes, you will have all *.data where * is the cancer name. 
    batch building all libraries under the same directory. (not keydb directory)    
    '''
    from get_data_breast import get_format_data
    from file_utilities import getData3
    start = time.time()
    start0 = start
    times = []
    #times is an array of 1 tuple, for each of the time there is an explanation. 

    '''
    #clean all databases
    keydb_destroy()    
    keydb_marginal_destroy()
    elapsed = time.time()-start
    print 'destroy keydb and marginal db finished. elapsed time=',elapsed,'s'
    times.append((['destroy keydb and marginal db',elapsed]))    
    start = time.time()    
    '''
    
    
    #get file list
    from glob import glob
    files = glob('./data/*.csv')
    
    #lengths = []
    for f in files:
        #destroy the keydb for each data
        #keydb_marginal_destroy(get_name(f)+'.data')
        #continue
        #get data
        data = getData3(f)     
        data,result = get_format_data(data)
        for cancer in result.keys():
            if cancer == 'content':
                continue
            for key in result[cancer].keys():
                if len(re.findall('\_',key))>=2:
                    result.append([cancer,result[cancer][key]])
        continue
        #lengths.append([f,len(data)])
        #continue
        elapsed = time.time()-start
        print 'laoding data finished. elapsed time=',elapsed,'s'   
        times.append((['loading data '+get_name(f),elapsed]))
        start = time.time()
        
    
        
        ###################test
        #testNote = '\nUTERINE CANCER STAGING SUMMARY\nd0 d1:data1\nd0 d1 d3:data3\nd1 d2: data2\nd1 d2: data3\n\nAmerican Joint Committee on Cancer (2009) Tumor-Node-Metastasis (TNM) staging for endometrial cancer:\nTumor (T):\t\tpT1a\nNodes (N):\t\tpN0\nMetastasis (M):\tpMX\n\n'
        #testResult = keydb_get_note(testNote)
        #keydb_marginal_add_note(testNote)
        #realResult = testResult.copy()
        #for key in testResult.keys():
        #    realResult[key]= keydb_marginal_newkey(key)
        ###################test over
        

        #load value  
        i=0
        for value in data:
            i+=1
            tempStart = time.time()
            #adding the note to db. 
            #keydb_marginal_add_note(value[1])
            #adding the note to specific db, namely breast.data etc
            keydb_marginal_add_note(value[1],dbName = get_name(f)+'.data' )

            print i,'/',len(data), time.time() - tempStart            
            #valdb_add_note(value[1])
            
        
        
        elapsed = time.time()-start
        print 'add note to marginal db finished. elapsed time=',elapsed,'s'   
        times.append((['adding data ' +get_name(f),elapsed]))
        start = time.time()
        
        '''
        #detect keys, excluding those of content
        marginal_result=result.copy()
        marginaldb = keydb_marginal_load()
        valuedb = valuedb_load()
        for key1,value1 in result.items():
            for key2,value2 in value1.items():
                if key2 == 'content':
                    continue
                else:
                    for key in value2.keys():
                        marginal_result[key1][key2][key]['keyscore'] = keydb_marginal_newkey(key,marginaldb=marginaldb)
                        marginal_result[key1][key2][key]['valuescore'] = valuedb_newvalue(value2[key],valuedb = valuedb)
                        
        elapsed = time.time()-start
        print 'detecting key finished. elapsed time=',elapsed,'s'  
        times.append((['detecting data',elapsed]))
        start = time.time()                               
        '''
        
        
        '''
        keydb_add_result(result)
        keydb = keydb_load()
        keydb_marginal_destroy()
        keydb_marginal_add_db(keydb)
        marginaldb = keydb_marginal_load()
        cResult = {}
        test = result[1][result[1].keys()[1]].keys()[0]
        
        marginal = keydb_marginal_marginal(test)
        chained = keydb_marginal_chained(test)
        
        '''
        
        ''' ALTERNATIVE WAYS TO GET DB'''
        ''' USING GET_KEY_FREQ ROUTINE
        db = {}
        for record in result.values():
            db=dict_add(db,get_key_freq(record))
        keydb_add(db)
        ### USING ADD_NOTE_KEYDB ROUTINE
        db = {}
        for value[1] in data.values() as record:
            db=dict_add(db,keydb_get_note(record))
        keydb_add(db)
        '''
        elapsed = time.time()-start0
        print 'finished exectuing. elapsed time=',elapsed,'s'
        times.append(['total time '+get_name(f),elapsed])
    print(times)