def readFile(self, fileName=None): if fileName is not None: self.clear_listbox(self.list1) self.clear_listbox(self.list2) self.clear_text(self.txt) #data is the original data, result is the result after processing for each row, pGroup is the pId grouping information. # I am being lazy here. Did not change much of the code but want to achieve the same result print 'reading from ',fileName self.data,self.result = get_format_data(fileName=fileName) print 'read file finished, len of data:', len(self.data), 'len of result ',len(self.result.keys()) self.insert_to_listbox(self.result.keys(),self.list1)
def keydb_build(): ''' this function is for building all keydbs from all csvs from /data folder. when the build finishes, you will have all *.data where * is the cancer name. batch building all libraries under the same directory. (not keydb directory) ''' from get_data_breast import get_format_data from file_utilities import getData3 start = time.time() start0 = start times = [] #times is an array of 1 tuple, for each of the time there is an explanation. ''' #clean all databases keydb_destroy() keydb_marginal_destroy() elapsed = time.time()-start print 'destroy keydb and marginal db finished. elapsed time=',elapsed,'s' times.append((['destroy keydb and marginal db',elapsed])) start = time.time() ''' #get file list from glob import glob files = glob('./data/*.csv') #lengths = [] for f in files: #destroy the keydb for each data #keydb_marginal_destroy(get_name(f)+'.data') #continue #get data data = getData3(f) data,result = get_format_data(data) for cancer in result.keys(): if cancer == 'content': continue for key in result[cancer].keys(): if len(re.findall('\_',key))>=2: result.append([cancer,result[cancer][key]]) continue #lengths.append([f,len(data)]) #continue elapsed = time.time()-start print 'laoding data finished. elapsed time=',elapsed,'s' times.append((['loading data '+get_name(f),elapsed])) start = time.time() ###################test #testNote = '\nUTERINE CANCER STAGING SUMMARY\nd0 d1:data1\nd0 d1 d3:data3\nd1 d2: data2\nd1 d2: data3\n\nAmerican Joint Committee on Cancer (2009) Tumor-Node-Metastasis (TNM) staging for endometrial cancer:\nTumor (T):\t\tpT1a\nNodes (N):\t\tpN0\nMetastasis (M):\tpMX\n\n' #testResult = keydb_get_note(testNote) #keydb_marginal_add_note(testNote) #realResult = testResult.copy() #for key in testResult.keys(): # realResult[key]= keydb_marginal_newkey(key) ###################test over #load value i=0 for value in data: i+=1 tempStart = time.time() #adding the note to db. #keydb_marginal_add_note(value[1]) #adding the note to specific db, namely breast.data etc keydb_marginal_add_note(value[1],dbName = get_name(f)+'.data' ) print i,'/',len(data), time.time() - tempStart #valdb_add_note(value[1]) elapsed = time.time()-start print 'add note to marginal db finished. elapsed time=',elapsed,'s' times.append((['adding data ' +get_name(f),elapsed])) start = time.time() ''' #detect keys, excluding those of content marginal_result=result.copy() marginaldb = keydb_marginal_load() valuedb = valuedb_load() for key1,value1 in result.items(): for key2,value2 in value1.items(): if key2 == 'content': continue else: for key in value2.keys(): marginal_result[key1][key2][key]['keyscore'] = keydb_marginal_newkey(key,marginaldb=marginaldb) marginal_result[key1][key2][key]['valuescore'] = valuedb_newvalue(value2[key],valuedb = valuedb) elapsed = time.time()-start print 'detecting key finished. elapsed time=',elapsed,'s' times.append((['detecting data',elapsed])) start = time.time() ''' ''' keydb_add_result(result) keydb = keydb_load() keydb_marginal_destroy() keydb_marginal_add_db(keydb) marginaldb = keydb_marginal_load() cResult = {} test = result[1][result[1].keys()[1]].keys()[0] marginal = keydb_marginal_marginal(test) chained = keydb_marginal_chained(test) ''' ''' ALTERNATIVE WAYS TO GET DB''' ''' USING GET_KEY_FREQ ROUTINE db = {} for record in result.values(): db=dict_add(db,get_key_freq(record)) keydb_add(db) ### USING ADD_NOTE_KEYDB ROUTINE db = {} for value[1] in data.values() as record: db=dict_add(db,keydb_get_note(record)) keydb_add(db) ''' elapsed = time.time()-start0 print 'finished exectuing. elapsed time=',elapsed,'s' times.append(['total time '+get_name(f),elapsed]) print(times)