def generate_pattern_occurrences_per_patient(out_dir, sequence_file, min_sup, mapping_dir): '''generates pattern, then checks for occurrences per patient and writes to csv''' # generate patterns patterns, p_pos, p_neg = generate(sequence_file, min_sup, mapping_dir) # save patterns patterns2csv(patterns, out_dir + '/patterns.csv') # open writer out_f = out_dir + '/temporal.csv' out = in_out.write_csv(out_f) # open reader if type(sequence_file) == dict: rows = (v['data'] for k, v in sequence_file.iteritems()) else: rows = in_out.read_csv(sequence_file) # make & write header #header = ['patient','age','gender'] + ['p'+str(p) for p in range(len(patterns))] + ['CRC'] header = ['ID'] + ['p'+str(p) for p in range(len(patterns))] out.writerow(header) # check for each pattern whether it matches in the patient (1) or not (0) for row in rows: write_record(row, out, patterns)
def generate_pattern_occurrences_per_patient(out_dir, sequence_file, min_sup, mapping_dir): '''generates pattern, then checks for occurrences per patient and writes to csv''' # generate patterns patterns, p_pos, p_neg = generate(sequence_file, min_sup, mapping_dir) # save patterns patterns2csv(patterns, out_dir + '/patterns.csv') # open writer out_f = out_dir + '/temporal.csv' out = in_out.write_csv(out_f) # open reader if type(sequence_file) == dict: rows = (v['data'] for k, v in sequence_file.items()) else: rows = in_out.read_csv(sequence_file) # make & write header #header = ['patient','age','gender'] + ['p'+str(p) for p in range(len(patterns))] + ['stroke'] header = ['ID'] + ['p' + str(p) for p in range(len(patterns))] out.writerow(header) # check for each pattern whether it matches in the patient (1) or not (0) for row in rows: write_record(row, out, patterns)
def save_output(self, benchmark=False, sequence_file=False, sub_dir='', name='unnamed', target=False): '''saves processed data to the specified output directory''' print('...saving processed data' ) # to {}'.format('sql' if self.from_sql else 'file') headers = self.headers # print (self.id2data.values()) # print('x') # if we didn't get the data from sql database, just save to .csv if True or not self.from_sql: # possibly make new directories out_dir = self.out_dir + '/' + sub_dir + '/' util.make_dir(out_dir) f_out = out_dir + name + '.csv' out = write_csv(f_out) # write headers where required if benchmark: out.writerow(headers[0:3]) elif target: out.writerow([headers[0], headers[-1]]) elif sequence_file: pass else: out.writerow([headers[0]] + headers[3:-1]) # write data for value in self.id2data.values(): data = value['data'] if benchmark: data = data[0:3] data[2] = 1 if data[2] == 'V' else 0 elif target: if self.survival == False: data = [data[0], 0 if data[-1] == 'negative' else 1] else: data = [data[0], data[-1]] elif sequence_file: pass else: data = [data[0]] + data[3:-1] out.writerow(data)
def export(self, folder, suffix): '''export significant abstractions with the specified suffix to a new file''' print '...exporting significance results' rows = io.read_csv(folder + suffix + '.csv') code2abstractions = {row[0] : row[1:] for row in rows} out = io.write_csv(folder + suffix + '_frequent.csv') for key, vals in code2abstractions.iteritems(): frequent_vals = [] for abstraction in vals: suffixed_abstraction = abstraction+'_'+suffix if suffixed_abstraction in self.abstraction2counts and self.abstraction2counts[suffixed_abstraction][-1] < self.ALPHA: frequent_vals.append(abstraction) # if 'rectal discharge' in abstraction: # print abstraction, suffixed_abstraction, suffixed_abstraction in self.abstraction2counts # print self.abstraction2counts.keys()[0] # print len(frequent_vals) # print self.abstraction2counts[suffixed_abstraction][-1], self.abstraction2counts[suffixed_abstraction][-1] < self.ALPHA if len(frequent_vals) > 0: out.writerow([key] + frequent_vals)
def export(self): '''exports the result to the specified file''' # open file for writing out = io.write_csv(self.f_out) # write sources out.writerow(['general source', 'predictor source', 'data source']) out.writerow([self.f_general, self.f_predictors, self.f_data]) out.writerow([]) # write general stuff out.writerow(self.compiled_result['general']['headers']) out.writerow(self.compiled_result['general']['stats']) out.writerow([]) # write headers to file out.writerow(self.compiled_result['headers']) # write individual results to file for row in self.compiled_result['data']: out.writerow(row)
def read_files_and_calculate_attributes(self, file, file_out, type=0): self.writer = io.write_csv(file_out) print '====== reading the data' rows = io.read_csv(file, ',') print '====== pointer to data obtained' counter = 0 ids = [] dataset_headers = [] for row in rows: if counter % 10000 == 0: print '====== ' + str(counter) # Assuming the headers are in the first row. if counter == 0: temp_dataset_headers = row[1:len(row)] # Create all headers, also of derived categorial attributes # attributes over time and derivations of multiple attributes combined # will be derived later. for header in temp_dataset_headers: header = header.lower() if 'hold_' in header: header = header[5:len(header)] if self.categorial_mapping.has_key(header): for var in self.categorial_mapping[header]: self.headers.append(var) else: self.headers.append(header) dataset_headers.append(header) self.headers.append('label') else: # Assuming ID is the first attribute. id = row[0] if id not in ids: ids.append(id) self.patient_dict[id] = {} for header in self.headers: self.patient_dict[id][header] = [] # Get the time to order based upon it timestamp = time.strptime( row[self.headers.index('charttime') + 1][0:15], "%d-%b-%y %H.%M") times = self.patient_dict[id]['charttime'] # Currently no ordering of the times assumed. If they are, just append at the end index = 0 while index < len(times) and times[index] < timestamp: index += 1 for row_index in range(1, len(row)): if dataset_headers[row_index - 1] == 'charttime': self.patient_dict[id]['charttime'].insert( index, timestamp) else: # Determine the values (there can be multiple in the case of categorial attributes) [features, values] = self.process_value_individual( dataset_headers[row_index - 1], row[row_index], type) for i in range(0, len(values)): self.patient_dict[id][features[i]].insert( index, values[i]) # Now assign the label self.patient_dict[id]['label'].insert( index, self.determine_class( self.patient_dict[id]['daysfromdischtodeath'][index], self.patient_dict[id]['expire_flg'][index])) counter += 1 return self.aggregate_data(type)
def execute(in_dir, delim, out_file, age_gender=False, counts_med=False, counts_med_enrich=False, counts_consult=False, counts_consult_enrich=False, counts_referral=False, counts_lab=False, tmprl=False, enriched_tmprl=False, knowledge_driven=False, counts_no_knowledge=False, tmprl_no_knowledge=False, separate=False, HISes=[]): '''merge the in files to produce the out file''' merged = defaultdict(list) headers = ['ID'] # we may not need this. ID2HIS = {} merged_test = defaultdict(list) # if we wish to separate, get dictionary of patient HIS sources using SQL. if separate: c = util.sql_connect().cursor() HISes_str = "','".join(HISes) q = '''SELECT patientnummer FROM patienten WHERE PRAKTIJKCODE IN ('{}')'''.format(HISes_str) c.execute(q) ID2HIS = {row[0] : row[0] for row in c} if age_gender: headers = merge_file(in_dir+'/AG.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_med: headers = merge_file(in_dir+'/C_M.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_med_enrich: headers = merge_file(in_dir+'/C_M_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_consult: headers = merge_file(in_dir+'/C_C.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_consult_enrich: headers = merge_file(in_dir+'/C_C_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_referral: headers = merge_file(in_dir+'/C_R.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_lab: headers = merge_file(in_dir+'/C_L.csv', merged, headers, delim, separate, ID2HIS, merged_test) if tmprl: headers = merge_file(in_dir+'/T.csv', merged, headers, delim, separate, ID2HIS, merged_test) if enriched_tmprl: headers = merge_file(in_dir+'/T_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test) if knowledge_driven: headers = merge_file(in_dir+'/K.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_no_knowledge: headers = merge_file(in_dir+'/C_NK.csv', merged, headers, delim, separate, ID2HIS, merged_test) if tmprl_no_knowledge: headers = merge_file(in_dir+'/T_NK.csv', merged, headers, delim, separate, ID2HIS, merged_test) headers = merge_file(in_dir+'/CRC.csv', merged, headers, delim, separate, ID2HIS, merged_test) # now write to new file (also check whether all results have same length) make_dir(out_file) out = io.write_csv(out_file) out.writerow(headers) skip=0 for key in merged: if len(headers) != 1+len(merged[key]): print 'unequal to header amount ({} vs {})! watch out.'.format(len(headers),len(merged[key])) # skip+=1 # continue out.writerow([key] + merged[key]) if separate: out_file_test = out_file[:out_file.rfind('/')+1] + 'test' + out_file[out_file.rfind('/'):] make_dir(out_file_test) out = io.write_csv(out_file_test) out.writerow(headers) for key in merged_test: if len(headers) != 1+len(merged_test[key]): print 'unequal to header amount ({} vs {})! watch out.'.format(len(headers),len(merged_test[key])) # skip+=1 # continue out.writerow([key] + merged_test[key]) print '## Done Merging ##'
def execute(in_dir, delim, out_file, age_gender=False, counts_med=False, counts_med_enrich=False, counts_consult=False, counts_consult_enrich=False, counts_referral=False, counts_lab=False, all_counts=False, tmprl=False, enriched_tmprl=False, knowledge_driven=False, counts_no_knowledge=False, tmprl_no_knowledge=False, separate=False, HISes=[]): '''merge the in files to produce the out file''' merged = defaultdict(list) headers = ['ID'] # we may not need this. ID2HIS = {} merged_test = defaultdict(list) # if we wish to separate, get dictionary of patient HIS sources using SQL. if separate: c = util.sql_connect().cursor() HISes_str = "','".join(HISes) q = '''SELECT patientnummer FROM patienten WHERE PRAKTIJKCODE IN ('{}')'''.format(HISes_str) c.execute(q) ID2HIS = {row[0]: row[0] for row in c} if age_gender: headers = merge_file(in_dir + '/AG.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_med: headers = merge_file(in_dir + '/C_M.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_med_enrich: headers = merge_file(in_dir + '/C_M_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_consult: headers = merge_file(in_dir + '/C_C.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_consult_enrich: headers = merge_file(in_dir + '/C_C_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_referral: headers = merge_file(in_dir + '/C_R.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_lab: headers = merge_file(in_dir + '/C_L.csv', merged, headers, delim, separate, ID2HIS, merged_test) if tmprl: headers = merge_file(in_dir + '/T.csv', merged, headers, delim, separate, ID2HIS, merged_test) if enriched_tmprl: headers = merge_file(in_dir + '/T_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test) if knowledge_driven: headers = merge_file(in_dir + '/K.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_no_knowledge: headers = merge_file(in_dir + '/C_NK.csv', merged, headers, delim, separate, ID2HIS, merged_test) if tmprl_no_knowledge: headers = merge_file(in_dir + '/T_NK.csv', merged, headers, delim, separate, ID2HIS, merged_test) if all_counts: print('ja') headers = merge_file(in_dir + '/counts.csv', merged, headers, delim, separate, ID2HIS, merged_test) headers = merge_file(in_dir + '/stroke.csv', merged, headers, delim, separate, ID2HIS, merged_test) # now write to new file (also check whether all results have same length) make_dir(out_file) out = io.write_csv(out_file) out.writerow(headers) skip = 0 for key in merged: if len(headers) != 1 + len(merged[key]): print('unequal to header amount ({} vs {})! watch out.'.format( len(headers), len(merged[key]))) # skip+=1 # continue out.writerow([key] + merged[key]) if separate: out_file_test = out_file[:out_file.rfind('/') + 1] + 'test' + out_file[out_file.rfind('/'):] make_dir(out_file_test) out = io.write_csv(out_file_test) out.writerow(headers) for key in merged_test: if len(headers) != 1 + len(merged_test[key]): print('unequal to header amount ({} vs {})! watch out.'.format( len(headers), len(merged_test[key]))) # skip+=1 # continue out.writerow([key] + merged_test[key]) print('## Done Merging ##')