def generate(sequence_file, min_sup, mapping_dir, verbose=False): global ENRICHMENT_DICT ENRICHMENT_DICT = import_enrichment_dicts(mapping_dir) if type(sequence_file) == dict: sequences = (v['data'] for k, v in sequence_file.iteritems()) else: sequences = in_out.read_csv(sequence_file) sequences_pos = patients_with_class_val(sequences, 'positive') if type(sequence_file) == dict: sequences = (v['data'] for k, v in sequence_file.iteritems()) else: sequences = in_out.read_csv(sequence_file) sequences_neg = patients_with_class_val(sequences, 'negative') if verbose: print "Patient dict:" print sequences_neg print '###### Mining positive CRC freq patterns ######' frequent_patterns_pos, MPTP_pos = mine(sequences_pos, min_sup, sequences_neg, verbose) print '###### Mining negative CRC freq patterns ######' frequent_patterns_neg, MPTP_neg = mine(sequences_neg, min_sup, sequences_pos, verbose) # print frequent_patterns_pos # print frequent_patterns_neg # save for later use ! # MPTP_pos.update(MPTP_neg) # print len(list(set(frequent_patterns_pos+frequent_patterns_neg))) # print len(MPTP_pos) print '###### Done mining patterns ######' return list(set(frequent_patterns_pos+frequent_patterns_neg)), frequent_patterns_pos, frequent_patterns_neg
def fill_enrichment_dicts(mapping_files_dir): '''loads the enrichment mappings''' # initiate result as a dict of dicts result = dict() # read frequent effects, put effects in dict, put dict in result try: rows = io.read_csv(mapping_files_dir + '/atc/effect_frequent.csv') except: rows = io.read_csv(mapping_files_dir + '/atc/effect.csv') code2effects = {row[0] : row[1:] for row in rows} result['effects'] = code2effects # read frequent indications, put indications in dict, put dict in result try: rows = io.read_csv(mapping_files_dir + '/atc/indication_frequent.csv') except: try: rows = io.read_csv(mapping_files_dir + '/atc/indication.csv') except: rows = [] code2indications = {row[0] : row[1:] for row in rows} result['indications'] = code2indications # read frequent ingredients, put ingredients in dict, put dict in result try: rows = io.read_csv(mapping_files_dir + '/atc/ingredient_frequent.csv') except: try: rows = io.read_csv(mapping_files_dir + '/atc/ingredient.csv') except: rows = [] code2ingredients = {row[0] : row[1:] for row in rows} result['ingredients'] = code2ingredients # read frequent manifestations of symptoms, put manifestations in dict, put dict in result try: rows = io.read_csv(mapping_files_dir + '/icpc/manifestationof_frequent.csv') except: try: rows = io.read_csv(mapping_files_dir + '/icpc/manifestationof.csv') except: rows = [] code2manifestation_of = {row[0] : row[1:] for row in rows} result['manifestationof'] = code2manifestation_of # read frequent associations of symptoms/disorders, put manifestations in dict, put dict in result try: rows = io.read_csv(mapping_files_dir + '/icpc/association_frequent.csv') except: try: rows = io.read_csv(mapping_files_dir + '/icpc/association.csv') except: rows = [] code2association = {row[0] : row[1:] for row in rows} result['association'] = code2association return result
def compile(self): '''compile the individual parts of the report''' # compile general segment rows = io.read_csv(self.f_general) self.compiled_result['general'] = self.compile_general(rows) # compile header headers = ['predictor', '# CRC', '% CRC', '# No CRC', '% No CRC', '# Total', '% Total', 'P value', 'Model importance'] self.compiled_result['headers'] = headers # compile results predictors = io.read_csv(self.f_predictors) data = io.read_csv(self.f_data) self.compiled_result['data'] = self.compile_data(predictors, data)
def generate_pattern_occurrences_per_patient(out_dir, sequence_file, min_sup, mapping_dir): '''generates pattern, then checks for occurrences per patient and writes to csv''' # generate patterns patterns, p_pos, p_neg = generate(sequence_file, min_sup, mapping_dir) # save patterns patterns2csv(patterns, out_dir + '/patterns.csv') # open writer out_f = out_dir + '/temporal.csv' out = in_out.write_csv(out_f) # open reader if type(sequence_file) == dict: rows = (v['data'] for k, v in sequence_file.iteritems()) else: rows = in_out.read_csv(sequence_file) # make & write header #header = ['patient','age','gender'] + ['p'+str(p) for p in range(len(patterns))] + ['CRC'] header = ['ID'] + ['p'+str(p) for p in range(len(patterns))] out.writerow(header) # check for each pattern whether it matches in the patient (1) or not (0) for row in rows: write_record(row, out, patterns)
def merge_file(f, merged, headers, delim, separate, ID2HIS, merged_test): try: rows = io.read_csv(f, delim) except: print '{} does not exist, choose a different directory or exclude the specified file from merging. Skipped for now.'.format( f) return headers headers = headers + rows.next()[1:] if not separate: for row in rows: ID = int(row[0]) merged[ID] = merged[ID] + row[1:] return headers # we separate test and training sets else: for row in rows: ID = int(row[0]) if ID in ID2HIS: merged[ID] = merged[ID] + row[1:] else: merged_test[ID] = merged_test[ID] + row[1:] return headers
def generate_pattern_occurrences_per_patient(out_dir, sequence_file, min_sup, mapping_dir): '''generates pattern, then checks for occurrences per patient and writes to csv''' # generate patterns patterns, p_pos, p_neg = generate(sequence_file, min_sup, mapping_dir) # save patterns patterns2csv(patterns, out_dir + '/patterns.csv') # open writer out_f = out_dir + '/temporal.csv' out = in_out.write_csv(out_f) # open reader if type(sequence_file) == dict: rows = (v['data'] for k, v in sequence_file.items()) else: rows = in_out.read_csv(sequence_file) # make & write header #header = ['patient','age','gender'] + ['p'+str(p) for p in range(len(patterns))] + ['stroke'] header = ['ID'] + ['p' + str(p) for p in range(len(patterns))] out.writerow(header) # check for each pattern whether it matches in the patient (1) or not (0) for row in rows: write_record(row, out, patterns)
def import_data(f, delim=';'): '''import data and separates the column names from the data''' rows = read_csv(f, delim=delim) headers = rows.columns.values.tolist() return rows, headers
def generate(sequence_file, min_sup, mapping_dir, verbose=False): # global ENRICHMENT_DICT # ENRICHMENT_DICT = import_enrichment_dicts(mapping_dir) if type(sequence_file) == dict: # print(sequence_file) # sequences = (v['data'] for k, v in sequence_file.items()) sequences = [v['data'] for k, v in sequence_file.items()] else: sequences = in_out.read_csv(sequence_file) sequences_pos = patients_with_class_val(sequences, ['positive', True]) if bool(sequences_pos) == False: print('No positive examples available in this subset of the data') if type(sequence_file) == dict: sequences = [v['data'] for k, v in sequence_file.items()] # sequences = (v['data'] for k, v in sequence_file.items()) else: sequences = in_out.read_csv(sequence_file) sequences_neg = patients_with_class_val(sequences, ['negative', False]) if verbose: print("Patient dict:") print(sequences_neg) print('###### Mining positive stroke freq patterns ######') frequent_patterns_pos, MPTP_pos = mine(sequences_pos, min_sup, sequences_neg, verbose) print('###### Mining negative stroke freq patterns ######') frequent_patterns_neg, MPTP_neg = mine(sequences_neg, min_sup, sequences_pos, verbose) # print frequent_patterns_pos # print frequent_patterns_neg # save for later use ! # MPTP_pos.update(MPTP_neg) # print len(list(set(frequent_patterns_pos+frequent_patterns_neg))) # print len(MPTP_pos) print('###### Done mining patterns ######') return list(set( frequent_patterns_pos + frequent_patterns_neg)), frequent_patterns_pos, frequent_patterns_neg
def compile(self): '''compile the individual parts of the report''' # compile general segment rows = io.read_csv(self.f_general) self.compiled_result['general'] = self.compile_general(rows) # compile header headers = [ 'predictor', '# CRC', '% CRC', '# No CRC', '% No CRC', '# Total', '% Total', 'P value', 'Model importance' ] self.compiled_result['headers'] = headers # compile results predictors = io.read_csv(self.f_predictors) data = io.read_csv(self.f_data) self.compiled_result['data'] = self.compile_data(predictors, data)
def enrich_from_file(self, in_dir): '''enrich using a data file as source''' assert (in_dir != '') files = util.list_dir_csv(in_dir) med_f = util.select_file(files, 'medicatie') records = io.read_csv(med_f) headers = util.get_headers(records.next()) idx = headers.index('atc_code') return self.atc_enrichment(records, idx)
def enrich_from_file(self, in_dir): '''enrich using a data file as source''' assert (in_dir != '') files = util.list_dir_csv(in_dir) med_f = util.select_file(files, 'journaal') records = io.read_csv(med_f) headers = util.get_headers(next(records)) idx = headers.index('icpc') return self.icpc_enrichment(records, idx)
def enrich_from_file(self, in_dir): '''enrich using a data file as source''' assert(in_dir != '') files = util.list_dir_csv(in_dir) med_f = util.select_file(files, 'journaal') records = io.read_csv(med_f) headers = util.get_headers(records.next()) idx = headers.index('icpc') return self.icpc_enrichment(records, idx)
def generate(sequence_file, min_sup, mapping_dir, verbose=False): global ENRICHMENT_DICT ENRICHMENT_DICT = import_enrichment_dicts(mapping_dir) if type(sequence_file) == dict: sequences = (v['data'] for k, v in sequence_file.iteritems()) else: sequences = in_out.read_csv(sequence_file) sequences_pos = patients_with_class_val(sequences, 'positive') if type(sequence_file) == dict: sequences = (v['data'] for k, v in sequence_file.iteritems()) else: sequences = in_out.read_csv(sequence_file) sequences_neg = patients_with_class_val(sequences, 'negative') if verbose: print "Patient dict:" print sequences_neg print '###### Mining positive CRC freq patterns ######' frequent_patterns_pos, MPTP_pos = mine(sequences_pos, min_sup, sequences_neg, verbose) print '###### Mining negative CRC freq patterns ######' frequent_patterns_neg, MPTP_neg = mine(sequences_neg, min_sup, sequences_pos, verbose) # print frequent_patterns_pos # print frequent_patterns_neg # save for later use ! # MPTP_pos.update(MPTP_neg) # print len(list(set(frequent_patterns_pos+frequent_patterns_neg))) # print len(MPTP_pos) print '###### Done mining patterns ######' return list(set( frequent_patterns_pos + frequent_patterns_neg)), frequent_patterns_pos, frequent_patterns_neg
def export(self, folder, suffix): '''export significant abstractions with the specified suffix to a new file''' print '...exporting significance results' rows = io.read_csv(folder + suffix + '.csv') code2abstractions = {row[0] : row[1:] for row in rows} out = io.write_csv(folder + suffix + '_frequent.csv') for key, vals in code2abstractions.iteritems(): frequent_vals = [] for abstraction in vals: suffixed_abstraction = abstraction+'_'+suffix if suffixed_abstraction in self.abstraction2counts and self.abstraction2counts[suffixed_abstraction][-1] < self.ALPHA: frequent_vals.append(abstraction) # if 'rectal discharge' in abstraction: # print abstraction, suffixed_abstraction, suffixed_abstraction in self.abstraction2counts # print self.abstraction2counts.keys()[0] # print len(frequent_vals) # print self.abstraction2counts[suffixed_abstraction][-1], self.abstraction2counts[suffixed_abstraction][-1] < self.ALPHA if len(frequent_vals) > 0: out.writerow([key] + frequent_vals)
def merge_file(f, merged, headers, delim, separate, ID2HIS, merged_test): try: rows = io.read_csv(f, delim) except: print '{} does not exist, choose a different directory or exclude the specified file from merging. Skipped for now.'.format(f) return headers headers = headers + rows.next()[1:] if not separate: for row in rows: ID = int(row[0]) merged[ID] = merged[ID] + row[1:] return headers # we separate test and training sets else: for row in rows: ID = int(row[0]) if ID in ID2HIS: merged[ID] = merged[ID] + row[1:] else: merged_test[ID] = merged_test[ID] + row[1:] return headers
def read_files_and_calculate_attributes(self, file, file_out, type=0): self.writer = io.write_csv(file_out) print '====== reading the data' rows = io.read_csv(file, ',') print '====== pointer to data obtained' counter = 0 ids = [] dataset_headers = [] for row in rows: if counter % 10000 == 0: print '====== ' + str(counter) # Assuming the headers are in the first row. if counter == 0: temp_dataset_headers = row[1:len(row)] # Create all headers, also of derived categorial attributes # attributes over time and derivations of multiple attributes combined # will be derived later. for header in temp_dataset_headers: header = header.lower() if 'hold_' in header: header = header[5:len(header)] if self.categorial_mapping.has_key(header): for var in self.categorial_mapping[header]: self.headers.append(var) else: self.headers.append(header) dataset_headers.append(header) self.headers.append('label') else: # Assuming ID is the first attribute. id = row[0] if id not in ids: ids.append(id) self.patient_dict[id] = {} for header in self.headers: self.patient_dict[id][header] = [] # Get the time to order based upon it timestamp = time.strptime( row[self.headers.index('charttime') + 1][0:15], "%d-%b-%y %H.%M") times = self.patient_dict[id]['charttime'] # Currently no ordering of the times assumed. If they are, just append at the end index = 0 while index < len(times) and times[index] < timestamp: index += 1 for row_index in range(1, len(row)): if dataset_headers[row_index - 1] == 'charttime': self.patient_dict[id]['charttime'].insert( index, timestamp) else: # Determine the values (there can be multiple in the case of categorial attributes) [features, values] = self.process_value_individual( dataset_headers[row_index - 1], row[row_index], type) for i in range(0, len(values)): self.patient_dict[id][features[i]].insert( index, values[i]) # Now assign the label self.patient_dict[id]['label'].insert( index, self.determine_class( self.patient_dict[id]['daysfromdischtodeath'][index], self.patient_dict[id]['expire_flg'][index])) counter += 1 return self.aggregate_data(type)
def fill_enrichment_dicts(mapping_files_dir): '''loads the enrichment mappings''' # initiate result as a dict of dicts result = dict() # read frequent effects, put effects in dict, put dict in result try: rows = io.read_csv(mapping_files_dir + '/atc/effect_frequent.csv') except: rows = io.read_csv(mapping_files_dir + '/atc/effect.csv') code2effects = {row[0]: row[1:] for row in rows} result['effects'] = code2effects # read frequent indications, put indications in dict, put dict in result try: rows = io.read_csv(mapping_files_dir + '/atc/indication_frequent.csv') except: try: rows = io.read_csv(mapping_files_dir + '/atc/indication.csv') except: rows = [] code2indications = {row[0]: row[1:] for row in rows} result['indications'] = code2indications # read frequent ingredients, put ingredients in dict, put dict in result try: rows = io.read_csv(mapping_files_dir + '/atc/ingredient_frequent.csv') except: try: rows = io.read_csv(mapping_files_dir + '/atc/ingredient.csv') except: rows = [] code2ingredients = {row[0]: row[1:] for row in rows} result['ingredients'] = code2ingredients # read frequent manifestations of symptoms, put manifestations in dict, put dict in result try: rows = io.read_csv(mapping_files_dir + '/icpc/manifestationof_frequent.csv') except: try: rows = io.read_csv(mapping_files_dir + '/icpc/manifestationof.csv') except: rows = [] code2manifestation_of = {row[0]: row[1:] for row in rows} result['manifestationof'] = code2manifestation_of # read frequent associations of symptoms/disorders, put manifestations in dict, put dict in result try: rows = io.read_csv(mapping_files_dir + '/icpc/association_frequent.csv') except: try: rows = io.read_csv(mapping_files_dir + '/icpc/association.csv') except: rows = [] code2association = {row[0]: row[1:] for row in rows} result['association'] = code2association return result