def generate(sequence_file, min_sup, mapping_dir, verbose=False):
	global ENRICHMENT_DICT
	ENRICHMENT_DICT = import_enrichment_dicts(mapping_dir)

	if type(sequence_file) == dict:
		sequences = (v['data'] for k, v in sequence_file.iteritems())
	else:
		sequences = in_out.read_csv(sequence_file)
	sequences_pos = patients_with_class_val(sequences, 'positive')

	if type(sequence_file) == dict:
		sequences = (v['data'] for k, v in sequence_file.iteritems())
	else:
		sequences = in_out.read_csv(sequence_file)
	sequences_neg = patients_with_class_val(sequences, 'negative')

	if verbose: 
		print "Patient dict:"
		print sequences_neg

	print '###### Mining positive CRC freq patterns ######'
	frequent_patterns_pos, MPTP_pos = mine(sequences_pos, min_sup, sequences_neg, verbose)
	print '###### Mining negative CRC freq patterns ######'
	frequent_patterns_neg, MPTP_neg = mine(sequences_neg, min_sup, sequences_pos, verbose)
	
	# print frequent_patterns_pos
	# print frequent_patterns_neg
	# save for later use !
	# MPTP_pos.update(MPTP_neg)
	# print len(list(set(frequent_patterns_pos+frequent_patterns_neg)))
	# print len(MPTP_pos)
	print '###### Done mining patterns ######'
	return list(set(frequent_patterns_pos+frequent_patterns_neg)), frequent_patterns_pos, frequent_patterns_neg
def fill_enrichment_dicts(mapping_files_dir):
	'''loads the enrichment mappings'''

	# initiate result as a dict of dicts
	result = dict()

	# read frequent effects, put effects in dict, put dict in result
	try:
		rows = io.read_csv(mapping_files_dir + '/atc/effect_frequent.csv')
	except:
		rows = io.read_csv(mapping_files_dir + '/atc/effect.csv')		
	code2effects = {row[0] : row[1:] for row in rows}
	result['effects'] = code2effects

	# read frequent indications, put indications in dict, put dict in result
	try:
		rows = io.read_csv(mapping_files_dir + '/atc/indication_frequent.csv')
	except:
		try:
			rows = io.read_csv(mapping_files_dir + '/atc/indication.csv')
		except:
			rows = []
	code2indications = {row[0] : row[1:] for row in rows}
	result['indications'] = code2indications

	# read frequent ingredients, put ingredients in dict, put dict in result
	try:
		rows = io.read_csv(mapping_files_dir + '/atc/ingredient_frequent.csv')
	except:
		try:
			rows = io.read_csv(mapping_files_dir + '/atc/ingredient.csv')
		except:
			rows = []
	code2ingredients = {row[0] : row[1:] for row in rows}
	result['ingredients'] = code2ingredients

	# read frequent manifestations of symptoms, put manifestations in dict, put dict in result
	try:
		rows = io.read_csv(mapping_files_dir + '/icpc/manifestationof_frequent.csv')
	except:
		try:
			rows = io.read_csv(mapping_files_dir + '/icpc/manifestationof.csv')
		except:
			rows = []
	code2manifestation_of = {row[0] : row[1:] for row in rows}
	result['manifestationof'] = code2manifestation_of

	# read frequent associations of symptoms/disorders, put manifestations in dict, put dict in result
	try:
		rows = io.read_csv(mapping_files_dir + '/icpc/association_frequent.csv')
	except:
		try:
			rows = io.read_csv(mapping_files_dir + '/icpc/association.csv')
		except:
			rows = []
	code2association = {row[0] : row[1:] for row in rows}
	result['association'] = code2association

	return result
	def compile(self):
		'''compile the individual parts of the report'''

		# compile general segment
		rows = io.read_csv(self.f_general)
		self.compiled_result['general'] = self.compile_general(rows)

		# compile header
		headers = ['predictor', '# CRC', '% CRC', '# No CRC', '% No CRC', '# Total', '% Total', 'P value', 'Model importance']
		self.compiled_result['headers'] = headers

		# compile results
		predictors = io.read_csv(self.f_predictors)
		data = io.read_csv(self.f_data)
		self.compiled_result['data'] = self.compile_data(predictors, data)
def generate_pattern_occurrences_per_patient(out_dir, sequence_file, min_sup, mapping_dir):
	'''generates pattern, then checks for occurrences per patient and writes to csv'''

	# generate patterns
	patterns, p_pos, p_neg = generate(sequence_file, min_sup, mapping_dir)

	# save patterns
	patterns2csv(patterns, out_dir + '/patterns.csv')

	# open writer
	out_f = out_dir + '/temporal.csv'
	out = in_out.write_csv(out_f)
	
	# open reader
	if type(sequence_file) == dict:
		rows = (v['data'] for k, v in sequence_file.iteritems())
	else:
		rows = in_out.read_csv(sequence_file)

	# make & write header
	#header = ['patient','age','gender'] + ['p'+str(p) for p in range(len(patterns))] + ['CRC']
	header = ['ID'] + ['p'+str(p) for p in range(len(patterns))]
	out.writerow(header)

	# check for each pattern whether it matches in the patient (1) or not (0)
	for row in rows:
		write_record(row, out, patterns)
Ejemplo n.º 5
0
def merge_file(f, merged, headers, delim, separate, ID2HIS, merged_test):
    try:
        rows = io.read_csv(f, delim)
    except:
        print '{} does not exist, choose a different directory or exclude the specified file from merging. Skipped for now.'.format(
            f)
        return headers

    headers = headers + rows.next()[1:]

    if not separate:
        for row in rows:
            ID = int(row[0])
            merged[ID] = merged[ID] + row[1:]
        return headers

    # we separate test and training sets
    else:
        for row in rows:
            ID = int(row[0])
            if ID in ID2HIS:
                merged[ID] = merged[ID] + row[1:]
            else:
                merged_test[ID] = merged_test[ID] + row[1:]
        return headers
def generate_pattern_occurrences_per_patient(out_dir, sequence_file, min_sup,
                                             mapping_dir):
    '''generates pattern, then checks for occurrences per patient and writes to csv'''

    # generate patterns
    patterns, p_pos, p_neg = generate(sequence_file, min_sup, mapping_dir)

    # save patterns
    patterns2csv(patterns, out_dir + '/patterns.csv')

    # open writer
    out_f = out_dir + '/temporal.csv'
    out = in_out.write_csv(out_f)

    # open reader
    if type(sequence_file) == dict:
        rows = (v['data'] for k, v in sequence_file.items())
    else:
        rows = in_out.read_csv(sequence_file)

    # make & write header
    #header = ['patient','age','gender'] + ['p'+str(p) for p in range(len(patterns))] + ['stroke']
    header = ['ID'] + ['p' + str(p) for p in range(len(patterns))]
    out.writerow(header)

    # check for each pattern whether it matches in the patient (1) or not (0)
    for row in rows:
        write_record(row, out, patterns)
Ejemplo n.º 7
0
def import_data(f, delim=';'):
    '''import data and separates the column names from the data'''

    rows = read_csv(f, delim=delim)
    headers = rows.columns.values.tolist()

    return rows, headers
Ejemplo n.º 8
0
def generate(sequence_file, min_sup, mapping_dir, verbose=False):
    # global ENRICHMENT_DICT
    # ENRICHMENT_DICT = import_enrichment_dicts(mapping_dir)

    if type(sequence_file) == dict:
        # print(sequence_file)
        # sequences = (v['data'] for k, v in sequence_file.items())
        sequences = [v['data'] for k, v in sequence_file.items()]

    else:
        sequences = in_out.read_csv(sequence_file)

    sequences_pos = patients_with_class_val(sequences, ['positive', True])

    if bool(sequences_pos) == False:
        print('No positive examples available in this subset of the data')

    if type(sequence_file) == dict:
        sequences = [v['data'] for k, v in sequence_file.items()]
        # sequences = (v['data'] for k, v in sequence_file.items())
    else:
        sequences = in_out.read_csv(sequence_file)

    sequences_neg = patients_with_class_val(sequences, ['negative', False])

    if verbose:
        print("Patient dict:")
        print(sequences_neg)

    print('###### Mining positive stroke freq patterns ######')
    frequent_patterns_pos, MPTP_pos = mine(sequences_pos, min_sup,
                                           sequences_neg, verbose)
    print('###### Mining negative stroke freq patterns ######')
    frequent_patterns_neg, MPTP_neg = mine(sequences_neg, min_sup,
                                           sequences_pos, verbose)

    # print frequent_patterns_pos
    # print frequent_patterns_neg
    # save for later use !
    # MPTP_pos.update(MPTP_neg)
    # print len(list(set(frequent_patterns_pos+frequent_patterns_neg)))
    # print len(MPTP_pos)
    print('###### Done mining patterns ######')
    return list(set(
        frequent_patterns_pos +
        frequent_patterns_neg)), frequent_patterns_pos, frequent_patterns_neg
Ejemplo n.º 9
0
    def compile(self):
        '''compile the individual parts of the report'''

        # compile general segment
        rows = io.read_csv(self.f_general)
        self.compiled_result['general'] = self.compile_general(rows)

        # compile header
        headers = [
            'predictor', '# CRC', '% CRC', '# No CRC', '% No CRC', '# Total',
            '% Total', 'P value', 'Model importance'
        ]
        self.compiled_result['headers'] = headers

        # compile results
        predictors = io.read_csv(self.f_predictors)
        data = io.read_csv(self.f_data)
        self.compiled_result['data'] = self.compile_data(predictors, data)
Ejemplo n.º 10
0
    def enrich_from_file(self, in_dir):
        '''enrich using a data file as source'''
        assert (in_dir != '')
        files = util.list_dir_csv(in_dir)
        med_f = util.select_file(files, 'medicatie')
        records = io.read_csv(med_f)

        headers = util.get_headers(records.next())
        idx = headers.index('atc_code')

        return self.atc_enrichment(records, idx)
Ejemplo n.º 11
0
    def enrich_from_file(self, in_dir):
        '''enrich using a data file as source'''
        assert (in_dir != '')
        files = util.list_dir_csv(in_dir)
        med_f = util.select_file(files, 'journaal')
        records = io.read_csv(med_f)

        headers = util.get_headers(next(records))
        idx = headers.index('icpc')

        return self.icpc_enrichment(records, idx)
Ejemplo n.º 12
0
	def enrich_from_file(self, in_dir):
		'''enrich using a data file as source'''
		assert(in_dir != '')
		files = util.list_dir_csv(in_dir)
		med_f = util.select_file(files, 'journaal')
		records = io.read_csv(med_f)

		headers = util.get_headers(records.next())
		idx = headers.index('icpc')

		return self.icpc_enrichment(records, idx)
def generate(sequence_file, min_sup, mapping_dir, verbose=False):
    global ENRICHMENT_DICT
    ENRICHMENT_DICT = import_enrichment_dicts(mapping_dir)

    if type(sequence_file) == dict:
        sequences = (v['data'] for k, v in sequence_file.iteritems())
    else:
        sequences = in_out.read_csv(sequence_file)
    sequences_pos = patients_with_class_val(sequences, 'positive')

    if type(sequence_file) == dict:
        sequences = (v['data'] for k, v in sequence_file.iteritems())
    else:
        sequences = in_out.read_csv(sequence_file)
    sequences_neg = patients_with_class_val(sequences, 'negative')

    if verbose:
        print "Patient dict:"
        print sequences_neg

    print '###### Mining positive CRC freq patterns ######'
    frequent_patterns_pos, MPTP_pos = mine(sequences_pos, min_sup,
                                           sequences_neg, verbose)
    print '###### Mining negative CRC freq patterns ######'
    frequent_patterns_neg, MPTP_neg = mine(sequences_neg, min_sup,
                                           sequences_pos, verbose)

    # print frequent_patterns_pos
    # print frequent_patterns_neg
    # save for later use !
    # MPTP_pos.update(MPTP_neg)
    # print len(list(set(frequent_patterns_pos+frequent_patterns_neg)))
    # print len(MPTP_pos)
    print '###### Done mining patterns ######'
    return list(set(
        frequent_patterns_pos +
        frequent_patterns_neg)), frequent_patterns_pos, frequent_patterns_neg
Ejemplo n.º 14
0
	def export(self, folder, suffix):
		'''export significant abstractions with the specified suffix to a new file'''
		print '...exporting significance results'
		rows = io.read_csv(folder + suffix + '.csv')
		code2abstractions = {row[0] : row[1:] for row in rows}

		out = io.write_csv(folder + suffix + '_frequent.csv')
		for key, vals in code2abstractions.iteritems(): 
			frequent_vals = []
			for abstraction in vals:
				suffixed_abstraction = abstraction+'_'+suffix
				if suffixed_abstraction in self.abstraction2counts and self.abstraction2counts[suffixed_abstraction][-1] < self.ALPHA:
					frequent_vals.append(abstraction)
				# if 'rectal discharge' in abstraction: 
				# 	print abstraction, suffixed_abstraction, suffixed_abstraction in self.abstraction2counts
				# 	print self.abstraction2counts.keys()[0]
				# 	print len(frequent_vals)
				# 	print self.abstraction2counts[suffixed_abstraction][-1], self.abstraction2counts[suffixed_abstraction][-1] < self.ALPHA

			if len(frequent_vals) > 0:
				out.writerow([key] + frequent_vals)
def merge_file(f, merged, headers, delim, separate, ID2HIS, merged_test):
	try:
		rows = io.read_csv(f, delim)
	except:
		print '{} does not exist, choose a different directory or exclude the specified file from merging. Skipped for now.'.format(f)
		return headers

	headers = headers + rows.next()[1:]
	
	if not separate:
		for row in rows:
			ID = int(row[0])
			merged[ID] = merged[ID] + row[1:]
		return headers

	# we separate test and training sets
	else:
		for row in rows:
			ID = int(row[0])
			if ID in ID2HIS:
				merged[ID] = merged[ID] + row[1:]
			else:
				merged_test[ID] = merged_test[ID] + row[1:] 
		return headers
Ejemplo n.º 16
0
    def read_files_and_calculate_attributes(self, file, file_out, type=0):

        self.writer = io.write_csv(file_out)

        print '====== reading the data'
        rows = io.read_csv(file, ',')
        print '====== pointer to data obtained'
        counter = 0
        ids = []
        dataset_headers = []

        for row in rows:

            if counter % 10000 == 0:
                print '====== ' + str(counter)
            # Assuming the headers are in the first row.
            if counter == 0:
                temp_dataset_headers = row[1:len(row)]

                # Create all headers, also of derived categorial attributes
                # attributes over time and derivations of multiple attributes combined
                # will be derived later.

                for header in temp_dataset_headers:
                    header = header.lower()
                    if 'hold_' in header:
                        header = header[5:len(header)]
                    if self.categorial_mapping.has_key(header):
                        for var in self.categorial_mapping[header]:
                            self.headers.append(var)
                    else:
                        self.headers.append(header)
                    dataset_headers.append(header)
                self.headers.append('label')
            else:
                # Assuming ID is the first attribute.
                id = row[0]
                if id not in ids:
                    ids.append(id)
                    self.patient_dict[id] = {}
                    for header in self.headers:
                        self.patient_dict[id][header] = []
                # Get the time to order based upon it
                timestamp = time.strptime(
                    row[self.headers.index('charttime') + 1][0:15],
                    "%d-%b-%y %H.%M")
                times = self.patient_dict[id]['charttime']
                # Currently no ordering of the times assumed. If they are, just append at the end
                index = 0
                while index < len(times) and times[index] < timestamp:
                    index += 1
                for row_index in range(1, len(row)):
                    if dataset_headers[row_index - 1] == 'charttime':
                        self.patient_dict[id]['charttime'].insert(
                            index, timestamp)
                    else:
                        # Determine the values (there can be multiple in the case of categorial attributes)
                        [features, values] = self.process_value_individual(
                            dataset_headers[row_index - 1], row[row_index],
                            type)
                        for i in range(0, len(values)):
                            self.patient_dict[id][features[i]].insert(
                                index, values[i])
                # Now assign the label
                self.patient_dict[id]['label'].insert(
                    index,
                    self.determine_class(
                        self.patient_dict[id]['daysfromdischtodeath'][index],
                        self.patient_dict[id]['expire_flg'][index]))
            counter += 1
        return self.aggregate_data(type)
Ejemplo n.º 17
0
def fill_enrichment_dicts(mapping_files_dir):
    '''loads the enrichment mappings'''

    # initiate result as a dict of dicts
    result = dict()

    # read frequent effects, put effects in dict, put dict in result
    try:
        rows = io.read_csv(mapping_files_dir + '/atc/effect_frequent.csv')
    except:
        rows = io.read_csv(mapping_files_dir + '/atc/effect.csv')
    code2effects = {row[0]: row[1:] for row in rows}
    result['effects'] = code2effects

    # read frequent indications, put indications in dict, put dict in result
    try:
        rows = io.read_csv(mapping_files_dir + '/atc/indication_frequent.csv')
    except:
        try:
            rows = io.read_csv(mapping_files_dir + '/atc/indication.csv')
        except:
            rows = []
    code2indications = {row[0]: row[1:] for row in rows}
    result['indications'] = code2indications

    # read frequent ingredients, put ingredients in dict, put dict in result
    try:
        rows = io.read_csv(mapping_files_dir + '/atc/ingredient_frequent.csv')
    except:
        try:
            rows = io.read_csv(mapping_files_dir + '/atc/ingredient.csv')
        except:
            rows = []
    code2ingredients = {row[0]: row[1:] for row in rows}
    result['ingredients'] = code2ingredients

    # read frequent manifestations of symptoms, put manifestations in dict, put dict in result
    try:
        rows = io.read_csv(mapping_files_dir +
                           '/icpc/manifestationof_frequent.csv')
    except:
        try:
            rows = io.read_csv(mapping_files_dir + '/icpc/manifestationof.csv')
        except:
            rows = []
    code2manifestation_of = {row[0]: row[1:] for row in rows}
    result['manifestationof'] = code2manifestation_of

    # read frequent associations of symptoms/disorders, put manifestations in dict, put dict in result
    try:
        rows = io.read_csv(mapping_files_dir +
                           '/icpc/association_frequent.csv')
    except:
        try:
            rows = io.read_csv(mapping_files_dir + '/icpc/association.csv')
        except:
            rows = []
    code2association = {row[0]: row[1:] for row in rows}
    result['association'] = code2association

    return result