def generate_pattern_occurrences_per_patient(out_dir, sequence_file, min_sup, mapping_dir):
	'''generates pattern, then checks for occurrences per patient and writes to csv'''

	# generate patterns
	patterns, p_pos, p_neg = generate(sequence_file, min_sup, mapping_dir)

	# save patterns
	patterns2csv(patterns, out_dir + '/patterns.csv')

	# open writer
	out_f = out_dir + '/temporal.csv'
	out = in_out.write_csv(out_f)
	
	# open reader
	if type(sequence_file) == dict:
		rows = (v['data'] for k, v in sequence_file.iteritems())
	else:
		rows = in_out.read_csv(sequence_file)

	# make & write header
	#header = ['patient','age','gender'] + ['p'+str(p) for p in range(len(patterns))] + ['CRC']
	header = ['ID'] + ['p'+str(p) for p in range(len(patterns))]
	out.writerow(header)

	# check for each pattern whether it matches in the patient (1) or not (0)
	for row in rows:
		write_record(row, out, patterns)
def generate_pattern_occurrences_per_patient(out_dir, sequence_file, min_sup,
                                             mapping_dir):
    '''generates pattern, then checks for occurrences per patient and writes to csv'''

    # generate patterns
    patterns, p_pos, p_neg = generate(sequence_file, min_sup, mapping_dir)

    # save patterns
    patterns2csv(patterns, out_dir + '/patterns.csv')

    # open writer
    out_f = out_dir + '/temporal.csv'
    out = in_out.write_csv(out_f)

    # open reader
    if type(sequence_file) == dict:
        rows = (v['data'] for k, v in sequence_file.items())
    else:
        rows = in_out.read_csv(sequence_file)

    # make & write header
    #header = ['patient','age','gender'] + ['p'+str(p) for p in range(len(patterns))] + ['stroke']
    header = ['ID'] + ['p' + str(p) for p in range(len(patterns))]
    out.writerow(header)

    # check for each pattern whether it matches in the patient (1) or not (0)
    for row in rows:
        write_record(row, out, patterns)
Ejemplo n.º 3
0
    def save_output(self,
                    benchmark=False,
                    sequence_file=False,
                    sub_dir='',
                    name='unnamed',
                    target=False):
        '''saves processed data to the specified output directory'''
        print('...saving processed data'
              )  # to {}'.format('sql' if self.from_sql else 'file')

        headers = self.headers
        # print (self.id2data.values())
        # print('x')
        # if we didn't get the data from sql database, just save to .csv
        if True or not self.from_sql:
            # possibly make new directories
            out_dir = self.out_dir + '/' + sub_dir + '/'
            util.make_dir(out_dir)

            f_out = out_dir + name + '.csv'
            out = write_csv(f_out)

            # write headers where required
            if benchmark:
                out.writerow(headers[0:3])
            elif target:
                out.writerow([headers[0], headers[-1]])
            elif sequence_file:
                pass
            else:
                out.writerow([headers[0]] + headers[3:-1])

            # write data
            for value in self.id2data.values():
                data = value['data']
                if benchmark:
                    data = data[0:3]
                    data[2] = 1 if data[2] == 'V' else 0
                elif target:
                    if self.survival == False:
                        data = [data[0], 0 if data[-1] == 'negative' else 1]
                    else:
                        data = [data[0], data[-1]]
                elif sequence_file:
                    pass
                else:
                    data = [data[0]] + data[3:-1]
                out.writerow(data)
Ejemplo n.º 4
0
	def export(self, folder, suffix):
		'''export significant abstractions with the specified suffix to a new file'''
		print '...exporting significance results'
		rows = io.read_csv(folder + suffix + '.csv')
		code2abstractions = {row[0] : row[1:] for row in rows}

		out = io.write_csv(folder + suffix + '_frequent.csv')
		for key, vals in code2abstractions.iteritems(): 
			frequent_vals = []
			for abstraction in vals:
				suffixed_abstraction = abstraction+'_'+suffix
				if suffixed_abstraction in self.abstraction2counts and self.abstraction2counts[suffixed_abstraction][-1] < self.ALPHA:
					frequent_vals.append(abstraction)
				# if 'rectal discharge' in abstraction: 
				# 	print abstraction, suffixed_abstraction, suffixed_abstraction in self.abstraction2counts
				# 	print self.abstraction2counts.keys()[0]
				# 	print len(frequent_vals)
				# 	print self.abstraction2counts[suffixed_abstraction][-1], self.abstraction2counts[suffixed_abstraction][-1] < self.ALPHA

			if len(frequent_vals) > 0:
				out.writerow([key] + frequent_vals)
	def export(self):
		'''exports the result to the specified file'''
		# open file for writing
		out = io.write_csv(self.f_out)

		# write sources
		out.writerow(['general source', 'predictor source', 'data source'])
		out.writerow([self.f_general, self.f_predictors, self.f_data])
		out.writerow([])

		# write general stuff
		out.writerow(self.compiled_result['general']['headers'])
		out.writerow(self.compiled_result['general']['stats'])
		out.writerow([])

		# write headers to file
		out.writerow(self.compiled_result['headers'])

		# write individual results to file
		for row in self.compiled_result['data']:
			out.writerow(row)
Ejemplo n.º 6
0
    def export(self):
        '''exports the result to the specified file'''
        # open file for writing
        out = io.write_csv(self.f_out)

        # write sources
        out.writerow(['general source', 'predictor source', 'data source'])
        out.writerow([self.f_general, self.f_predictors, self.f_data])
        out.writerow([])

        # write general stuff
        out.writerow(self.compiled_result['general']['headers'])
        out.writerow(self.compiled_result['general']['stats'])
        out.writerow([])

        # write headers to file
        out.writerow(self.compiled_result['headers'])

        # write individual results to file
        for row in self.compiled_result['data']:
            out.writerow(row)
Ejemplo n.º 7
0
    def read_files_and_calculate_attributes(self, file, file_out, type=0):

        self.writer = io.write_csv(file_out)

        print '====== reading the data'
        rows = io.read_csv(file, ',')
        print '====== pointer to data obtained'
        counter = 0
        ids = []
        dataset_headers = []

        for row in rows:

            if counter % 10000 == 0:
                print '====== ' + str(counter)
            # Assuming the headers are in the first row.
            if counter == 0:
                temp_dataset_headers = row[1:len(row)]

                # Create all headers, also of derived categorial attributes
                # attributes over time and derivations of multiple attributes combined
                # will be derived later.

                for header in temp_dataset_headers:
                    header = header.lower()
                    if 'hold_' in header:
                        header = header[5:len(header)]
                    if self.categorial_mapping.has_key(header):
                        for var in self.categorial_mapping[header]:
                            self.headers.append(var)
                    else:
                        self.headers.append(header)
                    dataset_headers.append(header)
                self.headers.append('label')
            else:
                # Assuming ID is the first attribute.
                id = row[0]
                if id not in ids:
                    ids.append(id)
                    self.patient_dict[id] = {}
                    for header in self.headers:
                        self.patient_dict[id][header] = []
                # Get the time to order based upon it
                timestamp = time.strptime(
                    row[self.headers.index('charttime') + 1][0:15],
                    "%d-%b-%y %H.%M")
                times = self.patient_dict[id]['charttime']
                # Currently no ordering of the times assumed. If they are, just append at the end
                index = 0
                while index < len(times) and times[index] < timestamp:
                    index += 1
                for row_index in range(1, len(row)):
                    if dataset_headers[row_index - 1] == 'charttime':
                        self.patient_dict[id]['charttime'].insert(
                            index, timestamp)
                    else:
                        # Determine the values (there can be multiple in the case of categorial attributes)
                        [features, values] = self.process_value_individual(
                            dataset_headers[row_index - 1], row[row_index],
                            type)
                        for i in range(0, len(values)):
                            self.patient_dict[id][features[i]].insert(
                                index, values[i])
                # Now assign the label
                self.patient_dict[id]['label'].insert(
                    index,
                    self.determine_class(
                        self.patient_dict[id]['daysfromdischtodeath'][index],
                        self.patient_dict[id]['expire_flg'][index]))
            counter += 1
        return self.aggregate_data(type)
def execute(in_dir, delim, out_file, age_gender=False, counts_med=False, 
				counts_med_enrich=False, counts_consult=False, counts_consult_enrich=False,
				counts_referral=False, counts_lab=False, tmprl=False, 
				enriched_tmprl=False, knowledge_driven=False, counts_no_knowledge=False, tmprl_no_knowledge=False,
				separate=False, HISes=[]):
	'''merge the in files to produce the out file'''
	merged = defaultdict(list)
	headers = ['ID']

	# we may not need this.
	ID2HIS = {}
	merged_test = defaultdict(list)

	# if we wish to separate, get dictionary of patient HIS sources using SQL.
	if separate:
		c = util.sql_connect().cursor()
		HISes_str = "','".join(HISes)
		q = '''SELECT patientnummer 
				FROM patienten
				WHERE PRAKTIJKCODE IN ('{}')'''.format(HISes_str)
		c.execute(q)
		
		ID2HIS = {row[0] : row[0] for row in c}

	if age_gender:
		headers = merge_file(in_dir+'/AG.csv', merged, headers, delim, separate, ID2HIS, merged_test)
		
	if counts_med:
		headers = merge_file(in_dir+'/C_M.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	if counts_med_enrich:
		headers = merge_file(in_dir+'/C_M_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	if counts_consult:
		headers = merge_file(in_dir+'/C_C.csv', merged, headers, delim, separate, ID2HIS, merged_test)
	
	if counts_consult_enrich:
		headers = merge_file(in_dir+'/C_C_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	if counts_referral:
		headers = merge_file(in_dir+'/C_R.csv', merged, headers, delim, separate, ID2HIS, merged_test)
	
	if counts_lab:
		headers = merge_file(in_dir+'/C_L.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	if tmprl:
		headers = merge_file(in_dir+'/T.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	if enriched_tmprl:
		headers = merge_file(in_dir+'/T_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	if knowledge_driven:
		headers = merge_file(in_dir+'/K.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	if counts_no_knowledge:
		headers = merge_file(in_dir+'/C_NK.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	if tmprl_no_knowledge:
		headers = merge_file(in_dir+'/T_NK.csv', merged, headers, delim, separate, ID2HIS, merged_test)

	headers = merge_file(in_dir+'/CRC.csv', merged, headers, delim, separate, ID2HIS, merged_test)
	
	# now write to new file (also check whether all results have same length)
	make_dir(out_file)
	out = io.write_csv(out_file)

	out.writerow(headers)
	skip=0
	for key in merged:
		if len(headers) != 1+len(merged[key]):
			print 'unequal to header amount ({} vs {})! watch out.'.format(len(headers),len(merged[key]))
			# skip+=1
			# continue
		out.writerow([key] + merged[key])

	if separate:
		out_file_test = out_file[:out_file.rfind('/')+1] + 'test' + out_file[out_file.rfind('/'):]
		make_dir(out_file_test)

		out = io.write_csv(out_file_test)
		
		out.writerow(headers)
		for key in merged_test:
			if len(headers) != 1+len(merged_test[key]):
				print 'unequal to header amount ({} vs {})! watch out.'.format(len(headers),len(merged_test[key]))
				# skip+=1
				# continue
			out.writerow([key] + merged_test[key])

	print '## Done Merging ##'
Ejemplo n.º 9
0
def execute(in_dir,
            delim,
            out_file,
            age_gender=False,
            counts_med=False,
            counts_med_enrich=False,
            counts_consult=False,
            counts_consult_enrich=False,
            counts_referral=False,
            counts_lab=False,
            all_counts=False,
            tmprl=False,
            enriched_tmprl=False,
            knowledge_driven=False,
            counts_no_knowledge=False,
            tmprl_no_knowledge=False,
            separate=False,
            HISes=[]):
    '''merge the in files to produce the out file'''
    merged = defaultdict(list)
    headers = ['ID']

    # we may not need this.
    ID2HIS = {}
    merged_test = defaultdict(list)

    # if we wish to separate, get dictionary of patient HIS sources using SQL.
    if separate:
        c = util.sql_connect().cursor()
        HISes_str = "','".join(HISes)
        q = '''SELECT patientnummer 
				FROM patienten
				WHERE PRAKTIJKCODE IN ('{}')'''.format(HISes_str)
        c.execute(q)

        ID2HIS = {row[0]: row[0] for row in c}

    if age_gender:
        headers = merge_file(in_dir + '/AG.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if counts_med:
        headers = merge_file(in_dir + '/C_M.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if counts_med_enrich:
        headers = merge_file(in_dir + '/C_M_enrich.csv', merged, headers,
                             delim, separate, ID2HIS, merged_test)

    if counts_consult:
        headers = merge_file(in_dir + '/C_C.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if counts_consult_enrich:
        headers = merge_file(in_dir + '/C_C_enrich.csv', merged, headers,
                             delim, separate, ID2HIS, merged_test)

    if counts_referral:
        headers = merge_file(in_dir + '/C_R.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if counts_lab:
        headers = merge_file(in_dir + '/C_L.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if tmprl:
        headers = merge_file(in_dir + '/T.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if enriched_tmprl:
        headers = merge_file(in_dir + '/T_enrich.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if knowledge_driven:
        headers = merge_file(in_dir + '/K.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if counts_no_knowledge:
        headers = merge_file(in_dir + '/C_NK.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if tmprl_no_knowledge:
        headers = merge_file(in_dir + '/T_NK.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    if all_counts:
        print('ja')
        headers = merge_file(in_dir + '/counts.csv', merged, headers, delim,
                             separate, ID2HIS, merged_test)

    headers = merge_file(in_dir + '/stroke.csv', merged, headers, delim,
                         separate, ID2HIS, merged_test)

    # now write to new file (also check whether all results have same length)
    make_dir(out_file)
    out = io.write_csv(out_file)

    out.writerow(headers)
    skip = 0

    for key in merged:
        if len(headers) != 1 + len(merged[key]):
            print('unequal to header amount ({} vs {})! watch out.'.format(
                len(headers), len(merged[key])))
            # skip+=1
            # continue
        out.writerow([key] + merged[key])

    if separate:
        out_file_test = out_file[:out_file.rfind('/') +
                                 1] + 'test' + out_file[out_file.rfind('/'):]
        make_dir(out_file_test)

        out = io.write_csv(out_file_test)

        out.writerow(headers)
        for key in merged_test:
            if len(headers) != 1 + len(merged_test[key]):
                print('unequal to header amount ({} vs {})! watch out.'.format(
                    len(headers), len(merged_test[key])))
                # skip+=1
                # continue
            out.writerow([key] + merged_test[key])

    print('## Done Merging ##')