def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False, counter=0):
		'''inserts data from the specified csv and corresponding columns'''
	
		important_featx	ures = ['CHOLBMT', 'RRDIKA', 'RRSYKA']

		# make convenient reference to the dictionary
		dct = self.id2data
		rows = rows.where((pd.notnull(rows)), None)

		# # get data and corresponding headers
		# rows, headers = util.import_data(f, delim=self.delim)

		# get the index of the relevant columns
		# ID_idx = headers.index(self.ID_column)
		code_idx = headers.index(code_column) + 1
		date_idx = headers.index(date_column[0]) + 1
		
		
		# regex pattern to match (ATC/ICPC standards)
		pattern = re.compile(regex_string)

		if 'lab_results' in suffix:
			values_dict = dict()
			# val_idx = headers.index('valuen') + 1

		# pair IDs with a dict corresponding to data and dates
			for row in rows.itertuples():#line in de data
				code = row[code_idx]
				# if we do not know the high and low values, determine by data distribution
				if code not in important_features:
					if not code in values_dict:
						try:
							values_dict[code] = [float(row.valuen)]
						except ValueError:
							continue
						except TypeError:
							continue
					else:
						try:
							values_dict[code].append(float(row.valuen))
						except ValueError:
							continue
						except TypeError:
							continue
							
			minmax_dict = self.calculate_minmax(values_dict, pattern, limit)
			

		if incorporate_SOEP:
			SOEP_idx = headers.index(incorporate_SOEP)

		# keep track of number of times the row is attributed to a positive stroke patient (or patient where the target instance = 'positive')
		num_pos = 0
		num_total = 0
		attribute_count = dict()
		# iterate over all instances, making a new dict with the new attributes as keys
		attribute2ids = dict()

		max=1000000000000000000
		current = 0 

		for row in tqdm(rows.itertuples()):
			current += 1	
			# row = row.split(';')

			if current > max: 
				break
			else:
				num_total+=1

				# if key is not in the data dictionary, we skip it
				key = row.Index
				
				if not key in dct:
					continue

				if dct[key]['stroke_dates'][0] != 'negative':
					num_pos+=1

				# init other vars
				date = str2date(row[date_idx], give_default_begin=True, give_default_end=True)
				begin = dct[key]['stroke_dates'][1]
				end = dct[key]['stroke_dates'][2]

				if code_column == 'specialisme':
					end = end - four_weeks()

				original_code = row[code_idx]
				if original_code == None:
					continue

				truncated_code = self.generate_code(original_code, limit)
				if truncated_code == None or truncated_code in ['K90', 'K89', 'k90', 'k89']:
					continue
				
				if not self.marshall_predictor(truncated_code, code_column):
					continue
				
				# if in the required interval and code is valid
				if (begin <= date and date <= end) and pattern.match(truncated_code):
					# if we do not care about SOEPcode (always except for journaal case) or the SOEPcode is E
					# if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'):
					
						if 'lab_results' in suffix: # if we prepare for lab result abstraction						
							try:
								val = float(row.valuen)
								if not original_code in important_features:
									min_val = minmax_dict[truncated_code]['low_bound']
									max_val = minmax_dict[truncated_code]['high_bound']
								else:
									min_val, max_val = self.determine_minmax(original_code)

							except ValueError:
								continue

							except TypeError:
									continue

							if not 'ID2abstractions' in locals():
								# dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement)
								ID2abstractions = dict()
							
							util.init_key(ID2abstractions, key, dict())
							util.init_key(ID2abstractions[key], original_code, [])

							ID2abstractions[key][original_code].append((date, val))

							if '' not in [val, min_val, max_val]:
								attr = get_value(val, min_val, max_val, original_code)

								if not attr in attribute_count:
									attribute_count[attr] = 0

								# check if attribute name and ID instance already exist, if not, make them
								util.init_key(attribute2ids, attr, dict())
								util.init_key(attribute2ids[attr], key, 0)
								
								# add 1 to the occurrence of the attribute in the instance
								attribute2ids[attr][key] += 1
								attribute_count[attr] += 1

						else: # else no lab result collection, regular aggregation
							# generate attribute names

							if 'cardiometabolism' in suffix:
								# val_idx = headers.index('valuec')
								value = str(row.valuec)
							
							else:
								value = None

							attributes = self.generate_attributes(original_code, limit, suffix, value, src=code_column)
							# this loop allows multiple attributes to be created in the previous code line
							# this allows for other classes to subclass this class, e.g. StandardEnrichProcess
							for attr in attributes:
								if not attr in attribute_count:
									attribute_count[attr] = 0

								# print truncated_code, attr
								# check if attribute name and ID instance already exist, if not, make them
								util.init_key(attribute2ids, attr, dict())
								util.init_key(attribute2ids[attr], key, 0)

								# add 1 to the occurrence of the attribute in the instance, except if attribute is binary
								if 'smoking' in suffix:
									if attribute2ids[attr][key] == 1:
										continue

								if 'allergies' in suffix:
									# val_idx = headers.index('flag')
									value = row.flag

									# check if the person actually has the allergie for which was tested
									if value == 'POS':
										attribute2ids[attr][key] = 1
									# if negative or not tested, it is assumed that person does not have particular allergie
									else:
										attribute2ids[attr][key] = 0

								else:
									attribute2ids[attr][key] += 1
									attribute_count[attr] += 1
		
		for attr, count in attribute_count.items():
			try:
				self.statistics[attr + '_count/min/max'] = [count, min_val, max_val]
			except UnboundLocalError:
				self.statistics[attr + '_count'] = count

		if 'lab_results' in suffix: # do funky stuff with trends and abstractions
			# convert to trends PER lab result
			for ID in ID2abstractions:
				# print ID2abstractions[ID]
				for k, points in ID2abstractions[ID].items():
					
					# the values are sorted before abstraction
					points = sorted(list(set(points)))

					# abstract the values and count the occurrences per measurement-trend per patient
					# if only 1 measurement was done, we cannot do time series analysis
					if len(points) > 1 and ID in dct: 
						abstractions = get_trends(k, points)
						for attr in abstractions:
							attr = attr[0] # get the state
							util.init_key(attribute2ids, attr, dict())
							util.init_key(attribute2ids[attr], ID, 0)
							attribute2ids[attr][ID] += 1
		# print len(attribute2ids)
		# print attribute2ids.keys()[0:5]
		
		

		# add data to each instance
		to_save = {}

		for ID in dct:
			to_save[ID] = []

		for ID in dct:
			data = dct[ID]['data']
			# to_save[ID] = []

			for id2occurrences in attribute2ids.values():
				
				# if patient has occurrences for the attribute, add that number, else add 0
				if ID in id2occurrences: 
					data.append(id2occurrences[ID])
					to_save[ID].append(id2occurrences[ID])

				else:
					data.append(0)
					to_save[ID].append(0)

		save_obj(self.statistics, self.in_dir + suffix[0]+ '_statistics.pkl')

		if self.survival == True:
			save_obj(to_save, self.in_dir + suffix[0] + '_dict_marshall' + str(counter)+ '_survival' + '.pkl')
			save_obj(list(attribute2ids.keys()), self.in_dir + suffix[0]  + '_headers'+ str(counter) + '.pkl')
		else:
			save_obj(to_save, self.in_dir + suffix[0] + '_dict_marshall' + str(counter) + '.pkl')
			save_obj(list(attribute2ids.keys()), self.in_dir + suffix[0] + '_headers'+  str(counter) + '.pkl')


		# return the keys to be used as headers when writing the processed data
		return list(attribute2ids.keys()), num_total, num_pos, suffix
Exemple #2
0
	def insert_data(self, rows, headers, code_column, date_column, regex_string, limit, suffix='', incorporate_SOEP=False):
		'''inserts data from the specified csv and corresponding columns'''

		# make convenient reference to the dictionary
		dct = self.id2data

		# # get data and corresponding headers
		# rows, headers = util.import_data(f, delim=self.delim)

		# get the index of the relevant columns
		ID_idx = headers.index(self.ID_column)
		code_idx = headers.index(code_column)
		b_date_idx = headers.index(date_column[0])
		e_date_idx = headers.index(date_column[1])
		if suffix == 'lab_results':
			val_idx = headers.index('waarde')
			min_idx = headers.index('referentie_minimum')
			max_idx = headers.index('referentie_maximum')
		if incorporate_SOEP:
			SOEP_idx = headers.index(incorporate_SOEP)

		# get the right suffix to append for the attribute name
		if suffix == '':
			suffix = code_column

		# regex pattern to match (ATC/ICPC standards)
		pattern = re.compile(regex_string)

		max = 5000
		current = 0

		# iterate over all instances
		for row in rows:
			if current > max:
				break


			row = row.split(';')
			
			original_code = row[code_idx]
			if original_code == None:
				continue
			truncated_code = self.generate_code(original_code, limit) 
			if truncated_code == None:
				continue

			### is in Marshall Predictors check ###
			### if it is a marshall predictor, we skip this line.
			if self.marshall_predictor(truncated_code, code_column):
				continue

			# if key is not in the data dictionary, we skip it
			key = row[ID_idx]
			if not key in dct:
				continue

			# init other vars
			b_date = str2date(row[b_date_idx], give_default_begin=True) # begin of event
			e_date = str2date(row[e_date_idx], give_default_end=True) # end of event
			b_reg = dct[key]['stroke_dates'][1] # beginning of registration
			e_reg = dct[key]['stroke_dates'][2] # ending of registration
			if code_column == 'specialisme':
				e_reg = e_reg - four_weeks()

			if suffix == 'lab_results':
				val, min_val, max_val = self.make_lab_values(row[val_idx], row[min_idx], row[max_idx])
				if val == '':
					continue

			# if in the required interval (either beginning or ending date) AND code is valid
			if ( (b_reg <= b_date and b_date <= e_reg) or (b_reg <= e_date and e_date <= e_reg) ) and pattern.match(truncated_code):
				
				# if we need to take the SOEP code of consults into account
				if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'):

					# generate attribute names
					if suffix == 'lab_results': # if we prepare for lab result abstraction
						if not 'ID2abstractions' in locals():
							# dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement)
							ID2abstractions = defaultdict(dict)
						
						util.init_key(ID2abstractions, key, defaultdict(dict))
						util.init_key(ID2abstractions[key], original_code, [])

						ID2abstractions[key][original_code].append((b_date, val))
					
						if '' not in [val, min_val, max_val]:
							attributes = [abstracts.get_value(val, min_val, max_val, original_code)]

							# # add value abstraction as state interval
							# self.insert_state_interval(key, attr, b_date, e_date)
						else:
							attributes = []

					else:
						attributes = self.generate_attributes(original_code, limit, suffix, src=code_column)

					# this loop allows multiple attributes to be created in the previous code line
					# this allows for other classes to subclass this class, e.g. SequenceEnrichProcess
					for attr in attributes:

						# insert a StateInterval object with the specified parameters
						self.insert_state_interval(key, attr, b_date, e_date, original_code, code_column)

			current += 1

		if suffix == 'lab_results': # do funky stuff with trends and abstractions
			# convert to trends PER lab result
			for ID in ID2abstractions:
				# print ID2abstractions[ID]
				for k, points in ID2abstractions[ID].items():
					
					# the values are sorted before abstraction
					points = sorted(list(set(points)))

					# abstract the values and append to the current patient's sequence
					# if only 1 measurement was done, we cannot do time series analysis
					if len(points) > 1 and ID in dct: 
						abstractions = abstracts.get_trends(k, points)
						for abstraction in abstractions:
							self.insert_state_interval(ID, *abstraction, original_code=original_code, src=code_column)
						# self.id2data[ID]['data'] = self.id2data[ID]['data'] + abstractions
		
		# to satisfy return value requirement for the method 'process' in the superclass
		return [], -1, -1
Exemple #3
0
    def insert_start_baseline(self, rows, headers):

        dct = self.id2data
        rows = rows.where((pd.notnull(rows)), None)
        actions_dict = dict()

        code_idx = headers.index('icpc_cat') + 1
        date_idx = headers.index('dicpc_startdate') + 1

        # patterns = ['12000','12001', '12002', '12004']

        max = 5000000000000000000
        current = 0
        amount_x = 0
        amount_y = 0
        f = 0
        g = 0
        i = 0
        z = 0
        key_list = []

        for row in tqdm(rows.itertuples()):
            current += 1

            if current > max:
                break

            key = row.Index

            if not key in dct:
                z += 1
                continue

            if not key in key_list:
                key_list.append(key)

            amount_x += 1

            date = str2date(row[date_idx],
                            give_default_begin=True,
                            give_default_end=True)

            if int(str(date).split('-')[0]) < 2007:
                continue

            original_code = row[code_idx]
            if original_code == None:
                i += 1
                continue

            string_code = str(original_code)

            if not key in actions_dict:
                actions_dict[key] = {}
                if not string_code in actions_dict[key]:
                    actions_dict[key][string_code] = []
                    actions_dict[key][string_code].append(date)
                else:
                    actions_dict[key][string_code].append(date)

            else:
                if not string_code in actions_dict[key]:
                    actions_dict[key][string_code] = []
                    actions_dict[key][string_code].append(date)
                else:
                    actions_dict[key][string_code].append(date)

        to_remove = []
        for patient, action_codes in actions_dict.items():
            amount_y += 1
            lowest_dict = dict()
            count = 0

            for action_code, dates in action_codes.items():
                if not dates:
                    count += 1
                    continue
                else:
                    lowest_dict[action_code] = min(dates)

                # try:
                # 	lowest_dict[action_code] = min(dates)
                # except ValueError:
                # 	continue
            # for action_code, date in lowest_dict:
            earliest_visit = min(lowest_dict, key=lowest_dict.get)
            visit_date = lowest_dict[earliest_visit]
            self.id2data[patient]['stroke_dates'].append(visit_date)
            print(self.id2data[patient]['stroke_dates'])
            # except ValueError:
            # 	to_remove.append(patient)

        print(amount_x, amount_y, z)
        print(f, g, i, g + f + i)
        print(len(to_remove))
        print(len(key_list))
        for key in to_remove:
            del self.id2data[key]
Exemple #4
0
    def get_stroke_occurrences(self, rows, headers):
        '''sets all stroke cases to initial diagnosis date values in 
			id2data[patient][stroke_dates][0]'''
        print('...getting all target (stroke) occurrences')

        stroke_count = 0

        # get the index of the relevant columns
        stroke_idx = headers.index('icpc') + 1
        date_idx = headers.index('dicpc_startdate') + 1

        # regex patterns to match
        general_stroke_pattern = re.compile('K90')
        ischemic_stroke_pattern = re.compile('K90.03')
        intracerebral_hem_pattern = re.compile('K90.02')
        subarchnoid_hem_pattern = re.compile('K90.01')
        tia_stroke_pattern = re.compile('K89')

        max = 500000000000000000
        current = 0

        rows = rows.where((pd.notnull(rows)), None)

        # pair IDs with a dict corresponding to data and dates
        print(len(rows))

        for row in tqdm(rows.itertuples()):  #line in de data
            if current > max:
                break

            if row[date_idx] == " ":
                continue

            else:
                # get key and if it's in the dict, the current corresponding stroke value

                key = row.Index
                if key in self.id2data:
                    stroke = self.id2data[key]['stroke_dates'][0]
                    # if self.survival == True and not isinstance(stroke, datetime.date):
                    # 	stroke = stroke[0]

                    # get ICPC code and its date
                    code = row.icpc
                    if code == None:
                        continue
                    elif type(code) == str:
                        code = code.strip().upper()[0:3]

                    code_date = str2date(
                        date_str=row.dicpc_startdate,
                        mdy=False,
                        give_default_begin=True,
                        give_default_end=True
                    )  #, mdy=False, give_default_begin=True, give_default_end=True

                    # add stroke case if code matches, AND corresponding date is earlier than the currently recorded

                    if self.survival:
                        if (general_stroke_pattern.match(code)
                                or ischemic_stroke_pattern.match(code)
                                or intracerebral_hem_pattern.match(code)
                                or subarchnoid_hem_pattern.match(code)
                                or tia_stroke_pattern.match(code)):
                            if (isinstance(stroke, list) and stroke[0]
                                    == False) or stroke > code_date:
                                self.id2data[key]['stroke_dates'][
                                    0] = code_date
                                self.id2data[key]['data'][0] = [True]
                                stroke_count += 1

                    if not self.survival:
                        if (general_stroke_pattern.match(code)
                                or ischemic_stroke_pattern.match(code)
                                or intracerebral_hem_pattern.match(code)
                                or subarchnoid_hem_pattern.match(code)
                                or tia_stroke_pattern.match(code)
                            ) and (stroke == 'negative' or stroke > code_date):
                            self.id2data[key]['stroke_dates'][0] = code_date
                            self.id2data[key]['data'][0] = 'positive'
                            stroke_count += 1

                else:
                    continue

            current += 1

        save_obj(self.id2data, self.in_dir + 'stroke_dict')
        self.statistics['stroke count'] = stroke_count
Exemple #5
0
    def get_IDs(self, rows, headers):
        '''sets all IDs as keys to a dict. Additionally adds gender/age data
			and date registration data'''
        print('...getting all record IDs')

        # get the index of the relevant columns
        print(self.ID_column)
        print(headers)
        # ID_idx = headers.index(self.ID_column) #ID column index
        # age_idx = headers.index('birthyear') + 1 #age column index
        # gender_idx = headers.index('dgender') + 1 #gender column index
        # begin_idx = headers.index('dentrdate') + 1 #begin column index
        # end_idx = headers.index('dexitdate') + 1#end column index

        ID_amount = []
        too_young = []
        registration_none = []
        unregistration_none = []
        before_07 = 0
        avg_age = []

        max = 5000000000000000000000
        current = 0

        rows = rows.where((pd.notnull(rows)), None)

        # pair IDs with a dict corresponding to data and dates
        for row in tqdm(rows.itertuples()):  #line in de data
            if current > max:
                break
            else:
                # key is ID
                if len(row) < 1:
                    print('row < 1')
                    break  #zelf toegevoegd

                key = row.Index  #int() weggehaald want key is ook met letters

                if key not in ID_amount:
                    ID_amount.append(key)

                # skip if instance is outside the specified age limits
                try:
                    if int(row.birthyear) > 2000:
                        too_young.append(key)
                        continue

                    ID_age = 2018 - int(row.birthyear)
                    avg_age.append(ID_age)

                    # val is a new dict with keys 'data' en 'dates'
                    # containing the processed data and registration dates, respectively
                    val = dict()

                    if self.survival == False:
                        val['data'] = [
                            'negative', key, ID_age, row.dgender
                        ]  #key 'data'; values ['negative', ID, age, gender]
                    else:
                        val['data'] = [[False], key, ID_age, row.dgender]
                    registration = str2date(
                        row.dentrdate, give_default_begin=False
                    )  #registration date #default begin was true, even veranderd nav de pippi documenten
                    #str2date uit date_math.py; converts date to format dd/mm/yyyy

                    unregistration = str2date(
                        row.dexitdate, ymd=False, give_default_end=True
                    )  #if not (row[end_idx] in ['', None]) else str2date('2050-12-31')

                    if registration == None:
                        registration_none.append(key)
                        continue
                    if unregistration == None:
                        unregistration_none.append(key)
                        continue

                    if int(str(unregistration).split('-')[0]) < 2007:
                        before_07 += 1
                        continue

                    if self.survival == False:
                        val['stroke_dates'] = [
                            'negative', registration, unregistration
                        ]  #key 'P_dates' ; values ['negative', begindate, enddate]
                    else:
                        val['stroke_dates'] = [[False], registration,
                                               unregistration]
                    # add key/value pair
                    self.id2data[key] = val  #id2data dict; key=id, val=dict

                except ValueError:
                    continue

                except TypeError:
                    continue
                current += 1

        self.statistics['unique ids'] = len(ID_amount)
        self.statistics['too old ids'] = len(too_young)
        self.statistics['in database before study started'] = len(
            registration_none)
        self.statistics['in database before until'] = len(unregistration_none)
        self.statistics['in database before study started'] = before_07
        self.statistics['len id2data '] = len(self.id2data)
        self.statistics['average age'] = np.mean(avg_age)

        save_obj(self.id2data, self.in_dir + 'patient_dict')

        print('it worked!')
        return ['ID', 'age', 'gender']
Exemple #6
0
    def insert_data(self,
                    rows,
                    headers,
                    code_column,
                    date_column,
                    regex_string,
                    limit,
                    suffix='',
                    incorporate_SOEP=False):
        '''inserts data from the specified csv and corresponding columns'''

        # make convenient reference to the dictionary
        dct = self.id2data

        # # get data and corresponding headers
        # rows, headers = util.import_data(f, delim=self.delim)

        # get the index of the relevant columns
        ID_idx = headers.index(self.ID_column)
        code_idx = headers.index(code_column)
        date_idx = headers.index(date_column[0])

        if incorporate_SOEP:
            SOEP_idx = headers.index(incorporate_SOEP)

        # get the right suffix to append for the attribute name
        if suffix == '':
            suffix = code_column

        # regex pattern to match (ATC/ICPC standards)
        pattern = re.compile(regex_string)

        # iterate over all instances, making a new dict with the new attributes as keys
        attribute2counts = defaultdict(dict)
        for row in rows:

            # if key is not in the data dictionary, we skip it
            key = row[ID_idx]
            if not key in dct:
                continue

            # init other vars
            date = str2date(row[date_idx])
            begin = dct[key]['stroke_dates'][3]
            end = dct[key]['stroke_dates'][4]
            original_code = row[code_idx]

            # if we do not care about SOEPcode (always except for journaal case) or the SOEPcode is E
            if (not incorporate_SOEP) or (incorporate_SOEP
                                          and row[SOEP_idx] == 'E'):

                # generate attribute names
                attributes = self.generate_attributes(original_code,
                                                      limit,
                                                      suffix,
                                                      src=code_column)

                # this loop allows multiple attributes to be created in the previous code line
                # this allows for other classes to subclass this class, e.g. StandardEnrichProcess
                for attr in attributes:

                    # check if attribute name and ID instance already exist, if not, make them
                    util.init_key(attribute2counts, attr, defaultdict(dict))
                    util.init_key(attribute2counts[attr], key, 0)

                    # add 1 to the occurrence of the attribute in the instance
                    attribute2counts[attr] += 1

        # add data to each instance
        for ID in dct:
            data = dct[ID]['data']

            for id2occurrences in attribute2ids.values():

                # if patient has occurrences for the attribute, add that number, else add 0
                if ID in id2occurrences:
                    data.append(id2occurrences[ID])
                else:
                    data.append(0)

        # return the keys to be used as headers when writing the processed data
        return attribute2ids.keys()
Exemple #7
0
    def insert_data(self,
                    rows,
                    headers,
                    code_column,
                    date_column,
                    regex_string,
                    limit,
                    suffix='',
                    incorporate_SOEP=False,
                    counter=0):
        '''inserts data from the specified csv and corresponding columns'''

        important_features = ['CHOLBMT', 'RRDIKA', 'RRSYKA']

        # read rows into list to re-use
        rows = rows.where((pd.notnull(rows)), None)

        # make convenient reference to the dictionary
        dct = self.id2data

        # # get data and corresponding headers
        # rows, headers = util.import_data(f, delim=self.delim)

        # get the index of the relevant columns
        # ID_idx = headers.index(self.ID_column)
        code_idx = headers.index(code_column) + 1
        date_idx = headers.index(date_column[0]) + 1
        b_date_idx = headers.index(date_column[0]) + 1
        e_date_idx = headers.index(date_column[1]) + 1

        # if incorporate_SOEP:
        # 	SOEP_idx = headers.index(incorporate_SOEP)

        # regex pattern to match (ATC/ICPC standards)
        pattern = re.compile(regex_string)

        # regex pattern to match (ATC/ICPC standards)
        pattern = re.compile(regex_string)

        if 'lab_results' in suffix:
            values_dict = dict()
            # val_idx = headers.index('valuen') + 1

            # pair IDs with a dict corresponding to data and dates
            for row in rows.itertuples():  #line in de data
                code = row[code_idx]
                # if we do not know the high and low values, determine by data distribution
                if code not in important_features:
                    if not code in values_dict:
                        try:
                            values_dict[code] = [float(row.valuen)]
                        except ValueError:
                            continue
                        except TypeError:
                            continue
                    else:
                        try:
                            values_dict[code].append(float(row.valuen))
                        except ValueError:
                            continue
                        except TypeError:
                            continue

            minmax_dict = self.calculate_minmax(values_dict, pattern, limit)

        # keep track of number of times the row is attributed to a positive stroke patient (or patient where the target instance = 'positive')
        num_pos = 0
        num_total = 0
        attribute_count = dict()
        # iterate over all instances, making a new dict with the new attributes as keys
        attribute2ids = dict()

        max = 100000000000000000
        current = 0

        # iterate over all instances
        for row in tqdm(rows.itertuples()):
            current += 1
            # row = row.split(';')

            if current > max:
                break
            else:
                num_total += 1

                # if key is not in the data dictionary, we skip it
                key = row.Index

                if not key in dct:
                    continue

            # init other vars
            b_date = str2date(row[b_date_idx],
                              give_default_begin=True)  # begin of event
            e_date = str2date(row[e_date_idx],
                              give_default_end=True)  # end of event
            b_reg = dct[key]['stroke_dates'][1]  # beginning of registration
            e_reg = dct[key]['stroke_dates'][2]  # ending of registration
            # print('wddup')
            # print(b_reg, e_reg)
            # print('xxx')

            # print(dct[key]['stroke_dates'][3], dct[key]['stroke_dates'][4])
            original_code = row[code_idx]
            if original_code == None:
                continue

            truncated_code = self.generate_code(original_code, limit)
            if truncated_code == None or truncated_code in [
                    'K90', 'K89', 'k90', 'k89'
            ]:
                continue

            print(b_reg, b_date, e_date)
            # print(b_reg <= b_date)
            # print(b_date <= e_reg)
            # print(b_reg <= e_date)
            # print(e_date <= e_reg)
            # if in the required interval (either beginning or ending date) AND code is valid
            if ((b_reg <= b_date and b_date <= e_reg) or
                (b_reg <= e_date
                 and e_date <= e_reg)) and pattern.match(truncated_code):

                # if we need to take the SOEP code of consults into account
                # if (not incorporate_SOEP) or (incorporate_SOEP and row[SOEP_idx] == 'E'):

                # generate attribute names
                if 'lab_results' in suffix:  # if we prepare for lab result abstraction

                    try:
                        val = float(row.valuen)
                        if not original_code in important_features:
                            min_val = minmax_dict[truncated_code]['low_bound']
                            max_val = minmax_dict[truncated_code]['high_bound']

                        else:
                            min_val, max_val = self.determine_minmax(
                                original_code)

                    except ValueError:
                        continue

                    except TypeError:
                        continue

                    val, min_val, max_val = self.make_lab_values(
                        val, min_val, max_val)

                    if not 'ID2abstractions' in locals():
                        # dict (patient) of dict (lab measurement name) of list of tuples (all value/date combinations of measurement)
                        ID2abstractions = dict()

                    util.init_key(ID2abstractions, key, dict())
                    util.init_key(ID2abstractions[key], original_code, [])

                    ID2abstractions[key][original_code].append((b_date, val))

                    if '' not in [val, min_val, max_val]:
                        attributes = [
                            get_value(val, min_val, max_val, original_code)
                        ]

                        # # add value abstraction as state interval
                        # self.insert_state_interval(key, attr, b_date, e_date)
                    else:
                        attributes = []

                else:
                    if 'cardiometabolism' in suffix:
                        val_idx = headers.index('valuec')
                        value = str(row[val_idx])

                    else:
                        value = None

                    attributes = self.generate_attributes(original_code,
                                                          limit,
                                                          suffix,
                                                          value,
                                                          src=code_column)

                # this loop allows multiple attributes to be created in the previous code line
                # this allows for other classes to subclass this class, e.g. SequenceEnrichProcess
                for attr in attributes:
                    if 'allergies' in suffix:
                        # val_idx = headers.index('flag')
                        value = row.flag

                        # check if the person actually has the allergie for which was tested
                        if value == 'POS':
                            self.insert_state_interval(key, attr, b_date,
                                                       e_date, original_code,
                                                       code_column)
                        # if negative or not tested, it is assumed that person does not have particular allergie
                        else:
                            continue
                    # insert a StateInterval object with the specified parameters
                    self.insert_state_interval(key, attr, b_date, e_date,
                                               original_code, code_column)

        if suffix == 'lab_results':  # do funky stuff with trends and abstractions
            # convert to trends PER lab result
            for ID in ID2abstractions:
                # print ID2abstractions[ID]
                for k, points in ID2abstractions[ID].items():

                    # the values are sorted before abstraction
                    points = sorted(list(set(points)))

                    # abstract the values and append to the current patient's sequence
                    # if only 1 measurement was done, we cannot do time series analysis
                    if len(points) > 1 and ID in dct:
                        abstractions = get_trends(k, points)
                        for abstraction in abstractions:
                            self.insert_state_interval(
                                ID,
                                *abstraction,
                                original_code=original_code,
                                src=code_column)
                        # self.id2data[ID]['data'] = self.id2data[ID]['data'] + abstractions

        # add data to each instance
        to_save = {}

        for ID in dct:
            to_save[ID] = []

        for ID in dct:
            data = dct[ID]['data']
            # to_save[ID] = []

            for id2occurrences in attribute2ids.values():

                # if patient has occurrences for the attribute, add that number, else add 0
                if ID in id2occurrences:
                    data.append(id2occurrences[ID])
                    to_save[ID].append(id2occurrences[ID])

                else:
                    data.append(0)
                    to_save[ID].append(0)

        if self.survival == True:
            save_obj(
                to_save, self.in_dir + suffix[0] + '_dict_temporal' +
                str(counter) + '_survival' + '.pkl')
            save_obj(
                list(attribute2ids.keys()), self.in_dir + suffix[0] +
                'temporal_headers' + str(counter) + '.pkl')
        else:
            save_obj(
                to_save, self.in_dir + suffix[0] + '_dict_temporal' +
                str(counter) + '.pkl')
            save_obj(
                list(attribute2ids.keys()), self.in_dir + suffix[0] +
                'temporal_headers' + str(counter) + '.pkl')
        # to satisfy return value requirement for the method 'process' in the superclass
        return [], -1, -1