def get_clean_data(rsa_file_path, rsa_format, ano_file_path, ano_format, meta_data, only_first_month = False): cmd_codes = meta_data['cmd'] stay_type_codes = meta_data['stay_type'] stay_complexity_codes = meta_data['stay_complexity'] ano_data = list() exit_month_data = list() chunk = 1000 sex_data_first_col = 0 age_in_year_data_first_col = sex_data_first_col + 2 age_in_day_data_first_col = age_in_year_data_first_col + formats.age_in_year_cols_count stay_length_data_first_col = age_in_day_data_first_col + formats.age_in_day_cols_count cmd_codes_first_col = stay_length_data_first_col + formats.stay_length_cols stay_type_codes_first_col = cmd_codes_first_col + len(cmd_codes) stay_complexity_codes_first_col = stay_type_codes_first_col + len(stay_type_codes) cols_count = stay_complexity_codes_first_col + len(stay_complexity_codes) np_data = np.zeros((chunk, cols_count), dtype=np.int) rsa_data = sparse.csr_matrix((0, cols_count)) index = 0 global_index = 0 lines_count = 0 with open(rsa_file_path) as rsa_file: with open(ano_file_path) as ano_file: while True: if index == chunk: rsa_data = vstack([rsa_data, sparse.csr_matrix(np_data)]) np_data.fill(0) index = 0 rsa_line = rsa_file.readline() ano_line = ano_file.readline() if ano_tools.is_ano_ok(ano_line, ano_format) and rsa_tools.is_rsa_ok(rsa_line, rsa_format): rsa = rsa_tools.get_rsa(rsa_line, rsa_format) exit_month = rsa['exit_month'] if only_first_month and exit_month != 1: continue exit_month_data.append(exit_month) ano = ano_tools.get_ano(ano_line, ano_format, global_index) ano_data.append(ano) np_data[index, sex_data_first_col + rsa['sex']] = 1 np_data[index, age_in_year_data_first_col + rsa['age_in_year_cat']] = 1 np_data[index, age_in_day_data_first_col + rsa['age_in_day_cat']] = 1 np_data[index, stay_length_data_first_col + rsa['stay_length_cat']] = 1 if rsa['cmd'] != '': np_data[index, cmd_codes_first_col + cmd_codes.index(rsa['cmd'])] = 1 if rsa['stay_type'] != '': np_data[index, stay_type_codes_first_col + stay_type_codes.index(rsa['stay_type'])] = 1 if rsa['stay_complexity'] != '': np_data[index, stay_complexity_codes_first_col + stay_complexity_codes.index(rsa['stay_complexity'])] = 1 index += 1 global_index += 1 if lines_count % 10000 == 0: print '\rPorcessed %s \t added %s' % (lines_count, global_index), lines_count += 1 if not rsa_line and not ano_line: break if index % chunk != 0: rsa_data = vstack([rsa_data, sparse.csr_matrix(np_data[0:index, :])]) return {'anos': ano_data, 'rsas': rsa_data, 'exit_month_data': exit_month_data}
def get_clean_anos_data(rsa_file_path, rsa_format, ano_file_path, ano_format): ano_data = list() i = 0 with open(rsa_file_path) as rsa_file: with open(ano_file_path) as ano_file: while True: rsa_line = rsa_file.readline() ano_line = ano_file.readline() if ano_tools.is_ano_ok(ano_line, ano_format) and rsa_tools.is_rsa_ok(rsa_line, rsa_format): ano = ano_tools.get_ano(ano_line, ano_format) ano_data.append(ano) if i % 10000 == 0: print '\rPorcessed ', i, i += 1 if not rsa_line and not ano_line: break return ano_data