sys.exit() for f in edgesFiles: content = SFrame.read_csv(path + f + '.csv', na_values='null', verbose=verbose) if 'src' in content.column_names() and 'dst' in content.column_names(): g = g.add_edges(content, src_field='src', dst_field='dst') elif 'source' in content.column_names( ) and 'target' in content.column_names(): g = g.add_edges(content, src_field='source', dst_field='target') else: print "Unknown src_id/dst_id field: ", content.column_names() sys.exit() print g.summary() def parseResult(row, columns): for c in columns: if isinstance(c, list): result = '' found = False for x in c: if row[x] != None and row[x] != '': result += row[x] + ' ' found = True if found: return result else: if row[c] != None and row[c] != '':
def build_data_graph(): file_path = "/Users/blahiri/healthcare/documents/recommendation_system/" beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv") bene_packed = beneficiaries.pack_columns(column_prefix = 'chron_', dtype = dict, new_column_name = 'chronic_conditions', remove_prefix = False) #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), #and the outer [] makes sure we emit a list of lists. bene_chrons = bene_packed.flat_map(["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], lambda x:[list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems()]) bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1] del bene_chrons['chronic_condition_value'] bene_chrons.rename({'chronic_condition_name': 'chronic_condition'}) g = SGraph() bene_chrons['relation'] = 'had_chronic' g = g.add_edges(bene_chrons, src_field = 'desynpuf_id', dst_field = 'chronic_condition') print g.summary() #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query bene_with_chrons = SFrame(None) bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id') #Add edges to the graph indicating which patient had which diagnosed condition tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv") cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'] for column in cols_to_drop: del tcdc[column] #Same patient can be diagnosed with same condition multiple times a year, so take distinct tcdc = tcdc.unique() #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no diagnosed condition, however. bene_chrons_tcdc = bene_with_chrons.join(tcdc) bene_chrons_tcdc['relation'] = 'diagnosed_with' g = g.add_edges(bene_chrons_tcdc, src_field = 'desynpuf_id', dst_field = 'dgns_cd') print g.summary() #Add edges to the graph indicating which patient had which procedure tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints = {'prcdr_cd' : str}) cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'] for column in cols_to_drop: del tcpc[column] tcpc = tcpc.unique() #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no procedure, however. bene_chrons_tcpc = bene_with_chrons.join(tcpc) bene_chrons_tcpc['relation'] = 'underwent' g = g.add_edges(bene_chrons_tcpc, src_field = 'desynpuf_id', dst_field = 'prcdr_cd') print g.summary() #Add edges to the graph indicating which patient had which medicine pde = SFrame.read_csv(file_path + "prescribed_drugs.csv") pde = pde.unique() #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no medicine, however. bene_chrons_pde = bene_with_chrons.join(pde) bene_chrons_pde['relation'] = 'had_drug' g = g.add_edges(bene_chrons_pde, src_field = 'desynpuf_id', dst_field = 'substancename') print g.summary() return g
def build_data_graph(): file_path = "/Users/blahiri/healthcare/documents/recommendation_system/" beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv") bene_packed = beneficiaries.pack_columns( column_prefix='chron_', dtype=dict, new_column_name='chronic_conditions', remove_prefix=False) #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), #and the outer [] makes sure we emit a list of lists. bene_chrons = bene_packed.flat_map( ["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], lambda x: [ list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems() ]) bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1] del bene_chrons['chronic_condition_value'] bene_chrons.rename({'chronic_condition_name': 'chronic_condition'}) g = SGraph() bene_chrons['relation'] = 'had_chronic' g = g.add_edges(bene_chrons, src_field='desynpuf_id', dst_field='chronic_condition') print g.summary() #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query bene_with_chrons = SFrame(None) bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id') #Add edges to the graph indicating which patient had which diagnosed condition tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv") cols_to_drop = [ 'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year' ] for column in cols_to_drop: del tcdc[column] #Same patient can be diagnosed with same condition multiple times a year, so take distinct tcdc = tcdc.unique() #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no diagnosed condition, however. bene_chrons_tcdc = bene_with_chrons.join(tcdc) bene_chrons_tcdc['relation'] = 'diagnosed_with' g = g.add_edges(bene_chrons_tcdc, src_field='desynpuf_id', dst_field='dgns_cd') print g.summary() #Add edges to the graph indicating which patient had which procedure tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints={'prcdr_cd': str}) cols_to_drop = [ 'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year' ] for column in cols_to_drop: del tcpc[column] tcpc = tcpc.unique() #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no procedure, however. bene_chrons_tcpc = bene_with_chrons.join(tcpc) bene_chrons_tcpc['relation'] = 'underwent' g = g.add_edges(bene_chrons_tcpc, src_field='desynpuf_id', dst_field='prcdr_cd') print g.summary() #Add edges to the graph indicating which patient had which medicine pde = SFrame.read_csv(file_path + "prescribed_drugs.csv") pde = pde.unique() #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no medicine, however. bene_chrons_pde = bene_with_chrons.join(pde) bene_chrons_pde['relation'] = 'had_drug' g = g.add_edges(bene_chrons_pde, src_field='desynpuf_id', dst_field='substancename') print g.summary() return g