Ejemplo n.º 1
0
        sys.exit()

for f in edgesFiles:
    content = SFrame.read_csv(path + f + '.csv',
                              na_values='null',
                              verbose=verbose)
    if 'src' in content.column_names() and 'dst' in content.column_names():
        g = g.add_edges(content, src_field='src', dst_field='dst')
    elif 'source' in content.column_names(
    ) and 'target' in content.column_names():
        g = g.add_edges(content, src_field='source', dst_field='target')
    else:
        print "Unknown src_id/dst_id field: ", content.column_names()
        sys.exit()

print g.summary()


def parseResult(row, columns):
    for c in columns:
        if isinstance(c, list):
            result = ''
            found = False
            for x in c:
                if row[x] != None and row[x] != '':
                    result += row[x] + ' '
                    found = True
            if found:
                return result
        else:
            if row[c] != None and row[c] != '':
Ejemplo n.º 2
0
def build_data_graph():
  file_path = "/Users/blahiri/healthcare/documents/recommendation_system/"
  beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv")
  bene_packed = beneficiaries.pack_columns(column_prefix = 'chron_', dtype = dict, new_column_name = 'chronic_conditions', remove_prefix = False)
  
  #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), 
  #and the outer [] makes sure we emit a list of lists.
  bene_chrons = bene_packed.flat_map(["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], 
                                     lambda x:[list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems()])
 

  bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1]
  del bene_chrons['chronic_condition_value']
  bene_chrons.rename({'chronic_condition_name': 'chronic_condition'})

  g = SGraph()
  bene_chrons['relation'] = 'had_chronic'
  g = g.add_edges(bene_chrons, src_field = 'desynpuf_id', dst_field = 'chronic_condition')
  print g.summary()
 
  #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query
  bene_with_chrons = SFrame(None)
  bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id')
  
  #Add edges to the graph indicating which patient had which diagnosed condition
  tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv")
  cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year']
  for column in cols_to_drop:
     del tcdc[column]
  #Same patient can be diagnosed with same condition multiple times a year, so take distinct
  tcdc = tcdc.unique()
  #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that 
  #such a patient had no diagnosed condition, however.
  bene_chrons_tcdc = bene_with_chrons.join(tcdc)
  
  bene_chrons_tcdc['relation'] = 'diagnosed_with'
  g = g.add_edges(bene_chrons_tcdc, src_field = 'desynpuf_id', dst_field = 'dgns_cd')
  print g.summary()

  
  #Add edges to the graph indicating which patient had which procedure
  tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints = {'prcdr_cd' : str})
  cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year']
  for column in cols_to_drop:
     del tcpc[column]
  tcpc = tcpc.unique()
  #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that 
  #such a patient had no procedure, however.
  bene_chrons_tcpc = bene_with_chrons.join(tcpc)
  bene_chrons_tcpc['relation'] = 'underwent'
  g = g.add_edges(bene_chrons_tcpc, src_field = 'desynpuf_id', dst_field = 'prcdr_cd')
  print g.summary()

  #Add edges to the graph indicating which patient had which medicine
  pde = SFrame.read_csv(file_path + "prescribed_drugs.csv")
  pde = pde.unique()
  #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that 
  #such a patient had no medicine, however.
  bene_chrons_pde = bene_with_chrons.join(pde)
  bene_chrons_pde['relation'] = 'had_drug'
  g = g.add_edges(bene_chrons_pde, src_field = 'desynpuf_id', dst_field = 'substancename')
  print g.summary()
   
  return g
Ejemplo n.º 3
0
def build_data_graph():
    file_path = "/Users/blahiri/healthcare/documents/recommendation_system/"
    beneficiaries = SFrame.read_csv(file_path +
                                    "beneficiary_summary_2008_2009.csv")
    bene_packed = beneficiaries.pack_columns(
        column_prefix='chron_',
        dtype=dict,
        new_column_name='chronic_conditions',
        remove_prefix=False)

    #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(),
    #and the outer [] makes sure we emit a list of lists.
    bene_chrons = bene_packed.flat_map(
        ["chronic_condition_name", "chronic_condition_value", "desynpuf_id"],
        lambda x: [
            list(k + (x['desynpuf_id'], ))
            for k in x['chronic_conditions'].iteritems()
        ])

    bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1]
    del bene_chrons['chronic_condition_value']
    bene_chrons.rename({'chronic_condition_name': 'chronic_condition'})

    g = SGraph()
    bene_chrons['relation'] = 'had_chronic'
    g = g.add_edges(bene_chrons,
                    src_field='desynpuf_id',
                    dst_field='chronic_condition')
    print g.summary()

    #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query
    bene_with_chrons = SFrame(None)
    bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(),
                                'desynpuf_id')

    #Add edges to the graph indicating which patient had which diagnosed condition
    tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv")
    cols_to_drop = [
        'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'
    ]
    for column in cols_to_drop:
        del tcdc[column]
    #Same patient can be diagnosed with same condition multiple times a year, so take distinct
    tcdc = tcdc.unique()
    #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no diagnosed condition, however.
    bene_chrons_tcdc = bene_with_chrons.join(tcdc)

    bene_chrons_tcdc['relation'] = 'diagnosed_with'
    g = g.add_edges(bene_chrons_tcdc,
                    src_field='desynpuf_id',
                    dst_field='dgns_cd')
    print g.summary()

    #Add edges to the graph indicating which patient had which procedure
    tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv",
                           column_type_hints={'prcdr_cd': str})
    cols_to_drop = [
        'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'
    ]
    for column in cols_to_drop:
        del tcpc[column]
    tcpc = tcpc.unique()
    #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no procedure, however.
    bene_chrons_tcpc = bene_with_chrons.join(tcpc)
    bene_chrons_tcpc['relation'] = 'underwent'
    g = g.add_edges(bene_chrons_tcpc,
                    src_field='desynpuf_id',
                    dst_field='prcdr_cd')
    print g.summary()

    #Add edges to the graph indicating which patient had which medicine
    pde = SFrame.read_csv(file_path + "prescribed_drugs.csv")
    pde = pde.unique()
    #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no medicine, however.
    bene_chrons_pde = bene_with_chrons.join(pde)
    bene_chrons_pde['relation'] = 'had_drug'
    g = g.add_edges(bene_chrons_pde,
                    src_field='desynpuf_id',
                    dst_field='substancename')
    print g.summary()

    return g