Beispiel #1
0
def read_ndds(lines, digraph):
    """Reads NDDs from an array of strings in the .ndd format."""

    ndds = []
    ndd_count, edge_count = [int(x) for x in lines[0].split()]
    ndds = [Ndd(id=i) for i in range(ndd_count)]

    # Keep track of which edges have been created already so that we can
    # detect duplicates
    edge_exists = [[False for v in digraph.vs] for ndd in ndds]

    for line in lines[1:edge_count + 1]:
        tokens = [t for t in line.split()]
        src_id = int(tokens[0])
        tgt_id = int(tokens[1])
        weight = float(tokens[2])
        if src_id < 0 or src_id >= ndd_count:
            raise KidneyReadException(
                "NDD index {} out of range.".format(src_id))
        if tgt_id < 0 or tgt_id >= digraph.n:
            raise KidneyReadException(
                "Vertex index {} out of range.".format(tgt_id))
        if edge_exists[src_id][tgt_id]:
            raise KidneyReadException(
                "Duplicate edge from NDD {0} to vertex {1}.".format(
                    src_id, tgt_id))
        ndds[src_id].add_edge(
            NddEdge(digraph.vs[tgt_id], weight, src_id=ndds[src_id].id))
        edge_exists[src_id][tgt_id] = True

    if lines[edge_count + 1].split()[0] != "-1" or len(lines) < edge_count + 2:
        raise KidneyReadException("Incorrect edge count")

    return ndds
def read_from_kpd(edgeweights_filename, digraph, vtx_index):
    col_names = [
        'match_run', 'patient_id', 'patient_pair_id', 'donor_id',
        'donor_pair_id', 'weight'
    ]
    df = pandas.read_csv(edgeweights_filename, names=col_names, skiprows=1)
    nonzero_edges = df.loc[
        df['weight'] >
        0]  # last column is edge weights -- only take nonzero edges

    ndd_edges = nonzero_edges.loc[
        nonzero_edges['donor_pair_id'].isnull()]  # take only NDD edges
    ndd_id = set(list(ndd_edges['donor_id'].unique()))

    ndd_count = len(ndd_id)

    if ndd_count > 0:
        ndds = [Ndd(id=i) for i in range(ndd_count)]
        ndd_index = dict(zip(ndd_id, range(
            len(ndd_id))))  # ndd_index[id] gives the index in the digraph

        # Keep track of which edges have been created already so that we can
        # detect duplicates
        edge_exists = [[False for v in digraph.vs] for ndd in ndds]

        # warned = False
        for index, row in ndd_edges.iterrows():
            src_id = ndd_index[row['donor_id']]
            tgt_id = vtx_index[row['patient_pair_id']]
            score = row['weight']
            if src_id < 0 or src_id >= ndd_count:
                raise KidneyReadException(
                    "NDD index {} out of range.".format(src_id))
            if tgt_id < 0 or tgt_id >= digraph.n:
                raise KidneyReadException(
                    "Vertex index {} out of range.".format(tgt_id))
            # if edge_exists[src_id][tgt_id] & ~warned:
            # print "# WARNING: Duplicate edge in file: {}".format(edgeweights_filename)
            # warned = True
            # raise KidneyReadException(
            #         "Duplicate edge from NDD {0} to vertex {1}.".format(src_id, tgt_id))
            ndds[src_id].add_edge(
                NddEdge(digraph.vs[tgt_id], score, src_id=ndds[src_id].id))
            edge_exists[src_id][tgt_id] = True
    else:
        ndds = []
        ndd_index = []

    return ndds, ndd_index
def read_unos_graph(directory, cycle_cap, chain_cap):
    """read a unos-format exchange, and return a list of kidney_ndd.Ndd objects and a kidney_digraph.Digraph object.

    each unos-format exchange is contained in a subdirectory with the naming format 'KPD_CSV_IO_######'. Each exchange
     subdirectory must contain a file of the format ########_edgeweights.csv
    """
    # look for edge files
    edge_files = glob.glob(os.path.join(directory, "*edgeweights.csv"))

    name = os.path.basename(directory)

    # there should only be one edgeweights file
    assert len(edge_files) == 1

    edge_filename = edge_files[0]

    df = pd.read_csv(edge_filename)

    assert df.columns == [
        "KPD Match Run ID",
        "KPD Candidate ID",
        "Candidate's KPD Pair ID",
        "KPD Donor ID",
        "Donor's KPD Pair ID",
        "Total Weight",
    ]

    col_names = [
        "match_run",
        "patient_id",
        "patient_pair_id",
        "donor_id",
        "donor_paired_patient_id",
        "weight",
    ]

    df.columns = col_names

    # last column is edge weights -- only take nonzero edges
    nonzero_edges = df.loc[df["weight"] > 0]

    # remove NDD edges
    kpd_edges = nonzero_edges.loc[~nonzero_edges["donor_paired_patient_id"].
                                  isnull()]

    # get unique vertex ids
    # Note, in the *edgeweights.csv files:
    # - "KPD Candidate ID" (or "patient_id" here) is the patient/recipient's UNOS ID
    # - "Donor's KPD Pair ID" is the UNOS ID of the donor's associated patient (or None if the donor is an NDD)
    vtx_id = set(
        list(kpd_edges["patient_id"].unique()) +
        list(kpd_edges["donor_paired_patient_id"].unique()))

    vtx_count = len(vtx_id)
    digraph = Digraph(vtx_count)

    # vtx_index[id] gives the index in the digraph
    vtx_index = dict(zip(vtx_id, range(len(vtx_id))))

    warned = False
    for index, row in kpd_edges.iterrows():
        src_id = vtx_index[row["donor_paired_patient_id"]]
        tgt_id = vtx_index[row["patient_id"]]
        weight = row["weight"]
        if src_id < 0 or src_id >= vtx_count:
            raise KidneyReadException(
                "Vertex index {} out of range.".format(src_id))
        if tgt_id < 0 or tgt_id >= vtx_count:
            raise KidneyReadException(
                "Vertex index {} out of range.".format(tgt_id))
        if src_id == tgt_id:
            raise KidneyReadException(
                "Self-loop from {0} to {0} not permitted".format(src_id))
        if digraph.edge_exists(digraph.vs[src_id],
                               digraph.vs[tgt_id]) & ~warned:
            print(
                "# WARNING: Duplicate edge in file: {}".format(edge_filename))
            warned = True
        if weight == 0:
            raise KidneyReadException("Zero-weight edge from {} to {}".format(
                src_id, tgt_id))

        digraph.add_edge(weight, digraph.vs[src_id], digraph.vs[tgt_id])

    # now read NDDs - take only NDD edges
    ndd_edges = nonzero_edges.loc[
        nonzero_edges["donor_paired_patient_id"].isnull()]
    ndd_id = set(list(ndd_edges["donor_id"].unique()))

    ndd_count = len(ndd_id)

    if ndd_count > 0:
        ndd_list = [Ndd(id=i) for i in range(ndd_count)]
        ndd_index = dict(zip(ndd_id, range(
            len(ndd_id))))  # ndd_index[id] gives the index in the digraph

        # Keep track of which edges have been created already, to detect duplicates
        edge_exists = [[False for v in digraph.vs] for ndd in ndd_list]

        for index, row in ndd_edges.iterrows():
            src_id = ndd_index[row["donor_id"]]
            tgt_id = vtx_index[row["patient_pair_id"]]
            weight = row["weight"]
            if src_id < 0 or src_id >= ndd_count:
                raise KidneyReadException(
                    "NDD index {} out of range.".format(src_id))
            if tgt_id < 0 or tgt_id >= digraph.n:
                raise KidneyReadException(
                    "Vertex index {} out of range.".format(tgt_id))

            ndd_list[src_id].add_edge(
                NddEdge(
                    digraph.vs[tgt_id],
                    weight,
                    src_id=ndd_list[src_id].id,
                    src=ndd_list[src_id],
                ))
            edge_exists[src_id][tgt_id] = True
    else:
        ndd_list = []
        ndd_index = []

    graph = GraphStructure(digraph, ndd_list, cycle_cap, chain_cap, name=name)

    return graph
def read_unos_graph_with_data(directory, cycle_cap, chain_cap):
    """
    read a unos-format exchange, and return a list of kidney_ndd.Ndd objects and a kidney_digraph.Digraph object, and
    save donor/recipient data

    each unos-format exchange is contained in a subdirectory with the naming format 'KPD_CSV_IO_######'. Each exchange
     subdirectory must contain files of the format:
      - *edgeweights.csv
      - *donor.csv
      - *recipient.csv
    """

    if directory.endswith(os.sep):
        name = os.path.basename(directory[:-1])
    else:
        name = os.path.basename(directory)

    # look for  files
    edge_files = glob.glob(os.path.join(directory, "*edgeweights.csv"))
    donor_files = glob.glob(os.path.join(directory, "*donor.csv"))
    recip_files = glob.glob(os.path.join(directory, "*recipient.csv"))

    # there should only be one of each file
    assert len(donor_files) == 1
    assert len(recip_files) == 1
    assert len(edge_files) == 1

    donor_file = donor_files[0]
    recip_file = recip_files[0]
    edge_filename = edge_files[0]

    df_donor = pd.read_csv(donor_file)
    df_recip = pd.read_csv(recip_file)

    # make all cols lowercase
    df_donor.columns = [c.lower() for c in df_donor.columns]
    df_recip.columns = [c.lower() for c in df_recip.columns]

    # if no cpra col, then add null values
    if "cpra" not in df_recip.columns:
        logger.info("CPRA column not found")
        df_recip["cpra"] = NULL_KPD_DATA

    # -- add columns for missing data if they don't exist
    # if no cpra col, then add null values
    if "cpra" not in df_recip.columns:
        logger.info("COL NOT FOUND: cpra")
        df_recip["cpra"] = NULL_KPD_DATA

    if "highly_sensitized" not in df_recip.columns:
        logger.info("COL NOT FOUND: highly_sensitized")
        df_recip["highly_sensitized"] = NULL_KPD_DATA

    if "abo" not in df_recip.columns:
        if "abo blood group" in df_recip.columns:
            df_recip["abo"] = df_recip["abo blood group"]
        else:
            raise Exception("no abo column found")

    # validate donor data
    assert "abo" in df_donor.columns

    # validate recip data
    assert "abo" in df_recip.columns
    assert "cpra" in df_recip.columns
    assert "highly_sensitized" in df_recip.columns

    # remove abo subtypes and make lowercase
    df_donor["abo"] = df_donor["abo"].apply(
        lambda x: simple_string(x, non_numeric=True))
    df_recip["abo"] = df_recip["abo"].apply(
        lambda x: simple_string(x, non_numeric=True))

    df_edges = pd.read_csv(edge_filename)

    expected_columns = [
        "KPD Match Run ID",
        "KPD Candidate ID",
        "Candidate's KPD Pair ID",
        "KPD Donor ID",
        "Donor's KPD Pair ID",
        "Total Weight",
    ]

    assert len(expected_columns) == len(df_edges.columns)

    for i_col, expected in enumerate(expected_columns):
        assert simple_string(expected) == simple_string(
            df_edges.columns[i_col])

    col_names = [
        "match_run",
        "patient_id",
        "patient_pair_id",
        "donor_id",
        "donor_paired_patient_id",
        "weight",
    ]

    df_edges.columns = col_names

    # last column is edge weights -- only take nonzero edges
    nonzero_edges = df_edges.loc[df_edges["weight"] > 0]

    # remove NDD edges
    kpd_edges = nonzero_edges.loc[~nonzero_edges["donor_paired_patient_id"].
                                  isnull()]

    # get unique vertex ids
    # Note, in the *edgeweights.csv files:
    # - "KPD Candidate ID" (or "patient_id" here) is the patient/recipient's UNOS ID
    # - "Donor's KPD Pair ID" is the UNOS ID of the donor's associated patient (or None if the donor is an NDD)
    vtx_id = set(
        list(kpd_edges["patient_id"].unique()) +
        list(kpd_edges["donor_paired_patient_id"].unique()))

    # vtx_index[id] gives the index in the digraph
    vtx_count = len(vtx_id)
    vtx_index = dict(zip(vtx_id, range(len(vtx_id))))
    vtx_index_to_id = {v: k for k, v in vtx_index.items()}

    digraph = Digraph(vtx_count, aux_vertex_id=vtx_index_to_id)

    warned = False
    for index, row in kpd_edges.iterrows():
        src_id = vtx_index[row["donor_paired_patient_id"]]
        tgt_id = vtx_index[row["patient_id"]]
        weight = row["weight"]
        if src_id < 0 or src_id >= vtx_count:
            raise KidneyReadException(
                "Vertex index {} out of range.".format(src_id))
        if tgt_id < 0 or tgt_id >= vtx_count:
            raise KidneyReadException(
                "Vertex index {} out of range.".format(tgt_id))
        if src_id == tgt_id:
            raise KidneyReadException(
                "Self-loop from {0} to {0} not permitted".format(src_id))
        if digraph.edge_exists(digraph.vs[src_id],
                               digraph.vs[tgt_id]) & ~warned:
            print(
                "# WARNING: Duplicate edge in file: {}".format(edge_filename))
            warned = True
        if weight == 0:
            raise KidneyReadException("Zero-weight edge from {} to {}".format(
                src_id, tgt_id))

        # for the donor pair, add the the donor ID to the vertex's list of donor IDs unless it's already there
        digraph.vs[src_id].donor_set.add(row["donor_id"])

        digraph.add_edge(weight,
                         digraph.vs[src_id],
                         digraph.vs[tgt_id],
                         edge_data=row.to_dict())

    # now read NDDs - take only NDD edges
    ndd_edges = nonzero_edges.loc[
        nonzero_edges["donor_paired_patient_id"].isnull()]
    ndd_id = set(list(ndd_edges["donor_id"].unique()))

    ndd_count = len(ndd_id)

    if ndd_count > 0:
        ndd_list = [Ndd(id=i) for i in range(ndd_count)]
        ndd_index = dict(zip(ndd_id, range(
            len(ndd_id))))  # ndd_index[id] gives the index in the digraph

        for k, v in ndd_index.items():
            ndd_list[v].aux_id = k

        # Keep track of which edges have been created already, to detect duplicates
        edge_exists = [[False for v in digraph.vs] for ndd in ndd_list]

        for index, row in ndd_edges.iterrows():
            src_id = ndd_index[row["donor_id"]]
            tgt_id = vtx_index[row["patient_pair_id"]]
            weight = row["weight"]
            if src_id < 0 or src_id >= ndd_count:
                raise KidneyReadException(
                    "NDD index {} out of range.".format(src_id))
            if tgt_id < 0 or tgt_id >= digraph.n:
                raise KidneyReadException(
                    "Vertex index {} out of range.".format(tgt_id))

            ndd_list[src_id].add_edge(
                NddEdge(
                    digraph.vs[tgt_id],
                    weight,
                    src_id=ndd_list[src_id].id,
                    src=ndd_list[src_id],
                ))
            edge_exists[src_id][tgt_id] = True
    else:
        ndd_list = []

    graph = GraphStructure(
        digraph,
        ndd_list,
        cycle_cap,
        chain_cap,
        name=name,
        df_donor=df_donor,
        df_recip=df_recip,
    )

    return graph
Beispiel #5
0
def read_unos_graph(directory):
    # read a unos-format exchange, and return a list of kidney_ndd.Ndd objects and a kidney_digraph.Digraph object.
    #
    # each unos-format exchange is contained in a subdirectory with the naming format 'KPD_CSV_IO_######'. Each exchange
    # subdirectory must contain a file of the format ########_edgeweights.csv

    # look for edge & recipient files
    edge_files = glob.glob(directory + os.sep + '*edgeweights.csv')

    name = os.path.basename(directory)

    # there should only be one edgeweights file
    assert len(edge_files) == 1

    edge_filename = edge_files[0]

    col_names = [
        'match_run', 'patient_id', 'patient_pair_id', 'donor_id',
        'donor_pair_id', 'weight'
    ]
    df = pd.read_csv(edge_filename, names=col_names, skiprows=1)

    # last column is edge weights -- only take nonzero edges
    nonzero_edges = df.loc[df['weight'] > 0]

    # remove NDD edges
    kpd_edges = nonzero_edges.loc[~nonzero_edges['donor_pair_id'].isnull()]

    # get unique vertex ids
    vtx_id = set(
        list(kpd_edges['patient_id'].unique()) +
        list(kpd_edges['donor_pair_id'].unique()))

    vtx_count = len(vtx_id)
    digraph = Digraph(vtx_count)

    # vtx_index[id] gives the index in the digraph
    vtx_index = dict(zip(vtx_id, range(len(vtx_id))))

    warned = False
    for index, row in kpd_edges.iterrows():
        src_id = vtx_index[row['donor_pair_id']]
        tgt_id = vtx_index[row['patient_id']]
        weight = row['weight']
        if src_id < 0 or src_id >= vtx_count:
            raise KidneyReadException(
                "Vertex index {} out of range.".format(src_id))
        if tgt_id < 0 or tgt_id >= vtx_count:
            raise KidneyReadException(
                "Vertex index {} out of range.".format(tgt_id))
        if src_id == tgt_id:
            raise KidneyReadException(
                "Self-loop from {0} to {0} not permitted".format(src_id))
        if digraph.edge_exists(digraph.vs[src_id],
                               digraph.vs[tgt_id]) & ~warned:
            print(
                "# WARNING: Duplicate edge in file: {}".format(edge_filename))
            warned = True
        if weight == 0:
            raise KidneyReadException("Zero-weight edge from {} to {}".format(
                src_id, tgt_id))

        digraph.add_edge(weight, digraph.vs[src_id], digraph.vs[tgt_id])

        # now read NDDs - take only NDD edges
        ndd_edges = nonzero_edges.loc[nonzero_edges['donor_pair_id'].isnull()]
        ndd_id = set(list(ndd_edges['donor_id'].unique()))

        ndd_count = len(ndd_id)

        if ndd_count > 0:
            ndd_list = [Ndd(id=i) for i in range(ndd_count)]
            ndd_index = dict(zip(ndd_id, range(
                len(ndd_id))))  # ndd_index[id] gives the index in the digraph

            # Keep track of which edges have been created already, to detect duplicates
            edge_exists = [[False for v in digraph.vs] for ndd in ndd_list]

            for index, row in ndd_edges.iterrows():
                src_id = ndd_index[row['donor_id']]
                tgt_id = vtx_index[row['patient_pair_id']]
                weight = row['weight']
                if src_id < 0 or src_id >= ndd_count:
                    raise KidneyReadException(
                        "NDD index {} out of range.".format(src_id))
                if tgt_id < 0 or tgt_id >= digraph.n:
                    raise KidneyReadException(
                        "Vertex index {} out of range.".format(tgt_id))

                ndd_list[src_id].add_edge(
                    NddEdge(digraph.vs[tgt_id],
                            weight,
                            src_id=ndd_list[src_id].id))
                edge_exists[src_id][tgt_id] = True
        else:
            ndd_list = []
            ndd_index = []

    return digraph, ndd_list, name
Beispiel #6
0
def read_new_format_unos_graph(edge_filename,
                               cycle_cap,
                               chain_cap,
                               logger=None):
    df_edges = pd.read_csv(edge_filename)

    expected_columns = [
        "KPD_Match_Run_ID",
        "Candidate_pt_code",
        "Candidates_pair_pt_code",
        "Donor_pt_code",
        "Donors_pair_pt_code",
        "Total_Weight",
        "non_directed",
    ]

    if not len(expected_columns) == len(df_edges.columns):
        raise KidneyReadException(
            f"Edgeweights file {edge_filename} has {len(df_edges.columns)} columns. "
            f"Expected {len(expected_columns)}.")

    for i_col, expected in enumerate(expected_columns):
        if not simple_string(expected) == simple_string(
                df_edges.columns[i_col]):
            raise KidneyReadException(
                f"Column {(i_col + 1)} in *edgeweights.csv should be {simple_string(expected)}."
                f"Instead we found column {simple_string(df_edges.columns[i_col])}."
            )

    col_names = [
        "match_run",
        "patient_id",
        "patient_pair_id",
        "donor_id",
        "donor_paired_patient_id",
        "weight",
        "non_directed",
    ]

    df_edges.columns = col_names

    # nonzero_edges = df_edges.loc[df_edges["weight"] > 0]
    kpd_edges = df_edges.loc[(df_edges["weight"] > 0)
                             & (df_edges["non_directed"] == 0)]

    vtx_id = set(
        list(kpd_edges["patient_id"].unique()) +
        list(kpd_edges["donor_paired_patient_id"].unique()))
    vtx_count = len(vtx_id)
    digraph = Digraph(vtx_count)

    # vtx_index[id] gives the index in the digraph
    vtx_index = dict(zip(vtx_id, range(len(vtx_id))))

    warned = False
    for index, row in kpd_edges.iterrows():
        src_id = vtx_index[row["donor_paired_patient_id"]]
        tgt_id = vtx_index[row["patient_id"]]
        weight = row["weight"]
        if src_id < 0 or src_id >= vtx_count:
            raise KidneyReadException(f"Vertex index {src_id} out of range.")
        if tgt_id < 0 or tgt_id >= vtx_count:
            raise KidneyReadException(f"Vertex index {tgt_id} out of range.")
        if src_id == tgt_id:
            raise KidneyReadException(
                f"Self-loop from {src_id} to {src_id} not permitted")
        if digraph.edge_exists(digraph.vs[src_id],
                               digraph.vs[tgt_id]) & ~warned:
            print(f"# WARNING: Duplicate edge in file: {edge_filename}")
            warned = True
        if weight == 0:
            raise KidneyReadException(
                f"Zero-weight edge from {src_id} to {tgt_id}")

        digraph.add_edge(
            weight,
            digraph.vs[src_id],
            digraph.vs[tgt_id],
            edge_data={
                "donor_id": row["donor_id"],
                "patient_id": row["patient_id"]
            },
        )

    ndd_edges = df_edges.loc[(df_edges["weight"] > 0)
                             & (df_edges["non_directed"] == 1)]
    ndd_id = set(list(ndd_edges["donor_id"].unique()))

    ndd_count = len(ndd_id)

    if ndd_count > 0:
        ndd_list = [Ndd(id=i) for i in range(ndd_count)]
        ndd_index = dict(zip(ndd_id, range(
            len(ndd_id))))  # ndd_index[id] gives the index in the digraph

        # Keep track of which edges have been created already, to detect duplicates
        edge_exists = [[False for v in digraph.vs] for ndd in ndd_list]

        for index, row in ndd_edges.iterrows():
            src_id = ndd_index[row["donor_id"]]
            tgt_id = vtx_index[row["patient_pair_id"]]
            weight = row["weight"]
            if src_id < 0 or src_id >= ndd_count:
                raise KidneyReadException(f"NDD index {src_id} out of range.")
            if tgt_id < 0 or tgt_id >= digraph.n:
                raise KidneyReadException(
                    f"Vertex index {tgt_id} out of range.")

            ndd_list[src_id].add_edge(
                NddEdge(
                    digraph.vs[tgt_id],
                    weight,
                    src_id=ndd_list[src_id].id,
                    src=ndd_list[src_id],
                    data={
                        "donor_id": row["donor_id"],
                        "patient_id": row["patient_id"]
                    },
                ))
            edge_exists[src_id][tgt_id] = True
    else:
        ndd_list = []

    print("ndd count", len(ndd_list))
    graph = GraphStructure(digraph,
                           ndd_list,
                           cycle_cap,
                           chain_cap,
                           name=edge_filename,
                           logger=logger)
    for e in graph.all_edge_list:

        e.data["patient_ctr"] = 0
        e.data["donor_ctr"] = 0

    return graph