コード例 #1
0
def read_cmu_format(details_filename,
                    maxcard_filename,
                    drop_edge_frac=0.0,
                    seed=101):
    # read a "cmu" format exchange graph, using the details and maxcard files
    #
    # optional : drop_edge_frac in (0, 1) removes a fraction of the edges to the Digraph.

    name = os.path.basename(maxcard_filename)

    # read details.input file
    col_names = [
        "id",
        "abo_patient",
        "abo_fonor",
        "wife_patient",
        "pra",
        "in_deg",
        "out_deg",
        "is_ndd",
        "is_marginalized",
    ]
    df_details = pd.read_csv(details_filename,
                             names=col_names,
                             skiprows=1,
                             delim_whitespace=True)

    pair_details = df_details.loc[df_details["is_ndd"] == 0]
    pair_id = list(pair_details["id"].unique())

    # vtx_index[id] gives the index in the digraph
    vtx_index = dict(zip(pair_id, range(len(pair_id))))

    vtx_count = len(vtx_index)
    digraph = Digraph(vtx_count)

    # label sensitized pairs
    for index, row in pair_details.iterrows():
        if row["is_marginalized"]:
            digraph.vs[vtx_index[row["id"]]].sensitized = True

    # read maxcard.inuput file (edges)
    col_names = ["src_id", "tgt_id", "weight", "c4", "c5"]
    df_edges = pd.read_csv(maxcard_filename,
                           names=col_names,
                           skiprows=1,
                           delim_whitespace=True)

    # drop the last column
    df_edges.drop(df_edges.index[-1])

    # take only nonzero edges
    nonzero_edges = df_edges.loc[df_edges["weight"] > 0]

    # optional: sample from the edges
    if drop_edge_frac != 0.0:
        assert (drop_edge_frac < 1.0) and (drop_edge_frac > 0.0)
        nonzero_edges = nonzero_edges.sample(frac=(1.0 - drop_edge_frac),
                                             random_state=seed)

    # ind ndds if they exist
    ndd_details = df_details.loc[df_details["is_ndd"] == 1]
    ndd_count = len(ndd_details)

    if ndd_count > 0:
        ndd_list = [Ndd(id=i) for i in range(ndd_count)]
        ndd_id = list(ndd_details["id"].unique())

        # ndd_index[id] gives the index in the ndd list
        ndd_index = dict(zip(ndd_id, range(len(ndd_id))))
    else:
        ndd_list = []
        ndd_index = []

    use_ndds = ndd_count > 0

    # add edges to pairs and ndds
    for index, row in nonzero_edges.iterrows():
        src = row["src_id"]
        tgt_id = vtx_index[row["tgt_id"]]
        weight = row["weight"]
        if use_ndds and (src in ndd_index.keys()):  # this is an ndd edge
            src_id = ndd_index[src]
            ndd_list[src_id].add_edge(
                NddEdge(
                    digraph.vs[tgt_id],
                    weight,
                    src_id=ndd_list[src_id].id,
                    src=ndd_list[src_id],
                ))
        else:  # this edge is a pair edge
            src_id = vtx_index[src]
            digraph.add_edge(weight, digraph.vs[src_id], digraph.vs[tgt_id])

    return digraph, ndd_list, name
コード例 #2
0
def read_unos_graph_with_data(directory, cycle_cap, chain_cap):
    """
    read a unos-format exchange, and return a list of kidney_ndd.Ndd objects and a kidney_digraph.Digraph object, and
    save donor/recipient data

    each unos-format exchange is contained in a subdirectory with the naming format 'KPD_CSV_IO_######'. Each exchange
     subdirectory must contain files of the format:
      - *edgeweights.csv
      - *donor.csv
      - *recipient.csv
    """

    if directory.endswith(os.sep):
        name = os.path.basename(directory[:-1])
    else:
        name = os.path.basename(directory)

    # look for  files
    edge_files = glob.glob(os.path.join(directory, "*edgeweights.csv"))
    donor_files = glob.glob(os.path.join(directory, "*donor.csv"))
    recip_files = glob.glob(os.path.join(directory, "*recipient.csv"))

    # there should only be one of each file
    assert len(donor_files) == 1
    assert len(recip_files) == 1
    assert len(edge_files) == 1

    donor_file = donor_files[0]
    recip_file = recip_files[0]
    edge_filename = edge_files[0]

    df_donor = pd.read_csv(donor_file)
    df_recip = pd.read_csv(recip_file)

    # make all cols lowercase
    df_donor.columns = [c.lower() for c in df_donor.columns]
    df_recip.columns = [c.lower() for c in df_recip.columns]

    # if no cpra col, then add null values
    if "cpra" not in df_recip.columns:
        logger.info("CPRA column not found")
        df_recip["cpra"] = NULL_KPD_DATA

    # -- add columns for missing data if they don't exist
    # if no cpra col, then add null values
    if "cpra" not in df_recip.columns:
        logger.info("COL NOT FOUND: cpra")
        df_recip["cpra"] = NULL_KPD_DATA

    if "highly_sensitized" not in df_recip.columns:
        logger.info("COL NOT FOUND: highly_sensitized")
        df_recip["highly_sensitized"] = NULL_KPD_DATA

    if "abo" not in df_recip.columns:
        if "abo blood group" in df_recip.columns:
            df_recip["abo"] = df_recip["abo blood group"]
        else:
            raise Exception("no abo column found")

    # validate donor data
    assert "abo" in df_donor.columns

    # validate recip data
    assert "abo" in df_recip.columns
    assert "cpra" in df_recip.columns
    assert "highly_sensitized" in df_recip.columns

    # remove abo subtypes and make lowercase
    df_donor["abo"] = df_donor["abo"].apply(
        lambda x: simple_string(x, non_numeric=True))
    df_recip["abo"] = df_recip["abo"].apply(
        lambda x: simple_string(x, non_numeric=True))

    df_edges = pd.read_csv(edge_filename)

    expected_columns = [
        "KPD Match Run ID",
        "KPD Candidate ID",
        "Candidate's KPD Pair ID",
        "KPD Donor ID",
        "Donor's KPD Pair ID",
        "Total Weight",
    ]

    assert len(expected_columns) == len(df_edges.columns)

    for i_col, expected in enumerate(expected_columns):
        assert simple_string(expected) == simple_string(
            df_edges.columns[i_col])

    col_names = [
        "match_run",
        "patient_id",
        "patient_pair_id",
        "donor_id",
        "donor_paired_patient_id",
        "weight",
    ]

    df_edges.columns = col_names

    # last column is edge weights -- only take nonzero edges
    nonzero_edges = df_edges.loc[df_edges["weight"] > 0]

    # remove NDD edges
    kpd_edges = nonzero_edges.loc[~nonzero_edges["donor_paired_patient_id"].
                                  isnull()]

    # get unique vertex ids
    # Note, in the *edgeweights.csv files:
    # - "KPD Candidate ID" (or "patient_id" here) is the patient/recipient's UNOS ID
    # - "Donor's KPD Pair ID" is the UNOS ID of the donor's associated patient (or None if the donor is an NDD)
    vtx_id = set(
        list(kpd_edges["patient_id"].unique()) +
        list(kpd_edges["donor_paired_patient_id"].unique()))

    # vtx_index[id] gives the index in the digraph
    vtx_count = len(vtx_id)
    vtx_index = dict(zip(vtx_id, range(len(vtx_id))))
    vtx_index_to_id = {v: k for k, v in vtx_index.items()}

    digraph = Digraph(vtx_count, aux_vertex_id=vtx_index_to_id)

    warned = False
    for index, row in kpd_edges.iterrows():
        src_id = vtx_index[row["donor_paired_patient_id"]]
        tgt_id = vtx_index[row["patient_id"]]
        weight = row["weight"]
        if src_id < 0 or src_id >= vtx_count:
            raise KidneyReadException(
                "Vertex index {} out of range.".format(src_id))
        if tgt_id < 0 or tgt_id >= vtx_count:
            raise KidneyReadException(
                "Vertex index {} out of range.".format(tgt_id))
        if src_id == tgt_id:
            raise KidneyReadException(
                "Self-loop from {0} to {0} not permitted".format(src_id))
        if digraph.edge_exists(digraph.vs[src_id],
                               digraph.vs[tgt_id]) & ~warned:
            print(
                "# WARNING: Duplicate edge in file: {}".format(edge_filename))
            warned = True
        if weight == 0:
            raise KidneyReadException("Zero-weight edge from {} to {}".format(
                src_id, tgt_id))

        # for the donor pair, add the the donor ID to the vertex's list of donor IDs unless it's already there
        digraph.vs[src_id].donor_set.add(row["donor_id"])

        digraph.add_edge(weight,
                         digraph.vs[src_id],
                         digraph.vs[tgt_id],
                         edge_data=row.to_dict())

    # now read NDDs - take only NDD edges
    ndd_edges = nonzero_edges.loc[
        nonzero_edges["donor_paired_patient_id"].isnull()]
    ndd_id = set(list(ndd_edges["donor_id"].unique()))

    ndd_count = len(ndd_id)

    if ndd_count > 0:
        ndd_list = [Ndd(id=i) for i in range(ndd_count)]
        ndd_index = dict(zip(ndd_id, range(
            len(ndd_id))))  # ndd_index[id] gives the index in the digraph

        for k, v in ndd_index.items():
            ndd_list[v].aux_id = k

        # Keep track of which edges have been created already, to detect duplicates
        edge_exists = [[False for v in digraph.vs] for ndd in ndd_list]

        for index, row in ndd_edges.iterrows():
            src_id = ndd_index[row["donor_id"]]
            tgt_id = vtx_index[row["patient_pair_id"]]
            weight = row["weight"]
            if src_id < 0 or src_id >= ndd_count:
                raise KidneyReadException(
                    "NDD index {} out of range.".format(src_id))
            if tgt_id < 0 or tgt_id >= digraph.n:
                raise KidneyReadException(
                    "Vertex index {} out of range.".format(tgt_id))

            ndd_list[src_id].add_edge(
                NddEdge(
                    digraph.vs[tgt_id],
                    weight,
                    src_id=ndd_list[src_id].id,
                    src=ndd_list[src_id],
                ))
            edge_exists[src_id][tgt_id] = True
    else:
        ndd_list = []

    graph = GraphStructure(
        digraph,
        ndd_list,
        cycle_cap,
        chain_cap,
        name=name,
        df_donor=df_donor,
        df_recip=df_recip,
    )

    return graph
コード例 #3
0
def read_unos_graph(directory, cycle_cap, chain_cap):
    """read a unos-format exchange, and return a list of kidney_ndd.Ndd objects and a kidney_digraph.Digraph object.

    each unos-format exchange is contained in a subdirectory with the naming format 'KPD_CSV_IO_######'. Each exchange
     subdirectory must contain a file of the format ########_edgeweights.csv
    """
    # look for edge files
    edge_files = glob.glob(os.path.join(directory, "*edgeweights.csv"))

    name = os.path.basename(directory)

    # there should only be one edgeweights file
    assert len(edge_files) == 1

    edge_filename = edge_files[0]

    df = pd.read_csv(edge_filename)

    assert df.columns == [
        "KPD Match Run ID",
        "KPD Candidate ID",
        "Candidate's KPD Pair ID",
        "KPD Donor ID",
        "Donor's KPD Pair ID",
        "Total Weight",
    ]

    col_names = [
        "match_run",
        "patient_id",
        "patient_pair_id",
        "donor_id",
        "donor_paired_patient_id",
        "weight",
    ]

    df.columns = col_names

    # last column is edge weights -- only take nonzero edges
    nonzero_edges = df.loc[df["weight"] > 0]

    # remove NDD edges
    kpd_edges = nonzero_edges.loc[~nonzero_edges["donor_paired_patient_id"].
                                  isnull()]

    # get unique vertex ids
    # Note, in the *edgeweights.csv files:
    # - "KPD Candidate ID" (or "patient_id" here) is the patient/recipient's UNOS ID
    # - "Donor's KPD Pair ID" is the UNOS ID of the donor's associated patient (or None if the donor is an NDD)
    vtx_id = set(
        list(kpd_edges["patient_id"].unique()) +
        list(kpd_edges["donor_paired_patient_id"].unique()))

    vtx_count = len(vtx_id)
    digraph = Digraph(vtx_count)

    # vtx_index[id] gives the index in the digraph
    vtx_index = dict(zip(vtx_id, range(len(vtx_id))))

    warned = False
    for index, row in kpd_edges.iterrows():
        src_id = vtx_index[row["donor_paired_patient_id"]]
        tgt_id = vtx_index[row["patient_id"]]
        weight = row["weight"]
        if src_id < 0 or src_id >= vtx_count:
            raise KidneyReadException(
                "Vertex index {} out of range.".format(src_id))
        if tgt_id < 0 or tgt_id >= vtx_count:
            raise KidneyReadException(
                "Vertex index {} out of range.".format(tgt_id))
        if src_id == tgt_id:
            raise KidneyReadException(
                "Self-loop from {0} to {0} not permitted".format(src_id))
        if digraph.edge_exists(digraph.vs[src_id],
                               digraph.vs[tgt_id]) & ~warned:
            print(
                "# WARNING: Duplicate edge in file: {}".format(edge_filename))
            warned = True
        if weight == 0:
            raise KidneyReadException("Zero-weight edge from {} to {}".format(
                src_id, tgt_id))

        digraph.add_edge(weight, digraph.vs[src_id], digraph.vs[tgt_id])

    # now read NDDs - take only NDD edges
    ndd_edges = nonzero_edges.loc[
        nonzero_edges["donor_paired_patient_id"].isnull()]
    ndd_id = set(list(ndd_edges["donor_id"].unique()))

    ndd_count = len(ndd_id)

    if ndd_count > 0:
        ndd_list = [Ndd(id=i) for i in range(ndd_count)]
        ndd_index = dict(zip(ndd_id, range(
            len(ndd_id))))  # ndd_index[id] gives the index in the digraph

        # Keep track of which edges have been created already, to detect duplicates
        edge_exists = [[False for v in digraph.vs] for ndd in ndd_list]

        for index, row in ndd_edges.iterrows():
            src_id = ndd_index[row["donor_id"]]
            tgt_id = vtx_index[row["patient_pair_id"]]
            weight = row["weight"]
            if src_id < 0 or src_id >= ndd_count:
                raise KidneyReadException(
                    "NDD index {} out of range.".format(src_id))
            if tgt_id < 0 or tgt_id >= digraph.n:
                raise KidneyReadException(
                    "Vertex index {} out of range.".format(tgt_id))

            ndd_list[src_id].add_edge(
                NddEdge(
                    digraph.vs[tgt_id],
                    weight,
                    src_id=ndd_list[src_id].id,
                    src=ndd_list[src_id],
                ))
            edge_exists[src_id][tgt_id] = True
    else:
        ndd_list = []
        ndd_index = []

    graph = GraphStructure(digraph, ndd_list, cycle_cap, chain_cap, name=name)

    return graph
コード例 #4
0
def read_CMU_format(details_filename,
                    maxcard_filename,
                    frac_edges=1.0,
                    seed=101):
    # read details.inuput file
    col_names = [
        'id', 'abo_patient', 'abo_fonor', 'wife_patient', 'pra', 'in_deg',
        'out_deg', 'is_ndd', 'is_marginalized'
    ]
    df_details = pandas.read_csv(details_filename,
                                 names=col_names,
                                 skiprows=1,
                                 delim_whitespace=True)

    pair_details = df_details.loc[df_details['is_ndd'] == 0]
    pair_id = list(pair_details['id'].unique())
    vtx_index = dict(zip(pair_id, range(
        len(pair_id))))  # vtx_index[id] gives the index in the digraph

    vtx_count = len(vtx_index)
    digraph = Digraph(vtx_count)

    # label sensitized pairs
    for index, row in pair_details.iterrows():
        if row['is_marginalized']:
            digraph.vs[vtx_index[row['id']]].sensitized = True

    # read maxcard.inuput file (edges)
    col_names = ['src_id', 'tgt_id', 'weight', 'c4', 'c5']
    df_edges = pandas.read_csv(maxcard_filename,
                               names=col_names,
                               skiprows=1,
                               delim_whitespace=True)
    df_edges.drop(df_edges.index[-1])  # drop the last column
    nonzero_edges = df_edges.loc[df_edges['weight'] >
                                 0]  # take only nonzero edges

    # sample from the edges, if we are supposed to sparsify...
    if frac_edges < 1:
        final_edges = nonzero_edges.sample(frac=frac_edges, random_state=seed)
    else:
        final_edges = nonzero_edges

    # ind ndds if they exist
    ndd_details = df_details.loc[df_details['is_ndd'] == 1]
    ndd_count = len(ndd_details)

    if ndd_count > 0:
        ndds = [Ndd() for _ in range(ndd_count)]
        ndd_id = list(ndd_details['id'].unique())
        ndd_index = dict(zip(ndd_id, range(
            len(ndd_id))))  # ndd_index[id] gives the index in the ndd list
    else:
        ndds = []
        ndd_index = []

    use_ndds = ndd_count > 0

    # add edges to pairs and ndds
    for index, row in final_edges.iterrows():
        src = row['src_id']
        tgt_id = vtx_index[row['tgt_id']]
        weight = row['weight']
        if use_ndds and ndd_index.has_key(src):  # this is an ndd edge
            src_id = ndd_index[src]
            ndds[src_id].add_edge(NddEdge(digraph.vs[tgt_id], weight))
        else:  # this edge is a pair edge
            src_id = vtx_index[src]
            digraph.add_edge(weight, digraph.vs[src_id], digraph.vs[tgt_id])

    return digraph, ndds
コード例 #5
0
    def er_randomgraph(cls, num_vertices, p, seed, cycle_cap=3, chain_cap=4):
        """
        generate an Erdos-Renyi random graph with a specified number of vertices and edge probability. all edge weights
        are 1

        if any vertices are disconnected, they are removed.
        """

        rs = np.random.RandomState(seed)

        name = f"random_n_{num_vertices}_p_{p}_s_{seed}"

        weight = 1.0

        # generate random adjmat
        # adjmat[i, j] is an edge from vertex i to vertex j
        adjmat = rs.choice([1, 0], (num_vertices, num_vertices), p=[p, 1 - p])

        # set the diagonal to 0
        for i in range(num_vertices):
            adjmat[i, i] = 0

        in_deg = [adjmat[:, i].sum() for i in range(num_vertices)]
        out_deg = [adjmat[i, :].sum() for i in range(num_vertices)]

        # any vertices with zero in degree is an ndd, all others are pair vertices
        ndd_inds = [
            i for i in range(num_vertices)
            if ((in_deg[i] == 0) and out_deg[i] > 0)
        ]
        all_vertex_ids = set(range(num_vertices)).difference(ndd_inds)
        vertex_inds = list(
            filter(lambda i: in_deg[i] > 0,
                   all_vertex_ids))  # take only vertices with incoming edges

        # take an id (0 to num_vertices + 1) to a index (0 to len(vertex_inds))
        vtx_id_to_ind = {j: i for i, j in enumerate(vertex_inds)}

        # create ndds
        ndd_list = [Ndd(id=i) for i in range(len(ndd_inds))]

        # create digraph
        digraph = Digraph(len(vertex_inds))

        # add ndd edges
        for i_ndd, ndd_ind in enumerate(ndd_inds):
            for recip_id in adjmat[i_ndd, :].nonzero()[0]:
                ndd_list[i_ndd].add_edge(
                    NddEdge(
                        digraph.vs[vtx_id_to_ind[recip_id]],
                        weight,
                        src_id=ndd_list[i_ndd].id,
                        src=ndd_list[i_ndd],
                    ))

        # add pair-pair edges
        for v_ind in vertex_inds:
            src_ind = vtx_id_to_ind[v_ind]
            for recip_id in adjmat[src_ind, :].nonzero()[0]:
                digraph.add_edge(weight, digraph.vs[src_ind],
                                 digraph.vs[vtx_id_to_ind[recip_id]])
        graph = cls(digraph,
                    ndd_list,
                    cycle_cap=cycle_cap,
                    chain_cap=chain_cap,
                    name=name)

        # set random edge weights
        initialize_random_edge_weights(graph, rs)

        return graph
コード例 #6
0
def read_unos_graph(directory):
    # read a unos-format exchange, and return a list of kidney_ndd.Ndd objects and a kidney_digraph.Digraph object.
    #
    # each unos-format exchange is contained in a subdirectory with the naming format 'KPD_CSV_IO_######'. Each exchange
    # subdirectory must contain a file of the format ########_edgeweights.csv

    # look for edge & recipient files
    edge_files = glob.glob(directory + os.sep + '*edgeweights.csv')

    name = os.path.basename(directory)

    # there should only be one edgeweights file
    assert len(edge_files) == 1

    edge_filename = edge_files[0]

    col_names = [
        'match_run', 'patient_id', 'patient_pair_id', 'donor_id',
        'donor_pair_id', 'weight'
    ]
    df = pd.read_csv(edge_filename, names=col_names, skiprows=1)

    # last column is edge weights -- only take nonzero edges
    nonzero_edges = df.loc[df['weight'] > 0]

    # remove NDD edges
    kpd_edges = nonzero_edges.loc[~nonzero_edges['donor_pair_id'].isnull()]

    # get unique vertex ids
    vtx_id = set(
        list(kpd_edges['patient_id'].unique()) +
        list(kpd_edges['donor_pair_id'].unique()))

    vtx_count = len(vtx_id)
    digraph = Digraph(vtx_count)

    # vtx_index[id] gives the index in the digraph
    vtx_index = dict(zip(vtx_id, range(len(vtx_id))))

    warned = False
    for index, row in kpd_edges.iterrows():
        src_id = vtx_index[row['donor_pair_id']]
        tgt_id = vtx_index[row['patient_id']]
        weight = row['weight']
        if src_id < 0 or src_id >= vtx_count:
            raise KidneyReadException(
                "Vertex index {} out of range.".format(src_id))
        if tgt_id < 0 or tgt_id >= vtx_count:
            raise KidneyReadException(
                "Vertex index {} out of range.".format(tgt_id))
        if src_id == tgt_id:
            raise KidneyReadException(
                "Self-loop from {0} to {0} not permitted".format(src_id))
        if digraph.edge_exists(digraph.vs[src_id],
                               digraph.vs[tgt_id]) & ~warned:
            print(
                "# WARNING: Duplicate edge in file: {}".format(edge_filename))
            warned = True
        if weight == 0:
            raise KidneyReadException("Zero-weight edge from {} to {}".format(
                src_id, tgt_id))

        digraph.add_edge(weight, digraph.vs[src_id], digraph.vs[tgt_id])

        # now read NDDs - take only NDD edges
        ndd_edges = nonzero_edges.loc[nonzero_edges['donor_pair_id'].isnull()]
        ndd_id = set(list(ndd_edges['donor_id'].unique()))

        ndd_count = len(ndd_id)

        if ndd_count > 0:
            ndd_list = [Ndd(id=i) for i in range(ndd_count)]
            ndd_index = dict(zip(ndd_id, range(
                len(ndd_id))))  # ndd_index[id] gives the index in the digraph

            # Keep track of which edges have been created already, to detect duplicates
            edge_exists = [[False for v in digraph.vs] for ndd in ndd_list]

            for index, row in ndd_edges.iterrows():
                src_id = ndd_index[row['donor_id']]
                tgt_id = vtx_index[row['patient_pair_id']]
                weight = row['weight']
                if src_id < 0 or src_id >= ndd_count:
                    raise KidneyReadException(
                        "NDD index {} out of range.".format(src_id))
                if tgt_id < 0 or tgt_id >= digraph.n:
                    raise KidneyReadException(
                        "Vertex index {} out of range.".format(tgt_id))

                ndd_list[src_id].add_edge(
                    NddEdge(digraph.vs[tgt_id],
                            weight,
                            src_id=ndd_list[src_id].id))
                edge_exists[src_id][tgt_id] = True
        else:
            ndd_list = []
            ndd_index = []

    return digraph, ndd_list, name
コード例 #7
0
def read_cmu_format(details_filename,
                    maxcard_filename,
                    frac_edges=None,
                    seed=101):
    # read a "cmu" format exchange graph, using the details and maxcard files
    #
    # optional : frac_edges in (0, 1) adds only a fraction of the edges to the Digraph.

    name = os.path.basename(maxcard_filename)

    # read details.input file
    col_names = [
        'id', 'abo_patient', 'abo_fonor', 'wife_patient', 'pra', 'in_deg',
        'out_deg', 'is_ndd', 'is_marginalized'
    ]
    df_details = pd.read_csv(details_filename,
                             names=col_names,
                             skiprows=1,
                             delim_whitespace=True)

    pair_details = df_details.loc[df_details['is_ndd'] == 0]
    pair_id = list(pair_details['id'].unique())

    # vtx_index[id] gives the index in the digraph
    vtx_index = dict(zip(pair_id, range(len(pair_id))))

    vtx_count = len(vtx_index)
    digraph = Digraph(vtx_count)

    # label sensitized pairs
    for index, row in pair_details.iterrows():
        if row['is_marginalized']:
            digraph.vs[vtx_index[row['id']]].sensitized = True

    # read maxcard.inuput file (edges)
    col_names = ['src_id', 'tgt_id', 'weight', 'c4', 'c5']
    df_edges = pd.read_csv(maxcard_filename,
                           names=col_names,
                           skiprows=1,
                           delim_whitespace=True)

    # drop the last column
    df_edges.drop(df_edges.index[-1])

    # take only nonzero edges
    nonzero_edges = df_edges.loc[df_edges['weight'] > 0]

    # optional: sample from the edges
    if frac_edges is not None:
        assert (frac_edges < 1.0) and (frac_edges > 0.0)
        nonzero_edges = nonzero_edges.sample(frac=frac_edges,
                                             random_state=seed)

    # ind ndds if they exist
    ndd_details = df_details.loc[df_details['is_ndd'] == 1]
    ndd_count = len(ndd_details)

    if ndd_count > 0:
        ndd_list = [Ndd(id=i) for i in range(ndd_count)]
        ndd_id = list(ndd_details['id'].unique())

        # ndd_index[id] gives the index in the ndd list
        ndd_index = dict(zip(ndd_id, range(len(ndd_id))))
    else:
        ndd_list = []
        ndd_index = []

    use_ndds = ndd_count > 0

    # add edges to pairs and ndds
    for index, row in nonzero_edges.iterrows():
        src = row['src_id']
        tgt_id = vtx_index[row['tgt_id']]
        weight = row['weight']
        if use_ndds and src in ndd_index:  # this is an ndd edge
            src_id = ndd_index[src]
            ndd_list[src_id].add_edge(
                NddEdge(digraph.vs[tgt_id], weight,
                        src_id=ndd_list[src_id].id))
        else:  # this edge is a pair edge
            src_id = vtx_index[src]
            digraph.add_edge(weight, digraph.vs[src_id], digraph.vs[tgt_id])

    return digraph, ndd_list, name
コード例 #8
0
def read_new_format_unos_graph(edge_filename,
                               cycle_cap,
                               chain_cap,
                               logger=None):
    df_edges = pd.read_csv(edge_filename)

    expected_columns = [
        "KPD_Match_Run_ID",
        "Candidate_pt_code",
        "Candidates_pair_pt_code",
        "Donor_pt_code",
        "Donors_pair_pt_code",
        "Total_Weight",
        "non_directed",
    ]

    if not len(expected_columns) == len(df_edges.columns):
        raise KidneyReadException(
            f"Edgeweights file {edge_filename} has {len(df_edges.columns)} columns. "
            f"Expected {len(expected_columns)}.")

    for i_col, expected in enumerate(expected_columns):
        if not simple_string(expected) == simple_string(
                df_edges.columns[i_col]):
            raise KidneyReadException(
                f"Column {(i_col + 1)} in *edgeweights.csv should be {simple_string(expected)}."
                f"Instead we found column {simple_string(df_edges.columns[i_col])}."
            )

    col_names = [
        "match_run",
        "patient_id",
        "patient_pair_id",
        "donor_id",
        "donor_paired_patient_id",
        "weight",
        "non_directed",
    ]

    df_edges.columns = col_names

    # nonzero_edges = df_edges.loc[df_edges["weight"] > 0]
    kpd_edges = df_edges.loc[(df_edges["weight"] > 0)
                             & (df_edges["non_directed"] == 0)]

    vtx_id = set(
        list(kpd_edges["patient_id"].unique()) +
        list(kpd_edges["donor_paired_patient_id"].unique()))
    vtx_count = len(vtx_id)
    digraph = Digraph(vtx_count)

    # vtx_index[id] gives the index in the digraph
    vtx_index = dict(zip(vtx_id, range(len(vtx_id))))

    warned = False
    for index, row in kpd_edges.iterrows():
        src_id = vtx_index[row["donor_paired_patient_id"]]
        tgt_id = vtx_index[row["patient_id"]]
        weight = row["weight"]
        if src_id < 0 or src_id >= vtx_count:
            raise KidneyReadException(f"Vertex index {src_id} out of range.")
        if tgt_id < 0 or tgt_id >= vtx_count:
            raise KidneyReadException(f"Vertex index {tgt_id} out of range.")
        if src_id == tgt_id:
            raise KidneyReadException(
                f"Self-loop from {src_id} to {src_id} not permitted")
        if digraph.edge_exists(digraph.vs[src_id],
                               digraph.vs[tgt_id]) & ~warned:
            print(f"# WARNING: Duplicate edge in file: {edge_filename}")
            warned = True
        if weight == 0:
            raise KidneyReadException(
                f"Zero-weight edge from {src_id} to {tgt_id}")

        digraph.add_edge(
            weight,
            digraph.vs[src_id],
            digraph.vs[tgt_id],
            edge_data={
                "donor_id": row["donor_id"],
                "patient_id": row["patient_id"]
            },
        )

    ndd_edges = df_edges.loc[(df_edges["weight"] > 0)
                             & (df_edges["non_directed"] == 1)]
    ndd_id = set(list(ndd_edges["donor_id"].unique()))

    ndd_count = len(ndd_id)

    if ndd_count > 0:
        ndd_list = [Ndd(id=i) for i in range(ndd_count)]
        ndd_index = dict(zip(ndd_id, range(
            len(ndd_id))))  # ndd_index[id] gives the index in the digraph

        # Keep track of which edges have been created already, to detect duplicates
        edge_exists = [[False for v in digraph.vs] for ndd in ndd_list]

        for index, row in ndd_edges.iterrows():
            src_id = ndd_index[row["donor_id"]]
            tgt_id = vtx_index[row["patient_pair_id"]]
            weight = row["weight"]
            if src_id < 0 or src_id >= ndd_count:
                raise KidneyReadException(f"NDD index {src_id} out of range.")
            if tgt_id < 0 or tgt_id >= digraph.n:
                raise KidneyReadException(
                    f"Vertex index {tgt_id} out of range.")

            ndd_list[src_id].add_edge(
                NddEdge(
                    digraph.vs[tgt_id],
                    weight,
                    src_id=ndd_list[src_id].id,
                    src=ndd_list[src_id],
                    data={
                        "donor_id": row["donor_id"],
                        "patient_id": row["patient_id"]
                    },
                ))
            edge_exists[src_id][tgt_id] = True
    else:
        ndd_list = []

    print("ndd count", len(ndd_list))
    graph = GraphStructure(digraph,
                           ndd_list,
                           cycle_cap,
                           chain_cap,
                           name=edge_filename,
                           logger=logger)
    for e in graph.all_edge_list:

        e.data["patient_ctr"] = 0
        e.data["donor_ctr"] = 0

    return graph