Esempio n. 1
0
def assign_secondary_structure(pdb):

    ppdb = PandasPdb().read_pdb(pdb)

    secondary_structure = {}

    helices_from_pdb = ppdb.df['OTHERS'][ppdb.df['OTHERS']['record_name'] ==
                                         'HELIX']['entry']
    for helix in helices_from_pdb:
        identifier_h = helix[5:8].strip()
        initial_chain_h = helix[13].strip()
        initial_pos_h = helix[16:19].strip()
        final_pos_h = helix[28:31].strip()
        for i in range(int(initial_pos_h), int(final_pos_h) + 1):
            secondary_structure[
                initial_chain_h +
                str(i)] = 'helix' + identifier_h + '-' + initial_chain_h

    sheets_from_pdb = ppdb.df['OTHERS'][ppdb.df['OTHERS']['record_name'] ==
                                        'SHEET']['entry']
    for sheet in sheets_from_pdb:
        identifier_s = sheet[6:8].strip()
        initial_chain_s = sheet[15].strip()
        initial_pos_s = sheet[17:20].strip()
        final_pos_s = sheet[28:31].strip()
        for i in range(int(initial_pos_s), int(final_pos_s) + 1):
            secondary_structure[
                initial_chain_s +
                str(i)] = 'sheet' + identifier_s + '-' + initial_chain_s

    mol = bg.Pmolecule(pdb)
    net = mol.network()

    residues_type = {}
    for residue in mol.model.get_residues():
        res_type = residue.resname
        res_pos = residue.parent.id + str(residue.id[1])
        residues_type[res_pos] = res_type

    residues = list(net.nodes)  #assume they are ordered
    last_structure = None
    last_chain = None
    i = 0
    for residue in residues:
        chain = residue[0]
        try:
            structure = secondary_structure[residue]
            if structure != last_structure:
                i += 1
        except KeyError:
            if chain != last_chain:
                i += 1
            structure = 'loop' + str(i)
            secondary_structure[residue] = structure
        last_structure = structure
        last_chain = chain

    return secondary_structure
Esempio n. 2
0
 def create_id2name(self, pdb):
     mol = bg.Pmolecule(pdb)
     net = mol.network(cutoff=self.cutoff)
     self.structure = PDBParser().get_structure('X', pdb)[0]
     residues = []
     for residue in self.structure.get_residues():
         residues.append(self.three2one[residue.resname])
     old_labels = net.nodes
     labels = [a + b[1:] + ':' + b[0] for a, b in zip(residues, old_labels)]
     self.id2name = dict(zip(old_labels, labels))
 def create(self, pdb):
     """ Creates the amino acid network using biographs"""
     mol = bg.Pmolecule(pdb)
     self.net = mol.network(cutoff=self.cutoff, weight=True)
     self.structure = PDBParser().get_structure('X', pdb)[0]
     # if self.pos1 and self.pos2:
     #     for node in list(self.net.nodes):
     #         pos = int(node[1::])
     #         if pos not in range(self.pos1, self.pos2):
     #             self.net.remove_node(node)
     residues = []
     for residue in self.structure.get_residues():
         if residue.resname in self.three2one:
             residues.append(self.three2one[residue.resname])
         else:
             residues.append(residue.resname)
     old_labels = self.net.nodes
     labels = [a + b[1:] + ':' + b[0] for a, b in zip(residues, old_labels)]
     mapping = dict(zip(old_labels, labels))
     self.net = nx.relabel_nodes(self.net, mapping)
     return self.net
Esempio n. 4
0
def create_network(pdb_id,
                   database=None,
                   net=None,
                   pdbs_path=None,
                   database_path=None,
                   pathogenic=[],
                   non_pathogenic=[],
                   both=[],
                   colors='mutations',
                   sizes='neighborhood_watch_sharp'):

    if pdbs_path and database_path:
        if 'pdb' + pdb_id + '.pdb' in os.listdir(pdbs_path):
            pdb = os.path.join(pdbs_path, 'pdb' + pdb_id + '.pdb')
        else:
            pdb = os.path.join(pdbs_path, 'pdb' + pdb_id + '.ent')

        mol = bg.Pmolecule(pdb)
        net = mol.network()

        database = pd.DataFrame(pd.read_csv(database_path))

    node_labels = {}
    for node in net.nodes:
        info = database[database['Residue name'] == 'pdb' + pdb_id + node]
        if len(info) > 1: info = info.iloc[0]  # check why more than one
        type_aa = amino_acids_conversion.three2one(
            info["Type of residue"].item())
        label = type_aa + node[1::] + ":" + node[0]
        node_labels[node] = label

    mutation_type = []
    neighborhood_watch_sharp = []
    neighborhood_watch_smooth = []
    degree = []
    pairwise_sharp = []

    for node in net.nodes:
        if colors == 'mutations' or sizes == 'mutations':
            seq_pos = node[1::]
            if seq_pos in pathogenic:
                mutation_type.append('tomato')
            elif seq_pos in non_pathogenic:
                mutation_type.append('limegreen')
            elif seq_pos in both:
                mutation_type.append('gold')
            else:
                mutation_type.append('lightgrey')

        elif colors or sizes:
            k = nx.degree(net, node)
            degree.append(k)
            weight = nx.degree(net, node, weight='weight')
            if colors == 'neighborhood_watch_sharp':
                if weight / k < 5: neighborhood_watch_sharp.append('blue')
                elif weight / k < 10: neighborhood_watch_sharp.append('cyan')
                elif weight / k < 15:
                    neighborhood_watch_sharp.append('greenyellow')
                elif weight / k < 20:
                    neighborhood_watch_sharp.append('yellow')
                elif weight / k < 25:
                    neighborhood_watch_sharp.append('orange')
                else:
                    neighborhood_watch_sharp.append('red')

            elif sizes == 'neighborhood_watch_sharp':
                if weight / k < 5: neighborhood_watch_sharp.append(1000)
                elif weight / k < 10: neighborhood_watch_sharp.append(4000)
                elif weight / k < 15: neighborhood_watch_sharp.append(7000)
                elif weight / k < 20: neighborhood_watch_sharp.append(10000)
                elif weight / k < 25: neighborhood_watch_sharp.append(13000)
                else: neighborhood_watch_sharp.append(16000)

            elif colors == 'neighborhood_watch_smooth' or sizes == 'neighborhood_watch_smooth':
                neighborhood_watch_smooth.append(weight / k)

    if colors == 'pairwise_sharp' or sizes == 'pairwise_sharp':
        for u, v in net.edges:
            wij = net.get_edge_data(u, v)['weight']
            if wij < 10: pairwise_sharp.append('blue')
            elif wij < 20: pairwise_sharp.append('cyan')
            elif wij < 30: pairwise_sharp.append('greenyellow')
            elif wij < 40: pairwise_sharp.append('yellow')
            elif wij < 50: pairwise_sharp.append('orange')
            else: pairwise_sharp.append('red')

    color_map = []
    edge_color_map = []
    if colors == 'mutations': color_map = mutation_type
    elif colors == 'degree': color_map = degree
    elif colors == 'neighborhood_watch_sharp':
        color_map = neighborhood_watch_sharp
    elif colors == 'neighborhood_watch_smooth':
        color_map = neighborhood_watch_smooth
    elif colors == 'pairwise_sharp':
        edge_color_map = pairwise_sharp

    size_map = []
    #    edge_size_map = []
    if sizes == 'mutations': size_map = mutation_type
    elif sizes == 'degree': size_map = degree
    elif sizes == 'neighborhood_watch_sharp':
        size_map = neighborhood_watch_sharp
    elif sizes == 'neighborhood_watch_smooth':
        size_map = neighborhood_watch_smooth
    #    elif sizes == 'pairwise_sharp': edge_size_map = pairwise_sharp

    return net, node_labels, size_map, color_map, edge_color_map
def create_database_nodes(pdbs,
                          folder_path,
                          pdbs_path,
                          cutoff=5,
                          save_csv=True,
                          db_name=None):
    """Creates a database of node properties from a list of pdbs.
    
    Parameters
    ----------
    pdbs : list
        list of pdb id's
    folder_path: str
        path of the output folder
    pdbs_path: str
        path of the pdb files folder
    cutoff: int, optional
        cutoff threshold for the connection of nodes in the amino acid network (dafault is 5).
    save_csv: boolean, optional
        if True, saves the database as a csv file in the directory specified by folder_path (default is True).
    db_name: str, optional
        name of the database. If None and save_csv is True, saves the database as database_nodes.csv (default is None).
    
    Returns
    -------
    DataFrame
        pandas DataFrame object
    """

    amino_acids = list_aa()
    # initialize database to report
    database = []
    for pdb_id in pdbs:

        pdb, downloaded = get_pdb_path(pdb_id, pdbs_path)

        mol = bg.Pmolecule(pdb)
        net = mol.network(cutoff=cutoff)

        secondary_structure = assign_secondary_structure(pdb)

        residues_dict = {}
        for residue in mol.model.get_residues():
            res_type = residue.resname.strip()
            if len(res_type) < 3:
                res_type = aaconv.one2three(res_type)
            res_pos = residue.parent.id + str(residue.id[1])
            residues_dict[res_pos] = res_type

        for residue in mol.model.get_residues():
            node_name = residue.parent.id + str(residue.id[1])
            deg = nx.degree(net, residue.parent.id + str(residue.id[1]))
            if deg == 0:
                net.remove_node(residue.parent.id + str(residue.id[1]))
            else:
                weight = nx.degree(net,
                                   residue.parent.id + str(residue.id[1]),
                                   weight="weight")
                restype = residue.resname
                resname = (os.path.split(pdb)[1][:-4] + residue.parent.id +
                           str(residue.id[1]))
                size = len(residue)
                structure = secondary_structure[node_name]
                if structure[0] == "h":
                    structure = "helix"
                elif structure[0] == "s":
                    structure = "sheet"
                else:
                    structure = "loop"

                line = [
                    resname, restype, deg, weight, weight / deg, size,
                    structure
                ]

                # divide 1D from other neighbors
                w1D = 0
                k1D = 0  # can be 1 or 2
                wOTH = 0
                kOTH = 0

                for neighbor in list(nx.neighbors(net, node_name)):
                    edge_weight = nx.edges(net)[(node_name,
                                                 neighbor)]["weight"]
                    relation = get_neighbor_structure_relation(
                        secondary_structure, node_name, neighbor)

                    if relation == "1D":
                        w1D += edge_weight
                        k1D += 1
                    else:
                        wOTH += edge_weight
                        kOTH += 1

                if k1D == 0 or kOTH == 0:
                    # print(resname)
                    continue

                nw1D = w1D / k1D
                nwOTH = wOTH / kOTH

                not_terminal = k1D == 2

                line += [nw1D, nwOTH, not_terminal]
                database.append(line)

    if not db_name:
        db_name = "database_nodes.csv"
    elif db_name[-3::] != ".csv":
        db_name += ".csv"

    columns = [
        "Residue name",
        "Type of residue",
        "Degree",
        "Weight",
        "Weight/Degree",
        "Atomic number",
        "Secondary structure",
        "NW1D",
        "NWothers",
        "Not terminal",
    ]

    if save_csv:
        db_path = os.path.join(folder_path, db_name)
        with open(db_path, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(columns)
            writer.writerows(database)

    db = pd.DataFrame(database, columns=columns)

    return db
def create_aa_network(pdb_id,
                      rel_list,
                      folder_path,
                      selected_positions=None,
                      cutoff=5,
                      kw_reprod=False,
                      k_w=None,
                      db_1_name=None,
                      db_2_name=None,
                      separate_jaccard=False,
                      separate_weights=False,
                      pdbs_path="pdbs",
                      save_csv=True,
                      remove_hydrogen_atoms=False):
    """Creates the amino acid network from a pdb id.
        
    Parameters
    ----------
    pdb_id : str
        pdb id of the protein
    rel_list: list
        list of relation (1D, 2D, 3D, 4D) to consider.
    folder_path: str
        path of the output folder
    selected_positions: None or list, optional
        list of sequence positions to consider. If None, all positions are considered (default is None)
    cutoff: int, optional
        cutoff threshold for the connection of nodes in the amino acid network (dafault is 5).
    kw_reprod: boolean, optional
        if True, adds a column that checks if (k, w) of a node exist in a database of nodes of robust proteins (default is False).
    k_w: dict or None, optional
        dictionary of weight values associated to each k value in the database of nodes of robust proteins (default is None).
    db_1_name: str, optional
        name of the database of type 1. If None and save_csv is True, saves the database as database_pos_1.csv (default is None).
    db_2_name: str, optional
        name of the database of type 2. If None and save_csv is True, saves the database as database_pos_1.csv (default is None).
    separate_jaccard: boolean, optional
        if True, separates the Jaccard vector in the database of type 2 based on the size of he neighbors (small, medium, large) (default is False).
    separate_weights: boolean, optional
        if True, separates the weight in the database of type 2 based on the size of he neighbors (small, medium, large) (default is False).
    pdbs_path: str, optional
        path of the pdb files folder (default is "pdbs")
    save_csv: boolean, optional
        if True, saves the database as a csv file in the directory specified by folder_path (default is True).
    remove_hydrogen_atoms: boolean, optional
        if True, saves removes the hydrogen atoms from the pdb file (default is True).
    Returns
    -------
    Graph
        NetworkX Graph object
    DataFrame
        pandas DataFrame object
    DataFrame
        pandas DataFrame object
    Pmolecule
        Biograph Pmolecule object
    boolean
        True if the pdb file was downloaded
    """

    amino_acids = list_aa()

    if separate_jaccard:
        amino_acids_schl = dict_aa_schl(amino_acids)

    if not separate_jaccard:  # DO I WANT TO LEAVE IT LIKE THIS?
        separate_weights = False

    pdb, downloaded = get_pdb_path(pdb_id, pdbs_path)

    if remove_hydrogen_atoms:
        remove_hydrogens(pdb)

    # initialize databases to report
    database_1 = []
    database_2 = []

    mol = bg.Pmolecule(pdb)
    net = mol.network(cutoff=cutoff)

    # take only selected positions:
    if selected_positions:
        for node in list(net.nodes):
            pos = int(node[1::])
            if pos not in selected_positions:
                net.remove_node(node)
    else:
        positions = [int(node[1::]) for node in list(net.nodes)]
        pos_min = min(positions)
        pos_max = max(positions)
        selected_positions = range(pos_min, pos_max + 1)

    secondary_structure = assign_secondary_structure(pdb)

    residues_dict = {}
    for residue in mol.model.get_residues():
        res_type = residue.resname.strip()
        if len(res_type) < 3:
            res_type = aaconv.one2three(res_type)
        res_pos = residue.parent.id + str(residue.id[1])
        residues_dict[res_pos] = res_type

    for residue in mol.model.get_residues():
        adj_vector = [0] * 20
        weight_vector = [0] * 20
        node_name = residue.parent.id + str(residue.id[1])
        deg = nx.degree(net, residue.parent.id + str(residue.id[1]))
        if deg == 0:
            net.remove_node(residue.parent.id + str(residue.id[1]))
        else:
            weight = nx.degree(net,
                               residue.parent.id + str(residue.id[1]),
                               weight="weight")
            restype = residue.resname
            resname = (os.path.split(pdb)[1][:-4] + residue.parent.id +
                       str(residue.id[1]))
            size = len(residue)
            seqpos = residue.id[1]
            if seqpos not in selected_positions:
                continue
            structure = secondary_structure[node_name]

            # check how many other aas can have the same k and w in the database
            if kw_reprod:
                n_others = 0
                for aa in amino_acids:
                    if aa != restype:
                        try:
                            [w_min, w_max] = k_w[aa][deg]
                            if w_min <= weight and w_max >= weight:
                                n_others += 1
                        except KeyError:
                            pass

            if separate_weights:
                w_separated = {"s": 0, "m": 0, "l": 0}

            line_neigh = []
            for neighbor in list(nx.neighbors(net, node_name)):
                neighbor_type = residues_dict[neighbor]
                edge_weight = nx.edges(net)[(node_name, neighbor)]["weight"]
                aa_num = amino_acids.index(neighbor_type)
                relation = get_neighbor_structure_relation(
                    secondary_structure, node_name, neighbor)
                # select only edges of desired relations
                if relation in rel_list:
                    adj_vector[aa_num] += 1
                    weight_vector[aa_num] += edge_weight
                    line_neigh.append(neighbor)
                    line_neigh.append(neighbor_type)
                    line_neigh.append(edge_weight)
                    line_neigh.append(relation)
                else:
                    net.remove_edge(neighbor, node_name)

                # separate weights
                if separate_weights:
                    neigh_size = scl.dict_classif[aaconv.three2one(
                        neighbor_type)]
                    w_separated[neigh_size] += edge_weight

            # check if the residue became of degree zero:
            deg = nx.degree(net, residue.parent.id + str(residue.id[1]))

            if deg == 0:
                net.remove_node(residue.parent.id + str(residue.id[1]))
            else:
                weight = nx.degree(net,
                                   residue.parent.id + str(residue.id[1]),
                                   weight="weight")

                line = [
                    resname,
                    node_name,
                    seqpos,
                    restype,
                    deg,
                    weight,
                    weight / deg,
                    size,
                    structure,
                ]
                line_2 = [
                    resname,
                    node_name,
                    seqpos,
                    restype,
                    deg,
                    weight,
                    weight / deg,
                    size,
                    structure,
                ]

                if kw_reprod:
                    line.append(n_others)
                    line_2.append(n_others)

                line += line_neigh

                database_1.append(line)
                line_2 += adj_vector

                if separate_jaccard:
                    sep_jaccard_dict = {"s": 0, "m": 0, "l": 0}
                    for k in range(len(amino_acids)):
                        num = adj_vector[k]
                        size = amino_acids_schl[k]
                        sep_jaccard_dict[size] += num
                    sep_jacc = [
                        sep_jaccard_dict["s"],
                        sep_jaccard_dict["m"],
                        sep_jaccard_dict["l"],
                    ]
                    line_2 += sep_jacc

                    if separate_weights:
                        w_separated = list(w_separated.values())
                        line_2 += w_separated

                database_2.append(line_2)

    sortedlist_pos = sorted(database_1, key=lambda row: row[2])

    sortedlist_pos_2 = sorted(database_2, key=lambda row: row[2])

    if not db_1_name:
        db_1_name = "database_pos_1.csv"
    if not db_2_name:
        db_2_name = "database_pos_2.csv"

    if kw_reprod:
        columns1 = [
            "Residue name",
            "Position",
            "Sequence position",
            "Type of residue",
            "Degree",
            "Weight",
            "Weight/Degree",
            "Atomic number",
            "Secondary structure",
            "N. others",
            "Neighbor position",
            "Neighbor type",
            "Pairwise weight",
            "Relation",
        ]
        columns2 = [
            "Residue name",
            "Position",
            "Sequence position",
            "Type of residue",
            "Degree",
            "Weight",
            "Weight/Degree",
            "Atomic number",
            "Secondary structure",
            "N. others",
        ] + amino_acids

    else:
        columns1 = [
            "Residue name",
            "Position",
            "Sequence position",
            "Type of residue",
            "Degree",
            "Weight",
            "Weight/Degree",
            "Atomic number",
            "Secondary structure",
            "Neighbor position",
            "Neighbor type",
            "Pairwise weight",
            "Relation",
        ]
        columns2 = [
            "Residue name", "Position", "Sequence position", "Type of residue",
            "Degree", "Weight", "Weight/Degree", "Atomic number",
            "Secondary structure"
        ] + amino_acids

    if separate_jaccard:
        columns2 += ["small neighbors", "medium neighbors", "large neighbors"]

        if separate_weights:
            columns2 += ["w_small", "w_medium", "w_large"]

    if save_csv:
        db_1_path = os.path.join(folder_path, db_1_name)
        with open(db_1_path, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(columns1)
            writer.writerows(sortedlist_pos)

        db_2_path = os.path.join(folder_path, db_2_name)
        with open(db_2_path, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(columns2)
            writer.writerows(sortedlist_pos_2)

    # database 1 has to have all rows of the same lenght to be read as a dataframe
    lengths = [len(row) for row in database_1]
    max_length = max(lengths)

    db_1 = []
    missing_header = max_length - len(columns1)
    for i in range(int(missing_header / 4)):
        columns1.append("Neighbor position")
        columns1.append("Neighbor type")
        columns1.append("Pairwise weight")
        columns1.append("Relation")

    for row in database_1:
        missing = max_length - len(row)
        for i in range(int(missing)):
            row.append("-")
        db_1.append(row)

    db_1 = pd.DataFrame(db_1, columns=columns1)

    db_2 = pd.DataFrame(database_2, columns=columns2)

    return net, db_1, db_2, mol, downloaded
def assign_secondary_structure(pdb):
    """Returns the secondary structure elements of a pdb
    
    Parameters
    ----------
    pdb : str
        pdb file path
    
    Returns
    -------
    dict
        dictionary of secondary structure elements
    """

    ppdb = PandasPdb().read_pdb(pdb)

    secondary_structure = {}

    helices_from_pdb = ppdb.df["OTHERS"][ppdb.df["OTHERS"]["record_name"] ==
                                         "HELIX"]["entry"]
    for helix in helices_from_pdb:
        identifier_h = helix[5:8].strip()
        initial_chain_h = helix[13].strip()
        initial_pos_h = helix[16:19].strip()
        final_pos_h = helix[28:31].strip()
        for i in range(int(initial_pos_h), int(final_pos_h) + 1):
            secondary_structure[initial_chain_h +
                                str(i)] = ("helix" + identifier_h + "-" +
                                           initial_chain_h)

    sheets_from_pdb = ppdb.df["OTHERS"][ppdb.df["OTHERS"]["record_name"] ==
                                        "SHEET"]["entry"]
    for sheet in sheets_from_pdb:
        identifier_s = sheet[6:8].strip()
        initial_chain_s = sheet[15].strip()
        initial_pos_s = sheet[17:20].strip()
        final_pos_s = sheet[28:31].strip()
        for i in range(int(initial_pos_s), int(final_pos_s) + 1):
            secondary_structure[initial_chain_s +
                                str(i)] = ("sheet" + identifier_s + "-" +
                                           initial_chain_s)

    mol = bg.Pmolecule(pdb)
    net = mol.network()

    residues_type = {}
    for residue in mol.model.get_residues():
        res_type = residue.resname
        res_pos = residue.parent.id + str(residue.id[1])
        residues_type[res_pos] = res_type

    residues = list(net.nodes)  # assume they are ordered
    last_structure = None
    last_chain = None
    i = 0
    for residue in residues:
        chain = residue[0]
        try:
            structure = secondary_structure[residue]
            if structure != last_structure:
                i += 1
        except KeyError:
            if chain != last_chain:
                i += 1
            structure = "loop" + str(i)
            secondary_structure[residue] = structure
        last_structure = structure
        last_chain = chain

    return secondary_structure
Esempio n. 8
0
 def create_adj(self, pdb):
     if not self.id2name:
         self.create_id2name(pdb)
     mol = bg.Pmolecule(pdb)
     net = mol.network(cutoff=self.cutoff)
     return nx.to_numpy_array(net)
    else:
        pdb_id = current_path.rsplit('/', 1)[1]

pdbl = PDBList(obsolete_pdb=True)

if not glob(os.path.join(pdbs_path, '*')):
    pdbl.download_pdb_files(pdbs, file_format='pdb', pdir=pdbs_path)

#initialize databases to report
database_1 = []
database_2 = []

pdbs = glob(os.path.join(pdbs_path, '*'))

for pdb in pdbs:
    mol = bg.Pmolecule(pdb)
    net = mol.network()

    # take only selected positions:
    if selected_positions:
        for node in list(net.nodes):
            pos = int(node[1::])
            if pos not in selected_positions:
                net.remove_node(node)

    secondary_structure = assign_secondary_structure(pdb)

    residues_dict = {}
    for residue in mol.model.get_residues():
        res_type = residue.resname
        res_pos = residue.parent.id + str(residue.id[1])
Esempio n. 10
0
def create_aa_network(pdb_id, rel_list, selected_positions, cutoff, pdbs_path):
    """Creates the amino acid network from a pdb id.
        
    Parameters
    ----------
    pdb_id : str
        pdb id of the protein
    rel_list: list
        list of relation (1D, 2D, 3D, 4D) to consider.
    folder_path: str
        path of the output folder
    selected_positions: None or list, optional
        list of sequence positions to consider. If None, all positions are considered (default is None)
    cutoff: int, optional
        cutoff threshold for the connection of nodes in the amino acid network (dafault is 5).
    pdbs_path: str, optional
        path of the pdb files folder (default is "pdbs")
    Returns
    -------
    Graph
        NetworkX Graph object
    dict
        labels dictionary
    """

    pdb, downloaded = get_pdb_path(pdb_id, pdbs_path)

    # initialize database to report
    database = []

    mol = bg.Pmolecule(pdb)
    net = mol.network(cutoff=cutoff)

    # take only selected positions:
    if selected_positions:
        for node in list(net.nodes):
            pos = int(node[1::])
            if pos not in selected_positions:
                net.remove_node(node)
    else:
        positions = [int(node[1::]) for node in list(net.nodes)]
        pos_min = min(positions)
        pos_max = max(positions)
        selected_positions = range(pos_min, pos_max + 1)

    secondary_structure = assign_secondary_structure(pdb)

    labels = {}
    residues_dict = {}
    for residue in mol.model.get_residues():
        res_type = residue.resname.strip()
        if len(res_type) > 1:
            res_type = aaconv.three2one(res_type)
        res_chain = residue.parent.id
        res_pos = str(residue.id[1])
        labels[res_chain + res_pos] = f"{res_type}{res_pos}:{res_chain}"
        residues_dict[res_chain + res_pos] = res_type

    for residue in mol.model.get_residues():
        node_name = residue.parent.id + str(residue.id[1])
        deg = nx.degree(net, node_name)
        if deg == 0:
            net.remove_node(node_name)
            _ = labels.pop(node_name)
        else:
            seqpos = residue.id[1]
            if seqpos not in selected_positions:
                _ = labels.pop(node_name)
                continue
            structure = secondary_structure[node_name]

            for neighbor in list(nx.neighbors(net, node_name)):
                edge_weight = nx.edges(net)[(node_name, neighbor)]["weight"]
                relation = get_neighbor_structure_relation(
                    secondary_structure, node_name, neighbor
                )
                # select only edges of desired relations
                if relation not in rel_list:
                    net.remove_edge(neighbor, node_name)

            # check if the residue became of degree zero:
            deg = nx.degree(net, residue.parent.id + str(residue.id[1]))

            if deg == 0:
                net.remove_node(node_name)
                _ = labels.pop(node_name)


    return net, labels
def GetData(path, prot, mutations, csv_path=None):
    """Get data from amino acid mutation perturbation networks as CSV files.

    Parameters:
        path (string): path where original and mutated pdb files are located
        prot (string): name of original pdb file
        mutations (dict): keys are strings representing positions to mutate
                        (amino acid, chain and index), each contains a list of
                        mutations performed (original aa, chain, index and
                        mutated aa). Mutated pdb files should be found in path,
                        as prot_mutation.pdb according to mutations in said list
        csv_path (string): default None, path where CSV files will be saved,
                        if None, a dir named "perturbation_network_data" will
                        be created in 'path'

    Returns:
        None
    """
    # Sorted list of one letter amino acids
    AA = list(Bio.PDB.Polypeptide.aa1)
    N = len(AA)  # Number of amino acids
    M = len(mutations.keys())  # Number of mutated positions
    cols = list(mutations.keys())  # List of mutated positions

    # Generate molecule of original pdb file
    original_prot = bg.Pmolecule(os.path.join(path, f"{prot}.pdb"))
    # The range of thresholds will define the networks
    thresholds = [round(i, 1) for i in np.linspace(3, 10, 71)]
    # Create dir to save resulting csv files if not specified
    if csv_path is None:
        csv_path = os.path.join(path, "perturbation_network_data")
        if not os.path.exists(csv_path):
            os.makedirs(csv_path)
    # Check if path and csv_path exist
    assert os.path.exists(path), f"Directory {path} doesn't exist."
    assert os.path.exists(csv_path), f"Directory {csv_path} doesn't exist."
    # Array to save data from original protein network
    original_data = np.zeros((4, len(thresholds)))

    # For each threshold we iterate over all mutations
    for i, threshold in enumerate(thresholds):
        nodes = np.zeros((N, M))
        edges = np.zeros((N, M))
        weight = np.zeros((N, M))
        distance = np.zeros((N, M))

        # Generate network for original graph with threshold
        original = original_prot.network(cutoff=threshold)
        original_matrix = nx.adjacency_matrix(original).toarray()
        # Saving data from original network
        original_data[0][i] = GetNodes(original)
        original_data[1][i] = GetEdges(original)
        original_data[2][i] = GetWeight(original)
        original_data[3][i] = GetDistance(original)

        for index, position in enumerate(mutations.keys()):
            for mutation in mutations[position]:
                # Generate network for current mutation
                current_path = os.path.join(path, f"{prot}_{mutation}.pdb")
                current_prot = bg.Pmolecule(current_path)
                current = current_prot.network(cutoff=threshold)

                # Obtain the absolute difference in terms of adjacency
                # matrices: the perturbation network.
                current_matrix = nx.adjacency_matrix(current).toarray()
                difference = np.abs(original_matrix - current_matrix)
                perturbation_network = nx.from_numpy_array(difference)

                # Remove isolates for accurate perturbation network node count
                perturbation_network.remove_nodes_from(
                    list(nx.isolates(perturbation_network)))

                # Corresponding row in array according to mutation
                assert mutation[-1] in AA, \
                    f"{mutation[-1]} not one of {Bio.PDB.Polypeptide.aa1}"
                aa_index = AA.index(mutation[-1])

                # Information obtained from perturbation network
                nodes[aa_index][index] = GetNodes(perturbation_network)
                edges[aa_index][index] = GetEdges(perturbation_network)
                weight[aa_index][index] = GetWeight(perturbation_network)
                distance[aa_index][index] = GetDistance(perturbation_network)

        # Save data arrays as csv files in csv_path
        WriteCSV(csv_path, nodes, cols, f"{prot}_{threshold}_nodes.csv")
        WriteCSV(csv_path, edges, cols, f"{prot}_{threshold}_edges.csv")
        WriteCSV(csv_path, weight, cols, f"{prot}_{threshold}_weight.csv")
        WriteCSV(csv_path, distance, cols, f"{prot}_{threshold}_distance.csv")

    # Save array from original data
    original_data = np.vstack(
        [thresholds, original_data])  # add thresholds
    original_data = np.transpose(original_data)  # to add names as header
    header = ['threshold', 'nodes', 'edges', 'weight', 'distance']
    # Write CSV of original data
    WriteCSV(csv_path, original_data, header, f"{prot}_original.csv")

    return None