Ejemplo n.º 1
def calcDistances(pdb_name, input_file, output_dir):
    p = PDBParser()
    structure = p.get_structure("X", input_file)

    resnam = []  # A list containing all Residue Names
    resnum = []  # A list containing all Residue Numbers
    reschain = []  # A list containing the chain name in which the residues lie
    crdCA = []
    crdSC = []
    sizeSC = []

    for residue in structure.get_residues():
        noSC = True
        noCA = True
        rescrd_SC = (
        )  # A list containing the coordinates of all side chain atoms of the current residue. Will be used to calculate the COM of the side chain.
        for atom in structure.get_atoms():
            # atom.name is equivalent to atom.get_id()
            if atom.parent.id == residue.id and atom.name == "CA":
                noCA = False

            elif atom.parent.id == residue.id and atom.name not in ["C", "CA", "O", "N"]:
                noSC = False

        if noCA:
                "\nERROR: ",
                "has a missing alpha carbon at residue ",
                ". Please manually check PDB file for extraneous structural information (e.g. partially resolved"
                "residues, bound DNA or ligands, etc.). Distances cannot be calculated.\n",

        if noSC:
            print("Missing side chain in residue: ", resnum[-1], resnam[-1], ", possibly a GLY:", pdb_name)
            # Calculate side chain properties:
            crdSC.append(sum(rescrd_SC) / float(sizeSC[-1]))

    single_letter_aa = ph.convert_to_one_letter_code(resnam)
    distSC = [["PDB_AA"] + [str(res.get_id()[1]) + resdict[res.get_resname()] for res in structure.get_residues()]]
    for i in range(len(resnam)):
        dist_sc_per_aa = [single_letter_aa[i]]
        for j in range(len(single_letter_aa)):
            distSCi = 0.0
            if crdSC[i][0] == "NA" or crdSC[j][0] == "NA":
            if i != j:
                distSCi = math.sqrt(
                    (crdSC[i][0] - crdSC[j][0]) ** 2
                    + (crdSC[i][1] - crdSC[j][1]) ** 2
                    + (crdSC[i][2] - crdSC[j][2]) ** 2

    with open(output_dir + "/" + pdb_name + "_dist.csv", "wb") as output_file:
        csv_writer = csv.writer(output_file, delimiter=",")
def merge_df_by_aa(all_dfs, col_list, retain_gaps=False):
    align_outfile = "temp.txt"
    all_seqs = []
    all_headers = []

    # We'll use this later for looping through all the dataframes/files
    df_range = xrange(len(all_dfs))

    for i in df_range:
        # Read in CSV files as dataframes
        seq_list = all_dfs[i][col_list[i]].values
        # print seq_list
        # Grab AA sequence from column
        if len(seq_list[0]) == 3:
            seq = ''.join(ph.convert_to_one_letter_code(seq_list))
        elif len(seq_list[0]) == 1:
            seq = ''.join(seq_list)
            raise "Could not recognized amino acid column in dataframe."

        # Append sequences and filenames to list for maaft processing
        all_headers.append('>AA_' + str(i))

    # Align sequences with mafft
    align_seqs_mafft(all_seqs, all_headers, align_outfile)

    # Extract the sequences from the aligned file
    (seqs, headers) = ph.get_sequences(align_outfile)

    # Now go backwards and build a dataframe with the sequences aligned
    # Column headers are file names and rows are each AA in sequence.
    aligned_seqs = dict()
    for i in df_range:
        aligned_seqs[headers[i][1:]] = list(seqs[i])
    aligned_df = pd.DataFrame(data=aligned_seqs)

    # Initialize list for storing newly constructed dataframes which will eventually be merged
    # with aligned_df.
    partial_dfs = []

    # Loop through file list and append rows to a new dataframe corresponding to the aligned
    # sequences. This is where the core functionality of this script happens.

    for s in df_range:
        j = 0
        # How many rows in the original CSV?
        row_length = len(all_dfs[s].index)
        current_col = all_headers[s][1:]
        # Initialize dict to store new rows. We will convert this back to a dataframe later.
        # Building a dictionary of lists first and then converting to a DF, rather than appending rows
        # to an existing DF is MUCH, MUCH faster.
        new_df_dict = dict()
        for index, row in aligned_df[current_col].iteritems():
            # If the sequence in the alignment is not blank, and we haven't reached the end of
            # the original CSV sequence length...
            if j >= row_length:
            if row != '-':
                # Append rows from original CSV to our new dictionary
                new_df_dict[index] = all_dfs[s].iloc[j, :].values
                j += 1

        # Append newly generated dataframe to a list to concatenate later
        new_df = pd.DataFrame.from_dict(new_df_dict, orient='index')
        # Add back in appropriate column names
        new_df.columns = all_dfs[s].columns

    # Insert aligned_df to the beginning of the list of new dataframes
    partial_dfs.insert(0, aligned_df)
    # Concatenate dataframes together by column (i.e., like cbind in R)
    aligned_df = pd.concat(partial_dfs, axis=1)
    # Remove any rows containing NaN
    if retain_gaps is False:
        aligned_df = aligned_df.dropna()

    return aligned_df