def calcDistances(pdb_name, input_file, output_dir):
    p = PDBParser()
    structure = p.get_structure("X", input_file)

    resnam = []  # A list containing all Residue Names
    resnum = []  # A list containing all Residue Numbers
    reschain = []  # A list containing the chain name in which the residues lie
    crdCA = []
    crdSC = []
    sizeSC = []

    for residue in structure.get_residues():
        resnam.append(residue.resname)
        resnum.append(residue.get_full_id()[3][1])
        reschain.append(residue.get_full_id()[2])
        noSC = True
        noCA = True
        rescrd_SC = (
            []
        )  # A list containing the coordinates of all side chain atoms of the current residue. Will be used to calculate the COM of the side chain.
        for atom in structure.get_atoms():
            # atom.name is equivalent to atom.get_id()
            if atom.parent.id == residue.id and atom.name == "CA":
                noCA = False
                crdCA.append(atom.get_coord())

            elif atom.parent.id == residue.id and atom.name not in ["C", "CA", "O", "N"]:
                noSC = False
                rescrd_SC.append(atom.get_coord())

        if noCA:
            print(
                "\nERROR: ",
                pdb_name,
                "has a missing alpha carbon at residue ",
                resnum[-1],
                ". Please manually check PDB file for extraneous structural information (e.g. partially resolved"
                "residues, bound DNA or ligands, etc.). Distances cannot be calculated.\n",
            )
            sys.exit()

        if noSC:
            print("Missing side chain in residue: ", resnum[-1], resnam[-1], ", possibly a GLY:", pdb_name)
            crdSC.append(crdCA[-1])
            sizeSC.append(0)
        else:
            # Calculate side chain properties:
            sizeSC.append(len(rescrd_SC))
            crdSC.append(sum(rescrd_SC) / float(sizeSC[-1]))

    single_letter_aa = ph.convert_to_one_letter_code(resnam)
    distSC = [["PDB_AA"] + [str(res.get_id()[1]) + resdict[res.get_resname()] for res in structure.get_residues()]]
    for i in range(len(resnam)):
        dist_sc_per_aa = [single_letter_aa[i]]
        for j in range(len(single_letter_aa)):
            distSCi = 0.0
            if crdSC[i][0] == "NA" or crdSC[j][0] == "NA":
                dist_sc_per_aa.append("NA")
                continue
            if i != j:
                distSCi = math.sqrt(
                    (crdSC[i][0] - crdSC[j][0]) ** 2
                    + (crdSC[i][1] - crdSC[j][1]) ** 2
                    + (crdSC[i][2] - crdSC[j][2]) ** 2
                )
            dist_sc_per_aa.append(distSCi)
        distSC.append(dist_sc_per_aa)

    with open(output_dir + "/" + pdb_name + "_dist.csv", "wb") as output_file:
        csv_writer = csv.writer(output_file, delimiter=",")
        csv_writer.writerows(distSC)
def merge_df_by_aa(all_dfs, col_list, retain_gaps=False):
    align_outfile = "temp.txt"
    all_seqs = []
    all_headers = []

    # We'll use this later for looping through all the dataframes/files
    df_range = xrange(len(all_dfs))

    for i in df_range:
        # Read in CSV files as dataframes
        seq_list = all_dfs[i][col_list[i]].values
        # print seq_list
        # Grab AA sequence from column
        if len(seq_list[0]) == 3:
            seq = ''.join(ph.convert_to_one_letter_code(seq_list))
        elif len(seq_list[0]) == 1:
            seq = ''.join(seq_list)
        else:
            raise "Could not recognized amino acid column in dataframe."

        # Append sequences and filenames to list for maaft processing
        all_seqs.append(seq)
        all_headers.append('>AA_' + str(i))

    # Align sequences with mafft
    align_seqs_mafft(all_seqs, all_headers, align_outfile)

    # Extract the sequences from the aligned file
    (seqs, headers) = ph.get_sequences(align_outfile)

    # Now go backwards and build a dataframe with the sequences aligned
    # Column headers are file names and rows are each AA in sequence.
    aligned_seqs = dict()
    for i in df_range:
        aligned_seqs[headers[i][1:]] = list(seqs[i])
    aligned_df = pd.DataFrame(data=aligned_seqs)

    # Initialize list for storing newly constructed dataframes which will eventually be merged
    # with aligned_df.
    partial_dfs = []

    # Loop through file list and append rows to a new dataframe corresponding to the aligned
    # sequences. This is where the core functionality of this script happens.

    for s in df_range:
        j = 0
        # How many rows in the original CSV?
        row_length = len(all_dfs[s].index)
        current_col = all_headers[s][1:]
        # Initialize dict to store new rows. We will convert this back to a dataframe later.
        # Building a dictionary of lists first and then converting to a DF, rather than appending rows
        # to an existing DF is MUCH, MUCH faster.
        new_df_dict = dict()
        for index, row in aligned_df[current_col].iteritems():
            # If the sequence in the alignment is not blank, and we haven't reached the end of
            # the original CSV sequence length...
            if j >= row_length:
                break
            if row != '-':
                # Append rows from original CSV to our new dictionary
                new_df_dict[index] = all_dfs[s].iloc[j, :].values
                j += 1

        # Append newly generated dataframe to a list to concatenate later
        new_df = pd.DataFrame.from_dict(new_df_dict, orient='index')
        # Add back in appropriate column names
        new_df.columns = all_dfs[s].columns
        partial_dfs.append(new_df)

    # Insert aligned_df to the beginning of the list of new dataframes
    partial_dfs.insert(0, aligned_df)
    # Concatenate dataframes together by column (i.e., like cbind in R)
    aligned_df = pd.concat(partial_dfs, axis=1)
    # Remove any rows containing NaN
    if retain_gaps is False:
        aligned_df = aligned_df.dropna()

    return aligned_df