def calcDistances(pdb_name, input_file, output_dir): p = PDBParser() structure = p.get_structure("X", input_file) resnam = [] # A list containing all Residue Names resnum = [] # A list containing all Residue Numbers reschain = [] # A list containing the chain name in which the residues lie crdCA = [] crdSC = [] sizeSC = [] for residue in structure.get_residues(): resnam.append(residue.resname) resnum.append(residue.get_full_id()[3][1]) reschain.append(residue.get_full_id()[2]) noSC = True noCA = True rescrd_SC = ( [] ) # A list containing the coordinates of all side chain atoms of the current residue. Will be used to calculate the COM of the side chain. for atom in structure.get_atoms(): # atom.name is equivalent to atom.get_id() if atom.parent.id == residue.id and atom.name == "CA": noCA = False crdCA.append(atom.get_coord()) elif atom.parent.id == residue.id and atom.name not in ["C", "CA", "O", "N"]: noSC = False rescrd_SC.append(atom.get_coord()) if noCA: print( "\nERROR: ", pdb_name, "has a missing alpha carbon at residue ", resnum[-1], ". Please manually check PDB file for extraneous structural information (e.g. partially resolved" "residues, bound DNA or ligands, etc.). Distances cannot be calculated.\n", ) sys.exit() if noSC: print("Missing side chain in residue: ", resnum[-1], resnam[-1], ", possibly a GLY:", pdb_name) crdSC.append(crdCA[-1]) sizeSC.append(0) else: # Calculate side chain properties: sizeSC.append(len(rescrd_SC)) crdSC.append(sum(rescrd_SC) / float(sizeSC[-1])) single_letter_aa = ph.convert_to_one_letter_code(resnam) distSC = [["PDB_AA"] + [str(res.get_id()[1]) + resdict[res.get_resname()] for res in structure.get_residues()]] for i in range(len(resnam)): dist_sc_per_aa = [single_letter_aa[i]] for j in range(len(single_letter_aa)): distSCi = 0.0 if crdSC[i][0] == "NA" or crdSC[j][0] == "NA": dist_sc_per_aa.append("NA") continue if i != j: distSCi = math.sqrt( (crdSC[i][0] - crdSC[j][0]) ** 2 + (crdSC[i][1] - crdSC[j][1]) ** 2 + (crdSC[i][2] - crdSC[j][2]) ** 2 ) dist_sc_per_aa.append(distSCi) distSC.append(dist_sc_per_aa) with open(output_dir + "/" + pdb_name + "_dist.csv", "wb") as output_file: csv_writer = csv.writer(output_file, delimiter=",") csv_writer.writerows(distSC)
def merge_df_by_aa(all_dfs, col_list, retain_gaps=False): align_outfile = "temp.txt" all_seqs = [] all_headers = [] # We'll use this later for looping through all the dataframes/files df_range = xrange(len(all_dfs)) for i in df_range: # Read in CSV files as dataframes seq_list = all_dfs[i][col_list[i]].values # print seq_list # Grab AA sequence from column if len(seq_list[0]) == 3: seq = ''.join(ph.convert_to_one_letter_code(seq_list)) elif len(seq_list[0]) == 1: seq = ''.join(seq_list) else: raise "Could not recognized amino acid column in dataframe." # Append sequences and filenames to list for maaft processing all_seqs.append(seq) all_headers.append('>AA_' + str(i)) # Align sequences with mafft align_seqs_mafft(all_seqs, all_headers, align_outfile) # Extract the sequences from the aligned file (seqs, headers) = ph.get_sequences(align_outfile) # Now go backwards and build a dataframe with the sequences aligned # Column headers are file names and rows are each AA in sequence. aligned_seqs = dict() for i in df_range: aligned_seqs[headers[i][1:]] = list(seqs[i]) aligned_df = pd.DataFrame(data=aligned_seqs) # Initialize list for storing newly constructed dataframes which will eventually be merged # with aligned_df. partial_dfs = [] # Loop through file list and append rows to a new dataframe corresponding to the aligned # sequences. This is where the core functionality of this script happens. for s in df_range: j = 0 # How many rows in the original CSV? row_length = len(all_dfs[s].index) current_col = all_headers[s][1:] # Initialize dict to store new rows. We will convert this back to a dataframe later. # Building a dictionary of lists first and then converting to a DF, rather than appending rows # to an existing DF is MUCH, MUCH faster. new_df_dict = dict() for index, row in aligned_df[current_col].iteritems(): # If the sequence in the alignment is not blank, and we haven't reached the end of # the original CSV sequence length... if j >= row_length: break if row != '-': # Append rows from original CSV to our new dictionary new_df_dict[index] = all_dfs[s].iloc[j, :].values j += 1 # Append newly generated dataframe to a list to concatenate later new_df = pd.DataFrame.from_dict(new_df_dict, orient='index') # Add back in appropriate column names new_df.columns = all_dfs[s].columns partial_dfs.append(new_df) # Insert aligned_df to the beginning of the list of new dataframes partial_dfs.insert(0, aligned_df) # Concatenate dataframes together by column (i.e., like cbind in R) aligned_df = pd.concat(partial_dfs, axis=1) # Remove any rows containing NaN if retain_gaps is False: aligned_df = aligned_df.dropna() return aligned_df