def wrapperProfileGraph(parentFile, contactFile): ''' Draws a graph of the number of clashes at each recombination point ''' pdbName = contactFile.split('_')[0][-4:] parent_list = schema.readMultipleSequenceAlignmentFile( file(parentFile, 'r')) parents = [p for (k, p) in parent_list] pdb_contacts = schema.readContactFile(file(contactFile, 'r')) clash_data = [[] for x in parents[0]] for i in range(1, len(parents)): print i #This reshuffles the alignment to make the first and second sequences the ones analysed. It was needed as SCHEMA is limited to 9 sequences. newList = [parents[0], parents[i]] for x in range(1, len(parents)): if not i == x: newList.append(parents[x]) #Graphs for hotspots for residue in range(0, len(parents[0])): crossovers = [residue] contacts = schema.getSCHEMAContactsWithCrossovers( pdb_contacts, newList, crossovers) fragments = schema.getFragments(crossovers, parents[0]) clash_data[residue].append( schema.getChimeraDisruption('21', contacts, fragments, newList)) means = [np.mean(values) for values in clash_data] StDev = [np.std(values) for values in clash_data] makeBarGraph(means, StDev, pdbName)
def main(args): arg_dict = parse_arguments(args) if not confirm_arguments(arg_dict): if args[0].split(os.path.sep)[-1] == "schemarandom.py": print_usage(args) return # Flags and values print_E = False print_m = False # Inputs: # The alignment/fragment file name. msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE] # Read the alignment file to create a list of parents. # The parents will appear in the list in the order in which they appear in the file. parent_list = schema.readMultipleSequenceAlignmentFile(file(msa_file, "r")) parents = [p for (k, p) in parent_list] # Get the contacts pdb_contacts = schema.readContactFile(file(arg_dict[ARG_CONTACT_FILE], "r")) # Establish connection to output, either file or, if no output file is # specified, to standard output. if arg_dict.has_key(ARG_OUTPUT_FILE): output_file = file(arg_dict[ARG_OUTPUT_FILE], "w") else: output_file = sys.stdout # Get the number of libraries to evaluate. if arg_dict.has_key(ARG_NUM_LIBRARIES): num_libraries = int(arg_dict[ARG_NUM_LIBRARIES]) else: num_libraries = int(1e3) # Get the minimum fragment size. if arg_dict.has_key(ARG_MIN_FRAGMENT_SIZE): min_length = int(arg_dict[ARG_MIN_FRAGMENT_SIZE]) else: min_length = 4 # Get the number of fragments -- one more than the number of crossovers. num_fragments = int(arg_dict[ARG_NUM_CROSSOVERS]) + 1 num_parents = len(parents) library_size = num_parents ** num_fragments if arg_dict.has_key(ARG_MAX_CHIMERAS_PER_LIBRARY): max_chimeras = min(library_size, int(arg_dict[ARG_MAX_CHIMERAS_PER_LIBRARY])) else: max_chimeras = library_size if arg_dict.has_key(ARG_RANDOM_SEED): random.seed(int(arg_dict[ARG_RANDOM_SEED])) # Make libraries consistent with RASPP (new_parents, identical_sites) = raspp.collapse_parents(parents) if len(new_parents[0]) < num_fragments * min_length: error_msg = ( "Minimum diversity length of %d is too large.\n%d " + "fragments with diversity %d cannot be found in a " + "sequence of length %d (with identities removed). Aborting..." ) print error_msg % (min_length, num_fragments, min_length, len(parents[0])) return start_time = time.clock() output_file.write("# <E>\t<m>\tcrossover points\n") random_crossovers = [] for libnum in range(num_libraries): crossovers = schema.generateRandomCrossovers(len(new_parents[0]), num_fragments - 1, min_length) crossovers = raspp.translate_collapsed_indices(crossovers, identical_sites) random_crossovers.append(crossovers) for crossovers in random_crossovers: fragments = schema.getFragments(crossovers, parents[0]) filtered_contacts = schema.getSCHEMAContactsWithCrossovers(pdb_contacts, parents, crossovers) all_chimeras = [] if max_chimeras < library_size: # Assemble a random sample of chimeras, with replacement for n_chim in range(max_chimeras): chim_index = random.randint(0, library_size - 1) n2c = schema.base(chim_index, num_parents) chimera_blocks = "".join(["1"] * (num_fragments - len(n2c)) + ["%d" % (int(x) + 1,) for x in n2c]) all_chimeras.append(chimera_blocks) else: # We'll be covering all chimeras in the library; might as well get a good sample. # The number of parents and fragments specifies all possible chimeras, regardless of # crossover point positions, so pre-generate all chimeras. max_chimeras = library_size for i in range(library_size): # The next two lines turn i into a chimera block pattern # (e.g., 0 -> '11111111', 1 -> '11111112', 2 -> '11111113'...) n2c = schema.base(i, num_parents) chimera_blocks = "".join(["1"] * (num_fragments - len(n2c)) + ["%d" % (int(x) + 1,) for x in n2c]) all_chimeras.append(chimera_blocks) # Randomly assort the chimeras random.shuffle(all_chimeras) # Calculate average E and m for the library or subsample E_values = [] m_values = [] for chim_index in range(max_chimeras): chimera_blocks = all_chimeras[chim_index] E = schema.getChimeraDisruption(chimera_blocks, filtered_contacts, fragments, parents) m = schema.getChimeraShortestDistance(chimera_blocks, fragments, parents) E_values.append(E) m_values.append(m) average_E = schema.mean(E_values) average_m = schema.mean(m_values) xover_pat = "%d " * len(crossovers) xover_str = xover_pat % tuple(crossovers) output_file.write(("%1.4f\t%1.4f\t%s\n") % (average_E, average_m, xover_str)) output_file.flush() total_time = time.clock() - start_time output_file.write( "# Finished in %1.2f seconds (%d libraries, %d chimeras)\n" % (total_time, num_libraries, num_libraries * max_chimeras) ) if arg_dict.has_key(ARG_OUTPUT_FILE): output_file.close()
def main(args): arg_dict = parse_arguments(args) if not confirm_arguments(arg_dict): if args[0].split(os.path.sep)[-1] == "schemaenergy.py": print_usage(args) return # Flags and values print_E = False print_m = False output_file = sys.stdout # Inputs: # The alignment/fragment file name. msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE] if arg_dict.has_key(ARG_PRINT_E): print_E = True if arg_dict.has_key(ARG_PRINT_M): print_m = True # Read the alignment file to create a list of parents. # The parents will appear in the list in the order in which they appear in the file. parent_list = schema.readMultipleSequenceAlignmentFile(file(msa_file, 'r')) parents = [p for (k,p) in parent_list] crossovers = schema.readCrossoverFile(file(arg_dict[ARG_CROSSOVER_FILE], 'r')) fragments = schema.getFragments(crossovers, parents[0]) # Get the contacts pdb_contacts = schema.readContactFile(file(arg_dict[ARG_CONTACT_FILE], 'r')) contacts = schema.getSCHEMAContactsWithCrossovers(pdb_contacts, parents, crossovers) if arg_dict.has_key(ARG_OUTPUT_FILE): output_file = file(arg_dict[ARG_OUTPUT_FILE], 'w') # Now, what does the user want? output_string = '%s' output_file.write('# chimera') if print_E: output_string += '\t%d' output_file.write('\tE') if print_m: output_string += '\t%d' output_file.write('\tm') output_string += '\n' output_file.write('\n') if arg_dict.has_key(ARG_CHIMERAS): # Print values for chimeras chimeras = arg_dict[ARG_CHIMERAS] # Could be a) a chimera, b) a list of chimeras, or c) a file of chimeras. if type(chimeras) is list: # It's a list of chimeras for chimera_blocks in chimeras: outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m) elif os.path.isfile(chimeras): # It's a file of chimeras for line in file(chimeras,'r').readlines(): chimera_blocks = line.strip() outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m) else: # It's a single chimera sequence chimera_blocks = chimeras outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m) else: # Enumerates all possible chimeras and their disruption and mutation values. p = len(parents) n = len(fragments) Es = [] ms = [] for i in xrange(len(parents)**len(fragments)): # The next two lines turn i into a chimera block pattern # (e.g., 0 -> '11111111', 1 -> '11111112', 2 -> '11111113'...) n2c = schema.base(i,p) chimera_blocks = ''.join(['1']*(n-len(n2c))+['%d'%(int(x)+1,) for x in n2c]) (E, m) = outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m) if (print_E): Es.append(E) if (print_m): ms.append(m) if (print_E): mean_str = "# Average disruption <E> = %1.4f\n" % schema.mean(Es) output_file.write(mean_str) if (print_m): mean_str = "# Average mutation <m> = %1.4f\n" % schema.mean(ms) output_file.write(mean_str) if arg_dict.has_key(ARG_OUTPUT_FILE): output_file.close()
def main(args): arg_dict = parse_arguments(args) if not confirm_arguments(arg_dict): if args[0].split(os.path.sep)[-1] == "schemacontacts.py": print_usage(args) return # Flags and values # Inputs: # The PDB file name. pdb_file = arg_dict[ARG_PDB_FILE] # The alignment/fragment file name. msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE] # The alignment between the reference parent (indicated by reference_parent_index) # and the target protein sequence in the provided PDB file. The amino acids in # the aligned reference parent should correspond exactly to those in the # msa_file above. # If you don't provide a PDB alignment file, the program will assume that the ID of the PDB structure # contained in the HEADER field corresponds to one of the sequence IDs in the MSA. parent_pdb_alignment_file = None if arg_dict.has_key(ARG_PDB_ALIGNMENT_FILE): if not os.path.isfile(arg_dict[ARG_PDB_ALIGNMENT_FILE]): print " Can't find PDB/parent alignment file %s" % arg_dict[ARG_PDB_ALIGNMENT_FILE] return else: parent_pdb_alignment_file = arg_dict[ARG_PDB_ALIGNMENT_FILE] else: pdb_key = pdbmod.File().getIDCode(file(pdb_file,'r')) # The PDB chains # Many PDB files include multiple chains. The chain_identifier list includes those # chains which correspond to the protein whose contacts are being evaluated. # Most often, chain 'A' (in the case of multiple chains) or chain ' ' (only one chain) # will be the appropriate choice. if arg_dict.has_key(ARG_CHAINS): chains = arg_dict[ARG_CHAINS] if type(chains) is list: chain_identifiers = chains + [' '] else: chain_identifiers = [chains, ' '] else: chain_identifiers = ['A',' '] if arg_dict.has_key(ARG_FORMAT): format = arg_dict[ARG_FORMAT] else: format = 'fasta' # Read the alignment file to create a list of parents. # The parents will appear in the list in the order in which they appear in the file. parent_dict = schema.readMultipleSequenceAlignmentFile(file(msa_file, 'r'), format) #parent_dict = dict(parent_list) # Generate the contacts # Read in the PDB file to create a list of residues. residues = pdbmod.File().read(file(pdb_file, 'r')) # Because the PDB file's residue sequence may differ from those of the parents, we # must align the PDB residues to one parent. if not parent_pdb_alignment_file: # Just get PDB sequence from the multiple sequence alignment try: aligned_pdb = parent_dict[pdb_key] aligned_prot = parent_dict[pdb_key] except KeyError: print "Could not find sequence %s in the multiple sequence alignment file %s. Aborting..." % (pdb_key, msa_file) return else: # Pull information from the parent/PDB alignment file. # Our objective is to find the sequence with the same key in both the parent MSA file and # the parent/PDB alignment file. pdb_parent_seq_list = schema.readMultipleSequenceAlignmentFile(file(parent_pdb_alignment_file, 'r'), format) pdb_parent_seq_dict = dict(pdb_parent_seq_list) # Bail out if there are fewer than 2 sequences. if len(pdb_parent_seq_dict.keys()) < 2: print "Only found one uniquely named sequence in the PDB/parent alignment, %s. Aborting..." % pdb_parent_seq_dict.keys()[0] return # Find the matching key pdb_key = None for k in parent_dict.keys(): if pdb_parent_seq_dict.has_key(k): pdb_key = k # Bail out if no matching key is found if not pdb_key: print "Could not find parents %s in PDB/parent aligned sequences %s. Aborting..." % (parent_dict.keys(),) return aligned_prot = pdb_parent_seq_dict[pdb_key] # Remove the sequence corresponding to the pdb_key, leaving only the parent sequence. del pdb_parent_seq_dict[pdb_key] # Take the first remaining sequence, which should be the parent sequence. aligned_pdb = pdb_parent_seq_dict.values()[0] # Check to make sure the parent sequence from both alignment files matches. if aligned_prot.replace('-','') != parent_dict[pdb_key].replace('-',''): print "The PDB-aligned parent and the named parent, %s, don't match! Aborting..." % (pdb_key,) return # Check to ensure the aligned PDB sequence matches the residue sequence pulled directly from the PDB file. if aligned_pdb.replace('-','') != pdbmod.sequence(residues, chain_identifiers): print "The parent-aligned PDB sequence, %s, and the PDB file sequence, chain(s) %s in %s, don't match! Aborting..." % (pdb_key, chain_identifiers, pdb_file) return #print aligned_prot #print aligned_pdb #print parent_dict[pdb_key] #print pdbmod.sequence(residues) # Align the residues with the parent protein. try: residues = schema.alignPDBResidues(residues, aligned_prot, aligned_pdb, parent_dict[pdb_key], chain_identifiers) except ValueError, ve: print ve return
def main(args): arg_dict = parse_arguments(args) if not confirm_arguments(arg_dict): if args[0].split(os.path.sep)[-1] == "rasppcurve.py": print_usage(args) return # Flags and values print_E = False print_m = False # Inputs: # The alignment/fragment file name. msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE] # Read the alignment file to create a list of parents. # The parents will appear in the list in the order in which they appear in the file. parent_list = schema.readMultipleSequenceAlignmentFile(file(msa_file, 'r')) parents = [p for (k,p) in parent_list] # Get the contacts pdb_contacts = schema.readContactFile(file(arg_dict[ARG_CONTACT_FILE], 'r')) # Establish connection to output, either file or, if no output file is # specified, to standard output. if arg_dict.has_key(ARG_OUTPUT_FILE): output_file = file(arg_dict[ARG_OUTPUT_FILE], 'w') else: output_file = sys.stdout # Get the minimum fragment size. if arg_dict.has_key(ARG_MIN_FRAGMENT_SIZE): min_length = int(arg_dict[ARG_MIN_FRAGMENT_SIZE]) else: output_file.write("# No minimum fragment length specified; using L=4.\n") min_length = 4 # Get the bin width if arg_dict.has_key(ARG_BIN_WIDTH): bin_width = float(arg_dict[ARG_BIN_WIDTH]) else: output_file.write("# No bin width specified; using bin width=1.0.\n") bin_width = 1.0 # Get the number of fragments -- one more than the number of crossovers. num_fragments = int(arg_dict[ARG_NUM_CROSSOVERS])+1 num_parents = len(parents) library_size = num_parents**num_fragments # Make libraries consistent with RASPP (new_parents, identical_sites) = raspp.collapse_parents(parents) if len(new_parents[0]) < num_fragments*min_length: error_msg = "Minimum fragment length of %d is too large.\n%d " + \ "fragments with length %d cannot be found in a " + \ "sequence of length %d (with identities removed). Aborting..." print error_msg % (min_length, num_fragments, min_length, len(parents[0])) return contacts = schema.getSCHEMAContacts(pdb_contacts, parents) energies = raspp.make_4d_energies(contacts, parents) avg_energies = raspp.calc_average_energies(energies, parents) tstart = time.clock() res = raspp.RASPP(avg_energies, parents, num_fragments-1, min_length) output_file.write("# RASPP took %1.2f secs\n" % (time.clock()-tstart,)) output_file.write("# RASPP found %d results\n" % (len(res),)) tstart = time.clock() curve = raspp.curve(res, parents, bin_width) output_file.write("# RASPP found %d unique (<E>,<m>) points\n" % (len(curve),)) output_file.write("# RASPP curve took %1.2f secs\n" % (time.clock()-tstart,)) output_file.write("# <E>\t<m>\tcrossover points\n") for (average_E, average_m, crossovers) in curve: xover_pat = '%d '*len(crossovers) xover_str = xover_pat % tuple(crossovers) output_file.write('%1.4f\t%1.4f\t%s\n' % (average_E, average_m, xover_str)) if arg_dict.has_key(ARG_OUTPUT_FILE): output_file.close()
def main(args): arg_dict = parse_arguments(args) if not confirm_arguments(arg_dict): if args[0].split(os.path.sep)[-1] == "schemacontacts.py": print_usage(args) return # Flags and values # Inputs: # The PDB file name. pdb_file = arg_dict[ARG_PDB_FILE] # The alignment/fragment file name. msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE] # The alignment between the reference parent (indicated by reference_parent_index) # and the target protein sequence in the provided PDB file. The amino acids in # the aligned reference parent should correspond exactly to those in the # msa_file above. # If you don't provide a PDB alignment file, the program will assume that the ID of the PDB structure # contained in the HEADER field corresponds to one of the sequence IDs in the MSA. parent_pdb_alignment_file = None if arg_dict.has_key(ARG_PDB_ALIGNMENT_FILE): if not os.path.isfile(arg_dict[ARG_PDB_ALIGNMENT_FILE]): print " Can't find PDB/parent alignment file %s" % arg_dict[ ARG_PDB_ALIGNMENT_FILE] return else: parent_pdb_alignment_file = arg_dict[ARG_PDB_ALIGNMENT_FILE] else: pdb_key = pdbmod.File().getIDCode(file(pdb_file, 'r')) # The PDB chains # Many PDB files include multiple chains. The chain_identifier list includes those # chains which correspond to the protein whose contacts are being evaluated. # Most often, chain 'A' (in the case of multiple chains) or chain ' ' (only one chain) # will be the appropriate choice. if arg_dict.has_key(ARG_CHAINS): chains = arg_dict[ARG_CHAINS] if type(chains) is list: chain_identifiers = chains + [' '] else: chain_identifiers = [chains, ' '] else: chain_identifiers = ['A', ' '] if arg_dict.has_key(ARG_FORMAT): format = arg_dict[ARG_FORMAT] else: format = 'fasta' # Read the alignment file to create a list of parents. # The parents will appear in the list in the order in which they appear in the file. parent_dict = schema.readMultipleSequenceAlignmentFile( file(msa_file, 'r'), format) #parent_dict = dict(parent_list) # Generate the contacts # Read in the PDB file to create a list of residues. residues = pdbmod.File().read(file(pdb_file, 'r')) # Because the PDB file's residue sequence may differ from those of the parents, we # must align the PDB residues to one parent. if not parent_pdb_alignment_file: # Just get PDB sequence from the multiple sequence alignment try: aligned_pdb = parent_dict[pdb_key] aligned_prot = parent_dict[pdb_key] except KeyError: print "Could not find sequence %s in the multiple sequence alignment file %s. Aborting..." % ( pdb_key, msa_file) return else: # Pull information from the parent/PDB alignment file. # Our objective is to find the sequence with the same key in both the parent MSA file and # the parent/PDB alignment file. pdb_parent_seq_list = schema.readMultipleSequenceAlignmentFile( file(parent_pdb_alignment_file, 'r'), format) pdb_parent_seq_dict = dict(pdb_parent_seq_list) # Bail out if there are fewer than 2 sequences. if len(pdb_parent_seq_dict.keys()) < 2: print "Only found one uniquely named sequence in the PDB/parent alignment, %s. Aborting..." % pdb_parent_seq_dict.keys( )[0] return # Find the matching key pdb_key = None for k in parent_dict.keys(): if pdb_parent_seq_dict.has_key(k): pdb_key = k # Bail out if no matching key is found if not pdb_key: print "Could not find parents %s in PDB/parent aligned sequences %s. Aborting..." % ( parent_dict.keys(), ) return aligned_prot = pdb_parent_seq_dict[pdb_key] # Remove the sequence corresponding to the pdb_key, leaving only the parent sequence. del pdb_parent_seq_dict[pdb_key] # Take the first remaining sequence, which should be the parent sequence. aligned_pdb = pdb_parent_seq_dict.values()[0] # Check to make sure the parent sequence from both alignment files matches. if aligned_prot.replace('-', '') != parent_dict[pdb_key].replace('-', ''): print "The PDB-aligned parent and the named parent, %s, don't match! Aborting..." % ( pdb_key, ) return # Check to ensure the aligned PDB sequence matches the residue sequence pulled directly from the PDB file. if aligned_pdb.replace('-', '') != pdbmod.sequence(residues, chain_identifiers): print "The parent-aligned PDB sequence, %s, and the PDB file sequence, chain(s) %s in %s, don't match! Aborting..." % ( pdb_key, chain_identifiers, pdb_file) return #print aligned_prot #print aligned_pdb #print parent_dict[pdb_key] #print pdbmod.sequence(residues) # Align the residues with the parent protein. try: residues = schema.alignPDBResidues(residues, aligned_prot, aligned_pdb, parent_dict[pdb_key], chain_identifiers) except ValueError, ve: print ve return
def main(args): arg_dict = parse_arguments(args) if not confirm_arguments(arg_dict): if args[0].split(os.path.sep)[-1] == "rasppcurve.py": print_usage(args) return # Flags and values print_E = False print_m = False # Inputs: # The alignment/fragment file name. msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE] # Read the alignment file to create a list of parents. # The parents will appear in the list in the order in which they appear in the file. parent_list = schema.readMultipleSequenceAlignmentFile(file(msa_file, 'r')) parents = [p for (k, p) in parent_list] # Get the contacts pdb_contacts = schema.readContactFile(file(arg_dict[ARG_CONTACT_FILE], 'r')) # Establish connection to output, either file or, if no output file is # specified, to standard output. if arg_dict.has_key(ARG_OUTPUT_FILE): output_file = file(arg_dict[ARG_OUTPUT_FILE], 'w') else: output_file = sys.stdout # Get the minimum fragment size. if arg_dict.has_key(ARG_MIN_FRAGMENT_SIZE): min_length = int(arg_dict[ARG_MIN_FRAGMENT_SIZE]) else: output_file.write( "# No minimum fragment length specified; using L=4.\n") min_length = 4 # Get the bin width if arg_dict.has_key(ARG_BIN_WIDTH): bin_width = float(arg_dict[ARG_BIN_WIDTH]) else: output_file.write("# No bin width specified; using bin width=1.0.\n") bin_width = 1.0 # Get the number of fragments -- one more than the number of crossovers. num_fragments = int(arg_dict[ARG_NUM_CROSSOVERS]) + 1 num_parents = len(parents) library_size = num_parents**num_fragments # Make libraries consistent with RASPP (new_parents, identical_sites) = raspp.collapse_parents(parents) if len(new_parents[0]) < num_fragments * min_length: error_msg = "Minimum fragment length of %d is too large.\n%d " + \ "fragments with length %d cannot be found in a " + \ "sequence of length %d (with identities removed). Aborting..." print error_msg % (min_length, num_fragments, min_length, len(parents[0])) return contacts = schema.getSCHEMAContacts(pdb_contacts, parents) energies = raspp.make_4d_energies(contacts, parents) avg_energies = raspp.calc_average_energies(energies, parents) tstart = time.clock() res = raspp.RASPP(avg_energies, parents, num_fragments - 1, min_length) output_file.write("# RASPP took %1.2f secs\n" % (time.clock() - tstart, )) output_file.write("# RASPP found %d results\n" % (len(res), )) tstart = time.clock() curve = raspp.curve(res, parents, bin_width) output_file.write("# RASPP found %d unique (<E>,<m>) points\n" % (len(curve), )) output_file.write("# RASPP curve took %1.2f secs\n" % (time.clock() - tstart, )) output_file.write("# <E>\t<m>\tcrossover points\n") for (average_E, average_m, crossovers) in curve: xover_pat = '%d ' * len(crossovers) xover_str = xover_pat % tuple(crossovers) output_file.write('%1.4f\t%1.4f\t%s\n' % (average_E, average_m, xover_str)) if arg_dict.has_key(ARG_OUTPUT_FILE): output_file.close()
def main(args): arg_dict = parse_arguments(args) if not confirm_arguments(arg_dict): if args[0].split(os.path.sep)[-1] == "schemaenergy.py": print_usage(args) return # Flags and values print_E = False print_m = False output_file = sys.stdout # Inputs: # The alignment/fragment file name. msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE] if arg_dict.has_key(ARG_PRINT_E): print_E = True if arg_dict.has_key(ARG_PRINT_M): print_m = True # Read the alignment file to create a list of parents. # The parents will appear in the list in the order in which they appear in the file. parent_list = schema.readMultipleSequenceAlignmentFile(file(msa_file, 'r')) parents = [p for (k, p) in parent_list] crossovers = schema.readCrossoverFile( file(arg_dict[ARG_CROSSOVER_FILE], 'r')) fragments = schema.getFragments(crossovers, parents[0]) # Get the contacts pdb_contacts = schema.readContactFile(file(arg_dict[ARG_CONTACT_FILE], 'r')) contacts = schema.getSCHEMAContactsWithCrossovers(pdb_contacts, parents, crossovers) if arg_dict.has_key(ARG_OUTPUT_FILE): output_file = file(arg_dict[ARG_OUTPUT_FILE], 'w') # Now, what does the user want? output_string = '%s' output_file.write('# chimera') if print_E: output_string += '\t%d' output_file.write('\tE') if print_m: output_string += '\t%d' output_file.write('\tm') output_string += '\n' output_file.write('\n') if arg_dict.has_key(ARG_CHIMERAS): # Print values for chimeras chimeras = arg_dict[ARG_CHIMERAS] # Could be a) a chimera, b) a list of chimeras, or c) a file of chimeras. if type(chimeras) is list: # It's a list of chimeras for chimera_blocks in chimeras: outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m) elif os.path.isfile(chimeras): # It's a file of chimeras for line in file(chimeras, 'r').readlines(): chimera_blocks = line.strip() outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m) else: # It's a single chimera sequence chimera_blocks = chimeras outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m) else: # Enumerates all possible chimeras and their disruption and mutation values. p = len(parents) n = len(fragments) Es = [] ms = [] for i in xrange(len(parents)**len(fragments)): # The next two lines turn i into a chimera block pattern # (e.g., 0 -> '11111111', 1 -> '11111112', 2 -> '11111113'...) n2c = schema.base(i, p) chimera_blocks = ''.join(['1'] * (n - len(n2c)) + ['%d' % (int(x) + 1, ) for x in n2c]) (E, m) = outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m) if (print_E): Es.append(E) if (print_m): ms.append(m) if (print_E): mean_str = "# Average disruption <E> = %1.4f\n" % schema.mean(Es) output_file.write(mean_str) if (print_m): mean_str = "# Average mutation <m> = %1.4f\n" % schema.mean(ms) output_file.write(mean_str) if arg_dict.has_key(ARG_OUTPUT_FILE): output_file.close()