Python readMultipleSequenceAlignmentFileの例、schema.readMultipleSequenceAlignmentFile Pythonの例

コード例 #1

0

ファイルを表示

ファイル: Schema_profile.py プロジェクト: MarkCalcott/NRPS_evolution

def wrapperProfileGraph(parentFile, contactFile):
    '''
    Draws a graph of the number of clashes at each recombination point
    '''
    pdbName = contactFile.split('_')[0][-4:]
    parent_list = schema.readMultipleSequenceAlignmentFile(
        file(parentFile, 'r'))
    parents = [p for (k, p) in parent_list]

    pdb_contacts = schema.readContactFile(file(contactFile, 'r'))

    clash_data = [[] for x in parents[0]]
    for i in range(1, len(parents)):
        print i
        #This reshuffles the alignment to make the first and second sequences the ones analysed. It was needed as SCHEMA is limited to 9 sequences.
        newList = [parents[0], parents[i]]
        for x in range(1, len(parents)):
            if not i == x:
                newList.append(parents[x])
        #Graphs for hotspots
        for residue in range(0, len(parents[0])):
            crossovers = [residue]
            contacts = schema.getSCHEMAContactsWithCrossovers(
                pdb_contacts, newList, crossovers)
            fragments = schema.getFragments(crossovers, parents[0])

            clash_data[residue].append(
                schema.getChimeraDisruption('21', contacts, fragments,
                                            newList))
    means = [np.mean(values) for values in clash_data]
    StDev = [np.std(values) for values in clash_data]
    makeBarGraph(means, StDev, pdbName)

コード例 #2

0

ファイルを表示

ファイル: schemarandom.py プロジェクト: carat64/SCHEMA-RASPP

def main(args):
    arg_dict = parse_arguments(args)
    if not confirm_arguments(arg_dict):
        if args[0].split(os.path.sep)[-1] == "schemarandom.py":
            print_usage(args)
        return

        # Flags and values
    print_E = False
    print_m = False

    # Inputs:
    #   The alignment/fragment file name.
    msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE]

    # Read the alignment file to create a list of parents.
    # The parents will appear in the list in the order in which they appear in the file.
    parent_list = schema.readMultipleSequenceAlignmentFile(file(msa_file, "r"))
    parents = [p for (k, p) in parent_list]

    # Get the contacts
    pdb_contacts = schema.readContactFile(file(arg_dict[ARG_CONTACT_FILE], "r"))

    # Establish connection to output, either file or, if no output file is
    # specified, to standard output.
    if arg_dict.has_key(ARG_OUTPUT_FILE):
        output_file = file(arg_dict[ARG_OUTPUT_FILE], "w")
    else:
        output_file = sys.stdout

        # Get the number of libraries to evaluate.
    if arg_dict.has_key(ARG_NUM_LIBRARIES):
        num_libraries = int(arg_dict[ARG_NUM_LIBRARIES])
    else:
        num_libraries = int(1e3)

        # Get the minimum fragment size.
    if arg_dict.has_key(ARG_MIN_FRAGMENT_SIZE):
        min_length = int(arg_dict[ARG_MIN_FRAGMENT_SIZE])
    else:
        min_length = 4

        # Get the number of fragments -- one more than the number of crossovers.
    num_fragments = int(arg_dict[ARG_NUM_CROSSOVERS]) + 1

    num_parents = len(parents)
    library_size = num_parents ** num_fragments

    if arg_dict.has_key(ARG_MAX_CHIMERAS_PER_LIBRARY):
        max_chimeras = min(library_size, int(arg_dict[ARG_MAX_CHIMERAS_PER_LIBRARY]))
    else:
        max_chimeras = library_size

    if arg_dict.has_key(ARG_RANDOM_SEED):
        random.seed(int(arg_dict[ARG_RANDOM_SEED]))

        # Make libraries consistent with RASPP
    (new_parents, identical_sites) = raspp.collapse_parents(parents)
    if len(new_parents[0]) < num_fragments * min_length:
        error_msg = (
            "Minimum diversity length of %d is too large.\n%d "
            + "fragments with diversity %d cannot be found in a "
            + "sequence of length %d (with identities removed).  Aborting..."
        )
        print error_msg % (min_length, num_fragments, min_length, len(parents[0]))
        return

    start_time = time.clock()

    output_file.write("# <E>\t<m>\tcrossover points\n")
    random_crossovers = []
    for libnum in range(num_libraries):
        crossovers = schema.generateRandomCrossovers(len(new_parents[0]), num_fragments - 1, min_length)
        crossovers = raspp.translate_collapsed_indices(crossovers, identical_sites)
        random_crossovers.append(crossovers)
    for crossovers in random_crossovers:
        fragments = schema.getFragments(crossovers, parents[0])
        filtered_contacts = schema.getSCHEMAContactsWithCrossovers(pdb_contacts, parents, crossovers)
        all_chimeras = []
        if max_chimeras < library_size:
            # Assemble a random sample of chimeras, with replacement
            for n_chim in range(max_chimeras):
                chim_index = random.randint(0, library_size - 1)
                n2c = schema.base(chim_index, num_parents)
                chimera_blocks = "".join(["1"] * (num_fragments - len(n2c)) + ["%d" % (int(x) + 1,) for x in n2c])
                all_chimeras.append(chimera_blocks)
        else:  # We'll be covering all chimeras in the library; might as well get a good sample.
            # The number of parents and fragments specifies all possible chimeras, regardless of
            # crossover point positions, so pre-generate all chimeras.
            max_chimeras = library_size
            for i in range(library_size):
                # The next two lines turn i into a chimera block pattern
                # (e.g., 0 -> '11111111', 1 -> '11111112', 2 -> '11111113'...)
                n2c = schema.base(i, num_parents)
                chimera_blocks = "".join(["1"] * (num_fragments - len(n2c)) + ["%d" % (int(x) + 1,) for x in n2c])
                all_chimeras.append(chimera_blocks)
                # Randomly assort the chimeras
            random.shuffle(all_chimeras)

            # Calculate average E and m for the library or subsample
        E_values = []
        m_values = []

        for chim_index in range(max_chimeras):
            chimera_blocks = all_chimeras[chim_index]
            E = schema.getChimeraDisruption(chimera_blocks, filtered_contacts, fragments, parents)
            m = schema.getChimeraShortestDistance(chimera_blocks, fragments, parents)
            E_values.append(E)
            m_values.append(m)
        average_E = schema.mean(E_values)
        average_m = schema.mean(m_values)
        xover_pat = "%d " * len(crossovers)
        xover_str = xover_pat % tuple(crossovers)
        output_file.write(("%1.4f\t%1.4f\t%s\n") % (average_E, average_m, xover_str))
        output_file.flush()
    total_time = time.clock() - start_time
    output_file.write(
        "# Finished in %1.2f seconds (%d libraries, %d chimeras)\n"
        % (total_time, num_libraries, num_libraries * max_chimeras)
    )
    if arg_dict.has_key(ARG_OUTPUT_FILE):
        output_file.close()

コード例 #3

0

ファイルを表示

ファイル: schemaenergy.py プロジェクト: carat64/SCHEMA-RASPP

def main(args):
	arg_dict = parse_arguments(args)
	if not confirm_arguments(arg_dict):
		if args[0].split(os.path.sep)[-1] == "schemaenergy.py":
			print_usage(args)
		return

	# Flags and values
	print_E = False
	print_m = False
	output_file = sys.stdout

	# Inputs:
	#   The alignment/fragment file name.
	msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE]

	if arg_dict.has_key(ARG_PRINT_E):
		print_E = True
	if arg_dict.has_key(ARG_PRINT_M):
		print_m = True

	# Read the alignment file to create a list of parents.
	# The parents will appear in the list in the order in which they appear in the file.
	parent_list = schema.readMultipleSequenceAlignmentFile(file(msa_file, 'r'))
	parents = [p for (k,p) in parent_list]
	
	crossovers = schema.readCrossoverFile(file(arg_dict[ARG_CROSSOVER_FILE], 'r'))
	fragments = schema.getFragments(crossovers, parents[0])

	# Get the contacts
	pdb_contacts = schema.readContactFile(file(arg_dict[ARG_CONTACT_FILE], 'r'))
	contacts = schema.getSCHEMAContactsWithCrossovers(pdb_contacts, parents, crossovers)
	
	if arg_dict.has_key(ARG_OUTPUT_FILE):
		output_file = file(arg_dict[ARG_OUTPUT_FILE], 'w')

	# Now, what does the user want?
	output_string = '%s'
	output_file.write('# chimera')
	if print_E:
		output_string += '\t%d'
		output_file.write('\tE')
	if print_m:
		output_string += '\t%d'
		output_file.write('\tm')
	output_string += '\n'
	output_file.write('\n')
	
	if arg_dict.has_key(ARG_CHIMERAS): # Print values for chimeras
		chimeras = arg_dict[ARG_CHIMERAS]
		# Could be a) a chimera, b) a list of chimeras, or c) a file of chimeras.
		if type(chimeras) is list:
			# It's a list of chimeras
			for chimera_blocks in chimeras:
				outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m)
		elif os.path.isfile(chimeras):
			# It's a file of chimeras
			for line in file(chimeras,'r').readlines():
				chimera_blocks = line.strip()
				outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m)
		else:
			# It's a single chimera sequence
			chimera_blocks = chimeras
			outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m)
	else:
		# Enumerates all possible chimeras and their disruption and mutation values.
		p = len(parents)
		n = len(fragments)
		Es = []
		ms = []
		for i in xrange(len(parents)**len(fragments)):
			# The next two lines turn i into a chimera block pattern 
			# (e.g., 0 -> '11111111', 1 -> '11111112', 2 -> '11111113'...)
			n2c = schema.base(i,p)
			chimera_blocks = ''.join(['1']*(n-len(n2c))+['%d'%(int(x)+1,) for x in n2c])
			(E, m) = outputEnergies(chimera_blocks, contacts, fragments, parents, output_file, output_string, print_E, print_m)
			if (print_E):
				Es.append(E)
			if (print_m):
				ms.append(m)
		if (print_E):
			mean_str = "# Average disruption <E> = %1.4f\n" % schema.mean(Es)
			output_file.write(mean_str)
		if (print_m):
			mean_str = "# Average mutation <m> = %1.4f\n" % schema.mean(ms)
			output_file.write(mean_str)
	
	if arg_dict.has_key(ARG_OUTPUT_FILE):
		output_file.close()

コード例 #4

0

ファイルを表示

ファイル: schemacontacts.py プロジェクト: dad/base

def main(args):
	arg_dict = parse_arguments(args)
	if not confirm_arguments(arg_dict):
		if args[0].split(os.path.sep)[-1] == "schemacontacts.py":
			print_usage(args)
		return

	# Flags and values
	
	# Inputs:
	#	The PDB file name.
	pdb_file = arg_dict[ARG_PDB_FILE]
	#   The alignment/fragment file name.
	msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE]
	#	The alignment between the reference parent (indicated by reference_parent_index)
	#	and the target protein sequence in the provided PDB file.  The amino acids in
	#	the aligned reference parent should correspond exactly to those in the 
	#	msa_file above.
	# If you don't provide a PDB alignment file, the program will assume that the ID of the PDB structure
	# contained in the HEADER field corresponds to one of the sequence IDs in the MSA.
	parent_pdb_alignment_file = None
	if arg_dict.has_key(ARG_PDB_ALIGNMENT_FILE):
		if not os.path.isfile(arg_dict[ARG_PDB_ALIGNMENT_FILE]):
			print "  Can't find PDB/parent alignment file %s" % arg_dict[ARG_PDB_ALIGNMENT_FILE]
			return 
		else:
			parent_pdb_alignment_file = arg_dict[ARG_PDB_ALIGNMENT_FILE]
	else:
		pdb_key = pdbmod.File().getIDCode(file(pdb_file,'r'))
		
	# The PDB chains
	# Many PDB files include multiple chains.  The chain_identifier list includes those
	# chains which correspond to the protein whose contacts are being evaluated.
	# Most often, chain 'A' (in the case of multiple chains) or chain ' ' (only one chain)
	# will be the appropriate choice.
	if arg_dict.has_key(ARG_CHAINS):
		chains = arg_dict[ARG_CHAINS]
		if type(chains) is list:
			chain_identifiers = chains + [' ']
		else:
			chain_identifiers = [chains, ' ']
	else:
		chain_identifiers = ['A',' ']
	
	if arg_dict.has_key(ARG_FORMAT):
		format = arg_dict[ARG_FORMAT]
	else:
		format = 'fasta'
	
	# Read the alignment file to create a list of parents.
	# The parents will appear in the list in the order in which they appear in the file.
	parent_dict = schema.readMultipleSequenceAlignmentFile(file(msa_file, 'r'), format)
	#parent_dict = dict(parent_list)

	# Generate the contacts
	# Read in the PDB file to create a list of residues.
	residues = pdbmod.File().read(file(pdb_file, 'r'))
	# Because the PDB file's residue sequence may differ from those of the parents, we
	# must align the PDB residues to one parent.
	if not parent_pdb_alignment_file:  # Just get PDB sequence from the multiple sequence alignment
		try:
			aligned_pdb = parent_dict[pdb_key]
			aligned_prot = parent_dict[pdb_key]
		except KeyError:
			print "Could not find sequence %s in the multiple sequence alignment file %s.  Aborting..." % (pdb_key, msa_file)
			return
	else: # Pull information from the parent/PDB alignment file.
		# Our objective is to find the sequence with the same key in both the parent MSA file and 
		# the parent/PDB alignment file.
		pdb_parent_seq_list = schema.readMultipleSequenceAlignmentFile(file(parent_pdb_alignment_file, 'r'), format)
		pdb_parent_seq_dict = dict(pdb_parent_seq_list)
	
		# Bail out if there are fewer than 2 sequences.
		if len(pdb_parent_seq_dict.keys()) < 2:
			print "Only found one uniquely named sequence in the PDB/parent alignment, %s.  Aborting..." % pdb_parent_seq_dict.keys()[0]
			return

		# Find the matching key
		pdb_key = None
		for k in parent_dict.keys():
			if pdb_parent_seq_dict.has_key(k):
				pdb_key = k

		# Bail out if no matching key is found
		if not pdb_key:
			print "Could not find parents %s in PDB/parent aligned sequences %s.  Aborting..." % (parent_dict.keys(),)
			return
		aligned_prot = pdb_parent_seq_dict[pdb_key]
		# Remove the sequence corresponding to the pdb_key, leaving only the parent sequence.
		del pdb_parent_seq_dict[pdb_key]
		# Take the first remaining sequence, which should be the parent sequence.
		aligned_pdb = pdb_parent_seq_dict.values()[0]

	# Check to make sure the parent sequence from both alignment files matches.
	if aligned_prot.replace('-','') != parent_dict[pdb_key].replace('-',''):
		print "The PDB-aligned parent and the named parent, %s, don't match!  Aborting..." % (pdb_key,)
		return
	# Check to ensure the aligned PDB sequence matches the residue sequence pulled directly from the PDB file.
	if aligned_pdb.replace('-','') != pdbmod.sequence(residues, chain_identifiers):
		print "The parent-aligned PDB sequence, %s, and the PDB file sequence, chain(s) %s in %s, don't match!  Aborting..." % (pdb_key, chain_identifiers, pdb_file)
		return
	#print aligned_prot
	#print aligned_pdb
	#print parent_dict[pdb_key]
	#print pdbmod.sequence(residues)
	
	# Align the residues with the parent protein.
	try:
		residues = schema.alignPDBResidues(residues, aligned_prot, aligned_pdb, parent_dict[pdb_key], chain_identifiers)
	except ValueError, ve:
		print ve
		return

コード例 #5

0

ファイルを表示

ファイル: rasppcurve.py プロジェクト: carat64/SCHEMA-RASPP

def main(args):
	arg_dict = parse_arguments(args)
	if not confirm_arguments(arg_dict):
		if args[0].split(os.path.sep)[-1] == "rasppcurve.py":
			print_usage(args)
		return

	# Flags and values
	print_E = False
	print_m = False
	
	# Inputs:
	#   The alignment/fragment file name.
	msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE]

	# Read the alignment file to create a list of parents.
	# The parents will appear in the list in the order in which they appear in the file.
	parent_list = schema.readMultipleSequenceAlignmentFile(file(msa_file, 'r'))
	parents = [p for (k,p) in parent_list]
	
	# Get the contacts
	pdb_contacts = schema.readContactFile(file(arg_dict[ARG_CONTACT_FILE], 'r'))
	
	# Establish connection to output, either file or, if no output file is 
	# specified, to standard output.
	if arg_dict.has_key(ARG_OUTPUT_FILE):
		output_file = file(arg_dict[ARG_OUTPUT_FILE], 'w')
	else:
		output_file = sys.stdout

	# Get the minimum fragment size.
	if arg_dict.has_key(ARG_MIN_FRAGMENT_SIZE):
		min_length = int(arg_dict[ARG_MIN_FRAGMENT_SIZE])
	else:
		output_file.write("# No minimum fragment length specified; using L=4.\n")
		min_length = 4

	# Get the bin width
	if arg_dict.has_key(ARG_BIN_WIDTH):
		bin_width = float(arg_dict[ARG_BIN_WIDTH])
	else:
		output_file.write("# No bin width specified; using bin width=1.0.\n")
		bin_width = 1.0

	# Get the number of fragments -- one more than the number of crossovers.
	num_fragments = int(arg_dict[ARG_NUM_CROSSOVERS])+1
	
	
	num_parents = len(parents)
	library_size = num_parents**num_fragments

	# Make libraries consistent with RASPP
	(new_parents, identical_sites) = raspp.collapse_parents(parents)
	if len(new_parents[0]) < num_fragments*min_length:
		error_msg = "Minimum fragment length of %d is too large.\n%d " + \
					"fragments with length %d cannot be found in a " + \
					"sequence of length %d (with identities removed).  Aborting..."
		print error_msg % (min_length, num_fragments, min_length, len(parents[0]))
		return

	contacts = schema.getSCHEMAContacts(pdb_contacts, parents)
	energies = raspp.make_4d_energies(contacts, parents)
	avg_energies = raspp.calc_average_energies(energies, parents)

	tstart = time.clock()
	res = raspp.RASPP(avg_energies, parents, num_fragments-1, min_length)
	output_file.write("# RASPP took %1.2f secs\n" % (time.clock()-tstart,))
	output_file.write("# RASPP found %d results\n" % (len(res),))

	tstart = time.clock()
	curve = raspp.curve(res, parents, bin_width)
	output_file.write("# RASPP found %d unique (<E>,<m>) points\n" % (len(curve),))
	output_file.write("# RASPP curve took %1.2f secs\n" % (time.clock()-tstart,))
	output_file.write("# <E>\t<m>\tcrossover points\n")
	for (average_E, average_m, crossovers) in curve:
		xover_pat = '%d '*len(crossovers)
		xover_str = xover_pat % tuple(crossovers)
		output_file.write('%1.4f\t%1.4f\t%s\n' % (average_E, average_m, xover_str))

	if arg_dict.has_key(ARG_OUTPUT_FILE):
		output_file.close()

コード例 #6

0

ファイルを表示

def main(args):
    arg_dict = parse_arguments(args)
    if not confirm_arguments(arg_dict):
        if args[0].split(os.path.sep)[-1] == "schemacontacts.py":
            print_usage(args)
        return

    # Flags and values

    # Inputs:
    #	The PDB file name.
    pdb_file = arg_dict[ARG_PDB_FILE]
    #   The alignment/fragment file name.
    msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE]
    #	The alignment between the reference parent (indicated by reference_parent_index)
    #	and the target protein sequence in the provided PDB file.  The amino acids in
    #	the aligned reference parent should correspond exactly to those in the
    #	msa_file above.
    # If you don't provide a PDB alignment file, the program will assume that the ID of the PDB structure
    # contained in the HEADER field corresponds to one of the sequence IDs in the MSA.
    parent_pdb_alignment_file = None
    if arg_dict.has_key(ARG_PDB_ALIGNMENT_FILE):
        if not os.path.isfile(arg_dict[ARG_PDB_ALIGNMENT_FILE]):
            print "  Can't find PDB/parent alignment file %s" % arg_dict[
                ARG_PDB_ALIGNMENT_FILE]
            return
        else:
            parent_pdb_alignment_file = arg_dict[ARG_PDB_ALIGNMENT_FILE]
    else:
        pdb_key = pdbmod.File().getIDCode(file(pdb_file, 'r'))

    # The PDB chains
    # Many PDB files include multiple chains.  The chain_identifier list includes those
    # chains which correspond to the protein whose contacts are being evaluated.
    # Most often, chain 'A' (in the case of multiple chains) or chain ' ' (only one chain)
    # will be the appropriate choice.
    if arg_dict.has_key(ARG_CHAINS):
        chains = arg_dict[ARG_CHAINS]
        if type(chains) is list:
            chain_identifiers = chains + [' ']
        else:
            chain_identifiers = [chains, ' ']
    else:
        chain_identifiers = ['A', ' ']

    if arg_dict.has_key(ARG_FORMAT):
        format = arg_dict[ARG_FORMAT]
    else:
        format = 'fasta'

    # Read the alignment file to create a list of parents.
    # The parents will appear in the list in the order in which they appear in the file.
    parent_dict = schema.readMultipleSequenceAlignmentFile(
        file(msa_file, 'r'), format)
    #parent_dict = dict(parent_list)

    # Generate the contacts
    # Read in the PDB file to create a list of residues.
    residues = pdbmod.File().read(file(pdb_file, 'r'))
    # Because the PDB file's residue sequence may differ from those of the parents, we
    # must align the PDB residues to one parent.
    if not parent_pdb_alignment_file:  # Just get PDB sequence from the multiple sequence alignment
        try:
            aligned_pdb = parent_dict[pdb_key]
            aligned_prot = parent_dict[pdb_key]
        except KeyError:
            print "Could not find sequence %s in the multiple sequence alignment file %s.  Aborting..." % (
                pdb_key, msa_file)
            return
    else:  # Pull information from the parent/PDB alignment file.
        # Our objective is to find the sequence with the same key in both the parent MSA file and
        # the parent/PDB alignment file.
        pdb_parent_seq_list = schema.readMultipleSequenceAlignmentFile(
            file(parent_pdb_alignment_file, 'r'), format)
        pdb_parent_seq_dict = dict(pdb_parent_seq_list)

        # Bail out if there are fewer than 2 sequences.
        if len(pdb_parent_seq_dict.keys()) < 2:
            print "Only found one uniquely named sequence in the PDB/parent alignment, %s.  Aborting..." % pdb_parent_seq_dict.keys(
            )[0]
            return

        # Find the matching key
        pdb_key = None
        for k in parent_dict.keys():
            if pdb_parent_seq_dict.has_key(k):
                pdb_key = k

        # Bail out if no matching key is found
        if not pdb_key:
            print "Could not find parents %s in PDB/parent aligned sequences %s.  Aborting..." % (
                parent_dict.keys(), )
            return
        aligned_prot = pdb_parent_seq_dict[pdb_key]
        # Remove the sequence corresponding to the pdb_key, leaving only the parent sequence.
        del pdb_parent_seq_dict[pdb_key]
        # Take the first remaining sequence, which should be the parent sequence.
        aligned_pdb = pdb_parent_seq_dict.values()[0]

    # Check to make sure the parent sequence from both alignment files matches.
    if aligned_prot.replace('-', '') != parent_dict[pdb_key].replace('-', ''):
        print "The PDB-aligned parent and the named parent, %s, don't match!  Aborting..." % (
            pdb_key, )
        return
    # Check to ensure the aligned PDB sequence matches the residue sequence pulled directly from the PDB file.
    if aligned_pdb.replace('-', '') != pdbmod.sequence(residues,
                                                       chain_identifiers):
        print "The parent-aligned PDB sequence, %s, and the PDB file sequence, chain(s) %s in %s, don't match!  Aborting..." % (
            pdb_key, chain_identifiers, pdb_file)
        return
    #print aligned_prot
    #print aligned_pdb
    #print parent_dict[pdb_key]
    #print pdbmod.sequence(residues)

    # Align the residues with the parent protein.
    try:
        residues = schema.alignPDBResidues(residues, aligned_prot, aligned_pdb,
                                           parent_dict[pdb_key],
                                           chain_identifiers)
    except ValueError, ve:
        print ve
        return

コード例 #7

0

ファイルを表示

ファイル: rasppcurve.py プロジェクト: Gienah/SCHEMA-RASPP

def main(args):
    arg_dict = parse_arguments(args)
    if not confirm_arguments(arg_dict):
        if args[0].split(os.path.sep)[-1] == "rasppcurve.py":
            print_usage(args)
        return

    # Flags and values
    print_E = False
    print_m = False

    # Inputs:
    #   The alignment/fragment file name.
    msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE]

    # Read the alignment file to create a list of parents.
    # The parents will appear in the list in the order in which they appear in the file.
    parent_list = schema.readMultipleSequenceAlignmentFile(file(msa_file, 'r'))
    parents = [p for (k, p) in parent_list]

    # Get the contacts
    pdb_contacts = schema.readContactFile(file(arg_dict[ARG_CONTACT_FILE],
                                               'r'))

    # Establish connection to output, either file or, if no output file is
    # specified, to standard output.
    if arg_dict.has_key(ARG_OUTPUT_FILE):
        output_file = file(arg_dict[ARG_OUTPUT_FILE], 'w')
    else:
        output_file = sys.stdout

    # Get the minimum fragment size.
    if arg_dict.has_key(ARG_MIN_FRAGMENT_SIZE):
        min_length = int(arg_dict[ARG_MIN_FRAGMENT_SIZE])
    else:
        output_file.write(
            "# No minimum fragment length specified; using L=4.\n")
        min_length = 4

    # Get the bin width
    if arg_dict.has_key(ARG_BIN_WIDTH):
        bin_width = float(arg_dict[ARG_BIN_WIDTH])
    else:
        output_file.write("# No bin width specified; using bin width=1.0.\n")
        bin_width = 1.0

    # Get the number of fragments -- one more than the number of crossovers.
    num_fragments = int(arg_dict[ARG_NUM_CROSSOVERS]) + 1

    num_parents = len(parents)
    library_size = num_parents**num_fragments

    # Make libraries consistent with RASPP
    (new_parents, identical_sites) = raspp.collapse_parents(parents)
    if len(new_parents[0]) < num_fragments * min_length:
        error_msg = "Minimum fragment length of %d is too large.\n%d " + \
           "fragments with length %d cannot be found in a " + \
           "sequence of length %d (with identities removed).  Aborting..."
        print error_msg % (min_length, num_fragments, min_length,
                           len(parents[0]))
        return

    contacts = schema.getSCHEMAContacts(pdb_contacts, parents)
    energies = raspp.make_4d_energies(contacts, parents)
    avg_energies = raspp.calc_average_energies(energies, parents)

    tstart = time.clock()
    res = raspp.RASPP(avg_energies, parents, num_fragments - 1, min_length)
    output_file.write("# RASPP took %1.2f secs\n" % (time.clock() - tstart, ))
    output_file.write("# RASPP found %d results\n" % (len(res), ))

    tstart = time.clock()
    curve = raspp.curve(res, parents, bin_width)
    output_file.write("# RASPP found %d unique (<E>,<m>) points\n" %
                      (len(curve), ))
    output_file.write("# RASPP curve took %1.2f secs\n" %
                      (time.clock() - tstart, ))
    output_file.write("# <E>\t<m>\tcrossover points\n")
    for (average_E, average_m, crossovers) in curve:
        xover_pat = '%d ' * len(crossovers)
        xover_str = xover_pat % tuple(crossovers)
        output_file.write('%1.4f\t%1.4f\t%s\n' %
                          (average_E, average_m, xover_str))

    if arg_dict.has_key(ARG_OUTPUT_FILE):
        output_file.close()

コード例 #8

0

ファイルを表示

ファイル: schemaenergy.py プロジェクト: Gienah/SCHEMA-RASPP

def main(args):
    arg_dict = parse_arguments(args)
    if not confirm_arguments(arg_dict):
        if args[0].split(os.path.sep)[-1] == "schemaenergy.py":
            print_usage(args)
        return

    # Flags and values
    print_E = False
    print_m = False
    output_file = sys.stdout

    # Inputs:
    #   The alignment/fragment file name.
    msa_file = arg_dict[ARG_MULTIPLE_SEQUENCE_ALIGNMENT_FILE]

    if arg_dict.has_key(ARG_PRINT_E):
        print_E = True
    if arg_dict.has_key(ARG_PRINT_M):
        print_m = True

    # Read the alignment file to create a list of parents.
    # The parents will appear in the list in the order in which they appear in the file.
    parent_list = schema.readMultipleSequenceAlignmentFile(file(msa_file, 'r'))
    parents = [p for (k, p) in parent_list]

    crossovers = schema.readCrossoverFile(
        file(arg_dict[ARG_CROSSOVER_FILE], 'r'))
    fragments = schema.getFragments(crossovers, parents[0])

    # Get the contacts
    pdb_contacts = schema.readContactFile(file(arg_dict[ARG_CONTACT_FILE],
                                               'r'))
    contacts = schema.getSCHEMAContactsWithCrossovers(pdb_contacts, parents,
                                                      crossovers)

    if arg_dict.has_key(ARG_OUTPUT_FILE):
        output_file = file(arg_dict[ARG_OUTPUT_FILE], 'w')

    # Now, what does the user want?
    output_string = '%s'
    output_file.write('# chimera')
    if print_E:
        output_string += '\t%d'
        output_file.write('\tE')
    if print_m:
        output_string += '\t%d'
        output_file.write('\tm')
    output_string += '\n'
    output_file.write('\n')

    if arg_dict.has_key(ARG_CHIMERAS):  # Print values for chimeras
        chimeras = arg_dict[ARG_CHIMERAS]
        # Could be a) a chimera, b) a list of chimeras, or c) a file of chimeras.
        if type(chimeras) is list:
            # It's a list of chimeras
            for chimera_blocks in chimeras:
                outputEnergies(chimera_blocks, contacts, fragments, parents,
                               output_file, output_string, print_E, print_m)
        elif os.path.isfile(chimeras):
            # It's a file of chimeras
            for line in file(chimeras, 'r').readlines():
                chimera_blocks = line.strip()
                outputEnergies(chimera_blocks, contacts, fragments, parents,
                               output_file, output_string, print_E, print_m)
        else:
            # It's a single chimera sequence
            chimera_blocks = chimeras
            outputEnergies(chimera_blocks, contacts, fragments, parents,
                           output_file, output_string, print_E, print_m)
    else:
        # Enumerates all possible chimeras and their disruption and mutation values.
        p = len(parents)
        n = len(fragments)
        Es = []
        ms = []
        for i in xrange(len(parents)**len(fragments)):
            # The next two lines turn i into a chimera block pattern
            # (e.g., 0 -> '11111111', 1 -> '11111112', 2 -> '11111113'...)
            n2c = schema.base(i, p)
            chimera_blocks = ''.join(['1'] * (n - len(n2c)) +
                                     ['%d' % (int(x) + 1, ) for x in n2c])
            (E, m) = outputEnergies(chimera_blocks, contacts, fragments,
                                    parents, output_file, output_string,
                                    print_E, print_m)
            if (print_E):
                Es.append(E)
            if (print_m):
                ms.append(m)
        if (print_E):
            mean_str = "# Average disruption <E> = %1.4f\n" % schema.mean(Es)
            output_file.write(mean_str)
        if (print_m):
            mean_str = "# Average mutation <m> = %1.4f\n" % schema.mean(ms)
            output_file.write(mean_str)

    if arg_dict.has_key(ARG_OUTPUT_FILE):
        output_file.close()