Ejemplo n.º 1
0
    def chgAlpha(self, newAlpha):
        """Accepts 'DNA' 'RNA' or 'protein' or an 
	alphabet object"""

        from Bio.Seq import Seq
        from Bio.Alphabet import IUPAC

        alpha = None
        if newAlpha == "DNA":
            alpha = IUPAC.IUPACUnambiguousDNA()
            self.typ = alpha
        elif newAlpha == "RNA":
            alpha = IUPAC.IUPACUnambiguousDNA()
            self.typ = alpha
        elif newAlpha == "protein":
            alpha = IUPAC.IUPACProtein()
            self.typ = alpha
        else:
            raise NameError, "type not 'DNA', 'RNA', or 'protein'"

        if not alpha:
            alpha = newAlpha

        self.seq = Seq(self.seq.tostring(), alpha)

        self.checkAlpha()
Ejemplo n.º 2
0
def cds_to_seqrecord(cds, parent_genome, gene_domains=[]):
    """Creates a SeqRecord object from a Cds and its parent Genome.

    :param cds: A populated Cds object.
    :type cds: Cds
    :param phage_genome: Populated parent Genome object of the Cds object.
    :param domains: List of domain objects populated with column attributes
    :type domains: list
    :returns: Filled Biopython SeqRecord object.
    :rtype: SeqRecord
    """
    record = SeqRecord(cds.translation)
    record.seq.alphabet = IUPAC.IUPACProtein()
    record.name = cds.id
    if cds.locus_tag == "" or cds.locus_tag is None:
        record.id = "".join(["DRAFT ", cds.id])
    else:
        record.id = cds.locus_tag

    cds.set_seqfeature()

    source = f"{parent_genome.host_genus} phage {cds.genome_id}"
    source_feature = cds.create_seqfeature("source", 0, cds.translation_length,
                                           1)
    source_feature.qualifiers["organism"] = [source]

    record.features = [source_feature]
    record.features.append(
        cds.create_seqfeature("Protein", 0, cds.translation_length, 1))

    cds_feature = cds.create_seqfeature("CDS", 0, cds.translation_length, 1)
    format_cds_seqrecord_CDS_feature(cds_feature, cds, parent_genome)
    record.features.append(cds_feature)

    region_features = get_cds_seqrecord_regions(gene_domains, cds)
    for region_feature in region_features:
        record.features.append(region_feature)

    record.description = (f"{cds.seqfeature.qualifiers['product'][0]} "
                          f"[{source}]")
    record.annotations = get_cds_seqrecord_annotations(cds, parent_genome)

    return record
Ejemplo n.º 3
0
	for chain in model:
		print("Working on chain %s." % (chain.get_id()))
		seq = list()
		position = list()
		
		for residue in chain:
			res_id = residue.get_id()
			if res_id[0] == ' ':
				seq.append(three_to_one(residue.get_resname()))
				position.append(res_id[1])
				#print("%s -- %d" % (three_to_one(residue.get_resname()), res_id[1]))


		my_prot = Seq(str(''.join(seq)), IUPAC.protein)

		tmp_seq1 = SeqIO.read(conserved_fasta_file, "fasta", IUPAC.IUPACProtein())
		
		my_x_seq = Seq(sub('[a-z]', 'X', str(tmp_seq1.seq)))

		seq1 = SeqRecord(my_x_seq, id=tmp_seq1.id, name="", description="")
		
		#print("%s" % seq1)
		
		seq2 = SeqRecord(my_prot, id="Chain_" + chain.get_id(), name="", description="")
		
		myseqs = [seq1, seq2]
		
		fasta_filename = prefix + "_" + chain.get_id() + ".fasta"
		align_filename = prefix + "_" + chain.get_id() + ".align"

		SeqIO.write(myseqs, fasta_filename, "fasta")
	input_file = args.input_file
	dataset_id = input_file.split("_")[0]
	contig_length_filter = "/hps/nobackup2/production/metagenomics/aalmeida/scripts/EMBL-EBI/filter_contigs_len.py"
	subprocess.call("%s -f %s -l 1" % (contig_length_filter, input_file), stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL, shell = True)
	new_name_search = re.search(r"(%s\w+)\.fna(\w+\.fasta)" % dataset_id, ",".join(os.listdir()))
	new_name = new_name_search.group(1) + new_name_search.group(2)
	os.rename(new_name_search.group(0), new_name)
	input_file = new_name
	predicted_viruses = virus_pred(input_file)
	SeqIO.write(predicted_viruses, "%s_viral_sequences.fna" % dataset_id, "fasta")
	input_file = "%s_viral_sequences.fna" % dataset_id
	hmmer_result = hmmer_domtbl(input_file)
	informative_df = ratio_evalue(hmmer_result)
	os.mkdir("%s_annotated_viral_sequences" % dataset_id)
	for contig in SeqIO.parse(input_file, "fasta", IUPAC.IUPACUnambiguousDNA()):
		for protein in SeqIO.parse("%s_viral_CDS.faa" % dataset_id, "fasta", IUPAC.IUPACProtein()):
			if contig.id in protein.id:
				protein_fields = protein.description.split(" # ")
				CDS_annotation = SeqFeature(FeatureLocation(int(protein_fields[1]), int(protein_fields[2])), type = "CDS", strand = int(protein_fields[3]))
				CDS_annotation.qualifiers["locus_tag"]  = [protein.id]
				CDS_annotation.qualifiers["transl_table"] = [11]
				CDS_annotation.qualifiers["translation"] = [str(protein.seq)]
				if protein.id in list(informative_df["query"].values):
					if list(informative_df["query"].values).count(protein.id) > 1:
						best_hit = max(informative_df[informative_df["query"] == protein.id]["Abs_Evalue_exp"].items(), key = operator.itemgetter(1))
						if best_hit[1] >= 10:
							CDS_annotation.qualifiers["result"] = ["high confidence"]
						else:
							CDS_annotation.qualifiers["result"] = ["low confidence"]
						CDS_annotation.qualifiers["taxon"] = [informative_df.loc[best_hit[0], "Taxon"], informative_df.loc[best_hit[0], "Abs_Evalue_exp"]]
					else: