Esempio n. 1
0
    def test_remove_site_with_partial_feature_overlap__downstream(self):
        """Test fixing a site that is only partially inside a feature.

        We expect a modification to the part that is not inside the feature,
        leaving the feature unchanged.
        """
        RESTRICTION_ENZYME = Restriction.BsmBI
        SITE_SEQ = RESTRICTION_ENZYME.site
        BEFORE = 'CCCCCCCCCCCCCCCCCCCCCCCC'
        AFTER = 'AAAAAAAAAAAAAAAA'
        SEQ = Seq(BEFORE + SITE_SEQ + AFTER, generic_dna)
        seq_record = SeqRecord(SEQ)

        feature_1_loc = FeatureLocation(len(BEFORE) + 2, len(SEQ), strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1')
        seq_record.features.append(feature_1)
        FEATURE_1_SEQ_ORIG = feature_1.extract(str(seq_record.seq))

        occurrences = find_restriction_site_occurrences(
            seq_record, RESTRICTION_ENZYME)
        self.assertEqual(1, len(occurrences))

        seq_record = _remove_site_with_partial_feature_overlap(
            seq_record, occurrences[0], feature_1)

        FEATURE_1_SEQ_UPDATED = feature_1.extract(str(seq_record.seq))

        occurrences = find_restriction_site_occurrences(
            seq_record, RESTRICTION_ENZYME)
        self.assertEqual(0, len(occurrences))
        self.assertEqual(FEATURE_1_SEQ_ORIG, FEATURE_1_SEQ_UPDATED)
    def test_deletion__overlapping_features(self):
        # Example based on intersection of nei and arbB genes in MG1655.
        before_overlap = 'GCCCTGGCTGCCAGCA'
        overlap = 'CTAG'
        after_overlap = 'GCCGACCGCTTCGG'
        raw_seq_str = before_overlap + overlap + after_overlap
        seq = Seq(raw_seq_str, generic_dna)
        seq_record = SeqRecord(seq)

        feature_1_loc = FeatureLocation(0,
                len(before_overlap) + len(overlap), strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1')
        seq_record.features.append(feature_1)

        feature_2_loc = FeatureLocation(len(before_overlap), len(raw_seq_str),
                strand=1)
        feature_2 = SeqFeature(feature_2_loc, type='CDS', id='2')
        seq_record.features.append(feature_2)

        maker = VCFToGenbankMaker(seq_record, None, None)
        maker._update_genome_record_for_variant(
                len(before_overlap), overlap, '')

        # Assert the sequence is correct.
        EXPECTED_SEQ = before_overlap + after_overlap
        self.assertEqual(EXPECTED_SEQ, str(seq_record.seq))

        # Assert the feature annotations are still correct.
        EXPECTED_FEATURE_1_SEQ = before_overlap
        self.assertEqual(EXPECTED_FEATURE_1_SEQ,
                str(feature_1.extract(seq_record.seq)))

        EXPECTED_FEATURE_2_SEQ = after_overlap
        self.assertEqual(EXPECTED_FEATURE_2_SEQ,
                str(feature_2.extract(seq_record.seq)))
    def test_update_genome_record_for_variant__overlapping_features(self):
        """Tests handling a record that lands in a region of overlapping
        features.
        """
        # Example based on intersection of nei and arbB genes in MG1655.
        before_overlap = 'GCCCTGGCTGCCAGCA'
        overlap = 'CTAG'
        after_overlap = 'GCCGACCGCTTCGG'
        raw_seq_str = before_overlap + overlap + after_overlap
        seq = Seq(raw_seq_str, generic_dna)
        seq_record = SeqRecord(seq)

        feature_1_loc = FeatureLocation(0,
                len(before_overlap) + len(overlap), strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1')
        seq_record.features.append(feature_1)

        feature_2_loc = FeatureLocation(len(before_overlap), len(raw_seq_str),
                strand=1)
        feature_2 = SeqFeature(feature_2_loc, type='CDS', id='2')
        seq_record.features.append(feature_2)

        maker = VCFToGenbankMaker(seq_record, None, None)
        overlap_replacement = 'TTAA'
        maker._update_genome_record_for_variant(len(before_overlap), overlap,
                overlap_replacement)

        # Features changed, so requery them.
        feature_1 = None
        feature_2 = None
        for feature in seq_record.features:
            if feature.id == '1':
                feature_1 = feature
            elif feature.id == '2':
                feature_2 = feature
        assert feature_1
        assert feature_2

        # Assert the sequence is correct.
        EXPECTED_SEQ = before_overlap + overlap_replacement + after_overlap
        self.assertEqual(EXPECTED_SEQ, str(seq_record.seq))

        # Feature added to represent swap.
        # self.assertEqual(3, len(seq_record.features))

        # Assert the feature annotations are still correct.
        EXPECTED_FEATURE_1_SEQ = before_overlap + overlap_replacement
        self.assertEqual(EXPECTED_FEATURE_1_SEQ,
                str(feature_1.extract(seq_record.seq)))

        EXPECTED_FEATURE_2_SEQ = overlap_replacement + after_overlap
        self.assertEqual(EXPECTED_FEATURE_2_SEQ,
                str(feature_2.extract(seq_record.seq)))
Esempio n. 4
0
    def test_remove_site_in_coding_feature(self):
        """Tests removing a restriction enzyme that falls in a coding
        region.
        """
        RESTRICTION_ENZYME = Restriction.BsmBI
        BEFORE = 'ATGTTTGGGCCCAAATTTGGGAAATTTGGGAAATTTGGGAAATTTGGGAAATTTGGG'
        SITE_SEQ = RESTRICTION_ENZYME.site
        AFTER = 'TAGAAAAAAAAAAAAAAAA'
        SEQ = Seq(BEFORE + SITE_SEQ + AFTER, generic_dna)
        seq_record = SeqRecord(SEQ)

        refactor_context = RefactorContext(seq_record)

        feature_1_loc = FeatureLocation(0,
                                        len(BEFORE) + len(SITE_SEQ) + 3,
                                        strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1')
        seq_record.features.append(feature_1)
        FEATURE_1_SEQ_ORIG = feature_1.extract(str(seq_record.seq))
        FEATURE_1_NUM_CODONS = len(feature_1) / 3

        # Compute fake feature profile.
        fake_profile_values_map = {}
        fake_profile_values_map[feature_1.id] = {
            GCContentFeatureProfile.get_name(): [0.2] * FEATURE_1_NUM_CODONS,
            SecondaryStructureFeatureProfile.get_name():
            [-10] * FEATURE_1_NUM_CODONS,
            CodonRarityFeatureProfile.get_name(): [0.5] * FEATURE_1_NUM_CODONS,
        }
        refactor_context.set_feature_id_to_profile_values_map(
            fake_profile_values_map)

        occurrences = find_restriction_site_occurrences(
            seq_record, RESTRICTION_ENZYME)
        self.assertEqual(1, len(occurrences))

        result = _remove_site_in_coding_feature(refactor_context, seq_record,
                                                occurrences[0], feature_1)

        self.assertTrue(result['is_success'])

        seq_record = result['updated_genome_record']

        FEATURE_1_SEQ_UPDATED = feature_1.extract(str(seq_record.seq))

        occurrences = find_restriction_site_occurrences(
            seq_record, RESTRICTION_ENZYME)
        self.assertEqual(0, len(occurrences))
        self.assertEqual(translate_custom(FEATURE_1_SEQ_ORIG),
                         translate_custom(FEATURE_1_SEQ_UPDATED))
Esempio n. 5
0
    def _findGeneInSeq(self, record):
        """Extract gene sequence from larger sequence (e.g. genomes)
by searching features."""
        if not record.features:
            # if there aren't any features, just return the record
            return record
        for feature in record.features:
            feature_names = []
            if 'gene' in feature.qualifiers.keys():
                feature_names.extend(feature.qualifiers['gene'])
            if 'gene_synonym' in feature.qualifiers.keys():
                feature_names.extend(feature.qualifiers['gene_synonym'])
            if 'product' in feature.qualifiers.keys():
                feature_names.extend(feature.qualifiers['product'])
            gene_names = [e.lower() for e in self.gene_names]
            feature_names = [e.lower() for e in feature_names]
            if set(gene_names) & set(feature_names):
                try:
                    extractor = SeqFeature(feature.location)
                    found_seq = extractor.extract(record)
                except ValueError:
                    # catch value errors raised for sequences
                    #  with "fuzzy" positions
                    # TODO: what are fuzzy positions and can I use
                    #  them?
                    return record
                else:
                    return found_seq
        return record
Esempio n. 6
0
    def test__init(self):
        "Test FeatureMatch object creation"
        # forward strand
        m = self.match
        self.assertEqual(m.direction, "forward")
        self.assertEqual(str(m.dna), "ATGTACTCCACTATCTGCTGA")
        self.assertEqual(str(m.long_dna), str(self.feature_seq))
        self.assertEqual(str(m.promotor_region), "AAA")
        self.assertEqual(str(m.terminator_region), "TTT")
        self.assertEqual(str(m.mrna), str(self.feature_seq.transcribe()))
        self.assertEqual(
            str(m.aas),
            str(self.feature.extract(self.seq).translate(to_stop=True)))

        # reverse strand
        inv_seq = self.seq.reverse_complement()
        feature_seq = inv_seq[3:-3]
        inv_feature = SeqFeature(FeatureLocation(6, 27),
                                 type="gene",
                                 strand=-1)
        m = FeatureMatch(inv_feature, feature_seq, -1, 3)
        self.assertEqual(m.direction, "reverse")
        self.assertEqual(str(m.dna),
                         str(feature_seq[3:-3].reverse_complement()))
        self.assertEqual(str(m.long_dna), str(self.feature_seq))
        self.assertEqual(str(m.promotor_region), "AAA")
        self.assertEqual(str(m.terminator_region), "TTT")
        self.assertEqual(str(m.mrna),
                         str(feature_seq.transcribe().reverse_complement()))
        self.assertEqual(
            str(m.aas),
            str(inv_feature.extract(inv_seq).translate(to_stop=True)))
Esempio n. 7
0
def get_longest(seq_record, gene2isoforms):
	l = []
	c = 0;

	chrom = adjust_name(seq_record.name);	
	
	for gene, isoforms in gene2isoforms.iteritems():
		longest = max(isoforms, key = lambda i: sum([len(x) for x in i]))
		
		if(args.format == 'bed'):
			compound_to_bed(longest, chrom, gene)
			
		elif(args.format == 'fasta'):
			if(len(longest) > 1):
				location = CompoundLocation(longest, operator = "join")
			else:
				location = longest[0];
				
			feature = SeqFeature(location=location, type='utr', strand = longest[0].strand)
			
			#print longest[0].strand
			
			f = feature.extract(seq_record)
			f.name = gene
			f.id = gene
			f.description = gene
			l.append(f);
	return l;	
Esempio n. 8
0
    def set_gap_features(self, len_cutoff=10):
        num_assembly_gap = 0
        for record in self.seq_dict.values():
            startPosition = 0
            seq = str(record.seq).upper()
            pat = "(" + "N" * len_cutoff + "+)"
            for fragment in re.split(pat, seq):
                endPosition = startPosition + len(fragment)
                if fragment.startswith("N"):
                    qualifiers = {
                        "estimated_length": [len(fragment)],
                        "gap_type": ["within scaffold"],
                        "linkage_evidence": [self.linkage_evidence]
                    }
                    location = FeatureLocation(startPosition,
                                               endPosition,
                                               strand=1)
                    feature = SeqFeature(location,
                                         id=uuid4(),
                                         type="assembly_gap",
                                         qualifiers=qualifiers)

                    assert str(feature.extract(record).seq).upper() == fragment
                    record.features.append(feature)
                    num_assembly_gap += 1
                startPosition = endPosition
        self.report["num_assembly_gap"] = num_assembly_gap
Esempio n. 9
0
def generate_genes(genbank):
    """
    Generate gene rows for every feature in a genbank object.
    """
    for (idx, feature) in enumerate(genbank.features):
        if feature.type == 'source' or feature.type == 'gene':
            continue
        row = {
            'location_start': feature.location.start,
            'location_end': feature.location.end,
            'strand': feature.strand,
            'ref': feature.ref,
            'ref_db': feature.ref_db
        }
        for (name, val) in feature.qualifiers.items():
            # For some reason, all values under .qualifiers are lists of one elem
            # We join the elems into a string just in case there are ever multiple items
            row[name] = ', '.join(val)
        if not row.get('locus_tag'):
            # No locus tag; skip this one. We can only use features with locus tags.
            continue
        row['_key'] = row['locus_tag']
        # Generate the DNA sequence using biopython
        # https://biopython.org/DIST/docs/api/Bio.SeqFeature.SeqFeature-class.html#extract
        seq_obj = SeqFeature(feature.location,
                             feature.type)  # type: SeqFeature
        seq_str = str(seq_obj.extract(genbank.seq))
        row['dna_sequence'] = seq_str
        yield row
def get_gene_and_201bp_upstream(genefeature,genomeseq):
    mystart = genefeature.location.start
    myend = genefeature.location.end
    mystrand = genefeature.location.strand
    if mystrand == 1:
        newfeature = SeqFeature(FeatureLocation(mystart-201,myend),strand=mystrand)
    elif mystrand == -1:
        newfeature = SeqFeature(FeatureLocation(mystart,myend+201),strand=mystrand)
    return newfeature.extract(genomeseq)
def find_cds ():
    seq_des = str(record_dict[keys].description).split("|")
    for i in seq_des:
        if re.match("CDS", i):
            feature, cds_start, cds_end = re.split(":|-", i)
    cds_feature = SeqFeature(FeatureLocation(int(cds_start)-1,int(cds_end)-1),
                type=str(feature))
    cds_sequence = cds_feature.extract(record_dict[keys].seq)
    print cds_sequence.translate()
    return cds_start, cds_end, cds_sequence
Esempio n. 12
0
def create_fragmented_pseudo(args, fragments, seqrecord):
    """Takes a list of features are concatenates them into a single pseudogene feature"""

    start = min([feature.location.start for feature in fragments])
    end = max([feature.location.end for feature in fragments])
    strands = [feature.strand for feature in fragments if (feature.strand is not None and feature.strand != 0)]

    if len(strands) == 0:   # Occurs if two intergenic regions are being merged
        strand = 0
    elif all([strand == strands[0] for strand in strands]):
        strand = strands[0]
    else:
        # TODO: This occurs if features from + and - strand are going to be combined. Very rarely does this happen
        # TODO: but there could be biologically relevant reasons for this to occur ie two fragments on + strand
        # TODO: separated by an ORF on the - strand which is an insertion sequence.
        # TODO: We should explore options to handle these cases in the future.
        tags = [feature.qualifiers['locus_tag'][0] for feature in fragments]
        parent_tags = [feature.qualifiers.get('parents') for feature in fragments]

        # TODO: understand why 'tags + parent_tags' results in a nested list and fix the source of the issue rather than forcibly flattening.
        # Order of operations: flatten -> remove duplicates -> remove 'None' -> convert to list
        all_tags = list(filter(None, set(flatten(tags + parent_tags))))

        common.print_with_time("WARNING: Pseudogene detected which traverses features on (+) and (-) strands.\n"
                               "We recommend manual inspection of this region.\n"
                               "Features involved: %s" % all_tags)

        strand = 0
        # raise RuntimeError("Trying to combine genes on opposite strands.")

    hits = []
    for feature in fragments:
        hits.extend(feature.qualifiers['hits'])

    parents = []
    for fragment in fragments:
        frag_parents = fragment.qualifiers.get('parents')
        if frag_parents is not None:
            parents.extend(frag_parents)
        parents.extend([feature.qualifiers['locus_tag'][0] for feature in fragments])
        parents = list(set(flatten(parents)))

    pseudo = SeqFeature(location=FeatureLocation(start, end),
                        type='pseudogene',
                        strand=strand)

    pseudo.qualifiers['nucleotide_seq'] = pseudo.extract(seqrecord.seq)
    pseudo.qualifiers['contig_id'] = seqrecord.name
    pseudo.qualifiers['hits'] = hits
    pseudo.qualifiers['locus_tag'] = ''
    pseudo.qualifiers['parents'] = parents
    pseudo.qualifiers['pseudo_type'] = PseudoType.fragmented
    pseudo.qualifiers['note'] = "Pseudogene candidate. Reason: Predicted fragmentation of a single gene."

    seqrecord.features.append(pseudo)
Esempio n. 13
0
 def test_within(self):
     """Features: write/read simple within locations."""
     s = "N" * 100
     f = SeqFeature(FeatureLocation(WithinPosition(2,6),10), \
                    strand=+1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "(3.9)..10")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(WithinPosition(12,6),
                                    WithinPosition(20,8)), \
                    strand=+1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "(13.19)..(20.28)")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(25,WithinPosition(30,3)), \
                    strand=+1, type="misc_feature")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "26..(30.33)")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(WithinPosition(35,4),40), \
                    strand=-1, type="rRNA")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "complement((36.40)..40)")
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(WithinPosition(45,2),
                                    WithinPosition(50,3)), \
                    strand=-1, type="repeat_region")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "complement((46.48)..(50.53))")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(55,WithinPosition(60,5)), \
                    strand=-1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "complement(56..(60.65))")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     self.write_read_checks()
def get_seq(genome, intervals):
    seq_records = []
    for seq_id, start, end in intervals:
        start = int(start)
        end = int(end)
        feature = SeqFeature(FeatureLocation(start, end))
        seq = feature.extract(SeqIO.to_dict(genome)[seq_id].seq)
        seq_record = SeqRecord(seq)
        seq_record.id = '{0}:{1}-{2}'.format(seq_id, start + 1, end)
        seq_record.description = ''
        seq_records.append(seq_record)
    return seq_records
    def get_seq_record_by_id_location(self, identifier, start=None, end=None, strand_=None):
        # print("Identifier {0}  Start {1}  End {2} Strand {3} ".format(identifier, start, end, strand_))
        if self.indexed_data is not None:
            if start is not None and end is not None and strand_ is not None:
                seq = self.get_sequence_by_id(identifier)
                dna_seq = Seq(str(seq), generic_dna)
                feature = SeqFeature(FeatureLocation(start, end), type="exon")
                return str(feature.extract(seq))
            else:
                return self.get_sequence_by_id(identifier)

        else:
            raise ValueError("Genbank Handler not initialized")
Esempio n. 16
0
    def test_deletion__overlapping_features(self):
        # Example based on intersection of nei and arbB genes in MG1655.
        before_overlap = 'GCCCTGGCTGCCAGCA'
        overlap = 'CTAG'
        after_overlap = 'GCCGACCGCTTCGG'
        raw_seq_str = before_overlap + overlap + after_overlap
        seq = Seq(raw_seq_str, generic_dna)
        seq_record = SeqRecord(seq)

        feature_1_loc = FeatureLocation(0,
                                        len(before_overlap) + len(overlap),
                                        strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1')
        seq_record.features.append(feature_1)

        feature_2_loc = FeatureLocation(len(before_overlap),
                                        len(raw_seq_str),
                                        strand=1)
        feature_2 = SeqFeature(feature_2_loc, type='CDS', id='2')
        seq_record.features.append(feature_2)

        maker = VCFToGenbankMaker(seq_record, None, None)
        maker._update_genome_record_for_variant(len(before_overlap), overlap,
                                                '')

        # Assert the sequence is correct.
        EXPECTED_SEQ = before_overlap + after_overlap
        self.assertEqual(EXPECTED_SEQ, str(seq_record.seq))

        # Assert the feature annotations are still correct.
        EXPECTED_FEATURE_1_SEQ = before_overlap
        self.assertEqual(EXPECTED_FEATURE_1_SEQ,
                         str(feature_1.extract(seq_record.seq)))

        EXPECTED_FEATURE_2_SEQ = after_overlap
        self.assertEqual(EXPECTED_FEATURE_2_SEQ,
                         str(feature_2.extract(seq_record.seq)))
Esempio n. 17
0
    def test_avoid_changes_in_shadows(self):
        """Avoid changing bases in CDS features.
        """
        # Sequence is all AT so some will have to change.
        seq = Seq(''.join([random.choice('AT') for i in range(200)]),
                  generic_dna)
        orig_seq_record = SeqRecord(seq)

        # We will expect a 60 base shadow from 60 - 120. This is the CDS
        # and 20 bases upstream.
        feature_1_loc = FeatureLocation(80, 120, strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1)
        orig_seq_record.features.append(feature_1)

        seq_record = copy.deepcopy(orig_seq_record)

        self.assertTrue(GC(seq_record.seq) == 0)

        # Hit just one position.
        interval_list = [(100, 101)]

        # Aim for 100% to hit all bases possible.
        constraint_obj = GCContentConstraints()
        constraint_obj.local_window_lower_bound = 1.0
        constraint_obj.local_window_upper_bound = 1.1  # no upper bound
        automated_intergenic_gc_fixer(seq_record,
                                      interval_list,
                                      gc_content_constraint_obj=constraint_obj)

        # Expect window centered at 100 to have GC 60, which is all
        window_seq = seq_record.seq[50:150]
        self.assertEqual(40, GC(window_seq))

        # Make sure shadow seq is unchanged.
        self.assertEqual(str(feature_1.extract(orig_seq_record.seq)),
                         str(feature_1.extract(seq_record.seq)))
Esempio n. 18
0
def retrieveCompositeSequence(seq_record,seqList) :
    # true seq 
    listePosition = list()
    for node in seqList :
        seq,coord = node.split(":")
        start,end = coord.split("..")
        listePosition.append(int(float(start)))
        listePosition.append(int(float(end)))

    start = min(listePosition)
    end = max(listePosition)
    f = SeqFeature(FeatureLocation(start,end))
    seq = f.extract(seq_record)
    seqId = seq_record.id+"|"+str(start)+"_"+str(end)
    return SeqRecord(seq=seq.seq,id=seqId,description="")
Esempio n. 19
0
    def __init__(self, coords, global_seq):
        """Init a region's basic information: start, end,
        globalsequence, subsequence.

        Args:
            coords (tuple(int, int)): start and end coordinates of the
                region.
            global_seq (Bio.Seq.Seq): the original sequence to which the
                region belongs.
        """
        self.start = coords[0]
        self.end = coords[1]
        self.globalsequence = global_seq
        # Biopython is 1-based but FeatureLocation takes Python
        # slicing-style positions: [20:30] -> 19..30
        feature = SeqFeature(FeatureLocation(self.start - 1, self.end))
        self.subsequence = feature.extract(global_seq).seq
Esempio n. 20
0
def check_sub(feature, sequence):
    new_features = []
    loc_list = []
    qual_list = {}
    topop = []
    for sub in feature.sub_features:
        if sub.sub_features:  # If there are sub_features, go deeper
            new_features.extend(check_sub(sub, sequence))
        elif sub.type == 'CDS':
            loc = [sub.location.start.real, sub.location.end.real]
            loc_list.append(FeatureLocation(loc[0], loc[1], strand=sub.strand))
            # For split features (CDSs), the final feature will have the same qualifiers as the children ONLY if
            # they're the same, i.e.: all children have the same "protein_ID" (key and value).
            for qual in sub.qualifiers.keys():
                if qual not in qual_list:
                    qual_list[qual] = sub.qualifiers[qual]
                if qual in qual_list and not qual_list[qual] == sub.qualifiers[
                        qual]:
                    topop.append(qual)

    for n in topop:  # Pop mismatching qualifers over split features
        qual_list.pop(n, None)
    qual_list.pop('Parent', None)  # Pop parent.

    # Only works in tip of the tree, when there's no new_feature built yet. If there is,
    # it means the script just came out of a check_sub and it's ready to return.
    if not new_features:
        if len(loc_list) > 1:
            loc_list = sorted(loc_list, key=lambda x: x.start.real)
            if loc_list[0].strand == 1:
                new_loc = CompoundLocation(loc_list)
            else:
                new_loc = CompoundLocation(list(reversed(loc_list)))
        elif len(loc_list) == 0:
            return new_features
        else:
            new_loc = loc_list[0]

        new_feature = SeqFeature(new_loc)
        new_feature.qualifiers = qual_list
        new_feature.type = 'CDS'
        trans = new_feature.extract(sequence.seq).translate(stop_symbol='')
        new_feature.qualifiers['translation'] = [str(trans)]
        new_features.append(new_feature)

    return new_features
def require_sd(data, record, chrom_start, sd_min, sd_max):
    sd_finder = NaiveSDCaller()
    for putative_gene in data:
        if putative_gene[2] > 0:  # strand
            start = chrom_start + putative_gene[0] - sd_max
            end = chrom_start + putative_gene[0] - sd_min
        else:
            start = chrom_start + putative_gene[1] + sd_min
            end = chrom_start + putative_gene[1] + sd_max

        (start, end) = __ensure_location_in_bounds(start=start, end=end,
                                                   parent_length=len(record))
        tmp = SeqFeature(FeatureLocation(
            start, end, strand=putative_gene[2]), type='domain')
        # Get the sequence
        seq = str(tmp.extract(record.seq))
        sds = sd_finder.list_sds(seq)
        if len(sds) > 0:
            yield putative_gene + (start, end)
Esempio n. 22
0
    def get_residue_annotations(self, start_resnum, end_resnum=None):
        """Retrieve letter annotations for a residue or a range of residues

        Args:
            start_resnum (int): Residue number
            end_resnum (int): Optional residue number, specify if a range is desired

        Returns:
            dict: Letter annotations for this residue or residues

        """
        if not end_resnum:
            end_resnum = start_resnum

        # Create a new SeqFeature
        f = SeqFeature(FeatureLocation(start_resnum - 1, end_resnum))

        # Get sequence properties
        return f.extract(self).letter_annotations
Esempio n. 23
0
def extract_sequences_one_sample(args):
    '''
    Function for extracting protein sequences given annotated regions and produce fasta files that can be used for clustering
  '''
    (fasta_path, annotation_path, sample, output_dir, domain) = args
    domainInfo = load_annotation_pfam(annotation_path, domain)
    print "Generating domain fasta sequences for " + sample + " ..."
    from Bio import SeqIO
    from Bio.SeqFeature import SeqFeature, FeatureLocation
    from Bio.SeqRecord import SeqRecord
    (annot, start, stop, strand, evalue) = domainInfo
    record_dict = index_fasta(fasta_path)
    recordlist = []
    outfilename = output_dir + '/forClustering/' + sample + '.fasta'
    outhandle = open(outfilename, 'w')
    for domainID in annot.keys():
        for i in range(len(annot[domainID])):
            domain = annot[domainID][i]
            try:
                seq = record_dict[domain]
            except KeyError:
                print "Error: " + domain + " not in fasta file.\n"
                break
            a = start[domainID][i]
            b = stop[domainID][i]
            seq_strand = strand[domainID][i]
            seq_evalue = evalue[domainID][i]
            if seq_strand in '+':
                domain_feature = SeqFeature(FeatureLocation(a - 1, b - 1),
                                            type="domain",
                                            strand=1)
            elif seq_strand in '-':
                domain_feature = SeqFeature(FeatureLocation(a - 1, b - 1),
                                            type="domain",
                                            strand=-1)
            feature_seq = domain_feature.extract(seq)
            feature_seq.id = feature_seq.id + ' ' + domainID + ' ' + seq_evalue
            recordlist.append(feature_seq)
    SeqIO.write(recordlist, outhandle, "fasta")
    outhandle.close()
    print "Done"
Esempio n. 24
0
def retrievePartialSequence(seqList,seq2coord,gi2domain,gi2phylum,gi2species,seqId2seqRecord) :
    # true seq 
    seq_set = set()
    for node in seqList :
        seq,coord = node.split(":")
        seq_set.add(seq)
#    print len(seq_set)

    # get fasta
    seqListExtracted = list()


    for seqId in seq_set :
        if seqId in seqId2seqRecord :
#            print seq_record.id
            seq_record = seqId2seqRecord[seqId]
            for node in seqList :
#                print "\t"+node.split(":")[0]
                if seq_record.id == node.split(":")[0] :
                    # extracting subseq thanks to coordinates
#                    print "\t\t"+seq_record.id+"\t"+node+"\t"+str(seq2coord[node])
                    start = seq2coord[node][0]
                    end = seq2coord[node][1]
                    f = SeqFeature(FeatureLocation(int(start),int(end) ) )
                    seq = f.extract(seq_record)
 #                   print seq
                    gi = node.split(":")[0]
                    species = gi2species[gi]
                    phylum = gi2phylum[gi]
                    seqId = gi+"|"+phylum+"|"+species+"|"+node.split(":")[1].replace("..","_")
                    
                    seqRecord = SeqRecord(seq=seq.seq,id=seqId,description="")
                    seqListExtracted.append(seqRecord)
                    seqList.remove(node)
                    break
                else :
                    continue
        else :
            continue
    return seqListExtracted
Esempio n. 25
0
def extract_sequences_one_sample(args):
  '''
    Function for extracting protein sequences given annotated regions and produce fasta files that can be used for clustering
  '''
  (fasta_path,annotation_path,sample, output_dir)=args
  domainInfo=load_annotation_pfam(annotation_path)
  print "Generating domain fasta sequences for "+sample+" ..."
  from Bio import SeqIO
  from Bio.SeqFeature import SeqFeature, FeatureLocation
  from Bio.SeqRecord import SeqRecord 
  (annot,start,stop,strand,evalue)=domainInfo
  record_dict=index_fasta(fasta_path)
  recordlist=[]
  outfilename=output_dir +'/forClustering/'+ sample +'.fasta'
  outhandle=open(outfilename,'w')
  for domainID in annot.keys():
      for i in range(len(annot[domainID])):
          domain=annot[domainID][i]
          try:
              seq=record_dict[domain]
          except KeyError:
             print "Error: " + domain + " not in fasta file.\n"
             break    
          a=start[domainID][i]
          b=stop[domainID][i]
          seq_strand=strand[domainID][i]
          seq_evalue=evalue[domainID][i]
          if seq_strand in '+':
             domain_feature = SeqFeature(FeatureLocation(a-1, b-1), type="domain", strand=1)
          elif seq_strand in '-':
               domain_feature = SeqFeature(FeatureLocation(a-1, b-1), type="domain", strand=-1)
          feature_seq = domain_feature.extract(seq)
          feature_seq.id=feature_seq.id+' '+domainID+' '+seq_evalue 
          recordlist.append(feature_seq)
  
  
  SeqIO.write(recordlist, outhandle, "fasta")
  outhandle.close()
  
  print "Done"
    def test_get_codon_feature_location__reverse_strand(self):
        FEATURE_SEQ_RAW = reverse_complement('ATGTTTGGGTAG')
        SEQ = Seq('CCCCCC' + FEATURE_SEQ_RAW + 'AGTA', generic_dna)
        seq_record = SeqRecord(SEQ)
        FEATURE_1_ID = '1'
        FEATURE_1_LOC = FeatureLocation(6, 18)
        feature_1 = SeqFeature(FEATURE_1_LOC, type='CDS',
                id=FEATURE_1_ID, strand=-1)
        add_feature_to_seq_record(seq_record, feature_1)

        # Sanity check for the feature sequence.
        feature_seq = str(feature_1.extract(seq_record.seq))
        self.assertEqual('ATG', feature_seq[0:3])
        self.assertEqual('TTT', feature_seq[3:6])

        CODON_0_FEATURE_LOCATION = get_codon_feature_location(feature_1, 0)
        self.assertEqual(15, CODON_0_FEATURE_LOCATION.start)
        self.assertEqual(18, CODON_0_FEATURE_LOCATION.end)

        CODON_1_FEATURE_LOCATION = get_codon_feature_location(feature_1, 1)
        self.assertEqual(12, CODON_1_FEATURE_LOCATION.start)
        self.assertEqual(15, CODON_1_FEATURE_LOCATION.end)
Esempio n. 27
0
    def testFeatureUpstream(self, feature, record, sd_min=5, sd_max=15):
        # Strand information necessary to getting correct upstream sequence
        # TODO: library?
        strand = feature.location.strand
        # n_bases_upstream
        if strand > 0:
            start = feature.location.start - sd_max
            end = feature.location.start - sd_min
        else:
            start = feature.location.end + sd_min
            end = feature.location.end + sd_max

        (start, end) = ensure_location_in_bounds(start=start,
                                                 end=end,
                                                 parent_length=len(record))

        # Create our temp feature used to obtain correct portion of
        # genome
        tmp = SeqFeature(FeatureLocation(start, end, strand=strand),
                         type="domain")
        seq = str(tmp.extract(record.seq))
        return self.list_sds(seq), start, end, seq
Esempio n. 28
0
def get_TIS(fullpath, filename):

    #scans the file name to determine how many bases are upstream of the CDS
    list_of_numbers_in_filename = re.findall('\d+', filename)
    num_bp_upstreamcds = int(list_of_numbers_in_filename[0])

    extracted_TIS_list = []

    #sets up the TIS coordinates prior to extraction
    TIS_coordinates = SeqFeature(FeatureLocation(num_bp_upstreamcds - num_bp_upstream_start, num_bp_upstreamcds + num_bp_downstream_start))
    
    # reads in a gbk and creates a SeqRecord object
    for record in SeqIO.parse(fullpath, "fasta"):
        TIS_only_record = TIS_coordinates.extract(record)

        #annotated_TIS_only_record = SeqRecord(TIS_only_record.seq, TIS_only_record.id, description = "|" + cds_protein_id +"|")

        extracted_TIS_list.append(TIS_only_record)

    SeqIO.write(extracted_TIS_list, "extracted_TIS_" + filename + ".TIS.fasta", "fasta")
    
    return
def annotate_like_HXB2(refname, VERBOSE=0):
    '''Annotate copying from HXB2'''
    hxb2 = load_custom_reference('HXB2', 'gb')
    ref = load_custom_reference(refname, 'fasta')
    refs = str(ref.seq)

    def get_sublocation(sublocation):
        hxb2_seq = sublocation.extract(hxb2)
        ref_seq = trim_to_refseq(refs, hxb2_seq).replace('-', '')
        start = refs.find(ref_seq)
        end = start + len(ref_seq)
        return FeatureLocation(start, end, strand=+1)

    for fea in hxb2.features:
        if VERBOSE >= 1:
            print fea.id
        loc = [get_sublocation(loc) for loc in fea.location.parts]
        if len(loc) == 1:
            loc = loc[0]
        else:
            loc = CompoundLocation(loc)

        feature = SeqFeature(loc, type=fea.type, id=fea.id)

        # Test length of old and new
        if fea.id not in ["LTR5'", "LTR3'", 'V4']:
            L1 = len(fea.extract(hxb2))
            L2 = len(feature.extract(ref))
            s = str(L2)+' vs '+str(L1)
            if 1.0 * L2 / L1 < 0.9:
                raise ValueError('Feature: '+fea.id+' is too short: '+s)
            elif 1.0 * L2 / L1 > 1.1:
                raise ValueError('Feature: '+fea.id+' is too long: '+s)

        ref.features.append(feature)

    return ref
Esempio n. 30
0
def annotate_like_HXB2(refname, VERBOSE=0):
    '''Annotate copying from HXB2'''
    hxb2 = load_custom_reference('HXB2', 'gb')
    ref = load_custom_reference(refname, 'fasta')
    refs = str(ref.seq)

    def get_sublocation(sublocation):
        hxb2_seq = sublocation.extract(hxb2)
        ref_seq = trim_to_refseq(refs, hxb2_seq).replace('-', '')
        start = refs.find(ref_seq)
        end = start + len(ref_seq)
        return FeatureLocation(start, end, strand=+1)

    for fea in hxb2.features:
        if VERBOSE >= 1:
            print fea.id
        loc = [get_sublocation(loc) for loc in fea.location.parts]
        if len(loc) == 1:
            loc = loc[0]
        else:
            loc = CompoundLocation(loc)

        feature = SeqFeature(loc, type=fea.type, id=fea.id)

        # Test length of old and new
        if fea.id not in ["LTR5'", "LTR3'", 'V4']:
            L1 = len(fea.extract(hxb2))
            L2 = len(feature.extract(ref))
            s = str(L2) + ' vs ' + str(L1)
            if 1.0 * L2 / L1 < 0.9:
                raise ValueError('Feature: ' + fea.id + ' is too short: ' + s)
            elif 1.0 * L2 / L1 > 1.1:
                raise ValueError('Feature: ' + fea.id + ' is too long: ' + s)

        ref.features.append(feature)

    return ref
Esempio n. 31
0
    def test__init(self):
        "Test FeatureMatch object creation"
        # forward strand
        m = self.match
        self.assertEqual(m.direction, "forward")
        self.assertEqual(str(m.dna), "ATGTACTCCACTATCTGCTGA")
        self.assertEqual(str(m.long_dna), str(self.feature_seq))
        self.assertEqual(str(m.promotor_region), "AAA")
        self.assertEqual(str(m.terminator_region), "TTT")
        self.assertEqual(str(m.mrna), str(self.feature_seq.transcribe()))
        self.assertEqual(str(m.aas), str(self.feature.extract(self.seq).translate(to_stop=True)))

        # reverse strand
        inv_seq = self.seq.reverse_complement()
        feature_seq = inv_seq[3:-3]
        inv_feature = SeqFeature(FeatureLocation(6, 27), type="gene", strand=-1)
        m = FeatureMatch(inv_feature, feature_seq, -1, 3)
        self.assertEqual(m.direction, "reverse")
        self.assertEqual(str(m.dna), str(feature_seq[3:-3].reverse_complement()))
        self.assertEqual(str(m.long_dna), str(self.feature_seq))
        self.assertEqual(str(m.promotor_region), "AAA")
        self.assertEqual(str(m.terminator_region), "TTT")
        self.assertEqual(str(m.mrna), str(feature_seq.transcribe().reverse_complement()))
        self.assertEqual(str(m.aas), str(inv_feature.extract(inv_seq).translate(to_stop=True)))
    def test_get_codon_feature_location__reverse_strand(self):
        FEATURE_SEQ_RAW = reverse_complement('ATGTTTGGGTAG')
        SEQ = Seq('CCCCCC' + FEATURE_SEQ_RAW + 'AGTA', generic_dna)
        seq_record = SeqRecord(SEQ)
        FEATURE_1_ID = '1'
        FEATURE_1_LOC = FeatureLocation(6, 18)
        feature_1 = SeqFeature(FEATURE_1_LOC,
                               type='CDS',
                               id=FEATURE_1_ID,
                               strand=-1)
        add_feature_to_seq_record(seq_record, feature_1)

        # Sanity check for the feature sequence.
        feature_seq = str(feature_1.extract(seq_record.seq))
        self.assertEqual('ATG', feature_seq[0:3])
        self.assertEqual('TTT', feature_seq[3:6])

        CODON_0_FEATURE_LOCATION = get_codon_feature_location(feature_1, 0)
        self.assertEqual(15, CODON_0_FEATURE_LOCATION.start)
        self.assertEqual(18, CODON_0_FEATURE_LOCATION.end)

        CODON_1_FEATURE_LOCATION = get_codon_feature_location(feature_1, 1)
        self.assertEqual(12, CODON_1_FEATURE_LOCATION.start)
        self.assertEqual(15, CODON_1_FEATURE_LOCATION.end)
Esempio n. 33
0
 def test_oneof(self):
     """Features: write/read simple one-of locations."""
     s = "N" * 100
     start = OneOfPosition([ExactPosition(0),ExactPosition(3),ExactPosition(6)])
     f = SeqFeature(FeatureLocation(start,21), strand=+1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "one-of(1,4,7)..21")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     start = OneOfPosition([ExactPosition(x) for x in [10,13,16]])
     end = OneOfPosition([ExactPosition(x) for x in [41,44,50]])
     f = SeqFeature(FeatureLocation(start,end), strand=+1, type="gene")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "one-of(11,14,17)..one-of(41,44,50)")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     end = OneOfPosition([ExactPosition(x) for x in [30,33]])
     f = SeqFeature(FeatureLocation(27,end), strand=+1, type="gene")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "28..one-of(30,33)")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     start = OneOfPosition([ExactPosition(x) for x in [36,40]])
     f = SeqFeature(FeatureLocation(start,46), strand=-1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "complement(one-of(37,41)..46)")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     start = OneOfPosition([ExactPosition(x) for x in [45,60]])
     end = OneOfPosition([ExactPosition(x) for x in [70,90]])
     f = SeqFeature(FeatureLocation(start,end), strand=-1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "complement(one-of(46,61)..one-of(70,90))")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     end = OneOfPosition([ExactPosition(x) for x in [60,63]])
     f = SeqFeature(FeatureLocation(55,end), strand=-1, type="tRNA")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "complement(56..one-of(60,63))")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     self.write_read_checks()
Esempio n. 34
0
 def test_after(self):
     """Features: write/read simple after locations."""
     s = "N" * 200
     f = SeqFeature(FeatureLocation(AfterPosition(5),10), \
                    strand=+1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      ">6..10")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(AfterPosition(15),AfterPosition(20)), \
                    strand=+1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      ">16..>20")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(25,AfterPosition(30)), \
                    strand=+1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "26..>30")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(AfterPosition(35),40), \
                    strand=-1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "complement(>36..40)")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(AfterPosition(45),AfterPosition(50)), \
                    strand=-1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "complement(>46..>50)")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(55,AfterPosition(60)), \
                    strand=-1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "complement(56..>60)")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     self.write_read_checks()
Esempio n. 35
0
 def test_before(self):
     """Features: write/read simple before locations."""
     s = "N"*200
     f = SeqFeature(FeatureLocation(BeforePosition(5),10), \
                    strand=+1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "<6..10")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(BeforePosition(15),BeforePosition(20)), \
                    strand=+1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "<16..<20")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(25,BeforePosition(30)), \
                    strand=+1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "26..<30")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(BeforePosition(35),40), \
                    strand=-1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "complement(<36..40)")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(BeforePosition(45),BeforePosition(50)), \
                    strand=-1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "complement(<46..<50)")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(55,BeforePosition(60)), \
                    strand=-1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f,100),
                      "complement(56..<60)")
     self.assertEqual(len(f), len(f.extract(s)))
     self.record.features.append(f)
     self.write_read_checks()
Esempio n. 36
0
class TestFeatureMatch(unittest2.TestCase):
    def setUp(self):
        self.seq = Seq("CCCAAAATGTACTCCACTATCTGCTGATTTGGG", generic_dna)
        self.feature = SeqFeature(FeatureLocation(6, 27), type="gene", strand=1)
        self.feature_seq = self.seq[3:-3]
        self.match = FeatureMatch(self.feature, self.feature_seq, 1, 3)


    def test__init(self):
        "Test FeatureMatch object creation"
        # forward strand
        m = self.match
        self.assertEqual(m.direction, "forward")
        self.assertEqual(str(m.dna), "ATGTACTCCACTATCTGCTGA")
        self.assertEqual(str(m.long_dna), str(self.feature_seq))
        self.assertEqual(str(m.promotor_region), "AAA")
        self.assertEqual(str(m.terminator_region), "TTT")
        self.assertEqual(str(m.mrna), str(self.feature_seq.transcribe()))
        self.assertEqual(str(m.aas), str(self.feature.extract(self.seq).translate(to_stop=True)))

        # reverse strand
        inv_seq = self.seq.reverse_complement()
        feature_seq = inv_seq[3:-3]
        inv_feature = SeqFeature(FeatureLocation(6, 27), type="gene", strand=-1)
        m = FeatureMatch(inv_feature, feature_seq, -1, 3)
        self.assertEqual(m.direction, "reverse")
        self.assertEqual(str(m.dna), str(feature_seq[3:-3].reverse_complement()))
        self.assertEqual(str(m.long_dna), str(self.feature_seq))
        self.assertEqual(str(m.promotor_region), "AAA")
        self.assertEqual(str(m.terminator_region), "TTT")
        self.assertEqual(str(m.mrna), str(feature_seq.transcribe().reverse_complement()))
        self.assertEqual(str(m.aas), str(inv_feature.extract(inv_seq).translate(to_stop=True)))


    def test_get_fasta_header(self):
        "Test FeatureMatch FASTA header creation"
        expected = ">untagged"
        self.assertEqual(self.match.get_fasta_header(), expected)

        self.match.feature.qualifiers['gene'] = ['fake']
        expected = ">fake"
        self.assertEqual(self.match.get_fasta_header(), expected)

        self.match.feature.qualifiers['locus_tag'] = ['FAKE_0001']
        expected = ">FAKE_0001"
        self.assertEqual(self.match.get_fasta_header(), expected)

        self.match.feature.qualifiers['product'] = ['Mup1']
        expected = ">FAKE_0001|Mup1"
        self.assertEqual(self.match.get_fasta_header(), expected)

        self.match.feature.qualifiers['protein_id'] = ['MUP_0001']
        expected = ">FAKE_0001|Mup1|MUP_0001"
        self.assertEqual(self.match.get_fasta_header(), expected)


    def test__str_(self):
        "Test FeatureMatch string representation"
        expected = """Feature:
	untagged
	Strand: forward
	DNA: ATGTACTCCACTATCTGCTGA
	mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU
	Protein: MYSTIC"""
        self.assertMultiLineEqual(str(self.match), expected)

        self.match.feature.qualifiers['gene'] = ["fake"]
        expected = """Feature:
	Tag: fake
	Strand: forward
	DNA: ATGTACTCCACTATCTGCTGA
	mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU
	Protein: MYSTIC"""
        self.assertMultiLineEqual(str(self.match), expected)

        self.match.feature.qualifiers['locus_tag'] = ['FAKE_0001']
        expected = """Feature:
	Tag: FAKE_0001
	Strand: forward
	DNA: ATGTACTCCACTATCTGCTGA
	mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU
	Protein: MYSTIC"""
        self.assertMultiLineEqual(str(self.match), expected)

        self.match.feature.qualifiers['product'] = ['Mup1']
        expected = """Feature:
	Tag: FAKE_0001
	Strand: forward
	Product: Mup1
	DNA: ATGTACTCCACTATCTGCTGA
	mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU
	Protein: MYSTIC"""
        self.assertMultiLineEqual(str(self.match), expected)

        self.match.feature.qualifiers['protein_id'] = ['MUP_0001']
        expected = """Feature:
	Tag: FAKE_0001
	Strand: forward
	Product: Mup1
	Protein ID: MUP_0001
	DNA: ATGTACTCCACTATCTGCTGA
	mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU
	Protein: MYSTIC"""
        self.assertMultiLineEqual(str(self.match), expected)


    def test_dna_fasta(self):
        "Test FeatureMatch DNA FASTA output"
        expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.dna)
        self.assertMultiLineEqual(self.match.dna_fasta(), expected)


    def test_long_dna_fasta(self):
        "Test FeatureMatch long DNA FASTA output"
        expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.long_dna)
        self.assertMultiLineEqual(self.match.long_dna_fasta(), expected)


    def test_mrna_fasta(self):
        "Test FeatureMatch mRNA FASTA output"
        expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.mrna)
        self.assertMultiLineEqual(self.match.mrna_fasta(), expected)


    def test_protein_fasta(self):
        "Test FeatureMatch protein FASTA output"
        expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.aas)
        self.assertMultiLineEqual(self.match.protein_fasta(), expected)


    def test_promotor_fasta(self):
        "Test FeatureMatch promotor DNA FASTA output"
        expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.promotor_region)
        self.assertMultiLineEqual(self.match.promotor_fasta(), expected)


    def test_terminator_fasta(self):
        "Test FeatureMatch terminator DNA FASTA output"
        expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.terminator_region)
        self.assertMultiLineEqual(self.match.terminator_fasta(), expected)
        # if we had a TSS (not ever gene does)
        if tss:
            # open the fasta file sequence.fasta
            for seq_record in SeqIO.parse("sequence.fasta", "fasta"):
                # check from genestart to TSS is forward or tss to gene start if backwards with -1 or 1 strange appropraitely
                if direction is "-":
                    # print(tss)

                    test_feature = SeqFeature(FeatureLocation(int(geneStarts[i]), int(tss)), type="gene", strand=-1)
                else:
                    test_feature = SeqFeature(FeatureLocation(int(tss), int(geneStarts[i])), type="gene", strand=1)

                # test_feature=SeqFeature(FeatureLocation(40417,TSS),type="gene",strand=strandDirection)

                # example_seq now contains the sequence which we need to search through
                example_seq = test_feature.extract(seq_record)

                print (example_seq.seq)

                # need to transcribe the sequence
                sequence = str(example_seq.seq.transcribe())
                print sequence

                letters = ["U", "A", "C", "G"]

                # default the stalling position to x
                stallingPosition = ["x", "x", "x", "x"]

                # for each letter in UACG
                for letter in letters:
Esempio n. 38
0
def get_codon(cds: SeqFeature, seq: Seq, region_pos: int) -> Codon:
    region_codon_start = region_pos - (region_pos % 3)
    region = cds.extract(seq)
    codon = region[region_codon_start:region_codon_start + 3]._data
    return codon
Esempio n. 39
0
def name_nrpspks(seq_record, pksnrpsvars, withinclustergenes, options):
    pksnrpsvars.nrpspkstypedict = {}
    for feature in withinclustergenes:
        k = utils.get_gene_id(feature)
        if not pksnrpsvars.domaindict.has_key(k):
            continue
        if pksnrpsvars.domaindict[k] == []:
            continue
        #structure of domaindict: domaindict[genename] = [[name,start,end,evalue,score],[name,start,end,evalue,score], etc.]
        domainlist = []
        nrKSdomains = 0
        for i in pksnrpsvars.domaindict[k]:
            domainlist.append(i[0])
            if i[0] == "PKS_KS":
                nrKSdomains += 1
        modKSscore = 0
        traKSscore = 0
        eneKSscore = 0
        iterKSscore = 0
        if pksnrpsvars.ksdomaindict.has_key(k):
            for i in pksnrpsvars.ksdomaindict[k]:
                if i[0] == "Trans-AT-KS":
                    traKSscore += 1
                if i[0] == "Modular-KS":
                    modKSscore += 1
                if i[0] == "Enediyne-KS":
                    eneKSscore += 1
                if i[0] == "Iterative-KS":
                    iterKSscore += 1
        if pksnrpsvars.domaindict.has_key(k):
            for i in pksnrpsvars.domaindict[k]:
                if "Cglyc" in domainlist and "Epimerization" in domainlist and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
                    nrpspkstype = "Glycopeptide NRPS"
                elif (
                        "Condensation_LCL" in domainlist or "Condensation_DCL"
                        in domainlist or "Condensation_Starter" in domainlist
                        or "Cglyc" in domainlist
                        or "Condensation_Dual" in domainlist
                ) and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
                    nrpspkstype = "NRPS"
                elif ("Condensation_LCL" in domainlist or "Condensation_DCL"
                      in domainlist or "Condensation_Starter" in domainlist
                      or "Cglyc" in domainlist or "Condensation_Dual"
                      in domainlist) or "AMP-binding" in domainlist and (
                          "PKS_KS" in domainlist or "PKS_AT" in domainlist):
                    nrpspkstype = "Hybrid PKS-NRPS"
                elif (
                        "Condensation_LCL" not in domainlist
                        and "Condensation_DCL" not in domainlist
                        and "Condensation_Starter" not in domainlist
                        and "Cglyc" not in domainlist and "Condensation_Dual"
                        not in domainlist and "AMP-binding" not in domainlist
                ) and "PKS_KS" in domainlist and "PKS_AT" not in domainlist and "Trans-AT_docking" in domainlist and traKSscore > modKSscore and traKSscore > iterKSscore and traKSscore > eneKSscore:
                    nrpspkstype = "Type I Trans-AT PKS"
                elif (
                        "Condensation_LCL" not in domainlist
                        and "Condensation_DCL" not in domainlist
                        and "Condensation_Starter" not in domainlist
                        and "Cglyc" not in domainlist and "Condensation_Dual"
                        not in domainlist and "AMP-binding" not in domainlist
                ) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and iterKSscore > modKSscore and iterKSscore > traKSscore and iterKSscore > eneKSscore and nrKSdomains < 3:
                    nrpspkstype = "Type I Iterative PKS"
                elif (
                        "Condensation_LCL" not in domainlist
                        and "Condensation_DCL" not in domainlist
                        and "Condensation_Starter" not in domainlist
                        and "Cglyc" not in domainlist and "Condensation_Dual"
                        not in domainlist and "AMP-binding" not in domainlist
                ) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and eneKSscore > modKSscore and eneKSscore > traKSscore and eneKSscore > iterKSscore and nrKSdomains < 3:
                    nrpspkstype = "Type I Enediyne PKS"
                elif (
                        "Condensation_LCL" not in domainlist
                        and "Condensation_DCL" not in domainlist
                        and "Condensation_Starter" not in domainlist
                        and "Cglyc" not in domainlist and "Condensation_Dual"
                        not in domainlist and "AMP-binding" not in domainlist
                ) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and (
                    (modKSscore > eneKSscore and modKSscore > traKSscore
                     and modKSscore > iterKSscore) or nrKSdomains > 3):
                    nrpspkstype = "Type I Modular PKS"
                elif ("Condensation_LCL" not in domainlist
                      and "Condensation_DCL" not in domainlist
                      and "Condensation_Starter" not in domainlist
                      and "Cglyc" not in domainlist and "Condensation_Dual"
                      not in domainlist and "AMP-binding" not in domainlist
                      ) and "PKS_KS" in domainlist and "PKS_AT" in domainlist:
                    nrpspkstype = "PKS-like protein"
                elif (
                        "Condensation_LCL" in domainlist or "Condensation_DCL"
                        in domainlist or "Condensation_Starter" in domainlist
                        or "Cglyc" in domainlist or "Condensation_Dual"
                        in domainlist or "AMP-binding" in domainlist
                ) and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
                    nrpspkstype = "NRPS-like protein"
                else:
                    nrpspkstype = "PKS/NRPS-like protein"
            if feature.qualifiers.has_key("sec_met"):
                feature.qualifiers['sec_met'].append("NRPS/PKS subtype: " +
                                                     nrpspkstype)
            else:
                feature.qualifiers['sec_met'] = [
                    "NRPS/PKS subtype: " + nrpspkstype
                ]

            #Write motifs to seq_record
            motifFeatures = []
            if pksnrpsvars.motifdict.has_key(k):
                motifs = pksnrpsvars.motifdict[k]
                counter = 1
                for motif in motifs:
                    if feature.location.strand == 1:
                        start = feature.location.start + (3 * motif[1])
                        end = feature.location.start + (3 * motif[2])
                    else:
                        end = feature.location.end - (3 * motif[1])
                        start = feature.location.end - (3 * motif[2])
                    loc = FeatureLocation(start, end, strand=feature.strand)
                    motifFeature = SeqFeature(
                        loc, type=options.FeatureTags.pksnrpsmotifs_tag)
                    quals = defaultdict(list)

                    quals['label'].append(str(motif[0]))
                    if feature.qualifiers.has_key('locus_tag'):
                        quals['locus_tag'] = feature.qualifiers['locus_tag']
                    else:
                        quals['locus_tag'] = [k]
                    quals['motif'] = [motif[0]]
                    quals['asDomain_id'] = [
                        'nrpspksmotif_' + '_'.join(quals['locus_tag']) + '_' +
                        '{:04d}'.format(counter)
                    ]
                    counter += 1

                    quals['evalue'] = [str("{:.2E}".format(float(motif[3])))]
                    quals['score'] = [str(motif[4])]
                    quals['aSTool'] = ["pksnrpsmotif"]
                    quals['detection'] = ["hmmscan"]
                    quals['database'] = ["abmotifs"]
                    if feature.qualifiers.has_key('transl_table'):
                        [transl_table] = feature.qualifiers['transl_table']
                    else:
                        transl_table = 1
                    quals['translation'] = [
                        str(
                            motifFeature.extract(seq_record).seq.translate(
                                table=transl_table))
                    ]

                    quals['note'].append("NRPS/PKS Motif: " + motif[0] +
                                         " (e-value: " + str(motif[3]) +
                                         ", bit-score: " + str(motif[4]) + ")")

                    motifFeature.qualifiers = quals

                    motifFeatures.append(motifFeature)
            nrpspksdomains = pksnrpsvars.domaindict[k]

            for domain in nrpspksdomains:
                if feature.qualifiers.has_key("sec_met"):
                    feature.qualifiers['sec_met'].append(
                        "NRPS/PKS Domain: %s (%s-%s). E-value: %s. Score: %s;"
                        % (domain[0], str(domain[1]), str(
                            domain[2]), str(domain[3]), str(domain[4])))
                else:
                    feature.qualifiers['sec_met'] = [
                        "NRPS/PKS Domain: %s (%s-%s). E-value: %s. Score: %s;"
                        % (domain[0], str(domain[1]), str(
                            domain[2]), str(domain[3]), str(domain[4]))
                    ]

        seq_record.features.extend(motifFeatures)
        pksnrpsvars.nrpspkstypedict[k] = nrpspkstype
Esempio n. 40
0
def gene2features(r, gene, gene2position, gene2product, start, end, gcode,
                  partialyes, verbose):
    """
    """
    contig, CDSs, gffstrand, function, frames = gene2position[gene]
    if gffstrand in ('1', '+'):
        strand = +1
    else:
        strand = -1
        CDSs.reverse()
    '''#add stop codon if not partial seq
    if strand==1 and CDSs[-1][1]+3 <= len(r.seq):
            CDSs[-1][1] += 3
    elif strand==-1 and CDSs[0][0]-3 > 0:
        CDSs[0][0] -= 3'''
    cdsloc, mrnaloc = get_locations(CDSs, start, end, strand)
    #add gene
    geneid = gene  #".".join(gene.split('.')[:-1])
    #get product
    product = "hypothetical protein"
    if geneid in gene2product:
        product = gene2product[geneid]
    if gene.endswith('.t1'):
        sf = SeqFeature(FeatureLocation(BeforePosition(start - 1),
                                        AfterPosition(end)),
                        strand=strand,
                        type='gene',
                        id=geneid)
        sf.qualifiers = {
            "locus_tag": geneid,
            "gene": geneid,
            "product": product
        }
        r.features.append(sf)
    #get mRNA sf
    sf = SeqFeature(mrnaloc, type='mRNA', id=gene)
    sf.qualifiers = {
        "locus_tag": geneid,
        "gene": geneid,
        "product": product
    }  #"protein_id": gene
    r.features.append(sf)
    #get CDS sf
    sf = SeqFeature(cdsloc, type='CDS', id=gene)
    #get translation
    seq = sf.extract(r.seq)
    aa = str(seq.translate(table=gcode))
    #solve non-triplets issue
    if len(seq) % 3:
        if strand == 1:
            end -= len(seq) % 3
        else:
            start += len(seq) % 3
    ##check for partial sequence - no M as first or no * as last aa
    partial = 0
    #both ends partial
    if aa[0] != "M" and aa[-1] != "*":
        partial = 1
        sf.location = FeatureLocation(BeforePosition(start - 1),
                                      AfterPosition(end))
    #left end partial
    elif aa[0] != "M" and strand == 1 or aa[-1] != "*" and strand == -1:
        partial = 1
        sf.location = FeatureLocation(BeforePosition(start - 1), end)
    #right end partial
    elif aa[-1] != "*" and strand == 1 or aa[0] != "M" and strand == -1:
        partial = 1
        sf.location = FeatureLocation(start - 1, AfterPosition(end))
    #strip stop codon
    aa = aa.strip("*")
    #replace internal stop codons by X
    if "*" in aa:
        if verbose:
            sys.stderr.write("[Warning] Stop codon(s) in: %s. Skipped!\n" %
                             gene)
        return r
        #aa = aa.replace("*","X")
    sf.qualifiers = {
        'transl_table': gcode,
        "locus_tag": geneid,
        "gene": geneid,
        "product": product,
        "translation": aa
    }  #"protein_id": gene,
    if function:
        sf.qualifiers['note'] = function
    #inform about partial entries
    if partial:
        #skip if not partial are allowed
        if not partialyes:
            return r
        if aa[0] != "M":
            sf.qualifiers['codon_start'] = 1
        sf.qualifiers['product'] += ", partial cds"
        if verbose:
            sys.stderr.write("[Warning] Partial sequence: %s\n" % (gene, ))
            #sys.stderr.write("[Warning] Partial sequence: %s %s\n" % (gene,sf))
    #add to features
    r.features.append(sf)
    return r
def extract_upstream_and_CDS(fullpath, filename):

    extracted_cds_list = []

    # reads in a gbk and creates a SeqRecord object
    for record in SeqIO.parse(fullpath, "genbank"):
        if record.features:
            for feature in record.features:
                if feature.type == "CDS":
                    if validate_cds(record, feature) == True:
                        # get the CDS nucleotide locations
                        cds_start_location = feature.location.start.position
                        cds_end_location = feature.location.end.position

                        # only used for length culling
                        #get the 5'UTR sequence coordinate and extract
                        FiveUTR_location = SeqFeature(
                            FeatureLocation(
                                cds_start_location - num_bp_upstreamcds,
                                cds_start_location))
                        extracted_5UTR = FiveUTR_location.extract(record)

                        if len(extracted_5UTR.seq) == num_bp_upstreamcds:

                            #need to check if complement
                            #if it is complemement, then reverse complement it

                            if "+" in str(feature.location):
                                #extract -num_bp_upstreamcds + the whole CDS #THIS LOCATION HAS TO BE DIFFERENT
                                extract_location = SeqFeature(
                                    FeatureLocation(
                                        cds_start_location -
                                        num_bp_upstreamcds, cds_end_location))
                                extracted_seq = extract_location.extract(
                                    record)
                                #print "reverse complement disengaged" + str(feature.location)

                            elif "-" in str(feature.location):

                                rc_extract_location = SeqFeature(
                                    FeatureLocation(
                                        cds_start_location,
                                        cds_end_location + num_bp_upstreamcds))
                                extracted_seq = rc_extract_location.extract(
                                    record).reverse_complement()
                                #print "reverse complement engaged   " + str(feature.location)

                            cds_protein_id = str(
                                feature.qualifiers.get('protein_id')).strip(
                                    '\'[]')
                            annotated_record = SeqRecord(extracted_seq.seq,
                                                         extracted_seq.name,
                                                         description="|" +
                                                         cds_protein_id + "|")
                            extracted_cds_list.append(annotated_record)

                            # create a SeqFeature object containing the location of where to extract
                            # need to test if its taking + or - 1 off the location
                            # genbank starts with 1
                #upstream_cds_downstream_location = SeqFeature(FeatureLocation(cds_start_location - num_bp_upstreamcds, cds_end_location + num_bp_downstreamcds))

    # extraction is using the GENBANK protein for all
    SeqIO.write(extracted_cds_list,
                filename + str(num_bp_upstreamcds) + "upstream_and_CDS.fasta",
                "fasta")
    return
Esempio n. 42
0
def subset_viruses_nextstrain_build(virus, subtype, gene, window, min_seqs,
                                    year_max, year_min):

    configs = readin_virus_config(virus)
    standard_gene = standardize_gene_name(virus, gene)

    #Find reference, alignment and meta files (some sub-genic regions may use files from a gene or a whole genome)
    if 'specify_location' in configs[standard_gene].keys():
        parent_gene = configs[standard_gene]['specify_location']['parent_gene']
        reference_file = configs['reference_file'].format(virus=virus,
                                                          subtype=subtype,
                                                          gene=parent_gene)
        alignment_file = configs['alignment_file'].format(virus=virus,
                                                          subtype=subtype,
                                                          gene=parent_gene)
        meta_file = configs['meta_file'].format(virus=virus,
                                                subtype=subtype,
                                                gene=parent_gene)
        #some are comma-separated, some are tab-separated
        metafile_sep = configs['metafile_sep']
    else:
        reference_file = configs['reference_file'].format(virus=virus,
                                                          subtype=subtype,
                                                          gene=gene)
        alignment_file = configs['alignment_file'].format(virus=virus,
                                                          subtype=subtype,
                                                          gene=gene)
        meta_file = configs['meta_file'].format(virus=virus,
                                                subtype=subtype,
                                                gene=gene)
        metafile_sep = configs['metafile_sep']

    #Find gene location, if domain is sub-genic or reference file contains multiple genes
    gene_location = False
    #If domain is sub-genic, fetch its position (within genome or parent gene) from config file

    if 'specify_location' in configs[standard_gene].keys():
        if subtype == None:
            gene_location_key = "location"
        else:
            gene_location_key = "location_" + str(subtype)

        gene_location_list = ast.literal_eval(
            configs[standard_gene]['specify_location'][gene_location_key])
        #Need to deal with domains the are not contiguous
        if len(gene_location_list) == 1:
            gene_location = SeqFeature(
                FeatureLocation(gene_location_list[0][0],
                                gene_location_list[0][1]))
        else:
            compound_locations = []
            for location in gene_location_list:
                compound_locations.append(
                    FeatureLocation(location[0], location[1]))
            gene_location = CompoundLocation(compound_locations)

    #Find gene location from reference files
    else:
        for seq_record in SeqIO.parse(reference_file, "genbank"):
            for feature in seq_record.features:
                if feature.type == 'CDS':
                    if 'gene' in feature.qualifiers.keys():
                        if feature.qualifiers['gene'][0].lower() == gene.lower(
                        ):
                            gene_location = feature.location
                    elif feature.qualifiers['product'][0].lower(
                    ) == gene.lower():
                        gene_location = feature.location

    #Subset data based on time windows
    meta = pd.read_csv(meta_file, sep=metafile_sep)
    meta.drop(meta[meta['date'] == '?'].index, inplace=True)
    meta.dropna(subset=['date'], inplace=True)
    meta['year'] = meta['date'].str[:4].astype('int')
    if year_max:
        meta.drop(meta[meta['year'] > year_max].index, inplace=True)
    if year_min:
        meta.drop(meta[meta['year'] < year_min].index, inplace=True)

    date_range = meta['year'].max() - meta['year'].min()
    #Remove egg- and cell-passaged strains
    meta.drop(meta[meta['strain'].str[-4:] == '-egg'].index, inplace=True)
    meta.drop(meta[meta['strain'].str[-5:] == '-cell'].index, inplace=True)

    #Limit meta data to only strains in alignment file
    aligned_isolates = []
    with open(alignment_file, "r") as aligned_handle:
        for isolate in SeqIO.parse(aligned_handle, "fasta"):
            aligned_isolates.append(isolate.id)
    aligned_isolates_df = pd.DataFrame(aligned_isolates, columns=['strain'])
    meta = meta.merge(aligned_isolates_df, on='strain', how='inner')

    #Group viruses by time windows
    virus_time_subset = {}
    if window == 'all':
        years = str(meta['year'].min()) + '-' + str(meta['year'].max())
        virus_time_subset[years] = meta['strain'].tolist()
    else:
        date_window_start = meta['year'].min()
        date_window_end = meta['year'].min() + window
        while date_window_end <= meta['year'].max():
            years = str(date_window_start) + '-' + str(date_window_end)
            strains = meta[(meta['year'] >= date_window_start) & (
                meta['year'] < date_window_end)]['strain'].tolist()
            virus_time_subset[years] = strains

            #sliding window
            date_window_end += 1
            date_window_start += 1

    #Only use time points with enough data:
    virus_time_subset = {
        k: v
        for k, v in virus_time_subset.items() if len(v) >= min_seqs
    }

    year_windows = []
    seqs_in_window = []

    #Find outgroup sequence from strains at first time point(to make consensus from)
    first_window = True
    first_window_strains = []
    first_window_sequences = []

    alignment_time_subset = {}

    for years, subset_viruses in virus_time_subset.items():

        year_windows.append(years)
        seqs_in_window.append(len(subset_viruses))
        alignment_time_subset[years] = []

        #make consensus sequence at first time point
        if first_window == True:
            first_window_strains += subset_viruses
            first_window = False

        with open(alignment_file, "r") as aligned_handle:
            for isolate in SeqIO.parse(aligned_handle, "fasta"):
                if isolate.id in first_window_strains:
                    if gene_location:
                        gene_record = SeqRecord(seq=gene_location.extract(
                            isolate.seq),
                                                id=isolate.id,
                                                description=gene)
                    else:
                        gene_record = SeqRecord(seq=isolate.seq,
                                                id=isolate.id,
                                                description=gene)
                    first_window_sequences.append(gene_record)
                if isolate.id in subset_viruses:
                    if gene_location:
                        alignment_time_subset[years].append(
                            gene_location.extract(isolate.seq))
                    else:
                        alignment_time_subset[years].append(isolate.seq)

    first_window_alignment = MultipleSeqAlignment(first_window_sequences)
    outgroup_seq = AlignInfo.SummaryInfo(first_window_alignment).gap_consensus(
        ambiguous='N')
    outgroup_seq_aa = outgroup_seq.translate()

    return virus_time_subset, alignment_time_subset, outgroup_seq, outgroup_seq_aa, year_windows, seqs_in_window
Esempio n. 43
0
def find_lipoprotein(gff3_file,
                     fasta_genome,
                     lipobox_mindist=10,
                     lipobox_maxdist=60):
    seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_genome, "fasta"))

    CASES = [
        re.compile('^.{%s,%s}[ACGSILMFTV][^REKD][GASNL]C' %
                   (lipobox_mindist, lipobox_maxdist)),
        # re.compile('^.{%s,%s}AWAC' % (lipobox_mindist, lipobox_maxdist)),
        # Make sure to not have multiple cases that share matches, will introduce duplicate features into gff3 file
    ]

    for record in GFF.parse(gff3_file, base_dict=seq_dict):
        good_features = []

        genes = list(
            feature_lambda(record.features,
                           feature_test_type, {'type': 'gene'},
                           subfeatures=True))
        for gene in genes:
            cdss = list(
                feature_lambda(gene.sub_features,
                               feature_test_type, {'type': 'CDS'},
                               subfeatures=False))
            if len(cdss) == 0:
                continue

            # Someday this will bite me in the arse.
            cds = cdss[0]

            try:
                tmpseq = str(
                    cds.extract(record.seq).translate(table=11,
                                                      cds=True)).replace(
                                                          "*", "")
            except:
                continue

            for case in CASES:
                m = case.search(tmpseq)
                if m:
                    if cds.location.strand > 0:
                        start = cds.location.start + (3 * (m.end() - 4))
                        end = cds.location.start + (3 * m.end())
                    else:
                        start = cds.location.end - (3 * (m.end() - 4))
                        end = cds.location.end - (3 * m.end())

                    tmp = SeqFeature(FeatureLocation(
                        min(start, end),
                        max(start, end),
                        strand=cds.location.strand),
                                     type='Lipobox',
                                     qualifiers={
                                         'source': 'CPT_LipoRy',
                                         'ID': '%s.lipobox' % get_id(gene),
                                     })
                    tmp.qualifiers['sequence'] = str(
                        tmp.extract(record).seq.translate())

                    gene.sub_features.append(tmp)
                    good_features.append(gene)

        record.features = good_features
        yield [record]
Esempio n. 44
0
def gene2features(r, gene, gene2position, gene2product, start, end, gcode, partialyes, verbose):
    """
    """
    contig, CDSs, gffstrand, function, frames = gene2position[gene]
    if gffstrand in ('1','+'):
        strand = +1
    else:
        strand = -1
        CDSs.reverse()
    '''#add stop codon if not partial seq
    if strand==1 and CDSs[-1][1]+3 <= len(r.seq):
            CDSs[-1][1] += 3
    elif strand==-1 and CDSs[0][0]-3 > 0:
        CDSs[0][0] -= 3'''
    cdsloc, mrnaloc = get_locations(CDSs, start, end, strand)
    #add gene
    geneid = gene #".".join(gene.split('.')[:-1])
    #get product
    product = "hypothetical protein"
    if geneid in gene2product:
        product = gene2product[geneid]    
    if gene.endswith('.t1'):
        sf = SeqFeature(FeatureLocation(BeforePosition(start-1),AfterPosition(end)), strand=strand, type='gene', id=geneid)
        sf.qualifiers={"locus_tag": geneid, "gene": geneid, "product": product}
        r.features.append(sf)
    #get mRNA sf
    sf = SeqFeature(mrnaloc, type='mRNA', id=gene)
    sf.qualifiers={"locus_tag": geneid, "gene": geneid, "product": product} #"protein_id": gene
    r.features.append(sf)
    #get CDS sf
    sf = SeqFeature(cdsloc, type='CDS', id=gene)
    #get translation
    seq = sf.extract(r.seq)
    aa = str(seq.translate(table=gcode))
    #solve non-triplets issue
    if len(seq) % 3:
        if strand==1:
            end   -= len(seq) % 3
        else:
            start += len(seq) % 3
    ##check for partial sequence - no M as first or no * as last aa
    partial = 0
    #both ends partial
    if aa[0]!="M" and aa[-1]!="*":
        partial = 1
        sf.location = FeatureLocation(BeforePosition(start-1),AfterPosition(end))
    #left end partial
    elif aa[0]!="M" and strand==1 or aa[-1]!="*" and strand==-1:
        partial = 1                
        sf.location = FeatureLocation(BeforePosition(start-1),end)
    #right end partial
    elif aa[-1]!="*" and strand==1 or aa[0]!="M" and strand==-1:
        partial = 1
        sf.location = FeatureLocation(start-1,AfterPosition(end))
    #strip stop codon
    aa = aa.strip("*")
    #replace internal stop codons by X
    if "*" in aa:
        if verbose:
            sys.stderr.write("[Warning] Stop codon(s) in: %s. Skipped!\n" % gene)
        return r
        #aa = aa.replace("*","X")
    sf.qualifiers = {'transl_table': gcode, "locus_tag": geneid, "gene": geneid, "product": product, "translation": aa} #"protein_id": gene,
    if function:
        sf.qualifiers['note'] = function
    #inform about partial entries
    if partial:
        #skip if not partial are allowed
        if not partialyes:
            return r
        if aa[0]!="M":
            sf.qualifiers['codon_start'] = 1
        sf.qualifiers['product']    += ", partial cds"
        if verbose:
            sys.stderr.write("[Warning] Partial sequence: %s\n" % (gene,))
            #sys.stderr.write("[Warning] Partial sequence: %s %s\n" % (gene,sf))
    #add to features
    r.features.append(sf)
    return r 
Esempio n. 45
0
def write_data_to_seq_record(pksnrpsvars, seq_record, options):
    #Save substrate specificity predictions in NRPS/PKS domain sec_met info of seq_record
    #
    # Workaround to extract positional information for CDS_motifs from the sec_met qualifiers

    for f in utils.get_cluster_features(seq_record):
	cluster_info = f.qualifiers

    for feature in pksnrpsvars.pksnrpscoregenes:
        nrat = 0
        nra = 0
        nrcal = 0
        nrkr = 0
        nrXdom = 0
        secmetqualifiers = feature.qualifiers['sec_met']
        updated_secmetqualifiers = []
        # BiosynML:creating object to add detailed substrate predictions
        updated_secmetqualifiers_predictions = []
        domainFeatures = []
        gene_id = utils.get_gene_id(feature)
        for qualifier in secmetqualifiers:
            if "NRPS/PKS Domain:" not in qualifier:
                updated_secmetqualifiers.append(qualifier)
                updated_secmetqualifiers_predictions.append(qualifier)
            else:
                # extract domain type, start and end position from qualifier string
                match_pos_obj = re.search("NRPS/PKS Domain: ([\w-]+) \((\d+)\-(\d+)\)\. E-value: ([\de\.-]+)\. Score: ([\de\.a-]+);", qualifier)
                if not match_pos_obj:
                    logging.exception("Exception: could not extract domain string from qualifier %s:" % qualifier)
                    sys.exit(1)
                domain_type = match_pos_obj.group(1)
                start_aa = int(match_pos_obj.group(2))
                end_aa = int(match_pos_obj.group(3))
                evalue = float(match_pos_obj.group(4))
                score = float (match_pos_obj.group(5))

                #calculate respective positions based on aa coordinates
                if feature.location.strand==1:
                    start = feature.location.start + ( 3 * start_aa )
                    end = feature.location.start + ( 3* end_aa )
                else:
                    end = feature.location.end - ( 3 * start_aa )
                    start = feature.location.end - ( 3 * end_aa)
                loc = FeatureLocation(start, end, strand=feature.strand)

                # set up new CDS_motif feature
                domainFeature = SeqFeature(loc, type=options.FeatureTags.pksnrpsdomains_tag)
                domainFeature.qualifiers['domain'] = [domain_type]
                if feature.qualifiers.has_key('locus_tag'):
                    domainFeature.qualifiers['locus_tag'] = feature.qualifiers['locus_tag']
                else:
                    domainFeature.qualifiers['locus_tag'] = [gene_id]
                domainFeature.qualifiers['detection'] = ["hmmscan"]
                domainFeature.qualifiers['database'] = ["nrpspksdomains.hmm"]
                domainFeature.qualifiers['evalue'] = [str("{:.2E}".format(float(evalue)))]
                domainFeature.qualifiers['score'] = [score]
                if feature.qualifiers.has_key('transl_table'):
                    [transl_table] = feature.qualifiers['transl_table']
                else:
                    transl_table = 1
                domainFeature.qualifiers['translation'] = [str(domainFeature.extract(seq_record).seq.translate(table=transl_table))]

                domainFeature_specificity = []

                if domain_type == "AMP-binding":
                    nra += 1
                    domainname = gene_id + "_A" + str(nra)
                    domainFeature.qualifiers['label'] = [domainname]
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname]
                    domainFeature_specificity.append("NRPSpredictor2 SVM: %s" % pksnrpsvars.nrps_svm_preds[domainname])
                    domainFeature_specificity.append("Stachelhaus code: %s" % pksnrpsvars.nrps_code_preds[domainname])
                    domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_nrps_preds[domainname])
                    domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds[domainname])


                    newqualifier = qualifier + " NRPS/PKS Domain: %s; Substrate specificity predictions: %s (NRPSPredictor2 SVM), %s (Stachelhaus code), %s (Minowa), %s (consensus);" % (domainname, pksnrpsvars.nrps_svm_preds[domainname], pksnrpsvars.nrps_code_preds[domainname], pksnrpsvars.minowa_nrps_preds[domainname], pksnrpsvars.consensuspreds[domainname])
                    # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                    newqualifier_detailed = qualifier + " NRPS/PKS Domain: %s; Substrate specificity predictions: %s (NRPSPredictor2 SVM), %s (Stachelhaus code), %s (Minowa), %s (consensus);" % (domainname,pksnrpsvars.nrps_code_preds_details[domainname], pksnrpsvars.nrps_svm_preds_details[domainname],  pksnrpsvars.minowa_nrps_preds_details[domainname], pksnrpsvars.consensuspreds[domainname])
                    updated_secmetqualifiers.append(newqualifier)
                    updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                elif domain_type == "PKS_AT":
                    nrat += 1
                    domainname = gene_id + "_AT" + str(nrat)
                    domainFeature.qualifiers['label'] = [domainname]
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname]
                    domainFeature_specificity.append("PKS signature: %s" % pksnrpsvars.pks_code_preds[domainname])
                    domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_pks_preds[domainname])
                    #For t1pks, t2pks and t3pks
                    if 'transatpks' not in cluster_info['product'][0]:
                        domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds[domainname])
                        newqualifier = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds[domainname], pksnrpsvars.minowa_pks_preds[domainname], pksnrpsvars.consensuspreds[domainname])
                        # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                        newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds_details[domainname], pksnrpsvars.minowa_pks_preds_details[domainname], pksnrpsvars.consensuspreds[domainname])
                        updated_secmetqualifiers.append(newqualifier)
                        updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                    #For transatpks
                    elif 'transatpks' in cluster_info['product'][0]:
                        domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds_transat[domainname])
                        newqualifier = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds[domainname], pksnrpsvars.minowa_pks_preds[domainname], pksnrpsvars.consensuspreds_transat[domainname])
                        # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                        newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds_details[domainname], pksnrpsvars.minowa_pks_preds_details[domainname], pksnrpsvars.consensuspreds_transat[domainname])

                        updated_secmetqualifiers.append(newqualifier)
                        updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                elif domain_type == "CAL_domain":
                    nrcal += 1
                    domainname = gene_id + "_CAL" + str(nrcal)
                    domainFeature.qualifiers['label'] = [domainname]
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname]
                    domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_cal_preds[domainname])
                    newqualifier = qualifier + " Substrate specificity predictions: %s (Minowa);" %(pksnrpsvars.minowa_cal_preds[domainname])
                    # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                    newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (Minowa);" %(pksnrpsvars.minowa_cal_preds_details[domainname])
                    updated_secmetqualifiers.append(newqualifier)
                    updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                elif domain_type == "PKS_KR":
                    nrkr += 1
                    domainname = gene_id + "_KR" + str(nrkr)
                    domainFeature.qualifiers['label'] = [domainname]
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname]
                    domainFeature_specificity.append("KR activity: %s" % pksnrpsvars.kr_activity_preds[domainname])
                    domainFeature_specificity.append("KR stereochemistry: %s" % pksnrpsvars.kr_stereo_preds[domainname])
                    newqualifier = qualifier + " Predicted KR activity: %s; Predicted KR stereochemistry: %s;" %(pksnrpsvars.kr_activity_preds[domainname], pksnrpsvars.kr_stereo_preds[domainname])
                    # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                    newqualifier_detailed = qualifier + " Predicted KR activity: %s; Predicted KR stereochemistry: %s;" %(pksnrpsvars.kr_activity_preds[domainname], pksnrpsvars.kr_stereo_preds[domainname])
                    updated_secmetqualifiers.append(newqualifier)
                    updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                else:
                    nrXdom += 1
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_" + gene_id.partition(".")[0] + "_Xdom"+'{:02d}'.format(nrXdom)]
                    updated_secmetqualifiers.append(qualifier)
                domainFeature.qualifiers['specificity'] = domainFeature_specificity
                if _map_domaintype(domain_type):
                    domainFeature.qualifiers['domain_subtype'] = [domain_type]
                    domainFeature.qualifiers['domain'] = [_map_domaintype(domain_type)]
                domainFeatures.append(domainFeature)

        feature.qualifiers['sec_met'] = updated_secmetqualifiers
        # BiosynML: creating new 'sec_met_predictions' qualifier
        #feature.qualifiers['sec_met_predictions'] = updated_secmetqualifiers_predictions
        seq_record.features.extend(domainFeatures)

        if pksnrpsvars.consensuspred_gene_dict.has_key(gene_id):
            feature.qualifiers[options.QualifierTags.product_prediction] = "-".join(pksnrpsvars.consensuspred_gene_dict[gene_id])

    #Save consensus structure + link to structure image to seq_record
    clusters = utils.get_cluster_features(seq_record)
    for cluster in clusters:
        clusternr = utils.get_cluster_number(cluster)
        if pksnrpsvars.compound_pred_dict.has_key(clusternr):
            structpred = pksnrpsvars.compound_pred_dict[clusternr]
            cluster.qualifiers['note'].append("Monomers prediction: " + structpred)
            cluster.qualifiers['note'].append("Structure image: structures/genecluster%s.png" % clusternr)
Esempio n. 46
0
def find_ORFs(seqBio,
              codonTable,
              startCodons=['ATG', 'GTG', 'TTG'],
              verbose=0):
    """
    Find all putative open reading frames (ORFs) in the nucleotide sequence.
    
    We assume that an ORF start with one of the start codons as defined in the argument
    list and ends with a stop codon as defined from the codon table.
    
    Note that the expression levels from other non-canonical start codons different from
    ATG, GTG or TTG is extremely low, as shown in E. coli [1].
    
    [1] Hecht, A., Glasgow, J., Jaschke, P. R., Bawazer, L. A., Munson, M. S., Cochran, J. R., … Salit, M. (2017).
    Measurements of translation initiation from all 64 codons in E. coli. Nucleic Acids Research, 1–12.
    http://doi.org/10.1093/nar/gkx070
    """

    ORFList = []

    for strand, seq in [(+1, seqBio), (-1, seqBio.reverse_complement())]:
        # We define frame relative to the start of the seq
        for frame in range(3):
            translatedSeq = seq[frame:].translate(codonTable)
            if verbose >= 2:
                print("\n\n#### seq", seq, "\nframe", frame, "\ntranslatedSeq",
                      translatedSeq)

            codonList = list(
                extract_codons_list(seq,
                                    frame=frame,
                                    checkLengthMultipleOf3=False))

            # Find positions of all stop codons
            iCodonStopList = [
                iCodon for iCodon, codon in enumerate(codonList)
                if codon in codonTable.stop_codons
            ]
            if verbose >= 2:
                print(iCodonStopList)
                print([codonList[i] for i in iCodonStopList])

            # In each subsequence in between stop codons, search for start codons
            iCodonFirstInChunk = 0
            for iCodonStop in iCodonStopList:
                if verbose >= 2:
                    print("iCodonFirstInChunk", iCodonFirstInChunk,
                          "iCodonStop", iCodonStop)
                codonListChunk = codonList[iCodonFirstInChunk:iCodonStop + 1]
                ORFStopCodon = codonList[iCodonStop]
                if verbose >= 2:
                    print("codonListChunk", [str(c) for c in codonListChunk])

                # Find the start codon positions inside the chunk
                iCodonStartList = [
                    i for i, c in enumerate(codonListChunk)
                    if str(c) in startCodons
                ]
                if verbose >= 2: print("iCodonStartList", iCodonStartList)

                for iStart in iCodonStartList:
                    if verbose >= 2: print("iStart", iStart)
                    iCodonInSeq = iCodonFirstInChunk + iStart
                    ORFStartCodon = codonList[iCodonInSeq]
                    if verbose >= 2:
                        print("iCodonInSeq", iCodonInSeq,
                              "codonList[iCodonInSeq]", codonList[iCodonInSeq])

                    # Position of start and stop codons in the nucleotide sequence
                    iStartInNucleotideSeq = iCodonInSeq * 3 + frame
                    iStopInNucleotideSeq = iCodonStop * 3 + frame
                    if verbose >= 2:
                        print("iStartInNucleotideSeq", iStartInNucleotideSeq,
                              "iStopInNucleotideSeq", iStopInNucleotideSeq)

                    # Define ORF region on the nucleotide sequence
                    # STRAND ???????
                    if strand == +1:
                        ORFStart = iStartInNucleotideSeq
                        ORFEnd = iStopInNucleotideSeq + 2 + 1
                    elif strand == -1:
                        # reverse location
                        ORFStart = (len(seq) - 1) - (iStopInNucleotideSeq + 2
                                                     )  # include stop codon
                        ORFEnd = (len(seq) - 1) - (iStartInNucleotideSeq) + 1
                    if verbose >= 2:
                        print("ORFStart", ORFStart, "ORFEnd", ORFEnd, "strand",
                              strand)

                    # Define ORF as Biopython SeqFeature
                    ORFFeat = SeqFeature(location=FeatureLocation(
                        ORFStart, ORFEnd, strand=strand),
                                         type='putative ORF',
                                         id=None)

                    # Compute translation of ORF
                    ORFFeat.qualifiers['translation'] = ORFFeat.extract(
                        seqBio).translate(table=codonTable, cds=True)
                    ORFFeat.qualifiers['start_codon'] = str(ORFStartCodon)
                    ORFFeat.qualifiers['stop_codon'] = str(ORFStopCodon)

                    if verbose >= 2:
                        print("ORFFeat", ORFFeat)
                        print("ORFFeat.extract(seqBio)",
                              ORFFeat.extract(seqBio))
                    ORFList.append(ORFFeat)

                iCodonFirstInChunk = iCodonStop + 1

    return ORFList
Esempio n. 47
0
class TestFeatureMatch(unittest.TestCase):
    def setUp(self):
        self.seq = Seq("CCCAAAATGTACTCCACTATCTGCTGATTTGGG", generic_dna)
        self.feature = SeqFeature(FeatureLocation(6, 27),
                                  type="gene",
                                  strand=1)
        self.feature_seq = self.seq[3:-3]
        self.match = FeatureMatch(self.feature, self.feature_seq, 1, 3)

    def test__init(self):
        "Test FeatureMatch object creation"
        # forward strand
        m = self.match
        self.assertEqual(m.direction, "forward")
        self.assertEqual(str(m.dna), "ATGTACTCCACTATCTGCTGA")
        self.assertEqual(str(m.long_dna), str(self.feature_seq))
        self.assertEqual(str(m.promotor_region), "AAA")
        self.assertEqual(str(m.terminator_region), "TTT")
        self.assertEqual(str(m.mrna), str(self.feature_seq.transcribe()))
        self.assertEqual(
            str(m.aas),
            str(self.feature.extract(self.seq).translate(to_stop=True)))

        # reverse strand
        inv_seq = self.seq.reverse_complement()
        feature_seq = inv_seq[3:-3]
        inv_feature = SeqFeature(FeatureLocation(6, 27),
                                 type="gene",
                                 strand=-1)
        m = FeatureMatch(inv_feature, feature_seq, -1, 3)
        self.assertEqual(m.direction, "reverse")
        self.assertEqual(str(m.dna),
                         str(feature_seq[3:-3].reverse_complement()))
        self.assertEqual(str(m.long_dna), str(self.feature_seq))
        self.assertEqual(str(m.promotor_region), "AAA")
        self.assertEqual(str(m.terminator_region), "TTT")
        self.assertEqual(str(m.mrna),
                         str(feature_seq.transcribe().reverse_complement()))
        self.assertEqual(
            str(m.aas),
            str(inv_feature.extract(inv_seq).translate(to_stop=True)))

    def test_get_fasta_header(self):
        "Test FeatureMatch FASTA header creation"
        expected = ">untagged"
        self.assertEqual(self.match.get_fasta_header(), expected)

        self.match.feature.qualifiers['gene'] = ['fake']
        expected = ">fake"
        self.assertEqual(self.match.get_fasta_header(), expected)

        self.match.feature.qualifiers['locus_tag'] = ['FAKE_0001']
        expected = ">FAKE_0001"
        self.assertEqual(self.match.get_fasta_header(), expected)

        self.match.feature.qualifiers['product'] = ['Mup1']
        expected = ">FAKE_0001|Mup1"
        self.assertEqual(self.match.get_fasta_header(), expected)

        self.match.feature.qualifiers['protein_id'] = ['MUP_0001']
        expected = ">FAKE_0001|Mup1|MUP_0001"
        self.assertEqual(self.match.get_fasta_header(), expected)

    def test__str_(self):
        "Test FeatureMatch string representation"
        expected = """Feature:
	untagged
	Strand: forward
	DNA: ATGTACTCCACTATCTGCTGA
	mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU
	Protein: MYSTIC"""
        self.assertMultiLineEqual(str(self.match), expected)

        self.match.feature.qualifiers['gene'] = ["fake"]
        expected = """Feature:
	Tag: fake
	Strand: forward
	DNA: ATGTACTCCACTATCTGCTGA
	mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU
	Protein: MYSTIC"""
        self.assertMultiLineEqual(str(self.match), expected)

        self.match.feature.qualifiers['locus_tag'] = ['FAKE_0001']
        expected = """Feature:
	Tag: FAKE_0001
	Strand: forward
	DNA: ATGTACTCCACTATCTGCTGA
	mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU
	Protein: MYSTIC"""
        self.assertMultiLineEqual(str(self.match), expected)

        self.match.feature.qualifiers['product'] = ['Mup1']
        expected = """Feature:
	Tag: FAKE_0001
	Strand: forward
	Product: Mup1
	DNA: ATGTACTCCACTATCTGCTGA
	mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU
	Protein: MYSTIC"""
        self.assertMultiLineEqual(str(self.match), expected)

        self.match.feature.qualifiers['protein_id'] = ['MUP_0001']
        expected = """Feature:
	Tag: FAKE_0001
	Strand: forward
	Product: Mup1
	Protein ID: MUP_0001
	DNA: ATGTACTCCACTATCTGCTGA
	mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU
	Protein: MYSTIC"""
        self.assertMultiLineEqual(str(self.match), expected)

    def test_dna_fasta(self):
        "Test FeatureMatch DNA FASTA output"
        expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.dna)
        self.assertMultiLineEqual(self.match.dna_fasta(), expected)

    def test_long_dna_fasta(self):
        "Test FeatureMatch long DNA FASTA output"
        expected = "%s\n%s" % (self.match.get_fasta_header(),
                               self.match.long_dna)
        self.assertMultiLineEqual(self.match.long_dna_fasta(), expected)

    def test_mrna_fasta(self):
        "Test FeatureMatch mRNA FASTA output"
        expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.mrna)
        self.assertMultiLineEqual(self.match.mrna_fasta(), expected)

    def test_protein_fasta(self):
        "Test FeatureMatch protein FASTA output"
        expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.aas)
        self.assertMultiLineEqual(self.match.protein_fasta(), expected)

    def test_promotor_fasta(self):
        "Test FeatureMatch promotor DNA FASTA output"
        expected = "%s\n%s" % (self.match.get_fasta_header(),
                               self.match.promotor_region)
        self.assertMultiLineEqual(self.match.promotor_fasta(), expected)

    def test_terminator_fasta(self):
        "Test FeatureMatch terminator DNA FASTA output"
        expected = "%s\n%s" % (self.match.get_fasta_header(),
                               self.match.terminator_region)
        self.assertMultiLineEqual(self.match.terminator_fasta(), expected)
Esempio n. 48
0
def _annotate(seq_record, options, results):
    "Annotate seq_record with CDS_motifs for the result"
    logging.debug("generating feature objects for PFAM hits")
    min_score = _min_score(options)
    max_evalue = _max_evalue(options)

    feature_by_id = utils.get_feature_dict(seq_record)
    
    for r in results:
        i = 1
        for hsp in r.hsps:
            if hsp.bitscore <= min_score or hsp.evalue >= max_evalue:
                continue

            if not feature_by_id.has_key(hsp.query_id):
                continue

            feature = feature_by_id[hsp.query_id]

            start, end = _calculate_start_end(feature, hsp)
            loc = FeatureLocation(start, end, strand=feature.strand)
            
            newFeature = SeqFeature(location=loc, type=options.FeatureTags.fullhmmer_tag)
            
            quals = defaultdict(list)
            
            quals['label'].append(r.id)
            if feature.qualifiers.has_key('locus_tag'):       
                quals['locus_tag'] = feature.qualifiers['locus_tag']
            else:
                quals['locus_tag'] = [hsp.query_id]
            quals['domain'] = [hsp.hit_id]
            quals['asDomain_id'] = ['fullhmmer_'+'_'.join(quals['locus_tag'])+'_'+'{:04d}'.format(i)]
            i += 1
            
            quals['evalue'] = [str("{:.2E}".format(float(hsp.evalue)))]
            quals['score'] = [str(hsp.bitscore)]
            quals['aSTool'] = ["fullhmmer"]
            quals['detection'] = ["hmmscan"]
            quals['database'] = [path.basename(r.target)]
            if feature.qualifiers.has_key('transl_table'):
                [transl_table] = feature.qualifiers['transl_table']
            else:
                transl_table = 1
            quals['translation'] = [str(newFeature.extract(seq_record.seq).translate(table=transl_table))]

            quals['note'].append("%s-Hit: %s. Score: %s. E-value: %s. Domain range: %s..%s." % \
                    (path.basename(r.target), hsp.hit_id, hsp.bitscore, hsp.evalue,
                     hsp.hit_start, hsp.hit_end))

            quals['description'] = [hsp.hit_description]

            try:
                pfamid = name_to_pfamid[hsp.hit_id]
                if quals.has_key('db_xref'):
                    quals['db_xref'].append("PFAM: %s" % pfamid)
                else:
                    quals['db_xref'] = ["PFAM: %s" % pfamid]    
            except KeyError:
                pass
            
            newFeature.qualifiers=quals
            seq_record.features.append(newFeature)
Esempio n. 49
0
                                 type=typ,
                                 id=name)

            else:
                fea = SeqFeature(CompoundLocation([FeatureLocation(edge[0], edge[1], strand=+1)
                                                   for edge in edges]),
                                 type=typ,
                                 id=name)
            
            seqnew.features.append(fea)

    
    print 'Sanity checks'
    for fea in seqnew.features:
        if fea.type in ('gene', 'protein'):
            feaseq = fea.extract(seqnew)

            # vpr has an additional T in HXB2
            if fea.id == 'vpr':
                assert ((len(feaseq) - 1) % 3) == 0
                T_pos = 5771
                prot = (feaseq[: T_pos - fea.location.nofuzzy_start].seq + \
                        feaseq[T_pos + 1 - fea.location.nofuzzy_start:].seq).translate()
            else:
                assert (len(feaseq) % 3) == 0
                prot = feaseq.seq.translate()


            # These genes contain premature stops in HXB2
            if fea.id in ('tat', 'nef'):
                assert prot[-1] == '*'