Beispiel #1
0
    def _findGeneInSeq(self, record):
        """Extract gene sequence from larger sequence (e.g. genomes)
by searching features."""
        if not record.features:
            # if there aren't any features, just return the record
            return record
        for feature in record.features:
            feature_names = []
            if 'gene' in feature.qualifiers.keys():
                feature_names.extend(feature.qualifiers['gene'])
            if 'gene_synonym' in feature.qualifiers.keys():
                feature_names.extend(feature.qualifiers['gene_synonym'])
            if 'product' in feature.qualifiers.keys():
                feature_names.extend(feature.qualifiers['product'])
            gene_names = [e.lower() for e in self.gene_names]
            feature_names = [e.lower() for e in feature_names]
            if set(gene_names) & set(feature_names):
                try:
                    extractor = SeqFeature(feature.location)
                    found_seq = extractor.extract(record)
                except ValueError:
                    # catch value errors raised for sequences
                    #  with "fuzzy" positions
                    # TODO: what are fuzzy positions and can I use
                    #  them?
                    return record
                else:
                    return found_seq
        return record
 def t_write_from_recs(self):
     """Write out GFF3 from SeqRecord inputs.
     """
     seq = Seq("GATCGATCGATCGATCGATC")
     rec = SeqRecord(seq, "ID1")
     qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"],
                   "ID": "gene1"}
     sub_qualifiers = {"source": "prediction"}
     top_feature = SeqFeature(FeatureLocation(0, 20), type="gene", strand=1,
                                                       qualifiers=qualifiers)
     top_feature.sub_features = [SeqFeature(FeatureLocation(0, 5), type="exon", strand=1,
                                            qualifiers=sub_qualifiers),
                                 SeqFeature(FeatureLocation(15, 20), type="exon", strand=1,
                                            qualifiers=sub_qualifiers)]
     rec.features = [top_feature]
     out_handle = StringIO.StringIO()
     GFF.write([rec], out_handle)
     wrote_info = out_handle.getvalue().split("\n")
     assert wrote_info[0] == "##gff-version 3"
     assert wrote_info[1] == "##sequence-region ID1 1 20"
     assert wrote_info[2].split("\t") == ['ID1', 'prediction', 'gene', '1',
                                          '20', '10.0', '+', '.',
                                          'other=Some,annotations;ID=gene1']
     assert wrote_info[3].split("\t") == ['ID1', 'prediction', 'exon', '1', '5',
                                          '.', '+', '.', 'Parent=gene1']
Beispiel #3
0
def mergeRecords(file): #adapted from SeqHandler by NF Alikhan (github.com/happykhan/seqhandler)

#SeqHandler is a script for merging, converting and splitting sequence files (Genbank, EMBL, fasta and others). Please use it to merge multi-Genbank files before running bwast.py

    filetype = determineFileType(file) #determine file type
    readInMultifasta = open(file, "r")
    records = list(SeqIO.parse(readInMultifasta, filetype))

    mergingFile = records[0]
    from Bio.SeqFeature import SeqFeature, FeatureLocation
    contigs = SeqFeature(FeatureLocation(0, len(mergingFile) ), type="fasta_record",\
                strand=1)
    contigs.qualifiers["note"] = records[0].name #pull out contig number of first contig
    mergingFile.features.append(contigs) #append first contig to mergingFile
    for nextRecord in records[1:]:
        contigs = SeqFeature(FeatureLocation(len(mergingFile), len(mergingFile) + len(nextRecord)), type="fasta_record",\
                strand=1)
        contigs.qualifiers["note"] = nextRecord.name 
        mergingFile.features.append(contigs) #append subsequent contigs to mergingFile
        mergingFile += nextRecord
    mergingFile.name = records[0].name
    mergingFile.description = records[0].description
    mergingFile.annotations = records[0].annotations

    for feature in mergingFile.features:
        if feature.type == 'source':
            mergingFile.features.remove(feature)
    contigs = SeqFeature(FeatureLocation(0, len(mergingFile)), type="source", strand=1)
    mergingFile.features.insert(0,contigs)
    merged_file = re.sub(r"\.\w+$", r".merged.fa", file)
    out_handle = open(merged_file, "w")
    SeqIO.write(mergingFile, out_handle, filetype)
    return merged_file
def make_protein_feature(feature_name, feature_start, feature_end, feature_type):
    ''' Returns sequence feature, using start, end, name and type as input
    '''
    feature = SeqFeature(FeatureLocation(int(feature_start), int(feature_end)), type=feature_type)
    if feature_type == "Region":
        feature.qualifiers = {'name': [feature_name]}
    return feature
Beispiel #5
0
 def test_translation_checks_cds(self):
     """Test that a CDS feature is subject to respective checks."""
     seq = Seq.Seq("GGTTACACTTACCGATAATGTCTCTGATGA", generic_dna)
     f = SeqFeature(FeatureLocation(0, 30), type="CDS")
     f.qualifiers['transl_table'] = [11]
     with self.assertRaises(TranslationError):
         f.translate(seq)
Beispiel #6
0
def mergeMod(args):

    filetype = args.inFormat
    # Load file as SeqRecord
    int_handle = open(args.input, "r")
    recs = list(SeqIO.parse(int_handle, filetype))
    # For each SeqRecord, I.e. complete gbk annotation obj in file
    fgbk = recs[0]
    from Bio.SeqFeature import SeqFeature, FeatureLocation

    d = SeqFeature(FeatureLocation(0, len(fgbk)), type="fasta_record", strand=1)
    d.qualifiers["note"] = recs[0].name
    fgbk.features.append(d)
    for l in recs[1:]:
        d = SeqFeature(FeatureLocation(len(fgbk), len(fgbk) + len(l)), type="fasta_record", strand=1)
        d.qualifiers["note"] = l.name
        fgbk.features.append(d)
        fgbk += l
    fgbk.name = recs[0].name
    fgbk.description = recs[0].description
    fgbk.annotations = recs[0].annotations
    if args.accession != None:
        fgbk.name = args.accession
    if args.ver != None:
        fgbk.id = fgbk.name + "." + args.ver
    for f in fgbk.features:
        if f.type == "source":
            fgbk.features.remove(f)
    d = SeqFeature(FeatureLocation(0, len(fgbk)), type="source", strand=1)
    fgbk.features.insert(0, d)
    outtype = filetype
    if args.outFormat != None:
        outtype = args.outFormat
    out_handle = open(args.output, "w")
    SeqIO.write(fgbk, out_handle, outtype)
Beispiel #7
0
def get_longest(seq_record, gene2isoforms):
	l = []
	c = 0;

	chrom = adjust_name(seq_record.name);	
	
	for gene, isoforms in gene2isoforms.iteritems():
		longest = max(isoforms, key = lambda i: sum([len(x) for x in i]))
		
		if(args.format == 'bed'):
			compound_to_bed(longest, chrom, gene)
			
		elif(args.format == 'fasta'):
			if(len(longest) > 1):
				location = CompoundLocation(longest, operator = "join")
			else:
				location = longest[0];
				
			feature = SeqFeature(location=location, type='utr', strand = longest[0].strand)
			
			#print longest[0].strand
			
			f = feature.extract(seq_record)
			f.name = gene
			f.id = gene
			f.description = gene
			l.append(f);
	return l;	
 def seqBlastToFeatures(self, blastDB, blastExe, seqFile, blastType = "blastn",scoreMin = 1e-3, logFile = None):
     '''
     Blast sequence file against blast database 
     parse files into records.
     This function may not work as well on very large blast comparisons because 
     it does a full read of the result for the conversion to features.
     '''
     print ">blast %s %s %s %s" % (self.blastDB, self.blastExe, seqFile, blastType) 
     blastRecords = self.seqBlast(seqFile, blastType = "blastn", scoreMin = 1e-3, logFile = None)
     
     result = []
     index = 0
     for r in blastRecords:
         recordFeatures = []
         for alignment in r.alignments:
             name = alignment.title
             query = r.query
             for hsp in alignment.hsps:
                 if hsp.expect < scoreMin:
                     (ts,ss) = hsp.frame
                     strand = ss
                     start = hsp.sbjct_start
                     end = hsp.sbjct_end
                     location = FeatureLocation(start,end)
                     feature = SeqFeature(id=query,location=location,strand=strand)
                     aMatch = hsp.query + "\n" + hsp.match + "\n" + hsp.sbjct
                     feature.qualifiers["query"] = hsp.query
                     feature.qualifiers["subject"] = hsp.sbjct
                     feature.qualifiers["alignment"] = aMatch
                     recordFeatures.append(feature)
         result.append(recordFeatures)
         index = index + 1
         
     return result
def annotate_dna_reference(ref, protein):
    '''Annotate DNA reference with the protein secondary structures'''
    from Bio.SeqFeature import SeqFeature, FeatureLocation

    annotation_table = parse_secondary_structure(protein)

    if protein == 'gagpol':
        start_protein = ref.annotation['gag'].location.nofuzzy_start
    elif protein == 'vpu':
        # Uniprot starts one aa downstream of us
        start_protein = ref.annotation[protein].location.nofuzzy_start + 3
    else:
        start_protein = ref.annotation[protein].location.nofuzzy_start

    features = []
    for _, datum in annotation_table.iterrows():
        start_dna = datum['start'] * 3 + start_protein
        end_dna = datum['end'] * 3 + start_protein

        # Notice ribosomal slippage site
        if protein == 'gagpol':
            if start_dna >= slippage_site:
                start_dna -= 1
            if end_dna >= slippage_site:
                end_dna -= 1

        anno = SeqFeature(FeatureLocation(start_dna, end_dna), strand=+1)
        anno.type = datum['feature']
        features.append(anno)

    return features
	def parse(self):
		with open(self._file) as handle:
			genbank = SeqRecord(Seq.UnknownSeq(0))
			header_pattern = re.compile(r"ref\|(?P<id>.*?)\|:(?P<start>[0-9]+)-(?P<end>[0-9]+)\|(?P<description>.*?)\|\s*\[gene=(?P<gene>\S+)\]\s*\[locus_tag=(?P<locus_tag>\S+)\]\s*")
			first = True			
			for record in SeqIO.parse(handle, "fasta"):
				header = record.description
				match = header_pattern.match(header)
				if not match:
					self.errors.append("Invalid header: >" + header)
					continue
				
				if first:
					first = False
					genbank.id = match.group("id")
					genbank.name = match.group("id")
				
				feature = SeqFeature(FeatureLocation(int(match.group("start")), int(match.group("end"))), type = "gene")
				feature.qualifiers = {"locus_tag": match.group("locus_tag"),
							"gene": match.group("gene"),
							"note": match.group("description"),
							"sequence": record.seq}
				genbank.features.append(feature)
			
			return genbank
		return None
Beispiel #11
0
 def to_seqfeature(self):
     """Create a SeqFeature from the ProteinDomain Object."""
     feat = SeqFeature(location=FeatureLocation(self.start, self.end),
                       id=self.value)
     if hasattr(self, 'confidence'):
         feat.qualifiers['confidence'] = self.confidence
     return feat
    def test_deletion__overlapping_features(self):
        # Example based on intersection of nei and arbB genes in MG1655.
        before_overlap = 'GCCCTGGCTGCCAGCA'
        overlap = 'CTAG'
        after_overlap = 'GCCGACCGCTTCGG'
        raw_seq_str = before_overlap + overlap + after_overlap
        seq = Seq(raw_seq_str, generic_dna)
        seq_record = SeqRecord(seq)

        feature_1_loc = FeatureLocation(0,
                len(before_overlap) + len(overlap), strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1')
        seq_record.features.append(feature_1)

        feature_2_loc = FeatureLocation(len(before_overlap), len(raw_seq_str),
                strand=1)
        feature_2 = SeqFeature(feature_2_loc, type='CDS', id='2')
        seq_record.features.append(feature_2)

        maker = VCFToGenbankMaker(seq_record, None, None)
        maker._update_genome_record_for_variant(
                len(before_overlap), overlap, '')

        # Assert the sequence is correct.
        EXPECTED_SEQ = before_overlap + after_overlap
        self.assertEqual(EXPECTED_SEQ, str(seq_record.seq))

        # Assert the feature annotations are still correct.
        EXPECTED_FEATURE_1_SEQ = before_overlap
        self.assertEqual(EXPECTED_FEATURE_1_SEQ,
                str(feature_1.extract(seq_record.seq)))

        EXPECTED_FEATURE_2_SEQ = after_overlap
        self.assertEqual(EXPECTED_FEATURE_2_SEQ,
                str(feature_2.extract(seq_record.seq)))
def create_gene_feature(gene_name, feature_location, feature_qualifiers):
    """Creates a minimal SeqFeature to represent a gene.
    """
    gene_feature = SeqFeature(feature_location, type='gene')
    gene_feature.qualifiers = {'gene': [gene_name]}
    gene_feature.qualifiers = dict(gene_feature.qualifiers.items() +
            feature_qualifiers.items())
    return gene_feature
Beispiel #14
0
 def _get_feature(self, feature_dict):
     """Retrieve a Biopython feature from our dictionary representation.
     """
     location = FeatureLocation(*feature_dict['location'])
     new_feature = SeqFeature(location, feature_dict['type'],
             id=feature_dict['id'], strand=feature_dict['strand'])
     new_feature.qualifiers = feature_dict['quals']
     return new_feature
def get_gene_and_201bp_upstream(genefeature,genomeseq):
    mystart = genefeature.location.start
    myend = genefeature.location.end
    mystrand = genefeature.location.strand
    if mystrand == 1:
        newfeature = SeqFeature(FeatureLocation(mystart-201,myend),strand=mystrand)
    elif mystrand == -1:
        newfeature = SeqFeature(FeatureLocation(mystart,myend+201),strand=mystrand)
    return newfeature.extract(genomeseq)
def makeSeqObjectsForTblastnNeighbors(tblastn_id, clusterrunid, cur, N=200000):
    """
    Given a tBBLSATn ID and a dictionary from sanitized contig IDs (which is what will be
    present in the TBLASTN id) to non-sanitized IDs (which are what is in the database),
    returns a list of seq objects INCLUDING the TBLASTN hit itself (so that we can show that
    on the region drawing).

    We pick an N large enough to get at least one gene and then pick the closest one and get
    all of its neighbors with a call to makeSeqFeaturesForGeneNeighbors() and just tack the TBLASTN
    onto it.
    """
    # Lets first get the contig and start/stop locations (which tell us teh strand) out of
    # the TBLASTN id. This returns a ValueError if it fails which the calling function can catch if needed.
    sanitizedToNot = getSanitizedContigList(cur)

    contig, start, stop = splitTblastn(tblastn_id)
    if contig in sanitizedToNot:
        contig = sanitizedToNot[contig]

    start = int(start)
    stop = int(stop)

    # Create a seq object for the TBLASTN hit itself
    if start < stop:
        strand = +1
    else:
        strand = -1
    tblastn_feature = SeqFeature(FeatureLocation(start, stop), strand=strand, id=tblastn_id)
    tblastn_feature.qualifiers["cluster_id"] = -1

    # Find the neighboring genes.
    neighboring_genes = getGenesInRegion(contig, start - N, stop + N, cur)
    if len(neighboring_genes) == 0:
        sys.stderr.write(
            "WARNING: No neighboring genes found for TBLASTN hit %s within %d nucleotides in contig %s\n"
            % (tblastn_id, N, contig)
        )
        return [tblastn_feature]
    else:
        neighboring_geneinfo = getGeneInfo(neighboring_genes, cur)

    # Find the closest gene to ours and get the clusters for those neighbors based on the specific clusterrunid
    minlen = N
    mingene = None
    minstrand = None
    for geneinfo in neighboring_geneinfo:
        genestart = int(geneinfo[5])
        geneend = int(geneinfo[6])
        distance = min(abs(genestart - start), abs(geneend - start), abs(genestart - stop), abs(geneend - stop))
        if distance < minlen:
            mingene = geneinfo[0]
            minlen = distance

    neighboring_features = makeSeqFeaturesForGeneNeighbors(mingene, clusterrunid, cur)
    # Add the TBLASTN itself and return it.
    neighboring_features.append(tblastn_feature)
    return neighboring_features
def find_cds ():
    seq_des = str(record_dict[keys].description).split("|")
    for i in seq_des:
        if re.match("CDS", i):
            feature, cds_start, cds_end = re.split(":|-", i)
    cds_feature = SeqFeature(FeatureLocation(int(cds_start)-1,int(cds_end)-1),
                type=str(feature))
    cds_sequence = cds_feature.extract(record_dict[keys].seq)
    print cds_sequence.translate()
    return cds_start, cds_end, cds_sequence
Beispiel #18
0
def make_seq_feature(start, end, ftype, quals={}):
    '''
    create a sequence feature from a start, end, and a type. additionally you 
    may include other fields, like note, label, evidence, citation, as a dict.
    '''
    
    seq_feature = SeqFeature(FeatureLocation(start, end), strand= +1, type=ftype)
    seq_feature.qualifiers = quals
    seq_feature.qualifiers['source'] = ['splicemod']
    return seq_feature
    def test_update_genome_record_for_variant__overlapping_features(self):
        """Tests handling a record that lands in a region of overlapping
        features.
        """
        # Example based on intersection of nei and arbB genes in MG1655.
        before_overlap = 'GCCCTGGCTGCCAGCA'
        overlap = 'CTAG'
        after_overlap = 'GCCGACCGCTTCGG'
        raw_seq_str = before_overlap + overlap + after_overlap
        seq = Seq(raw_seq_str, generic_dna)
        seq_record = SeqRecord(seq)

        feature_1_loc = FeatureLocation(0,
                len(before_overlap) + len(overlap), strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1')
        seq_record.features.append(feature_1)

        feature_2_loc = FeatureLocation(len(before_overlap), len(raw_seq_str),
                strand=1)
        feature_2 = SeqFeature(feature_2_loc, type='CDS', id='2')
        seq_record.features.append(feature_2)

        maker = VCFToGenbankMaker(seq_record, None, None)
        overlap_replacement = 'TTAA'
        maker._update_genome_record_for_variant(len(before_overlap), overlap,
                overlap_replacement)

        # Features changed, so requery them.
        feature_1 = None
        feature_2 = None
        for feature in seq_record.features:
            if feature.id == '1':
                feature_1 = feature
            elif feature.id == '2':
                feature_2 = feature
        assert feature_1
        assert feature_2

        # Assert the sequence is correct.
        EXPECTED_SEQ = before_overlap + overlap_replacement + after_overlap
        self.assertEqual(EXPECTED_SEQ, str(seq_record.seq))

        # Feature added to represent swap.
        # self.assertEqual(3, len(seq_record.features))

        # Assert the feature annotations are still correct.
        EXPECTED_FEATURE_1_SEQ = before_overlap + overlap_replacement
        self.assertEqual(EXPECTED_FEATURE_1_SEQ,
                str(feature_1.extract(seq_record.seq)))

        EXPECTED_FEATURE_2_SEQ = overlap_replacement + after_overlap
        self.assertEqual(EXPECTED_FEATURE_2_SEQ,
                str(feature_2.extract(seq_record.seq)))
def attach_features(predictions, seqrecord):
    for prediction in predictions[seqrecord.id]:
        if prediction.raw_score >= 1.0:
            qualifiers = {}
            qualifiers['locus_tag'] = [prediction.cds_id]
            feature = SeqFeature(
                location=prediction.location,
                type='CDS',
                strand=prediction.strand,
                qualifiers=qualifiers,
            )
            feature.qualifiers = qualifiers
            seqrecord.features.append(feature)
def makeSeqFeature(geneid, cur):
    '''
    Make a BioPython SeqFeature object for a gene with ITEP ID geneid
    '''

    geneinfo = getGeneInfo( [ geneid ], cur )
    geneinfo = geneinfo[0]
    start = int(geneinfo[5])
    stop = int(geneinfo[6])
    strand = int(geneinfo[8])
    feature = SeqFeature(FeatureLocation(start, stop), strand=strand, id=geneid)
    # This can be overwritten by other functions but we need a placeholder.
    feature.qualifiers["cluster_id"] = -1
    return feature
Beispiel #22
0
 def _add_gff_line(self, rec, gff_parts, parents, children):
     """Add details from a GFF line to the given SeqRecord.
     """
     gff_parts = [(None if p == '.' else p) for p in gff_parts]
     assert rec.id == gff_parts[0], "ID mismatch: %s %s" % (rec.id,
             gff_parts[0])
     # collect all of the base qualifiers for this item
     quals = collections.defaultdict(list)
     if gff_parts[1]:
         quals["source"].append(gff_parts[1])
     if gff_parts[5]:
         quals["score"].append(gff_parts[5])
     if gff_parts[7]:
         quals["phase"].append(gff_parts[7])
     for key, val in [a.split('=') for a in gff_parts[8].split(';')]:
         quals[key].extend(val.split(','))
     quals = dict(quals)
     # if we are describing a location, then we are a feature
     if gff_parts[3] and gff_parts[4]:
         #if quals.has_key('ID') or quals.has_key('Parent'):
         #    print gff_parts[1:6], quals
         location = FeatureLocation(int(gff_parts[3]) - 1, int(gff_parts[4]))
         new_feature = SeqFeature(location, gff_parts[2],
                 id = quals.get('ID', [''])[0],
                 strand = self._strand_map[gff_parts[6]])
         new_feature.qualifiers = quals
         # Handle flat features
         if not new_feature.id:
             rec.features.append(new_feature)
         # features that have parents need to link so we can pick up
         # the relationship
         elif new_feature.qualifiers.has_key('Parent'):
             for parent in new_feature.qualifiers['Parent']:
                 children[parent].append(new_feature)
         # top level features
         else:
             parents[rec.id].append(new_feature)
     # otherwise, associate these annotations with the full record
     else:
         # add these as a list of annotations, checking not to overwrite
         # current values
         for key, vals in quals:
             if rec.annotations.has_key(key):
                 try:
                     rec.annotations[key].extend(vals)
                 except AttributeError:
                     rec.annotations[key] = [rec.annotations[key]] + vals
             else:
                 rec.annotations[key] = vals
     return rec, parents, children
Beispiel #23
0
def make_join_feature(f_list, ftype="misc_feature"):
    #NOTE - Does NOT reorder the sub-features (which you may
    #want to do for reverse strand features...)
    if len(set(f.strand for f in f_list))==1:
        strand = f_list[0].strand
    else:
        strand = None
    for f in f_list:
        f.type=ftype
        f.location_operator="join"
    jf = SeqFeature(FeatureLocation(f_list[0].location.start,
                                    f_list[-1].location.end),
                    type=ftype, strand=strand, location_operator="join")
    jf.sub_features = f_list
    return jf
Beispiel #24
0
def nrpsSmash(dnaSeq):
    options = Namespace()
    options.outputfoldername = "/tmp/nrpspks_predictions_txt"
    options.record_idx = "" # used in NRPSPredictor2.nrpscodepred, check later what to set it to
    options.eukaryotic = 0
    tstFeature = SeqFeature(FeatureLocation(0, len(dnaSeq)), type="CDS", strand=1)
    tstFeature.qualifiers = {'gene':['gene']}
    sequenceRecord = SeqRecord(Seq(dnaSeq, IUPAC.unambiguous_dna),
        id = "seq_id",
        name = "seq_name",
        description = "seq_description")
    sequenceRecord.features = [tstFeature]
    analysis = specific_analysis(sequenceRecord, options)
    shutil.rmtree(options.raw_predictions_outputfolder)
    return analysis
def retrieveCompositeSequence(seq_record,seqList) :
    # true seq 
    listePosition = list()
    for node in seqList :
        seq,coord = node.split(":")
        start,end = coord.split("..")
        listePosition.append(int(float(start)))
        listePosition.append(int(float(end)))

    start = min(listePosition)
    end = max(listePosition)
    f = SeqFeature(FeatureLocation(start,end))
    seq = f.extract(seq_record)
    seqId = seq_record.id+"|"+str(start)+"_"+str(end)
    return SeqRecord(seq=seq.seq,id=seqId,description="")
Beispiel #26
0
 def __repr__(self):
     "It prints representing the seqfeature"
     toprint = BioSeqFeature.__repr__(self)
     toprint = toprint[:-1]
     toprint += ", qualifiers=%s " % repr(self.qualifiers)
     toprint += ")"
     return toprint
Beispiel #27
0
 def convertReportToFeatures(self,report,conversionMap,startTag,endTag,strandTag,qualTag="gene"):
     '''
     parse 2d hash into feature list
     '''
     result = []
     for rowName in report.returnRowNames():
         start = float(report.getElement(rowName,startTag))
         end = float(report.getElement(rowName,endTag))
         location = FeatureLocation(start,end)
         strand = self._parseConversion(report.getElement(rowName,strandTag),conversionMap)
         feature = SeqFeature(id = rowName,location=location,strand=strand)
         feature.qualifiers[qualTag] = [rowName]
         for colName in [rowName,report.returnColumnNames()]:
             if colName in conversionMap.keys():
                 newName = conversionMap[colName]
                 value = report[colName]
                 feature.qualifiers[newName] = [value]
         result.append(feature)
     return result  
def parse_smart_domains(file_name):
    in_handle = open(file_name, "rU")
    lines = in_handle.readlines()
    domains = []
    domain_status = False
    domain_type = False
    is_domain = False
    for line in lines:
        line = line.rstrip()
        if len(line) > 1:
            pairs = line.split("=")
            is_domain = True
            if len(pairs) == 2:
                if pairs[0] == "DOMAIN":
                    domain_name = pairs[1]
                elif pairs[0] == "START":
                    domain_start = int(pairs[1])
                elif pairs[0] == "END":
                    domain_end = int(pairs[1])
                elif pairs[0] == "TYPE":
                    if pairs[1] != "PFAM":
                        domain_type = True
                    else:
                        domain_type = False
                elif pairs[0] == "STATUS":
                    if pairs[1] == "visible|OK":
                        domain_status = True
                    #False
                    else: domain_status = True
                else:
                    is_domain = False
        else:
            if is_domain & domain_type & domain_status:
                d = SeqFeature(FeatureLocation(domain_start, domain_end), type="Region")
                d.qualifiers = {'region_name': [domain_name]}
                if domain_name != 'low_complexity_region':
                    domains.append(d)
                is_domain = False
                domain_type = False
                domain_status = False
    in_handle.close()
    return domains
def main():
    genome_record = SeqIO.read(INPUT_GENOME, 'genbank')

    with open(SNP_CSV_DATA_FILE) as input_fh:
        reader = csv.DictReader(input_fh, SNP_FIELD_NAMES)
        reader.next() # Ignore header.
        for row in reader:
            feature_ref = row['ref']
            feature_alt = row['alt'].replace('[', '').replace(']', '')
            feature_start = int(row['position']) - 1 # pythonic
            feature_end = feature_start + len(feature_alt)
            feature_location = FeatureLocation(feature_start, feature_end)
            feature = SeqFeature(location=feature_location,
                    type=REALIGNED_SNP_TYPE,
                    strand=1)
            feature.qualifiers['ref'] = row['ref']
            feature.qualifiers['alt'] = row['alt']
            genome_record.features.append(feature)

    with open(OUTPUT_GENOME, 'w') as output_fh:
        SeqIO.write(genome_record, output_fh, 'genbank')
Beispiel #30
0
def extract_sequences_one_sample(args):
  '''
    Function for extracting protein sequences given annotated regions and produce fasta files that can be used for clustering
  '''
  (fasta_path,annotation_path,sample, output_dir)=args
  domainInfo=load_annotation_pfam(annotation_path)
  print "Generating domain fasta sequences for "+sample+" ..."
  from Bio import SeqIO
  from Bio.SeqFeature import SeqFeature, FeatureLocation
  from Bio.SeqRecord import SeqRecord 
  (annot,start,stop,strand,evalue)=domainInfo
  record_dict=index_fasta(fasta_path)
  recordlist=[]
  outfilename=output_dir +'/forClustering/'+ sample +'.fasta'
  outhandle=open(outfilename,'w')
  for domainID in annot.keys():
      for i in range(len(annot[domainID])):
          domain=annot[domainID][i]
          try:
              seq=record_dict[domain]
          except KeyError:
             print "Error: " + domain + " not in fasta file.\n"
             break    
          a=start[domainID][i]
          b=stop[domainID][i]
          seq_strand=strand[domainID][i]
          seq_evalue=evalue[domainID][i]
          if seq_strand in '+':
             domain_feature = SeqFeature(FeatureLocation(a-1, b-1), type="domain", strand=1)
          elif seq_strand in '-':
               domain_feature = SeqFeature(FeatureLocation(a-1, b-1), type="domain", strand=-1)
          feature_seq = domain_feature.extract(seq)
          feature_seq.id=feature_seq.id+' '+domainID+' '+seq_evalue 
          recordlist.append(feature_seq)
  
  
  SeqIO.write(recordlist, outhandle, "fasta")
  outhandle.close()
  
  print "Done"
Beispiel #31
0
 def add_track_with_sigils(self, **kwargs):
     """Add track with sigils."""
     self.gdt_features = self.gdd.new_track(1, greytrack=False)
     self.gds_features = self.gdt_features.new_set()
     for i in range(18):
         start = int((400 * i) / 18.0)
         end = start + 17
         if i % 3 == 0:
             strand = None
             name = "Strandless"
             color = colors.orange
         elif i % 3 == 1:
             strand = +1
             name = "Forward"
             color = colors.red
         else:
             strand = -1
             name = "Reverse"
             color = colors.blue
         feature = SeqFeature(FeatureLocation(start, end), strand=strand)
         self.gds_features.add_feature(feature, name=name,
                                       color=color, label=True, **kwargs)
 def match_sequence_numbering(self):
     """ SequenceData.match_sequence_numbering
     Assign canonical sequence numbering to structural fragments
     """
     if not hasattr(self, 'has_canonical'):
         return False
     for ch_id in self.data:
         if ch_id not in self.has_canonical or not self.has_canonical[ch_id]:
             continue
         for mod_id in self.data[ch_id]['pdb']:
             frgs = self.data[ch_id]['pdb'][mod_id]['frgs']
             self.data[ch_id]['pdb'][mod_id]['match_numbering'] = True
             for nfrag in range(0, len(frgs)):
                 inic = self.data[ch_id]['can'].seq.find(frgs[nfrag].seq) + 1
                 fin = inic + len(frgs[nfrag].seq) - 1
                 self.data[ch_id]['pdb'][mod_id]['frgs'][nfrag].features.append(
                     SeqFeature(FeatureLocation(inic, fin))
                 )
                 if inic != frgs[nfrag].features[0].location.start or\
                     fin != frgs[nfrag].features[0].location.end:
                     self.data[ch_id]['pdb'][mod_id]['match_numbering'] = False
     return True
Beispiel #33
0
    def add_point_feature(self, resnum, feat_type=None, feat_id=None):
        """Add a feature to the features list describing a single residue.

        Args:
            resnum (int): Protein sequence residue number
            feat_type (str, optional): Optional description of the feature type (ie. 'catalytic residue')
            feat_id (str, optional): Optional ID of the feature type (ie. 'TM1')

        """
        if self.feature_file:
            raise ValueError(
                'Feature file associated with sequence, please remove file association to append '
                'additional features.')

        if not feat_type:
            feat_type = 'Manually added protein sequence single residue feature'
        newfeat = SeqFeature(location=FeatureLocation(
            ExactPosition(resnum - 1), ExactPosition(resnum)),
                             type=feat_type,
                             id=feat_id)

        self.features.append(newfeat)
 def t_write_seqrecord(self):
     """Write single SeqRecords.
     """
     seq = Seq("GATCGATCGATCGATCGATC")
     rec = SeqRecord(seq, "ID1")
     qualifiers = {
         "source": "prediction",
         "score": 10.0,
         "other": ["Some", "annotations"],
         "ID": "gene1"
     }
     rec.features = [
         SeqFeature(FeatureLocation(0, 20),
                    type="gene",
                    strand=1,
                    qualifiers=qualifiers)
     ]
     out_handle = StringIO()
     GFF.write([rec], out_handle, include_fasta=True)
     wrote_info = out_handle.getvalue().split("\n")
     gff_line = wrote_info[2]
     assert gff_line.split("\t")[0] == "ID1"
    def long_sigils(self, glyph):
        """Check feature sigils within bounding box."""
        # Add a track of features, bigger height to emphasise any sigil errors
        self.gdt_features = self.gdd.new_track(1, greytrack=True, height=3)
        # We'll just use one feature set for these features if strand specific
        self.gds_features = self.gdt_features.new_set()
        if glyph in ["BIGARROW"]:
            # These straddle the axis, so don't want to draw them on top of each other
            feature = SeqFeature(FeatureLocation(25, 375), strand=None)
            self.gds_features.add_feature(feature, color="lightblue")
            feature = SeqFeature(FeatureLocation(25, 375), strand=+1)
        else:
            feature = SeqFeature(FeatureLocation(25, 375), strand=+1)
            self.gds_features.add_feature(feature, color="lightblue")
        self.gds_features.add_feature(
            feature, name="Forward", sigil=glyph, color="blue", arrowhead_length=2.0
        )

        if glyph in ["BIGARROW"]:
            # These straddle the axis, so don't want to draw them on top of each other
            self.gdt_features = self.gdd.new_track(1, greytrack=True, height=3)
            self.gds_features = self.gdt_features.new_set()
            feature = SeqFeature(FeatureLocation(25, 375), strand=None)
            self.gds_features.add_feature(feature, color="pink")
            feature = SeqFeature(FeatureLocation(25, 375), strand=-1)
        else:
            feature = SeqFeature(FeatureLocation(25, 375), strand=-1)
            self.gds_features.add_feature(feature, color="pink")
        self.gds_features.add_feature(
            feature, name="Reverse", sigil=glyph, color="red", arrowhead_length=2.0
        )
        # Add another track of features, bigger height to emphasise any sigil errors
        self.gdt_features = self.gdd.new_track(1, greytrack=True, height=3)
        # We'll just use one feature set for these features,
        self.gds_features = self.gdt_features.new_set()
        feature = SeqFeature(FeatureLocation(25, 375), strand=None)
        self.gds_features.add_feature(feature, color="lightgreen")
        self.gds_features.add_feature(
            feature, name="Standless", sigil=glyph, color="green", arrowhead_length=2.0
        )
        self.finish(f"GD_sigil_long_{glyph}")
Beispiel #36
0
def check_sub(feature, sequence):
    new_features = []
    loc_list = []
    qual_list = {}
    topop = []
    for sub in feature.sub_features:
        if sub.sub_features:  # If there are sub_features, go deeper
            new_features.extend(check_sub(sub, sequence))
        elif sub.type == 'CDS':
            loc = [sub.location.start.real, sub.location.end.real]
            loc_list.append(FeatureLocation(loc[0], loc[1], strand=sub.strand))
            # For split features (CDSs), the final feature will have the same qualifiers as the children ONLY if
            # they're the same, i.e.: all children have the same "protein_ID" (key and value).
            for qual in sub.qualifiers.keys():
                if qual not in qual_list:
                    qual_list[qual] = sub.qualifiers[qual]
                if qual in qual_list and not qual_list[qual] == sub.qualifiers[
                        qual]:
                    topop.append(qual)

    for n in topop:  # Pop mismatching qualifers over split features
        qual_list.pop(n, None)
    qual_list.pop('Parent', None)  # Pop parent.

    # Only works in tip of the tree, when there's no new_feature built yet. If there is,
    # it means the script just came out of a check_sub and it's ready to return.
    if not new_features:
        if len(loc_list) > 1:
            loc_list = sorted(loc_list, key=lambda x: x.start.real)
            if loc_list[0].strand == 1:
                new_loc = CompoundLocation(loc_list)
            else:
                new_loc = CompoundLocation(list(reversed(loc_list)))
        elif len(loc_list) == 0:
            return new_features
        else:
            new_loc = loc_list[0]

        new_feature = SeqFeature(new_loc)
        new_feature.qualifiers = qual_list
        new_feature.type = 'CDS'
        trans = new_feature.extract(sequence.seq).translate(stop_symbol='')
        new_feature.qualifiers['translation'] = [str(trans)]
        new_features.append(new_feature)

    return new_features
Beispiel #37
0
    def setUp(self):
        random.seed(1)
        self.sf1 = SeqFeature(FeatureLocation(ExactPosition(100),
                                              ExactPosition(500),
                                              strand=-1),
                              type='exon',
                              id='exon_1')
        self.sf2 = SeqFeature(FeatureLocation(ExactPosition(800),
                                              ExactPosition(1000),
                                              strand=-1),
                              type='exon',
                              id='exon_2')
        self.sf3 = SeqFeature(FeatureLocation(ExactPosition(1500),
                                              ExactPosition(2000),
                                              strand=-1),
                              type='exon',
                              id='exon_3')
        self.sf4 = SeqFeature(FeatureLocation(ExactPosition(10000),
                                              ExactPosition(15000),
                                              strand=-1),
                              type='exon',
                              id='exon_4')
        self.sf5 = SeqFeature(FeatureLocation(ExactPosition(17000),
                                              ExactPosition(20000),
                                              strand=-1),
                              type='exon',
                              id='exon_5')
        self.sf6 = SeqFeature(FeatureLocation(ExactPosition(22000),
                                              ExactPosition(25000),
                                              strand=-1),
                              type='exon',
                              id='exon_6')

        self.exon1 = FusionExon(self.sf1)
        self.exon2 = FusionExon(self.sf2)
        self.exon3 = FusionExon(self.sf3)
        self.exon4 = FusionExon(self.sf4)
        self.exon5 = FusionExon(self.sf5)
        self.exon6 = FusionExon(self.sf6)

        self.sfs1 = [self.sf1, self.sf2, self.sf3]
        self.sfs2 = [self.sf4, self.sf5, self.sf6]
        self.exons1 = [self.exon1, self.exon2, self.exon3]
        self.exons2 = [self.exon4, self.exon5, self.exon6]
        self.FE1 = FusionEvent(self.sfs1, self.sfs2, "+", "-")
        self.FE2 = FusionEvent(self.sfs1, self.sfs2, "-", "+")
Beispiel #38
0
def mergeMod(args):

    filetype = args.inFormat
    # Load file as SeqRecord
    int_handle = open(args.input, "r")
    recs = list(SeqIO.parse(int_handle, filetype))
    # For each SeqRecord, I.e. complete gbk annotation obj in file
    fgbk = recs[0]
    from Bio.SeqFeature import SeqFeature, FeatureLocation
    d = SeqFeature(FeatureLocation(0, len(fgbk) ), type="fasta_record",\
                strand=1)
    d.qualifiers["note"] = recs[0].name 
    fgbk.features.append(d)
    for l in recs[1:]:
        d = SeqFeature(FeatureLocation(len(fgbk), len(fgbk) + len(l)), type="fasta_record",\
                strand=1)
        d.qualifiers["note"] = l.name 
        fgbk.features.append(d)
        fgbk += l
    fgbk.name = recs[0].name
    fgbk.description = recs[0].description
    fgbk.annotations = recs[0].annotations
    if args.accession != None: 
        fgbk.name = args.accession
    if args.ver != None:
        fgbk.id = fgbk.name +'.' + args.ver
    for f in fgbk.features:
        if f.type == 'source':
            fgbk.features.remove(f)
    d = SeqFeature(FeatureLocation(0, len(fgbk)), type="source", strand=1)
    fgbk.features.insert(0,d)
    outtype = filetype 
    if args.outFormat != None:
        outtype = args.outFormat
    out_handle = open( args.output,"w")
    SeqIO.write(fgbk, out_handle, outtype)
Beispiel #39
0
    def test_translate(self):
        s = SeqRecord(
            Seq("ATGGTGTAA"),
            id="TestID",
            name="TestName",
            description="TestDescription",
            dbxrefs=["TestDbxrefs"],
            features=[SeqFeature(FeatureLocation(0, 3), type="Site")],
            annotations={"organism": "bombyx"},
            letter_annotations={"test": "abcdefghi"},
        )

        t = s.translate()
        self.assertEqual(t.seq, "MV*")
        self.assertEqual(t.id, "<unknown id>")
        self.assertEqual(t.name, "<unknown name>")
        self.assertEqual(t.description, "<unknown description>")
        self.assertFalse(t.dbxrefs)
        self.assertFalse(t.features)
        self.assertFalse(t.annotations)
        self.assertFalse(t.letter_annotations)

        t = s.translate(
            cds=True,
            id=True,
            name=True,
            description=True,
            dbxrefs=True,
            annotations=True,
        )
        self.assertEqual(t.seq, "MV")
        self.assertEqual(t.id, "TestID")
        self.assertEqual(t.name, "TestName")
        self.assertEqual(t.description, "TestDescription")
        self.assertEqual(t.dbxrefs, ["TestDbxrefs"])
        self.assertFalse(t.features)
        self.assertEqual(t.annotations, {"organism": "bombyx"})
        self.assertFalse(t.letter_annotations)
Beispiel #40
0
    def execute(self, params: dict):
        records: List[SeqRecord] = []

        assembly_list: AssemblyResultSet = params[0]
        for assembly_idx, assembly in enumerate(assembly_list):
            features = []
            part_start_pos = 0

            assembly_sequence = assembly.get_sequence()

            for part in assembly.get_parts():
                #assembly_sequence += str(part.source_part.sequence.seq)

                # Add annotation
                feature = SeqFeature(
                    id=part.identifier,
                    qualifiers={'name': part.identifier},
                    #location=FeatureLocation(
                    #    start=part_start_pos,
                    #    end=new_start_pos
                    #),
                    type='part')

                features.append(feature)

            record = SeqRecord(
                assembly_sequence.seq,
                id='123456789',  # random accession number
                name='Example',
                description='An example GenBank file generated by SuperGSL')

            for feature in features:
                record.features.append(feature)

            records.append(record)

        output_file = open('example.gb', 'w')
        SeqIO.write(records, output_file, 'genbank')
Beispiel #41
0
def _easy_seqrec(
    str_seq: str,
    id,
    annotation_type: str = "misc_feature",
    start=0,
    end=None,
    **qualifiers: list,
) -> SeqRecord:
    """Return an annotated SeqRecord from a string and id.

    Args:
        str_seq: sequence of SeqRecord.
        id : Identifier for new part.
        annotation_type (optional): Equivalent to Bio.SeqFeature type e.g. CDS,
            Defaults to "misc_feature".
        start (optional): start of the annotation, Defaults to 0.
        end (optional): end of the annotation, if None defaults to len(str_seq),
            Defaults to 'None'.
        **qualifiers: equivalent to Bio.SeqFeature.qualifiers for annotation e.g. standard_name=["LMP"].

    Return:
        SeqRecord: An annotated SeqRecord.
    """
    if not end:
        end = len(str_seq)
    seqrec = SeqRecord(
        Seq(str_seq),
        id=id,
        features=[
            SeqFeature(
                type=annotation_type,
                location=FeatureLocation(start=start, end=end, strand=+1),
                qualifiers={item[0]: item[1]
                            for item in qualifiers.items()},
            )
        ],
    )
    return seqrec
Beispiel #42
0
    def to_record(self, record=None, record_id=None):
        """Return a Biopython seqrecord of the quote.

        >>> record = to_record(solution)
        >>> # Let's plot with DnaVu:
        >>> from dnavu import create_record_plot
        >>> from bokeh.io import output_file, show
        >>> output_file("view.html")
        >>> plot = create_record_plot(record)
        >>> show(plot)
        """
        if record_id is None:
            record_id = self.id
        if record is None:
            if has_dna_alphabet:  # Biopython <1.78
                record = SeqRecord(Seq(self.sequence, DNAAlphabet()),
                                   id=record_id)
            else:
                record = SeqRecord(Seq(self.sequence), id=record_id)
            record.annotations["molecule_type"] = "DNA"
        else:
            record = deepcopy(record)

        if self.assembly_plan is not None:
            features = [
                SeqFeature(
                    FeatureLocation(segment[0], segment[1], 1),
                    type="Feature",
                    qualifiers={
                        "name": quote.id,
                        "source": quote.source,
                        "price": quote.price,
                        "lead_time": quote.lead_time,
                    },
                ) for segment, quote in self.assembly_plan.items()
            ]
            record.features = features + record.features
        return record
Beispiel #43
0
    def test_replacer__TGA_codon_mid_seq(self):
        """Tests that TGA codon mis-sequence is not replaced,
        as it codes for Selenocysteine.
        """
        CODONS_TO_REMOVE = ['TGA']

        # Simple table for testing.
        AA_TO_CODON_LIST_DICT = {
                '*': {'TGA': {}, 'TAG': {}},
        }

        CODON_USAGE_MEMEX = CodonUsageMemex(AA_TO_CODON_LIST_DICT)

        feature_1_seq = 'TTG' + 'GCT' + 'TGA' + 'TTG' + 'TGA'
        whole_seq = feature_1_seq
        seq = Seq(whole_seq, generic_dna)
        seq_record = SeqRecord(seq)

        feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1')
        seq_record.features.append(feature_1)

        codon_replacer = GraphSearchCodonReplacer(
                CODONS_TO_REMOVE, CODON_USAGE_MEMEX)

        # Perform replacement.
        replace_result = codon_replacer.replace_codons_in_feature(
                feature_1.id, seq_record)

        # Assert successful fix.
        self.assertTrue(replace_result['is_success'])

        # Assert the new sequence has only the TTG that is not a start codon
        # removed.
        EXPECTED_NEW_FEATURE_SEQUENCE = 'TTG' + 'GCT' + 'TGA' + 'TTG' + 'TAG'
        self.assertEqual(
                EXPECTED_NEW_FEATURE_SEQUENCE,
                str(replace_result['new_feature_seq']))
Beispiel #44
0
def gb_create(sequence, nb_strain, inp, path, name_directory):

    # initialization of the DNA sequence
    DNA_seq = ''

    # loop in sequence list to generate the string corresponding to the DNA sequence of the device.
    for seq in sequence:
        DNA_seq += seq[1]

    # creation of the formated DNA sequence for the genbank file
    seq_final = Seq(DNA_seq, IUPAC.unambiguous_dna)

    record = SeqRecord(
        seq_final,
        id='NA',  # random accession number
        name='Strain' + nb_strain + '_' + inp,
        description='Sequence of the computational device in the strain' +
        nb_strain + ' to implement the Boolean function required')

    # initialization of the variable len_seq
    len_seq = 0

    # loop in sequence list to generate the genbank feature of the sequence
    for feat in sequence:

        feature = SeqFeature(FeatureLocation(start=len_seq,
                                             end=len_seq + len(feat[1])),
                             id=feat[0],
                             type=feat[0],
                             strand=feat[2])
        record.features.append(feature)
        len_seq += len(feat[1])

    # Save as GenBank file
    output_file = open(
        path + '/' + name_directory + '_Strain' + nb_strain + '.gb', 'w')
    SeqIO.write(record, output_file, 'genbank')
    output_file.close()
Beispiel #45
0
def bed2SeqFeature( bedlist,window,expcount,cnvlimit ):
    """Generate feature data."""
    gdata = []
    for s,e,c in bedlist:
        #assume not reads and dels
        log2 = float('-inf')
        #get log2 only if any reads        
        if c:
            #get exp count
            expcountlocal = expcount
            if e-s < window:
                #if region shorter than window, normalize expcount accordingly
                expcountlocal = 1.0 * (e-s) / window * expcount
            log2 = log( c / expcountlocal,2 )
        #store dels and dups
        sf = SeqFeature(FeatureLocation(s,e))
        color = 0
        '''
        if   log2 <= -cnvlimit:
            color = (0,0,_get_color(-log2))
        elif log2 >=  cnvlimit:
            color = (_get_color(log2),0,0) #rgb
        '''
        if   log2 <= -3*cnvlimit:
            color = colors.darkblue
        elif log2 <= -2*cnvlimit:
            color = colors.blue
        elif log2 <= -1*cnvlimit:
            color = colors.lightblue
        elif log2 >=  3*cnvlimit:
            color = colors.darkred
        elif log2 >=  2*cnvlimit:
            color = colors.red
        elif log2 >=  1*cnvlimit:
            color = colors.lightsalmon #'''
        if color:
            gdata.append((sf,color))
    return gdata
    def plan_step_to_record(plan_step, record=None, record_id=None):
        """Return a Biopython seqrecord of the quote.
        >>> record = to_SeqRecord(solution)
        >>> # Let's plot with DnaVu:
        >>> from dnavu import create_record_plot
        >>> from bokeh.io import output_file, show
        >>> output_file("view.html")
        >>> plot = create_record_plot(record)
        >>> show(plot)
        """
        if record_id is None:
            record_id = plan_step.id
        if record is None:
            if has_dna_alphabet:  # Biopython <1.78
                record = SeqRecord(Seq(plan_step.sequence, DNAAlphabet()),
                                   id=record_id)
            else:
                record = SeqRecord(Seq(plan_step.sequence), id=record_id)
            record.annotations["molecule_type"] = "DNA"
        else:
            record = deepcopy(record)

        if plan_step.assembly_plan is not None:
            features = [
                SeqFeature(
                    FeatureLocation(q.segment_start, q.segment_end, 1),
                    type="misc_feature",
                    qualifiers={
                        "label": "%s - From %s" % (q.id, q.source),
                        "name": q.id,
                        "source": q.source,
                        "price": q.price,
                        "lead_time": q.lead_time,
                    },
                ) for q in plan_step.assembly_plan
            ]
            record.features = features + record.features
        return record
Beispiel #47
0
    def locations_to_biopython_features(
        self,
        feature_type="misc_feature",
        color="red",
        label_prefix="",
        merge_overlapping=False,
    ):
        """Return a list of locations (of breach/suboptimality) as annotations.

        Parameters
        ----------
        feature_type
          Genbank type of the annotations

        color
          Color property attached to the annotations

        label_prefix
          The locations will be labelled of the form
          "prefix NameOfSpecification()"

        merge_overlapping
          If true, then overlapping locations (0-5, 2-9) will be merged into a
          single one (0-9).
        """
        locations = self.locations
        if merge_overlapping:
            locations = Location.merge_overlapping_locations(locations)
        return [
            SeqFeature(
                location.to_biopython_location(),
                type=feature_type,
                qualifiers=dict(
                    label=label_prefix + " " + str(self.specification),
                    color=color,
                ),
            ) for location in locations
        ]
Beispiel #48
0
def annotate_record(
    seqrecord,
    location="full",
    feature_type="misc_feature",
    margin=0,
    **qualifiers
):
    """Add a feature to a Biopython SeqRecord.

    Parameters
    ----------

    seqrecord
      The biopython seqrecord to be annotated.

    location
      Either (start, end) or (start, end, strand). (strand defaults to +1)

    feature_type
      The type associated with the feature

    margin
      Number of extra bases added on each side of the given location.

    qualifiers
      Dictionnary that will be the Biopython feature's `qualifiers` attribute.
    """
    if location == "full":
        location = (margin, len(seqrecord) - margin)

    strand = location[2] if len(location) == 3 else 1
    seqrecord.features.append(
        SeqFeature(
            FeatureLocation(location[0], location[1], strand),
            qualifiers=qualifiers,
            type=feature_type,
        )
    )
Beispiel #49
0
def convert_feature(feature,
                    qualifier_transformers=DEFAULT_QUALIFIER_TRANSFORMERS):
    """

    :param SeqFeature feature:
    :param qualifier_transformers: a set of transformation functions used to clean up the qualifiers.
    :return: a :class:`SeqFeature` with valid GenBank qualifiers and a feature type that is a Sequence Ontology term.
    """
    type_ = convert_feature_type(feature)

    before, after = feature.qualifiers, feature.qualifiers

    for transformer in qualifier_transformers:
        before = after = transformer(before, dict(after))

    # finally, remove all qualifiers that do not belong to a certain feature key
    qualifiers = remove_qualifiers_inappropriate_for_feature(
        before, dict(after), genbank_feature_key(type_))

    return SeqFeature(location=feature.location,
                      type=type_,
                      id=feature.id,
                      qualifiers=qualifiers)
Beispiel #50
0
def get_genome(inputfile):
    with open(inputfile) as f:
        lines = f.readlines()
    sepline = lines.index("##FASTA\n")
    genome_id = lines[sepline+1][1:].strip()

    genome_features = []
    for i in range(2,sepline):
        splitline = lines[i].split()
        startloc = int(splitline[3])
        endloc = int(splitline[4])
        featstrand = stranddict[splitline[6]]
        locustag = splitline[8].split('=')[1]
        thisfeature = SeqFeature(FeatureLocation(start=startloc, end=endloc, strand = featstrand),
                                 type=splitline[2],
                                 qualifiers = {'locus_tag':[locustag]})
        genome_features.append(thisfeature)
    
    genome_seq = ""
    
    for i in range(sepline+2, len(lines)):
        genome_seq = genome_seq + lines[i].strip()
    return(genome_id, genome_seq, genome_features)
Beispiel #51
0
def gb_create(DNA_seq, list_feat, name, directory):

    # creation of the formated DNA sequence for the genbank file
    seq_final = Seq(DNA_seq, IUPAC.unambiguous_dna)
    record = SeqRecord(
        seq_final,
        id='NA',  # random accession number
        name='',
        description='Synthetic sequence')

    # loop in sequence list to generate the genbank feature of the sequence
    for feat in list_feat:

        feature = SeqFeature(FeatureLocation(start=feat[0], end=feat[1]),
                             id=feat[2],
                             type=feat[2],
                             strand=feat[3])
        record.features.append(feature)

    # Save as GenBank file
    output_file = open(directory + '/' + name + '.gb', 'w')
    SeqIO.write(record, output_file, 'genbank')
    output_file.close()
    def test_find_features_starting_at(self):
        before = 'TACTAGTCGAT'
        feature_1_seq = 'ATGAAAGGGATG'
        after = 'CTATCTAGCTAGCT'
        whole_seq = Seq(before + feature_1_seq + after, generic_dna)
        seq_record = SeqRecord(whole_seq)

        feature_1_loc = FeatureLocation(len(before),
                                        len(before) + len(feature_1_seq),
                                        strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1')
        seq_record.features.append(feature_1)

        self.assertEquals(set([]),
                          set(find_features_starting_at(0, seq_record)))
        self.assertEquals(
            set([feature_1]),
            set(find_features_starting_at(feature_1_loc.start, seq_record)))
        self.assertEquals(
            set(),
            set(
                find_features_starting_at(feature_1_loc.end, seq_record,
                                          ['misc'])))
Beispiel #53
0
def bed2SeqFeature(bed1, expcount1, bed2, expcount2, window):
    """Generate feature data:
    - mark heterozygous SNP-rich with orange
    - mark homozygous SNP-rich with darkgrey
    """
    gdata = []
    bed1.sort()
    bed2.sort()
    for (s, e, c1), (s2, e2, c2) in zip(bed1, bed2):
        # hapA
        color = "darkgrey"  #0
        #mark heterozygous SNP-rich with orange # hetero
        if get_log2(s, e, c2, expcount2, window) > 0:
            color = 0  #"orange"
        #mark homozygous SNP-rich with darkgrey # hapB
        elif get_log2(s, e, c1, expcount1, window) > 0:
            color = "orange"  #"darkgrey"

        #store dels and dups
        if color:
            sf = SeqFeature(FeatureLocation(s, e))
            gdata.append((sf, color))
    return gdata
 def extract_target(self):
     inter_df = self.inter_df
     i = 0
     for index, row in inter_df.iterrows():
         i += 1
         print(i)
         # if i > 20:
         #     break
         chr_id = row['chr']
         chr = self.get_genome_by_id(chr_id)
         strand = row['strand']
         assert strand == '+' or strand == '-', "strand value incorrect {}".format(
             strand)
         strand = 1 if strand == '+' else -1
         start = row[
             'start'] - 1  #Note that the start and end location numbering follow Python's scheme
         stop = row['end']
         target_feature = SeqFeature(
             FeatureLocation(start, stop, strand=strand))
         target = target_feature.location.extract(chr)
         inter_df.loc[index, 'target'] = str(target.seq)
         #print (str(target.seq))
     self.inter_df = inter_df.copy()
Beispiel #55
0
 def _translate_feature(fi, rec, table):
     f = rec.features[fi]
     srec = f.extract(rec)
     try:
         tsec = srec.seq.translate(table)
         if tsec[-1] == '*': tsec = tsec[:-1]
         try:
             fid = f.qualifiers['locus_tag'][0]
         except KeyError:
             fid = '%s_f%d' % (rec.id, fi)
         trec = SeqRecord(tsec,
                          id=fid,
                          name=rec.name,
                          description=rec.description,
                          annotations=rec.annotations)
         pf = SeqFeature(FeatureLocation(0, len(trec)),
                         id=f.id,
                         type='CDS',
                         qualifiers=f.qualifiers)
         trec.features.append(pf)
     except Exception, e:
         print e
         raise RuntimeError('Unable to translate: %s' % str(srec.seq))
Beispiel #56
0
def make_gbk_from_gff(infile, outfile):
    g_id, g_seq, g_features = get_genome(infile)
    myseq = Seq.Seq(g_seq, IUPAC.unambiguous_dna)
    myrecord = SeqRecord(myseq, id = g_id, name = g_id)
    f1 = SeqFeature(FeatureLocation(0, len(myseq)), type="source")
    myrecord.features.append(f1)
    for feature in g_features:
        feature.qualifiers['transl_table']=[11]
        if feature.location.strand == 1:
            aaseq = myseq[feature.location.start-1:feature.location.end-1].translate()
        else:
            aaseq = myseq[feature.location.start-1:feature.location.end-1].reverse_complement().translate()
        feature.qualifiers['translation']=[str(aaseq)]
        feature.type="CDS"
        feature.qualifiers['product'] = feature.qualifiers['locus_tag']
        
        genefeature = copy.deepcopy(feature)
        genefeature.type = 'gene'
        genefeature.qualifiers = {'locus_tag':feature.qualifiers['locus_tag']}
        
        myrecord.features.append(genefeature)
        myrecord.features.append(feature)
    SeqIO.write(myrecord, outfile, 'genbank')
 def load_annotation(self, gff):
     print("Loading annotation...")
     try:
         with open(gff, 'r') as g:
             for feature in g:
                 if 'gene' in feature:
                     scaffold, source, gfftype, start, end, score, strand, phase, attributes = feature.rstrip(
                     ).split('\t')
                     if scaffold not in self.sequences:
                         continue
                     featurestrand = None
                     if strand == '+':
                         featurestrand = 1
                     elif strand == '-':
                         featurestrand = -1
                     location = FeatureLocation(ExactPosition(start),
                                                ExactPosition(end),
                                                strand=featurestrand)
                     feature = SeqFeature(location, type=gfftype)
                     self.sequences[scaffold].features.append(feature)
     except IOError:
         print("Can't load annotation from file {}!".format(gff))
         sys.exit()
Beispiel #58
0
def add_jaggies(contig_seq, offset, gd_contig_features):
    """Add JAGGY features for any run of NNNN or XXXX in sequence."""
    if not MIN_GAP_JAGGY:
        #Skip the jaggies
        return
    contig_seq = contig_seq.upper().replace("X", "N")
    i = 0
    j = 0
    NNN = "N" * MIN_GAP_JAGGY
    while i < len(contig_seq):
        i = contig_seq.find(NNN, i)
        if i == -1:
            return
        j = i
        while j < len(contig_seq) and contig_seq[j] == "N":
            j += 1
        #print("Adding jaggy")
        gd_contig_features.add_feature(SeqFeature(
            FeatureLocation(offset + i, offset + j)),
                                       sigil="JAGGY",
                                       color=colors.slategrey,
                                       border=colors.black)
        i = j + 1
Beispiel #59
0
    def to_biopython_feature(self,
                             feature_type="misc_feature",
                             role="constraint",
                             colors_dict=None,
                             **qualifiers):
        """Return a Biopython feature representing the specification.

        The feature label is a string representation of the specification,
        and its location indicates the specification's scope.

        """
        if colors_dict is None:
            colors_dict = {"constraint": "#355c87", "objective": "#f9cd60"}
        qualifiers["role"] = role
        if "label" not in qualifiers:
            qualifiers['label'] = self.label(role=role,
                                             with_location=False,
                                             assignment=':')
        if "color" not in qualifiers:
            qualifiers['color'] = colors_dict[role]
        return SeqFeature(self.location.to_biopython_location(),
                          type=feature_type,
                          qualifiers=qualifiers)
Beispiel #60
0
    def organism_iterator(self, organism, seq_map=None):
        for dbcontig in Contig.objects(organism=organism).no_cache():
            if seq_map:
                seq = str(seq_map[dbcontig.name].seq)
            else:
                seq = dbcontig.seq
            contig = SeqRecord(id=dbcontig.name, seq=Seq(seq))
            for dbfeature in dbcontig.features:
                qualifiers = {"locus_tag": [dbfeature.locus_tag]}
                p = list(
                    Protein.objects(organism=organism,
                                    gene=dbfeature.identifier))
                if p:
                    p = p[0]
                    qualifiers["description"] = [p.description]
                    qualifiers["gene_symbol"] = p.gene
                    qualifiers["Note"] = [p.description]

                    ecs = [
                        x.upper() for x in p.ontologies if x.startswith("ec:")
                    ]
                    gos = [
                        x.upper() for x in p.ontologies if x.startswith("go:")
                    ]
                    if ecs:
                        qualifiers["EC"] = ecs
                    if gos:
                        qualifiers["GO"] = gos
                    feature = SeqFeature(id=dbfeature.identifier,
                                         type=dbfeature.type,
                                         qualifiers=qualifiers,
                                         location=FeatureLocation(
                                             start=dbfeature.location.start,
                                             end=dbfeature.location.end,
                                             strand=dbfeature.location.strand))
                    contig.features.append(feature)
            yield contig