Ejemplo n.º 1
0
    def to_biopython(self, qualifiers: Dict[str, Any] = None) -> List[SeqFeature]:
        """ Converts this feature into one or more SeqFeature instances.

            Subclasses must manage their own attributes and potential extra
            features.
        """
        feature = SeqFeature(self.location, type=self.type)
        quals = self._qualifiers.copy()
        notes = self._qualifiers.get("note", [])
        assert notes is not None
        notes.extend(self.notes)
        if qualifiers:
            notes += qualifiers.pop("note", [])
            quals.update(qualifiers)
        if notes:
            # sorting helps with consistency and comparison testing
            quals["note"] = sorted(notes)
        if self.created_by_antismash:
            quals["tool"] = ["antismash"]
        if self._original_codon_start is not None:
            start = int(self._original_codon_start)
            quals["codon_start"] = [str(start + 1)]
            # adjust location back if neccessary
            if self.location.strand == -1:
                start *= -1
            if self._original_codon_start != 0:
                feature.location = _adjust_location_by_offset(feature.location, -start)
        # sorted here to match the behaviour of biopython
        for key, val in sorted(quals.items()):
            feature.qualifiers[key] = val
        assert isinstance(feature.qualifiers, dict)
        return [feature]
Ejemplo n.º 2
0
        def _track(track_level):
            track = diagram.new_track(track_level, greytrack=False)
            feature_set = track.new_set()
            for name, it in zip(names, intervals):
                feat = SeqFeature(FeatureLocation(*it, strand=1))
                feature_set.add_feature(feat,
                                        name=name,
                                        label=True,
                                        label_angle=90)

            for i, feat in enumerate(record.features[:8]):
                loc = feat.location
                eta = 4.8045
                feat.location = FeatureLocation(int(loc.start / eta),
                                                int(loc.end / eta),
                                                strand=-1)
                color = colors.blue if i % 2 == 0 else colors.lightblue
                feature_set.add_feature(feat, color=color, label=True)
Ejemplo n.º 3
0
def gene2features(r, gene, gene2position, gene2product, start, end, gcode,
                  partialyes, verbose):
    """
    """
    contig, CDSs, gffstrand, function, frames = gene2position[gene]
    if gffstrand in ('1', '+'):
        strand = +1
    else:
        strand = -1
        CDSs.reverse()
    '''#add stop codon if not partial seq
    if strand==1 and CDSs[-1][1]+3 <= len(r.seq):
            CDSs[-1][1] += 3
    elif strand==-1 and CDSs[0][0]-3 > 0:
        CDSs[0][0] -= 3'''
    cdsloc, mrnaloc = get_locations(CDSs, start, end, strand)
    #add gene
    geneid = gene  #".".join(gene.split('.')[:-1])
    #get product
    product = "hypothetical protein"
    if geneid in gene2product:
        product = gene2product[geneid]
    if gene.endswith('.t1'):
        sf = SeqFeature(FeatureLocation(BeforePosition(start - 1),
                                        AfterPosition(end)),
                        strand=strand,
                        type='gene',
                        id=geneid)
        sf.qualifiers = {
            "locus_tag": geneid,
            "gene": geneid,
            "product": product
        }
        r.features.append(sf)
    #get mRNA sf
    sf = SeqFeature(mrnaloc, type='mRNA', id=gene)
    sf.qualifiers = {
        "locus_tag": geneid,
        "gene": geneid,
        "product": product
    }  #"protein_id": gene
    r.features.append(sf)
    #get CDS sf
    sf = SeqFeature(cdsloc, type='CDS', id=gene)
    #get translation
    seq = sf.extract(r.seq)
    aa = str(seq.translate(table=gcode))
    #solve non-triplets issue
    if len(seq) % 3:
        if strand == 1:
            end -= len(seq) % 3
        else:
            start += len(seq) % 3
    ##check for partial sequence - no M as first or no * as last aa
    partial = 0
    #both ends partial
    if aa[0] != "M" and aa[-1] != "*":
        partial = 1
        sf.location = FeatureLocation(BeforePosition(start - 1),
                                      AfterPosition(end))
    #left end partial
    elif aa[0] != "M" and strand == 1 or aa[-1] != "*" and strand == -1:
        partial = 1
        sf.location = FeatureLocation(BeforePosition(start - 1), end)
    #right end partial
    elif aa[-1] != "*" and strand == 1 or aa[0] != "M" and strand == -1:
        partial = 1
        sf.location = FeatureLocation(start - 1, AfterPosition(end))
    #strip stop codon
    aa = aa.strip("*")
    #replace internal stop codons by X
    if "*" in aa:
        if verbose:
            sys.stderr.write("[Warning] Stop codon(s) in: %s. Skipped!\n" %
                             gene)
        return r
        #aa = aa.replace("*","X")
    sf.qualifiers = {
        'transl_table': gcode,
        "locus_tag": geneid,
        "gene": geneid,
        "product": product,
        "translation": aa
    }  #"protein_id": gene,
    if function:
        sf.qualifiers['note'] = function
    #inform about partial entries
    if partial:
        #skip if not partial are allowed
        if not partialyes:
            return r
        if aa[0] != "M":
            sf.qualifiers['codon_start'] = 1
        sf.qualifiers['product'] += ", partial cds"
        if verbose:
            sys.stderr.write("[Warning] Partial sequence: %s\n" % (gene, ))
            #sys.stderr.write("[Warning] Partial sequence: %s %s\n" % (gene,sf))
    #add to features
    r.features.append(sf)
    return r
Ejemplo n.º 4
0
		placeholder = SeqRecord(Seq("cgctatgcgaacaaaattgaactggaacgc", alphabet=IUPAC.unambiguous_dna), name=destination)


		if args.vector:
			base, ext = os.path.splitext(os.path.basename(args.vector))
			output_filename = base + "_" + destination + ext
			naive_construct, objectives, constraints = load_template(args.vector, placeholder, destination)
		else:
			output_filename = destination + ".gb"
			objectives = []
			constraints = []
			naive_construct = placeholder
			whole_seq_feat = SeqFeature()
			whole_seq_feat.type = "misc_feature"
			whole_seq_feat.qualifiers['label'] = [destination]
			whole_seq_feat.location = FeatureLocation(0,len(placeholder),strand=1)
			naive_construct.features.append(whole_seq_feat)

		dest_feat = find_annotation(naive_construct, placeholder.name)
		dest_loc = Location.from_biopython_location(dest_feat.location)


		user_objectives, user_constraints = load_user_options(args, dest_loc)
	

		objectives += user_objectives
		constraints += user_constraints


		problem = DnaOptimizationProblem(str(naive_construct.seq), constraints=constraints, objectives=objectives)
Ejemplo n.º 5
0
def agrupar_sitios():
    regiones = list(
        GFF.parse("/data/organismos/ILEX_PARA2/regulation/ncbi_IP4.gff3.reg"))
    ids = 1
    groups = {}
    for c in tqdm(regiones):
        groups[c.id] = []
        for strand in [1, -1]:
            group = SeqFeature(id=c.features[0],
                               type="grouped_transcription_regulatory_region",
                               location=c.features[0].location)
            group.sub_features = []

            fs = sorted([f for f in c.features if f.strand == strand],
                        key=lambda x: x.location.start)
            if not fs:
                continue
            group.sub_features += [fs[0]]

            for f in fs[1:]:
                end = max([x.location.end for x in group.sub_features])
                if ((abs(f.location.start - end) < 1500)
                        or (set(range(f.location.start, f.location.end)) & set(
                            range(group.sub_features[-1].location.start,
                                  group.sub_features[-1].location.end)))):
                    group.sub_features.append(f)
                else:
                    group.qualifiers = {
                        "description":
                        "_".join(
                            sorted(
                                set([
                                    x.qualifiers["description"][0].split(
                                        " regulatory region")[0]
                                    for x in group.sub_features
                                ]))),
                        "ID": ["ILEXPARARR" + str(ids)]
                    }
                    ids += 1

                    group.location = FeatureLocation(
                        start=min(
                            [x.location.start for x in group.sub_features]),
                        end=max([x.location.end for x in group.sub_features]),
                        strand=f.location.strand)
                    assert group.location.start < group.location.end
                    if (group.location.end - group.location.start) > 5000:
                        print(group.qualifiers["ID"])

                    groups[c.id].append(group)
                    group = SeqFeature(
                        id=c.features[0],
                        type="grouped_transcription_regulatory_region",
                        location=f.location)
                    group.sub_features = [f]
            if group:
                group.qualifiers = {
                    "description":
                    "_".join(
                        sorted(
                            set([
                                x.qualifiers["description"][0].split(
                                    " binding site")[0]
                                for x in group.sub_features
                            ]))),
                    "ID": ["ILEXPARARR" + str(ids)]
                }
                ids += 1
                group.location = FeatureLocation(
                    start=min([x.location.start for x in group.sub_features]),
                    end=max([x.location.end for x in group.sub_features]),
                    strand=f.location.strand)
                assert group.location.start < group.location.end
                if (group.location.end - group.location.start) > 5000:
                    print(group.qualifiers["ID"])

                groups[c.id].append(group)

    # for _, v in groups.items():
    #     for x in v:
    #         x.sub_features = []
    records = [
        SeqRecord(id=k, name="", description="", seq=Seq(""), features=v)
        for k, v in groups.items()
    ]
    GFF.write(tqdm(records),
              open("/data/organismos/ILEX_PARA2/regulation/grouped.gff", "w"))
Ejemplo n.º 6
0
def gene2features(r, gene, gene2position, gene2product, start, end, gcode, partialyes, verbose):
    """
    """
    contig, CDSs, gffstrand, function, frames = gene2position[gene]
    if gffstrand in ('1','+'):
        strand = +1
    else:
        strand = -1
        CDSs.reverse()
    '''#add stop codon if not partial seq
    if strand==1 and CDSs[-1][1]+3 <= len(r.seq):
            CDSs[-1][1] += 3
    elif strand==-1 and CDSs[0][0]-3 > 0:
        CDSs[0][0] -= 3'''
    cdsloc, mrnaloc = get_locations(CDSs, start, end, strand)
    #add gene
    geneid = gene #".".join(gene.split('.')[:-1])
    #get product
    product = "hypothetical protein"
    if geneid in gene2product:
        product = gene2product[geneid]    
    if gene.endswith('.t1'):
        sf = SeqFeature(FeatureLocation(BeforePosition(start-1),AfterPosition(end)), strand=strand, type='gene', id=geneid)
        sf.qualifiers={"locus_tag": geneid, "gene": geneid, "product": product}
        r.features.append(sf)
    #get mRNA sf
    sf = SeqFeature(mrnaloc, type='mRNA', id=gene)
    sf.qualifiers={"locus_tag": geneid, "gene": geneid, "product": product} #"protein_id": gene
    r.features.append(sf)
    #get CDS sf
    sf = SeqFeature(cdsloc, type='CDS', id=gene)
    #get translation
    seq = sf.extract(r.seq)
    aa = str(seq.translate(table=gcode))
    #solve non-triplets issue
    if len(seq) % 3:
        if strand==1:
            end   -= len(seq) % 3
        else:
            start += len(seq) % 3
    ##check for partial sequence - no M as first or no * as last aa
    partial = 0
    #both ends partial
    if aa[0]!="M" and aa[-1]!="*":
        partial = 1
        sf.location = FeatureLocation(BeforePosition(start-1),AfterPosition(end))
    #left end partial
    elif aa[0]!="M" and strand==1 or aa[-1]!="*" and strand==-1:
        partial = 1                
        sf.location = FeatureLocation(BeforePosition(start-1),end)
    #right end partial
    elif aa[-1]!="*" and strand==1 or aa[0]!="M" and strand==-1:
        partial = 1
        sf.location = FeatureLocation(start-1,AfterPosition(end))
    #strip stop codon
    aa = aa.strip("*")
    #replace internal stop codons by X
    if "*" in aa:
        if verbose:
            sys.stderr.write("[Warning] Stop codon(s) in: %s. Skipped!\n" % gene)
        return r
        #aa = aa.replace("*","X")
    sf.qualifiers = {'transl_table': gcode, "locus_tag": geneid, "gene": geneid, "product": product, "translation": aa} #"protein_id": gene,
    if function:
        sf.qualifiers['note'] = function
    #inform about partial entries
    if partial:
        #skip if not partial are allowed
        if not partialyes:
            return r
        if aa[0]!="M":
            sf.qualifiers['codon_start'] = 1
        sf.qualifiers['product']    += ", partial cds"
        if verbose:
            sys.stderr.write("[Warning] Partial sequence: %s\n" % (gene,))
            #sys.stderr.write("[Warning] Partial sequence: %s %s\n" % (gene,sf))
    #add to features
    r.features.append(sf)
    return r 
Ejemplo n.º 7
0
def correct(_gff, _genome):
    """
    :param _gff: gff path
    :param _genome: fasta path
    :return:
    """
    _seqs = SeqIO.to_dict(SeqIO.parse(_genome, 'fasta'))
    _gff = [_ for _ in GFF.parse(_gff, base_dict=_seqs)]
    correct_list = []
    gene_error_dict = defaultdict(list)
    for scaffold in _gff:
        correct_scaffold = SeqRecord(seq="", id=scaffold.id, name=scaffold.name, description=scaffold.description)
        for gene in scaffold.features:
            correct_gene = SeqFeature(location=gene.location,
                                      type='gene',
                                      strand=gene.strand,
                                      id=gene.id,
                                      qualifiers={'ID': [gene.id]})
            correct_gene.sub_features = []
            error_dict = defaultdict(list)
            for mRNA in gene.sub_features:
                try:
                    get_cds(mRNA, scaffold)
                    correct_gene.sub_features.append(mRNA)
                except TranslationError as e:
                    try:
                        if e.args[0].startswith("First codon"):
                            _tmp_mrna = correct_start_codon(mRNA, scaffold)
                            get_cds(_tmp_mrna, scaffold)
                            correct_gene.sub_features.append(_tmp_mrna)
                            error_dict.setdefault('corrected', []).append(mRNA.id)
                        elif e.args[0].startswith('The phase of first CDS is not 0'):
                            # the translation was checked in function
                            _tmp_mrna = correct_phase(mRNA, scaffold)
                            correct_gene.sub_features.append(_tmp_mrna)
                            error_dict.setdefault('corrected', []).append(mRNA.id)
                        elif e.args[0].endswith("is not a stop codon"):
                            _tmp_mrna = correct_stop_codon(mRNA, scaffold)
                            get_cds(_tmp_mrna, scaffold)
                            correct_gene.sub_features.append(_tmp_mrna)
                            error_dict.setdefault('corrected', []).append(mRNA.id)
                        # can not handle for now
                        elif e.args[0] == "Extra in frame stop codon found.":
                            error_dict.setdefault('internal', []).append(mRNA.id)
                        elif e.args[0].endswith("is not a multiple of three"):
                            error_dict.setdefault('three', []).append(mRNA.id)
                    except TranslationError as e2:
                        if e2.args[0].startswith('These mRNAs need another round correction'):
                            correct_gene.sub_features.append(e2.args[1])
                            error_dict.setdefault('phase', []).append(mRNA.id)
                        # for second round
                        elif e2.args[0] == "Extra in frame stop codon found":
                            error_dict.setdefault('internal', []).append(mRNA.id)
                        elif e2.args[0].startswith('First codon'):
                            error_dict.setdefault('first2', []).append(mRNA.id)
                        elif e2.args[0].endswith("is not a stop codon"):
                            error_dict.setdefault('final', []).append(mRNA.id)
                        elif e2.args[0].endswith("is not a multiple of three"):
                            error_dict.setdefault('three', []).append(mRNA.id)
                except Exception as e:
                    print(e)
                    print(mRNA.id)
            # handle mRNA and gene relationship
            if not correct_gene.sub_features:
                # Raise error for genes whose all mRNAs have error.
                for _key, value in error_dict.items():
                    _tmp_error = [gene.id + ' ' + _ for _ in value]
                    gene_error_dict[_key] += _tmp_error
            else:
                # check boundary conflict between gene and mRNA.
                gene_start, gene_end = gene.location.start, gene.location.end
                for mRNA in correct_gene.sub_features:
                    if mRNA.location.start < correct_gene.location.start:
                        gene_start = mRNA.location.start
                    if mRNA.location.end > correct_gene.location.end:
                        gene_end = mRNA.location.end
                correct_gene.location = FeatureLocation(gene_start, gene_end, strand=correct_gene.strand)
                correct_scaffold.features.append(correct_gene)
        correct_list.append(correct_scaffold)
    # tidy correct list
    return correct_list, gene_error_dict
Ejemplo n.º 8
0
                'label': 'B0015 Double Terminator',
                'note': ['color: #ff8eff', '"iGEM Part: BBa_B0015"']
            }

        # merge Lux pL promoter
        r0063 = next(get_features('R0063'), None)
        if r0063 is not None:
            luxpl = next(get_features('Lux pL promoter'))
            luxpl.location = r0063.location
            gb_archive.features.remove(r0063)

        # add LVA ssrA tag
        ssra_match = SSRA_TAG.search(gb_archive.seq)
        if ssra_match is not None:
            ssra = SeqFeature(type="CDS")
            ssra.location = FeatureLocation(*ssra_match.span(), strand=1)
            ssra.qualifiers = {
                "label": ["ssrA tag (LVA)"],
                "product": [
                    "C-terminal peptide that mediates degradation in bacteria through the ClpXP and ClpAP proteases (McGinness et al., 2006)"
                ],
                "translation":
                "AANDENYALVA",
                "note": [
                    "mutant LVA variant that confers accelerated degradation under some conditions (Andersen et al., 1998)",
                    "color: #cc99b2",
                ],
            }
            gb_archive.features.append(ssra)

        # Replace E0040m with well annotated GFP
Ejemplo n.º 9
0
    print('\n---Exercise 5---')
    id = "PAX-6.5"
    format = "fasta"
    record = SeqIO.read(open(id + "." + format), format)
    print("Record:\n", record)
    record.seq.alphabet = IUPAC.unambiguous_dna
    print("\nAlphabet altered to IUPAC.unambiguous_dna !!!")
    accNb = record.id.split("|")[3]
    print("Access number: ", accNb)
    record.name = accNb
    record.id = accNb
    print("record.name and record.id have been altered !!!")

    feature = SeqFeature()
    feature.type = "gene"
    feature.location = FeatureLocation(18, 200)
    feature.strand = -1
    record.features.append(feature)
    print("record.features: ", record.features)
    print("\nRecord:\n", record)

    count = SeqIO.write(record, open(id + ".gb", "w"), "genbank")
    print("Converted %i records" % count)

if 6 in _RunExercise:
    print('\n---ORF---')

    mail = ''
    id = "NC_009926"
    db = "nuccore"
    format = "fasta"