Ejemplo n.º 1
0
 def test_end_without_start(self):
     expected = [
         FeatureLocation(BeforePosition(0), ExactPosition(6), strand=1)
     ]
     self.run_both_dirs(expected, "NNNTAGNNN")
Ejemplo n.º 2
0
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.SeqFeature import SeqFeature, FeatureLocation

exclude=["E6*","E1^E4","E8^E2","URR","E1BS","E2BS"]

accessions=[]
for r in csv.reader(open('PaVE.csv',"r")):
    if r[0] not in accessions:
        accessions.append(r[0])


with open('PaVE.gb',"w") as genbank:
    for a in accessions:
        print (a)
        for r in csv.reader(open('PaVE.csv',"r")):
            if r[0] == a and r[1] not in exclude and "join" not in r[2]:
                if r[1] == 'CG':
                    sequence_object = Seq(r[3],IUPAC.unambiguous_dna)
                    record = SeqRecord(sequence_object,
                       id=a, # random accession number
                       name=a,
                       description='An example GenBank file generated by BioPython')
                    
                else:
                	#print (r[2])
                	s,e = int(r[2].split("..")[0])-1,int(r[2].split("..")[1])
                	feature = SeqFeature(FeatureLocation(start=s, end=e), type='CDS', qualifiers={'gene':r[1]})
                	record.features.append(feature)
        print (record.format("gb"),file=genbank)
Ejemplo n.º 3
0
    def test_eq_not_identical(self):
        """Test two different locations are not equal"""

        loc1 = FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1)
        loc2 = FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1) + FeatureLocation(50, 60, 1)
        self.assertNotEqual(loc1, loc2)

        loc1 = FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1)
        loc2 = FeatureLocation(12, 17, -1) + FeatureLocation(23, 42, -1)
        self.assertNotEqual(loc1, loc2)

        loc1 = CompoundLocation([FeatureLocation(12, 17, 1), FeatureLocation(23, 42, 1)])
        loc2 = CompoundLocation([FeatureLocation(12, 17, 1), FeatureLocation(23, 42, 1)], 'order')
        self.assertNotEqual(loc1, loc2)

        loc1 = FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1)
        loc2 = 5
        self.assertNotEqual(loc1, loc2)
Ejemplo n.º 4
0
def copy_features(source_sr, target_sr, limit = 10):
    '''This function tries to copy all features in source_seq and copy
    them to target_seq. Source_sr and target_sr are objects with
    a features property, such as Dseqrecord or Biopython SeqRecord.

    Parameters
    ----------

    source_seq : SeqRecord or Dseqrecord
        The sequence to copy features from

    target_seq : SeqRecord or Dseqrecord
        The sequence to copy features to

    Returns
    -------
    bool : True
        This function acts on target_seq in place.
        No data is returned.


    '''
    import re
    from Bio.Seq import reverse_complement as rc
    target_length    = len(target_sr)
    target_string    = str(target_sr.seq).upper()

    try:
        circular = bool(target_sr.circular)
    except AttributeError:
        circular=False

    newfeatures=[]

    trgt_string = target_string
    trgt_string_rc = rc(trgt_string)

    for feature in [f for f in source_sr.features if len(f)>limit]:
        fsr            = feature.extract(source_sr).upper()
        featurelength  = 0# len(fsr)

        if circular:
            trgt_string = target_string+target_string[:featurelength]
            trgt_string_rc = rc(trgt_string)

        positions = (
        [(m.start(), m.end(), 1,) for m in re.finditer(str(fsr.seq),trgt_string)]
        +
        [(len(trgt_string_rc)-m.end(),len(trgt_string_rc)-m.start(),-1,)
                      for m in re.finditer(str(fsr.seq),trgt_string_rc)])

        for begin, end, strand in positions:
            if circular and begin<target_length<end:
                end = end-len(
                              target_sr)
                sf1 = SeqFeature(FeatureLocation(begin, trgt_length),
                                 type=feature.type,
                                 location_operator=feature.location_operator,
                                 strand=strand,
                                 id=feature.id,
                                 qualifiers=feature.qualifiers,
                                 sub_features=None,)
                sf2 = SeqFeature(FeatureLocation(0, end),
                                 type=feature.type,
                                 location_operator=feature.location_operator,
                                 strand=strand,
                                 id=feature.id,
                                 qualifiers=feature.qualifiers,
                                 sub_features=None,)
                nf =  SeqFeature(FeatureLocation(begin, end),
                                 type=feature.type,
                                 location_operator="join",
                                 strand=strand,
                                 id=feature.id,
                                 qualifiers=feature.qualifiers,
                                 sub_features=[sf1,sf2],)
            else:
                nf = SeqFeature(FeatureLocation(begin,end),
                     type=feature.type,
                     location_operator=feature.location_operator,
                     strand=strand,
                     id=feature.id,
                     qualifiers=feature.qualifiers,
                     sub_features=None)
            newfeatures.append(nf)
    target_sr.features.extend(newfeatures)
    return True
Ejemplo n.º 5
0
def run_glimmer(seq_record, options):
    "Run glimmer3 to annotate prokaryotic sequences"
    basedir = utils.get_genefinding_basedir(options)
    with TemporaryDirectory(change=True):
        utils.fix_record_name_id(seq_record, options)
        name = seq_record.id
        while len(name) > 0 and name[0] == '-':
            name = name[1:]
        if name == "":
            name = "unknown"
        fasta_file = '%s.fasta' % name
        longorfs_file = '%s.longorfs' % name
        icm_file = '%s.icm' % name
        result_file = '%s.predict' % name

        # run long-orfs
        with open(fasta_file, 'w') as handle:
            seqio.write([seq_record], handle, 'fasta')
        long_orfs = [path.join(basedir, 'long-orfs')]
        long_orfs.extend([
            '-l', '-n', '-t', '1.15', '--trans_table', '11', fasta_file,
            longorfs_file
        ])
        out, err, retcode = execute(long_orfs)
        if err.find('ERROR') > -1:
            logging.error("Locating long orfs failed: %r" % err)
            return

        # run extract
        extract = [
            path.join(basedir, 'extract'), '-t', fasta_file, longorfs_file
        ]
        out, err, retcode = execute(extract)
        if out == '':
            logging.error("Failed to extract genes from model, aborting: %r" %
                          err)
            return

        build_icm = [path.join(basedir, 'build-icm'), '-r', icm_file]
        out, err, retcode = execute(build_icm, input=out)
        if err != '':
            logging.error("Failed to build gene model: %r" % err)
            return

        # run glimmer3
        glimmer = [path.join(basedir, 'glimmer3')]
        glimmer.extend([
            '-l', '-o', '50', '-g', '90', '-q', '3000', '-t', '30',
            '--trans_table', '11', fasta_file, icm_file, name
        ])

        out, err, retcode = execute(glimmer)
        if err.find('ERROR') > -1:
            logging.error("Failed to run glimmer3: %r" % err)
            return
        for line in open(result_file, 'r'):
            # skip first line
            if line.startswith('>'):
                continue

            name, start, end, strand, score = line.split()

            try:
                start = int(start)
                end = int(end)
                strand = int(strand)
            except ValueError:
                logging.error('Malformatted glimmer output line %r' %
                              line.rstrip())

            if start > end:
                bpy_strand = -1
                tmp = start
                start = end
                end = tmp
            else:
                bpy_strand = 1

            loc = FeatureLocation(start - 1, end, strand=bpy_strand)
            feature = SeqFeature(location=loc,
                                 id=name,
                                 type="CDS",
                                 qualifiers={
                                     'locus_tag':
                                     ['ctg%s_%s' % (options.record_idx, name)],
                                     'note': ['Glimmer score: %s' % score]
                                 })
            seq_record.features.append(feature)
Ejemplo n.º 6
0
def gff2genbank(gff_path, seq_path, organism):
    gff_file = [_ for _ in GFF.parse(gff_path)]
    asm_seq = SeqIO.read(seq_path, 'fasta')
    genbank_header = defaultdict(
        str, {
            'organism': organism,
            'organelle': 'plastid:chloroplast',
            'molecule_type': 'DNA',
            'topology': 'circular',
            'data_file_division': 'PLN',
            'date': time.strftime("%d-%b-%Y", time.localtime()).upper(),
            'source': 'CGIR'
        })

    seqfeature_dict = defaultdict(MySeqFeature)
    # source feature
    seqfeature_dict['source'] = MySeqFeature(FeatureLocation(
        ExactPosition(1), ExactPosition(len(asm_seq))),
                                             strand=1,
                                             qualifiers={
                                                 'organism': organism,
                                                 'organelle':
                                                 'plastid:chloroplast',
                                                 'mol_type': "genomic DNA"
                                             },
                                             type='source')
    seqfeature_dict['source'].location = [seqfeature_dict['source'].location]
    # gene
    for feature in gff_file[0].features:
        gene = MySeqFeature()
        gene.inherit(feature)
        seqfeature_dict.setdefault(gene.qualifiers['ID'][0],
                                   gene).update_location(gene.location)
        # CDS
        for subfeature in gene.sub_features:
            child = MySeqFeature()
            child.inherit(subfeature)
            if gene.qualifiers['gene_biotype'][0] == 'protein_coding':
                _prefix = 'cds_'
            elif gene.qualifiers['gene_biotype'][0] == 'rRNA':
                _prefix = 'rrna_'
            elif gene.qualifiers['gene_biotype'][0] == 'tRNA':
                _prefix = 'trna_'
            else:
                _prefix = 'other_'
            seqfeature_dict.setdefault(_prefix + gene.qualifiers['ID'][0],
                                       child).update_location(child.location)
    # reset information like NCBI
    for _key, _feature in seqfeature_dict.items():
        # parse location
        if len(_feature.location) == 1:
            _feature.location = _feature.location[0]
        else:
            _feature.location = CompoundLocation(_feature.location)
        # qualifiers
        _old = _feature.qualifiers
        if _feature.type == 'gene':
            new_qualitier = {
                'gene':
                _old.get('gene') if 'gene' in _old else _old.get('Name'),
                'locus_tag': _old['ID'][0],
                'db_xref': 'GeneID:' + _old['Accession'][0]
            }
        elif _feature.type in ['CDS', 'rRNA', 'tRNA']:
            new_qualitier = {
                'gene':
                seqfeature_dict[_old['Parent'][0]].qualifiers.get('gene'),
                'locus_tag':
                _old['Parent'][0],
                'product':
                _old.get('product')[0] if _old.get('product') else '',
                'db_xref':
                seqfeature_dict[_old['Parent'][0]].qualifiers.get('db_xref'),
                'transl_table':
                '11'
            }
            if _feature.type == 'CDS':
                new_qualitier['protein_id'] = _old.get('Protein_Accession')
        else:
            new_qualitier = _old
        _feature.qualifiers = new_qualitier
    # output
    seqfeature_list = list(seqfeature_dict.values())
    return SeqRecord.SeqRecord(id=asm_seq.id,
                               seq=asm_seq.seq,
                               features=seqfeature_list,
                               annotations=genbank_header)
Ejemplo n.º 7
0
def seq_record_convert_nucl_to_prot(seq_records, options):
    seq_record = seq_records[0]
    cdsfeatures = utils.get_cds_features(seq_record)
    cdsmotifs = utils.get_all_features_of_type(seq_record, ["CDS_motif"])
    #Find corresponding cdsmotifs for each cdsfeature
    cdsmotifdict = {}
    for cdsfeature in cdsfeatures:
        for cdsmotif in cdsmotifs:
            if cdsfeature.location.start <= cdsmotif.location.start <= cdsfeature.location.end:
                if not cdsmotifdict.has_key(
                        cdsfeature.qualifiers['product'][0]):
                    cdsmotifdict[cdsfeature.qualifiers['product'][0]] = [
                        cdsmotif
                    ]
                else:
                    cdsmotifdict[cdsfeature.qualifiers['product'][0]].append(
                        cdsmotif)
    #For each cdsfeature, write a protein SeqRecord with CDS_motif features (abMotifs AND sec_met)
    prot_seq_records = []
    for cdsfeature in cdsfeatures:
        cds_domains = []
        #Extract sec_met info from feature
        if 'sec_met' in cdsfeature.qualifiers:
            if len([
                    qual for qual in cdsfeature.qualifiers['sec_met']
                    if "NRPS/PKS subtype: " in qual
            ]) > 0:
                cds_description = [
                    qual for qual in cdsfeature.qualifiers['sec_met']
                    if "NRPS/PKS subtype: " in qual
                ][0].partition("NRPS/PKS subtype: ")[2]
            else:
                cds_description = "Unknown protein"
            cds_domains = [
                qual for qual in cdsfeature.qualifiers['sec_met']
                if "NRPS/PKS Domain: " in qual
            ]
        else:
            cds_description = "Unknown protein"
        #Create protein seq_record
        prot_seq_record = SeqRecord(Seq(
            cdsfeature.qualifiers['translation'][0], IUPAC.protein),
                                    id=cdsfeature.qualifiers['product'][0],
                                    name=cdsfeature.qualifiers['product'][0],
                                    description=cds_description)
        utils.fix_record_name_id(prot_seq_record, options)
        #Add CDS_motif features based on NRPS/PKS domains
        cdsmotif_features = []
        for cds_domain in cds_domains:
            domainstart, domainend = cds_domain.partition(" (")[2].partition(
                "). ")[0].split("-")
            domainlocation = FeatureLocation(int(domainstart), int(domainend))
            domain_feature = SeqFeature(domainlocation, type="CDS_motif")
            domain_feature.qualifiers['note'] = [cds_domain]
            cdsmotif_features.append(domain_feature)
        #Add CDS_motif features based on NRPS/PKS abMotifs
        if cdsmotifdict.has_key(cdsfeature.qualifiers['product'][0]):
            for cdsmotif in cdsmotifdict[cdsfeature.qualifiers['product'][0]]:
                oldstart, oldend = cdsmotif.location.start, cdsmotif.location.end
                newstart = (oldstart - cdsfeature.location.start) / 3
                newend = (oldend - cdsfeature.location.start) / 3
                newlocation = FeatureLocation(newstart, newend)
                cdsmotif.location = newlocation
                cdsmotif_features.append(cdsmotif)
        prot_seq_record.features.extend(cdsmotif_features)
        prot_seq_records.append(prot_seq_record)
    return prot_seq_records
Ejemplo n.º 8
0
input_handle = open(enter.input_file, 'r')
if enter.input_file == enter.output_file:
    sys.exit('Sorry, but we can\'t edit input file. Plese give another name \
              to output file!')
try:
    output_handle = open(enter.output_file, 'w')
except IOError:
    sys.exit('Open error! Please check your genbank output path!')

records = SeqIO.parse(input_handle, 'genbank')
merged_seq = Seq('', generic_dna)
merged_record = SeqRecord(merged_seq, features=[])
location_offset = 0
for record in records:
    merged_record.seq += record.seq + 'N' * enter.nucleotides
    for feature in record.features:
        my_feature = SeqFeature(location=FeatureLocation(
            feature.location.start + location_offset,
            feature.location.end + location_offset),
                                type=feature.type,
                                strand=feature.strand,
                                qualifiers=feature.qualifiers)
        merged_record.features.append(my_feature)
    location_offset += len(record.seq) + enter.nucleotides
print merged_record

SeqIO.write(merged_record, output_handle, 'genbank')
input_handle.close()
output_handle.close()
print 'Looks fine!'
Ejemplo n.º 9
0
        root = tree.getroot()
        for seq in root.iter('seq_data'):
            #record = seq.text + gbkFile
            sequence = seq.text
            sequence = sequence.replace("\n", "")
            sequence = Seq(sequence)

        allPartsSeqs[curPart] = sequence

# In[25]:

from Bio.SeqFeature import SeqFeature, FeatureLocation
sequenceRecords = list()
for x in allPartsSeqs:
    record = allPartsSeqs[x].upper() + gbkFile
    featLoc = FeatureLocation(0, len(allPartsSeqs[x]), 1)
    record.features.append(
        SeqFeature(featLoc, type='Region', qualifiers={"label": x}))
    record.id = x
    sequenceRecords.append(record)

# In[26]:

outputPath = r"C:\Users\pakan\Documents\iGEM\referenceFiles"
if not os.path.exists(outputPath):
    os.makedirs(outputPath)

# In[27]:

from Bio.Alphabet import generic_dna
outputPath = 'C:/Users/pakan/Documents/iGEM/referenceFiles/'
Ejemplo n.º 10
0
                                      scale=True,
                                      height=1,
                                      scale_smallticks=0)
gds_features = gd_track_for_features.new_set()

seed(int(arg.random_seed))

for series in series_features.keys():
    if series_indexes[series] < len(colors_list):
        current_color = colors_list[series_indexes[series]]
    else:
        current_color = colors.Color(random(), random(), random())

    for i in range(0, len(series_features[series])):
        current_feature = series_features[series][i]
        feature = SeqFeature(FeatureLocation(int(current_feature[2]),
                                             int(current_feature[3])),
                             strand=current_feature[1])
        gds_features.add_feature(feature,
                                 name="{}".format(current_feature[0]),
                                 label=True,
                                 color=current_color)

        if int(current_feature[2]) < start:
            start = int(current_feature[2])
        if int(current_feature[3]) > end:
            end = int(current_feature[3])

if not arg.start == None:
    start = int(arg.start)
else:
    start = start - 1000
Ejemplo n.º 11
0
def processed_record(detector_name='deepbgc',
                     detector_label='deepbgc',
                     score_threshold=0.5):
    comment_key = util.format_detector_meta_key(detector_label)
    record = SeqRecord(Seq('ACTGCTCGACTGATT'))
    record.annotations['molecule_type'] = 'DNA'
    record.annotations['structured_comment'] = collections.OrderedDict()
    record.annotations['structured_comment'][
        comment_key] = collections.OrderedDict(name=detector_name,
                                               label=detector_label,
                                               score_threshold=score_threshold)
    # Add protein features
    record.features.append(
        SeqFeature(FeatureLocation(0, 2),
                   type='CDS',
                   qualifiers={'locus_tag': ['A']}))
    record.features.append(
        SeqFeature(FeatureLocation(2, 5),
                   type='CDS',
                   qualifiers={'locus_tag': ['B']}))
    record.features.append(
        SeqFeature(FeatureLocation(5, 8),
                   type='CDS',
                   qualifiers={'locus_tag': ['C']}))
    # Add pfam features
    score_column = util.format_bgc_score_column(detector_name)
    qualifiers = {
        score_column: [0.4],
        'db_xref': ['PF00001'],
        'locus_tag': ['A'],
        'database': [PFAM_DB_VERSION]
    }
    record.features.append(
        SeqFeature(FeatureLocation(0, 2),
                   type=util.PFAM_FEATURE,
                   qualifiers=qualifiers))
    qualifiers = {
        score_column: [0.7],
        'db_xref': ['PF00002'],
        'locus_tag': ['B'],
        'database': [PFAM_DB_VERSION]
    }
    record.features.append(
        SeqFeature(FeatureLocation(2, 5),
                   type=util.PFAM_FEATURE,
                   qualifiers=qualifiers))
    qualifiers = {
        score_column: [0.6],
        'db_xref': ['PF00003'],
        'locus_tag': ['C'],
        'database': [PFAM_DB_VERSION]
    }
    record.features.append(
        SeqFeature(FeatureLocation(5, 8),
                   type=util.PFAM_FEATURE,
                   qualifiers=qualifiers))
    # Add BGC features
    qualifiers = {
        score_column: ['0.6'],
        'detector': [detector_name],
        'detector_label': [detector_label]
    }
    record.features.append(
        SeqFeature(FeatureLocation(0, 5),
                   type='cluster',
                   qualifiers=qualifiers))
    qualifiers = {'detector': ['annotated'], 'detector_label': ['annotated']}
    record.features.append(
        SeqFeature(FeatureLocation(2, 8),
                   type='cluster',
                   qualifiers=qualifiers))
    return record
Ejemplo n.º 12
0
from Bio.Graphics import GenomeDiagram
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import SeqIO



records = list(SeqIO.parse('/Users/rkoch/Documents/Data_and_Scripts/MTB_genome_data_12/gene_promoters.fna', 'fasta'))

record = records[0]

p1 = [100]
p2 = [x +24 for x in p1]


for p,pp, i in zip(p1,p2, range(1,len(p1)+1)):
    record.features.append(SeqFeature(location = FeatureLocation(p, pp, strand = +1), type = 'GRE', id = 'G{}'.format(i)))

p3 = [180]
p4 = [x + 15 for x in p3]

for p,pp, i in zip(p3,p4, range(1,len(p3)+1)):
    record.features.append(SeqFeature(location = FeatureLocation(p, pp, strand = -1), type = 'TF', id = 'T{}'.format(i)))


p5 = []
p6 = []
for p,pp in zip(p1,p2):
    for ppp,pppp in zip(p3,p4):
        if ppp < p < pppp:
            p5.append(p)
            p6.append(pppp)
Ejemplo n.º 13
0
def dispatch_args(args: argparse.Namespace) -> Sequence[str]:
    if args.bam:
        assert args.ref, "Reference required with Bam input!"
    if args.type == 'lofreq':
        dicts = lofreq_process(args.vcf_path, args.minp, args.mind, args.out)
    elif args.type == 'base_caller':
        dicts = base_caller_process(args.vcf_path, args.minp, args.mind,
                                    args.out)
    if args.cds_rev or args.cds_fwd or args.genbank:
        assert not ((args.cds_rev or args.cds_fwd) and args.genbank
                    ), "CDS and genank files cannot be used simultaneously!"
        assert args.ref, "Reference file required when passing CDS as an argument."
        # each seqfeature represents a CDS
        fwd_cdss = [] if not args.cds_fwd else [
            SeqFeature(location=FeatureLocation(start, end, strand=+1),
                       type='CDS') for (start, end) in args.cds_fwd
        ]
        rev_cdss = [] if not args.cds_rev else [
            SeqFeature(location=FeatureLocation(start, end, strand=-1),
                       type='CDS') for (start, end) in args.cds_rev
        ]
        with open(args.ref) as ref_file:
            refs = list(SeqIO.parse(ref_file, format='fasta'))
            assert len(
                refs) == 1, "Only one reference sequence currently supported."
            rec = refs[0]
            rec.features = sorted(fwd_cdss + rev_cdss,
                                  key=lambda x: x.location._start)
        if args.genbank:
            with open(args.genbank) as genbank_file:
                genbanks = list(SeqIO.parse(genbank_file, format='genbank'))
                assert len(
                    genbanks
                ) == 1, "Only one genbank record currently supported."
                rec = genbanks[0]
        variants = [(d['Position'], d['Alt Base']) for d in dicts]
        translation_results = translation.dispatch(rec, variants)
        # Note: dicts will have None values, but "None" is acceptable in the output TSV
        dicts = [
            d.update(tr.__dict__) for d, tr in zip(dicts, translation_results)
        ]
        dupe_fields = ('position', 'alt')
        for d in dicts:
            assert d['position'] == d[
                'Position'], "Ordering of translation return values went bad."
            # delete fields that are now duplicated.
            for k in dupe_fields:
                del d[k]
                del d[k]
        fields = list(translation.TResult.__dataclass_fields__.keys())
        fields = [f for f in fields if not (f in dupe_fields)]
        global HEADERS
        HEADERS = HEADERS[:] + fields

    if args.bam:
        listdicts = list((d for d in dicts if d['Alt Base'] != '*'))
        ref_id = listdicts[0][
            'Reference ID']  # TODO: multiple refs in a single vcf
        get_info = partial(bam_readcount_pos, args.bam, args.ref, args.mind,
                           ref_id)
        bam_info = map(get_info, (x['Position'] for x in listdicts))

        def to_dict(v_d: Dict[str, Any], brc: BRCRow) -> Dict[str, Any]:
            entry = brc.entries[str(v_d['Alt Base'])]
            #entry = brc.entries['='] # not reliable
            # below should actually be type safe
            return {k: getattr(entry, k) for k in entry._fields if k != 'base'}

        brc_dicts = map(to_dict, listdicts, bam_info)
        dicts = map(merge, listdicts, brc_dicts)
        HEADERS = [
            'Reference ID', 'Position', 'Total Depth', 'Ref Base', 'Alt Base',
            'Ref Frequency', 'Alt Frequency', 'Codon', 'Codon Type'
        ]
        HEADERS = HEADERS[:] + [f for f in BRCEntry._fields if f != 'base']
    write_tsv(args.out, dicts, HEADERS[:])
    return HEADERS[:]
Ejemplo n.º 14
0
    # collect orfs
    record.features = []
    aa_record = load_multifasta(annot_aa)
    counter = 1
    for aa_rec in aa_record:
        this_prot = rec_name+"_"+str(counter)
        # get feature details from description line
        # because prodigal output fails to load as valid genbank
        defline = aa_rec.description
        pattern = re.compile('.+#\s(\d+)\s#\s(\d+)\s#\s(\S*1)\s#\sID.+')
        match = pattern.match(defline)
        start_pos = int(match.group(1))
        end_pos = int(match.group(2))
        strand_pos = int(match.group(3))
        feat_loc = FeatureLocation(start_pos, end_pos)
        l_tag = rec_name+"_"+str(counter)
        # consolidation feature annotations
        quals = {'note': defline, 'locus_tag': l_tag,
                 'translation': aa_rec.seq}
        feature = SeqFeature(location=feat_loc,
                             strand=strand_pos,
                             id='cds_'+str(counter),
                             type='CDS',
                             qualifiers=quals)
        record.features.append(feature)
        counter +=1

    # add annotations for Nx100 spacers
    sequence = str(record.seq)
    separator = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN"
Ejemplo n.º 15
0
    def plot_motif_sites(self, cluster_num, motif_num):
        """THIS NEEDS MORE WORK but has the beginnings of something...
        TODO: multiple motifs on same tracks, include ALL genes (i.e. in operons that were not included),
              do reverse-complement positioning correctly (based on gene strand), 
              use MAST scan output (from b.tables['motif_annotations'])
        """
        from Bio.SeqFeature import SeqFeature, FeatureLocation
        from Bio.Graphics import GenomeDiagram
        from reportlab.lib.units import cm
        from reportlab.lib import colors
        """To get this to work: download http://www.reportlab.com/ftp/fonts/pfbfer.zip
           and unzip it into /usr/lib/python2.7/dist-packages/reportlab/fonts/
        """

        motif_sites = self.get_motif_sites(cluster_num, motif_num)
        pv_range = np.max(
            -np.log10(motif_sites.pvalue.values)
        ) - 4  ## divide -log10(pval) by this to get alpha to use
        len_range = np.max(motif_sites.start.values) + 10

        gdd = GenomeDiagram.Diagram('Motif sites: %d, %d' %
                                    (cluster_num, motif_num))

        for i in range(motif_sites.shape[0]):
            gdt_features = gdd.new_track(1,
                                         start=0,
                                         end=len_range,
                                         greytrack=True,
                                         greytrack_labels=1,
                                         name=motif_sites.names.values[i],
                                         scale=True,
                                         greytrack_fontsize=4)
            gds_features = gdt_features.new_set()
            col = colors.red.clone()
            col.alpha = (-np.log10(motif_sites.pvalue.values[i]) -
                         4) / pv_range
            m_start = motif_sites.start.values[i]
            m_len = len(motif_sites.seq.values[i])
            m_strand = motif_sites.reverse.values[i]
            if m_strand == 0:
                m_strand = -1
            feature = SeqFeature(FeatureLocation(m_start, m_start + m_len - 1),
                                 strand=m_strand)
            gds_features.add_feature(feature,
                                     name=str(i + 1),
                                     label=False,
                                     color=col)

        gdd.draw(format='linear',
                 pagesize=(15 * cm, motif_sites.shape[0] * cm / 2),
                 fragments=1,
                 start=0,
                 end=len_range + 10)
        ##gdd.write("GD_labels_default.pdf", "pdf") ## looks like only output is to file, so do this:
        #output = cStringIO.StringIO()
        #gdd.write(output, 'png', dpi=300)
        #output.seek(0)
        output = gdd.write_to_string(output='png', dpi=300)
        output = cStringIO.StringIO(output)
        img = mpimg.imread(output)
        plt.axis('off')
        imgplot = plt.imshow(img, interpolation='bicubic')
        output.close()
        return gdd
Ejemplo n.º 16
0
def write_insdc(genome, features, genbank_output_path, embl_output_path):
    log.debug('prepare: genbank=%s, embl=%s', genbank_output_path,
              embl_output_path)

    contig_list = []
    for contig in genome['contigs']:
        contig_features = [
            feat for feat in features if feat['contig'] == contig['id']
        ]
        comment = (
            f"Annotated with Bakta (v{bakta.__version__}): https://github.com/oschwengers/bakta\n",
            f"Database (v{cfg.db_info['major']}.{cfg.db_info['minor']}): https://doi.org/10.5281/zenodo.4247252\n",
            '\n',
            f"##Genome Annotation Summary:##\n",
            f"{'Annotation Date':<30} :: {datetime.now().strftime('%m/%d/%Y, %H:%M:%S')}\n",
            f"{'Annotation Pipeline':<30} :: Bakta\n",
            f"{'Annotation Software version':<30} ::  v{bakta.__version__}\n",
            f"{'Annotation Database version':<30} ::  v{cfg.db_info['major']}.{cfg.db_info['minor']}\n",
            f"{'CDSs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_CDS or feat['type'] == bc.FEATURE_SORF]):5,}\n",
            f"{'tRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_T_RNA]):5,}\n",
            f"{'tmRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_TM_RNA]):5,}\n",
            f"{'tRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_R_RNA]):5,}\n",
            f"{'ncRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_NC_RNA]):5,}\n",
            f"{'regulatory ncRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_NC_RNA_REGION]):5,}\n",
            f"{'CRISPR Arrays':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_CRISPR]):5,}",
            f"{'oriCs/oriVs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_ORIC or feat['type'] == bc.FEATURE_ORIV]):5,}",
            f"{'oriTs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_ORIT]):5,}",
            f"{'gaps':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_GAP]):5,}",
        )
        contig_annotations = {
            'molecule_type':
            'DNA',
            'source':
            genome['taxon'],
            'date':
            date.today().strftime('%d-%b-%Y').upper(),
            'topology':
            contig['topology'],
            'data_file_division':
            'HGT' if contig['type'] == bc.REPLICON_CONTIG else 'BCT',
            'comment':
            comment
            # TODO: taxonomy
        }
        source_qualifiers = {
            'mol_type': 'genomic DNA'
            # 'molecule_type': 'DNA' #  might be necessary in BioPython > 1.78 along with removal of Seq(..., generic_dna)
        }

        description = ''
        if (genome['taxon']):
            contig_annotations['organism'] = genome['taxon']
            source_qualifiers['organism'] = genome['taxon']
            description = genome['taxon']
        if (genome['strain']):
            source_qualifiers['strain'] = genome['strain']

        if (contig['type'] == bc.REPLICON_PLASMID):
            source_qualifiers['plasmid'] = contig['name'] if contig.get(
                'name', None) else 'unnamed'
            description = f"{description} plasmid {contig.get('name', 'unnamed')}"
            description += ', complete sequence' if contig[
                'complete'] else ', whole genome shotgun sequence'
        elif (contig['type'] == bc.REPLICON_CHROMOSOME):
            source_qualifiers['chromosome'] = contig['name'] if contig.get(
                'name', None) else contig['id']
            description = f'{description} chromosome, complete genome' if contig[
                'complete'] else f"{description} chromosome {contig['id']}, whole genome shotgun sequence"
        else:
            description += f" {contig['id']}, whole genome shotgun sequence"

        if (len(description) > 0 and description[0]
                == ' '):  # discard potential leading whitespace
            description = description[1:]

        contig_rec = SeqIO.SeqRecord(id=contig['id'],
                                     name=contig['id'],
                                     description=description,
                                     annotations=contig_annotations,
                                     seq=Seq(contig['sequence']))

        source = SeqFeature(FeatureLocation(0, contig['length'], strand=+1),
                            type='source',
                            qualifiers=source_qualifiers)
        seq_feature_list = [source]

        for feature in contig_features:
            insdc_feature_type = None
            qualifiers = {}
            if ('db_xrefs' in feature):
                qualifiers['db_xref'] = feature['db_xrefs']
            if ('product' in feature):
                qualifiers['product'] = feature['product']
            if ('locus' in feature):
                qualifiers['locus_tag'] = feature['locus']

            if (feature['type'] == bc.FEATURE_GAP):
                insdc_feature_type = bc.INSDC_FEATURE_GAP
                qualifiers['estimated_length'] = feature['length']
            elif (feature['type'] == bc.FEATURE_ORIC
                  or feature['type'] == bc.FEATURE_ORIV):
                # TODO: Add fuzzy positions for oriC/oriV
                insdc_feature_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION
                qualifiers['inference'] = 'similar to DNA sequence'
            elif (feature['type'] == bc.FEATURE_ORIT):
                # TODO: Add fuzzy positions for oriT
                insdc_feature_type = bc.INSDC_FEATURE_ORIGIN_TRANSFER
                qualifiers['inference'] = 'similar to DNA sequence'
            elif (feature['type'] == bc.FEATURE_CDS) or (feature['type']
                                                         == bc.FEATURE_SORF):
                qualifiers['translation'] = feature['sequence']
                qualifiers['codon_start'] = 1
                qualifiers['transl_table'] = cfg.translation_table
                insdc_feature_type = bc.INSDC_FEATURE_CDS
                inference = []
                inference.append(
                    'ab initio prediction:Prodigal:2.6' if feature['type'] ==
                    bc.FEATURE_CDS else 'ab initio prediction:Bakta')
                if ('ups' in feature):
                    if ('ncbi_nrp_id' in feature['ups']):
                        qualifiers['protein_id'] = feature['ups'][
                            'ncbi_nrp_id']
                if ('ips' in feature):
                    if ('uniref100_id' in feature['ips']):
                        ips_subject_id = feature['ips']['uniref100_id']
                        inference.append(
                            f'similar to AA sequence:UniProtKB:{ips_subject_id}'
                        )
                if ('psc' in feature):
                    if ('uniref90_id' in feature['psc']):
                        psc_subject_id = feature['psc']['uniref90_id']
                        inference.append(
                            f'similar to AA sequence:UniProtKB:{psc_subject_id}'
                        )
                qualifiers['inference'] = inference
            elif (feature['type'] == bc.FEATURE_T_RNA):
                # TODO: Position anticodon
                if ('amino_acid' in feature and 'anti_codon' in feature):
                    if ('anti_codon_pos' in feature):
                        anti_codon_pos = feature['anti_codon_pos']
                        qualifiers[
                            'anticodon'] = f"(pos:{anti_codon_pos[0]}..{anti_codon_pos[1]},aa:{feature['amino_acid']},seq:{feature['anti_codon']})"
                    else:
                        qualifiers[
                            'note'] = f"tRNA-{feature['amino_acid']} ({feature['anti_codon']})"
                qualifiers['inference'] = 'profile:tRNAscan:2.0'
                insdc_feature_type = bc.INSDC_FEATURE_T_RNA
                if ('pseudo' in feature):
                    qualifiers['pseudo'] = None
            elif (feature['type'] == bc.FEATURE_TM_RNA):
                qualifiers['inference'] = 'profile:aragorn:1.2'
                insdc_feature_type = bc.INSDC_FEATURE_TM_RNA
            elif (feature['type'] == bc.FEATURE_R_RNA):
                for dbxref in feature['db_xrefs']:
                    if (dbxref.split(':')[0] == 'RFAM'):
                        rfam_id = dbxref.split(':')[1]
                        qualifiers['inference'] = f'profile:Rfam:{rfam_id}'
                insdc_feature_type = bc.INSDC_FEATURE_R_RNA
            elif (feature['type'] == bc.FEATURE_NC_RNA):
                # TODO: ncRNA_class
                for dbxref in feature['db_xrefs']:
                    if (dbxref.split(':')[0] == 'RFAM'):
                        rfam_id = dbxref.split(':')[1]
                        qualifiers['inference'] = f'profile:Rfam:{rfam_id}'
                qualifiers[bc.INSDC_FEATURE_NC_RNA_CLASS] = select_ncrna_class(
                    feature)
                insdc_feature_type = bc.INSDC_FEATURE_NC_RNA
            elif (feature['type'] == bc.FEATURE_NC_RNA_REGION):
                for dbxref in feature['db_xrefs']:
                    if (dbxref.split(':')[0] == 'RFAM'):
                        rfam_id = dbxref.split(':')[1]
                        qualifiers['inference'] = f'profile:Rfam:{rfam_id}'
                qualifiers[
                    bc.
                    INSDC_FEATURE_REGULATORY_CLASS] = select_regulatory_class(
                        feature)
                insdc_feature_type = bc.INSDC_FEATURE_REGULATORY
                qualifiers['note'] = feature['product']
                qualifiers.pop('product', None)
            elif (feature['type'] == bc.FEATURE_CRISPR):
                qualifiers[bc.INSDC_FEATURE_REPEAT_FAMILY] = 'CRISPR'
                qualifiers[bc.INSDC_FEATURE_REPEAT_TYPE] = 'direct'
                qualifiers[bc.INSDC_FEATURE_REPEAT_UNIT_SEQ] = feature[
                    'repeat_consensus']
                qualifiers['inference'] = 'COORDINATES:alignment:pilercr:1.02'
                insdc_feature_type = bc.INSDC_FEATURE_REPEAT_REGION
                qualifiers['note'] = feature['product']
                qualifiers.pop('product', None)

            strand = None
            if (feature['strand'] == bc.STRAND_FORWARD):
                strand = 1
            elif (feature['strand'] == bc.STRAND_REVERSE):
                strand = -1
            elif (feature['strand'] == bc.STRAND_UNKNOWN):
                strand = 0

            start = feature['start'] - 1
            stop = feature['stop']
            if ('edge' in feature):
                fl_1 = FeatureLocation(start, contig['length'], strand=strand)
                fl_2 = FeatureLocation(0, stop, strand=strand)
                feature_location = CompoundLocation([fl_1, fl_2])
            else:
                if ('truncated' in feature):
                    if (feature['truncated'] == bc.FEATURE_END_5_PRIME):
                        if (feature['strand'] == bc.STRAND_FORWARD):
                            start = BeforePosition(start)
                        else:
                            stop = AfterPosition(stop)
                    elif (feature['truncated'] == bc.FEATURE_END_3_PRIME):
                        if (feature['strand'] == bc.STRAND_FORWARD):
                            stop = AfterPosition(stop)
                        else:
                            start = BeforePosition(start)
                    else:
                        start = BeforePosition(start)
                        stop = AfterPosition(stop)
                feature_location = FeatureLocation(start, stop, strand=strand)
            if (feature.get('locus', None)):
                gene_qualifier = {'locus_tag': feature['locus']}
                if (feature.get('gene', None)):
                    qualifiers['gene'] = feature['gene']
                    gene_qualifier['gene'] = feature['gene']
                gen_seqfeat = SeqFeature(feature_location,
                                         type='gene',
                                         qualifiers=gene_qualifier)
                seq_feature_list.append(gen_seqfeat)
            feat_seqfeat = SeqFeature(feature_location,
                                      type=insdc_feature_type,
                                      qualifiers=qualifiers)
            seq_feature_list.append(feat_seqfeat)
        contig_rec.features = seq_feature_list
        contig_list.append(contig_rec)

    with genbank_output_path.open('wt', encoding='utf-8') as fh:
        log.info('write GenBank: path=%s', genbank_output_path)
        SeqIO.write(contig_list, fh, format='genbank')

    with embl_output_path.open('wt', encoding='utf-8') as fh:
        log.info('write EMBL: path=%s', embl_output_path)
        SeqIO.write(contig_list, fh, format='embl')
Ejemplo n.º 17
0
    def merge_or_split(self, seq, feature, mindistance = 100):
        """
        Merge or split sequence features with compound locations
        :param seq: the sequence object
        :param feature: The feature with a compound location to merge or split
        :param mindistance: the distance with which they will be merged/split
        :return:
        """
        thisid = " ".join(feature.qualifiers.get('locus_tag', [str(feature.location)]))
        # find the index of this in the list. We could also pass this as an arg
        idx = seq.features.index(feature)

        # do we need to do anything
        if type(feature.location) != CompoundLocation:
            log_and_message(f"Error {thisid} does not appear to be a compound location\n",
                            c="RED", stderr=True, loglevel="WARNING")
            return

        if not feature.strand:
            # per the biopython docs, a compound feature with some parts on one strand
            # and some parts on the other are given a strand designation of None
            # https://biopython.org/DIST/docs/api/Bio.SeqFeature.CompoundLocation-class.html#__init__
            log_and_message(f"Error {thisid} compound location seems to be on both strands. We do not know how to handle this!\n",
                            c="RED", stderr=True, loglevel="WARNING")
            return

        log_and_message(f"merging/splitting {thisid} original location: {feature.location}")

        if 'product' in feature.qualifiers:
            feature.qualifiers['product'][0] = 'Merged-or-split: ' + feature.qualifiers['product'][0]
        else:
            feature.qualifiers['product'] = ['Merged-or-split: not assigned']


        # simplify our coding
        loc = feature.location
        # first we find the strand
        strand = loc.parts[0].strand

        # test that all locations are on the same strand
        # record an error if not
        for p in loc.parts:
            if p.strand != strand:
                msg = "Error: We can not handle compound locations on different strands. For {thisid} we have {loc}"
                log_and_message(msg, c="RED", stderr=True, loglevel="WARNING")
                return

        all_locs = []
        merged = loc.parts[0]
        # handle features on the + strand
        if strand > 0:
            for p in loc.parts[1:]:
                if p.start < merged.start:
                    # this feature spans a break
                    msg = (f"Feature {thisid} spans the origin: {loc}\n"
                           f"WARNING: THIS IS AN UNTESTED FEATURE!\n"
                           f"We have not thoroughly tested conditions where an ORF on the +ve strand appears to cross "
                           f"the origin of the contig. We would appreciate you posting an issue on GitHub and sending "
                           f"Rob a copy of your genome to test!\n")
                    log_and_message(msg, c="YELLOW", stderr=True, loglevel='WARNING')

                    all_locs.append(merged)
                    merged = p
                    continue
                if p.start > merged.start and p.start < merged.end:
                    merged = FeatureLocation(merged.start, p.end, strand)
                elif p.start > merged.end:
                    if p.start - merged.end > mindistance:
                        all_locs.append(merged)
                        merged = FeatureLocation(p.start - 1, p.end, strand)
                    else:
                        merged = FeatureLocation(merged.start, p.end, strand)
                else:
                    all_locs.append(merged)
                    merged = FeatureLocation(p.start, p.end, strand)
        # handle features on the -ve strand
        else:
            for p in loc.parts[1:]:
                if merged.start < p.start:
                    # this feature spans a break
                    msg = (f"Feature {thisid} spans the origin: {loc}\n"
                           f"WARNING: THIS IS AN UNTESTED FEATURE!\n"
                           f"We have not thoroughly tested conditions where an ORF on the -ve strand appears to cross "
                           f"the origin of the contig. We would appreciate you posting an issue on GitHub and sending "
                           f"Rob a copy of your genome to test!\n")
                    log_and_message(msg, c="YELLOW", stderr=True, loglevel='WARNING')

                    all_locs.append(merged)
                    merged = p
                    continue
                if p.end > merged.start and p.end < merged.end:
                    # trivial case, the ORFs overlap
                    merged = FeatureLocation(p.start, merged.end, strand)
                elif p.end < merged.start:
                    # more complex case, there is a gap
                    if merged.start - p.end > mindistance:
                        all_locs.append(merged)
                        merged = FeatureLocation(p.start, p.end - 1, strand)
                    else:
                        merged = FeatureLocation(p.start, merged.end, strand)
                else:
                    all_locs.append(merged)
                    merged = FeatureLocation(p.start, p.end, strand)

        if all_locs:
            # we have multiple features, so we need to add features
            # make sure we add the last feature
            all_locs.append(merged)
            log_and_message(f"We could not join the whole {feature.type} feature into a single new feature.")
            # replace the existing compound feature with the first one of the split features
            newfeat = feature
            newfeat.location = all_locs[0]
            seq.features[idx] = newfeat
            # append the other features
            for f in all_locs[1:]:
                newfeat = copy.deepcopy(feature)
                newfeat.location = f
                seq.features.append(newfeat)
                log_and_message(f"Appended part of a multiple {newfeat.type} feature {thisid} loc: {f}\n")
        else:
            # we just replace the old feature with the new
            # update the location
            feature.location = merged
            # add the new feature
            seq.features[idx] = feature
            log_and_message(f"Created a single {feature.type} feature: {thisid} loc: {merged}\n")
Ejemplo n.º 18
0
def extract_features(genbank_file=None, tag='CDS', translate=False,
                     n_bases_upstream=0, n_bases_downstream=0,
                     strip_stops=False, translation_table_id=11, informative=False):

    for record in SeqIO.parse(genbank_file, "genbank"):
        for feature in record.features:
            if feature.type in tag:
                # Find new feature boundaries
                start = int(feature.location.start)
                end = int(feature.location.end)
                strand = feature.location.strand
                if n_bases_downstream != 0:
                    # If we want extra on the end we cannot listen to
                    # stop_stripping requests
                    if strand > 0:
                        end += n_bases_downstream
                    else:
                        start -= n_bases_downstream

                # n_bases_upstream
                if strand > 0:
                    start -= n_bases_upstream
                else:
                    end += n_bases_upstream

                __seqs = []
                # Upstream addition
                if n_bases_upstream > 0:
                    __seqs.append(SeqFeature(FeatureLocation(start,
                                                             int(feature.location.start),
                                                             strand=strand),
                                             type='domain'))

                __seqs.append(feature)
                # Downstream addition
                if n_bases_downstream > 0:
                    __seqs.append(SeqFeature(FeatureLocation(int(feature.location.end),
                                                             end,
                                                             strand=strand),
                                             type='domain'))

                if translate:
                    extracted_seqs = []
                    for x in __seqs:
                        try:
                            y = x.extract(record.seq).translate(table=translation_table_id, cds=True)
                            extracted_seqs.append(y)
                        except Exception, bdct:
                            log.warn("WARN %s %s %s", record.name, get_id(x), bdct)
                            try:
                                y = x.extract(record.seq).translate(table=translation_table_id, cds=False)
                                extracted_seqs.append(y)
                            except Exception, bcdt2:
                                log.warn("ERROR %s %s %s", record.name, get_id(x), bcdt2)
                else:
                    extracted_seqs = [x.extract(record.seq) for x in __seqs]

                if informative:
                    defline = ' %s [start=%s,end=%s]' % (','.join(feature.qualifiers.get('product', [])), start, end)
                else:
                    defline = ' [start=%s,end=%s]' % (start, end)

                extracted_seq = ''.join(map(str, extracted_seqs))

                if strip_stops:
                    extracted_seq = extracted_seq.replace('*', '')

                yield [
                    SeqRecord(
                        Seq(extracted_seq.strip()),
                        id='gb|%s|lcl|%s' % (record.name,  get_id(feature)),
                        description=defline
                    )
                ]
def mga_to_gff3(mga_output, genome):
    seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta"))

    current_record = None
    for line in mga_output:
        if line.startswith("#"):
            if line.startswith("# gc = ") or line.startswith("# self:"):
                continue
            chromId = line.strip().replace("# ", "")

            if " " in chromId:
                chromId = chromId[0:chromId.index(" ")]

            if chromId in seq_dict:
                if current_record is not None:
                    yield current_record
                current_record = seq_dict[chromId]
            else:
                raise Exception(
                    "Found results for sequence %s which was not in fasta file sequences (%s)"
                    % (chromId, ", ".join(seq_dict.keys())))

        else:
            (
                gene_id,
                start,
                end,
                strand,
                phase,
                complete,
                score,
                model,
                rbs_start,
                rbs_end,
                rbs_score,
            ) = line.strip().split("\t")
            start = int(start)
            end = int(end)
            strand = +1 if strand == "+" else -1

            # Correct for gff3
            start -= 1

            rbs_feat = None
            if rbs_start != "-":
                rbs_start = int(rbs_start)
                rbs_end = int(rbs_end)
                rbs_feat = SeqFeature(
                    FeatureLocation(rbs_start, rbs_end),
                    type="Shine_Dalgarno_sequence",
                    strand=strand,
                    qualifiers={
                        "ID": "%s.rbs_%s" % (current_record.id, gene_id),
                        "Source": "MGA",
                    },
                )

            cds_feat = SeqFeature(
                FeatureLocation(start, end),
                type="CDS",
                strand=strand,
                qualifiers={
                    "Source": "MGA",
                    "ID": "%s.cds_%s" % (current_record.id, gene_id),
                },
            )

            if rbs_feat is not None:
                if strand > 0:
                    gene_start = rbs_start
                    gene_end = end
                else:
                    gene_start = start
                    gene_end = rbs_end
            else:
                gene_start = start
                gene_end = end

            gene = SeqFeature(
                FeatureLocation(gene_start, gene_end),
                type="gene",
                strand=strand,
                qualifiers={
                    "Source": "MGA",
                    "ID": "%s.%s" % (current_record.id, gene_id),
                },
            )

            gene.sub_features = [cds_feat]
            if rbs_feat is not None:
                gene.sub_features.append(rbs_feat)
            current_record.features.append(gene)
    yield current_record
Ejemplo n.º 20
0
        "WARNING - Consider using order_assembly.py instead for FASTA output\n"
    )
    fasta_handle = open(output_fasta, "w")
    fasta_saved_count = 0
    fasta_short_dropped = 0

gd_diagram = GenomeDiagram.Diagram("Comparison")
gd_track_for_features = gd_diagram.new_track(1,
                                             name="reference",
                                             greytrack=False,
                                             height=0.5,
                                             start=0,
                                             end=max_len)
gd_feature_set = gd_track_for_features.new_set()
# Add a dark grey background
gd_feature_set.add_feature(SeqFeature(FeatureLocation(0, len(record))),
                           sigil="BOX",
                           color="grey",
                           label=False),

offset = 0
ref_offsets = dict()
for record in reference_parser:
    if offset > 0:
        # Add Jaggy
        # print("Adding jaggy from %i to %i" % (offset, offset+SPACER))
        gd_feature_set.add_feature(SeqFeature(
            FeatureLocation(offset, offset + SPACER)),
                                   sigil="JAGGY",
                                   color=colors.slategrey,
                                   border=colors.black)
Ejemplo n.º 21
0
def _read_ft(record, line):
    name = line[5:13].rstrip()
    if name:
        if line[13:21] == "        ":  # new-style FT line
            location = line[21:80].rstrip()
            try:
                isoform_id, location = location.split(":")
            except ValueError:
                isoform_id = None
            try:
                from_res, to_res = location.split("..")
            except ValueError:
                from_res = location
                to_res = ""
            qualifiers = {}
        else:  # old-style FT line
            from_res = line[14:20].lstrip()
            to_res = line[21:27].lstrip()
            isoform_id = None
            description = line[34:75].rstrip()
            qualifiers = {"description": description}
        if from_res == "?":
            from_res = UnknownPosition()
        elif from_res.startswith("?"):
            position = int(from_res[1:]) - 1  # Python zero-based counting
            from_res = UncertainPosition(position)
        elif from_res.startswith("<"):
            position = int(from_res[1:]) - 1  # Python zero-based counting
            from_res = BeforePosition(position)
        else:
            position = int(from_res) - 1  # Python zero-based counting
            from_res = ExactPosition(position)
        if to_res == "":
            position = from_res + 1
            to_res = ExactPosition(position)
        elif to_res == "?":
            to_res = UnknownPosition()
        elif to_res.startswith("?"):
            position = int(to_res[1:])
            to_res = UncertainPosition(position)
        elif to_res.startswith(">"):
            position = int(to_res[1:])
            to_res = AfterPosition(position)
        else:
            position = int(to_res)
            to_res = ExactPosition(position)
        location = FeatureLocation(from_res, to_res, ref=isoform_id)
        feature = FeatureTable(location=location,
                               type=name,
                               id=None,
                               qualifiers=qualifiers)
        record.features.append(feature)
        return
    # this line is a continuation of the previous feature
    feature = record.features[-1]
    if line[5:34] == "                             ":  # old-style FT line
        description = line[34:75].rstrip()
        if description.startswith("/FTId="):
            # store the FTId as the feature ID
            feature.id = description[6:].rstrip(".")
            return
        # this line is a continuation of the description of the previous feature
        old_description = feature.qualifiers["description"]
        if old_description.endswith("-"):
            description = "%s%s" % (old_description, description)
        else:
            description = "%s %s" % (old_description, description)

        if feature.type in ("VARSPLIC", "VAR_SEQ"):  # special case
            # Remove unwanted spaces in sequences.
            # During line carryover, the sequences in VARSPLIC/VAR_SEQ can get
            # mangled with unwanted spaces like:
            # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH'
            # We want to check for this case and correct it as it happens.
            try:
                first_seq, second_seq = description.split(" -> ")
            except ValueError:
                pass
            else:
                extra_info = ""
                # we might have more information at the end of the
                # second sequence, which should be in parenthesis
                extra_info_pos = second_seq.find(" (")
                if extra_info_pos != -1:
                    extra_info = second_seq[extra_info_pos:]
                    second_seq = second_seq[:extra_info_pos]
                # now clean spaces out of the first and second string
                first_seq = first_seq.replace(" ", "")
                second_seq = second_seq.replace(" ", "")
                # reassemble the description
                description = first_seq + " -> " + second_seq + extra_info
        feature.qualifiers["description"] = description
    else:  # new-style FT line
        value = line[21:].rstrip()
        if value.startswith("/id="):
            qualifier_type = "id"
            value = value[4:]
            assert value.startswith('"')
            assert value.endswith('"')
            feature.id = value[1:-1]
            return
        elif value.startswith("/evidence="):
            value = value[10:]
            assert value.startswith('"')
            if value.endswith('"'):
                value = value[1:-1]
            else:  # continues on the next line
                value = value[1:]
            assert "evidence" not in feature.qualifiers
            feature.qualifiers["evidence"] = value
            return
        elif value.startswith("/note="):
            value = value[6:]
            assert value.startswith('"')
            if value.endswith('"'):
                value = value[1:-1]
            else:  # continues on the next line
                value = value[1:]
            assert "note" not in feature.qualifiers
            feature.qualifiers["note"] = value
            return
        # this line is a continuation of the description of the previous feature
        keys = list(feature.qualifiers.keys())
        key = keys[-1]
        description = value.rstrip('"')
        old_description = feature.qualifiers[key]
        if key == "evidence" or old_description.endswith("-"):
            description = "%s%s" % (old_description, description)
        else:
            description = "%s %s" % (old_description, description)
        if feature.type == "VAR_SEQ":  # see VARSPLIC above
            try:
                first_seq, second_seq = description.split(" -> ")
            except ValueError:
                pass
            else:
                extra_info = ""
                # we might have more information at the end of the
                # second sequence, which should be in parenthesis
                extra_info_pos = second_seq.find(" (")
                if extra_info_pos != -1:
                    extra_info = second_seq[extra_info_pos:]
                    second_seq = second_seq[:extra_info_pos]
                # now clean spaces out of the first and second string
                first_seq = first_seq.replace(" ", "")
                second_seq = second_seq.replace(" ", "")
                # reassemble the description
                description = first_seq + " -> " + second_seq + extra_info
        feature.qualifiers[key] = description
Ejemplo n.º 22
0
    def to_biopython(self,
                     qualifiers: Dict[str, List] = None) -> List[SeqFeature]:
        """ Generates up to three SeqFeatures, depending if leader and tail exist.
            Any qualifiers given will be used as a base for all SeqFeatures created.
        """
        # calculate core location
        core_start = self.location.start
        core_end = self.location.end
        if self.leader:
            core_start += len(self.leader) * 3
        if self.tail:
            core_end -= len(self.tail) * 3
        core_location = FeatureLocation(core_start, core_end,
                                        self.location.strand)

        # add qualifiers
        if not qualifiers:
            qualifiers = {'note': []}
        if 'note' not in qualifiers:
            qualifiers['note'] = []

        # build features
        features = []
        if self.leader:
            start = self.location.start
            leader_location = FeatureLocation(start, core_location.start,
                                              self.location.strand)
            leader = SeqFeature(leader_location,
                                type="CDS_motif",
                                qualifiers={"note": []})
            leader.translation = self.leader
            leader.qualifiers['locus_tag'] = [self.locus_tag]
            leader.qualifiers['note'].extend([
                'leader peptide', self.peptide_class,
                'predicted leader seq: %s' % self.leader
            ])
            features.append(leader)

        core = SeqFeature(core_location,
                          type="CDS_motif",
                          qualifiers=qualifiers)
        core.qualifiers['locus_tag'] = [self.locus_tag]
        core.qualifiers['note'].extend([
            'core peptide', self.peptide_class,
            'predicted class: %s' % self.peptide_subclass,
            "predicted core seq: %s" % self.core,
            "score: %0.2f" % self.score,
            "molecular weight: %0.1f" % self.molecular_weight,
            "monoisotopic mass: %0.1f" % self.monoisotopic_mass
        ])
        if self.alternative_weights:
            weights = map(lambda x: "%0.1f" % x, self.alternative_weights)
            core.qualifiers['note'].append('alternative weights: %s' %
                                           "; ".join(weights))

        features.append(core)

        if self.tail:
            tail_location = FeatureLocation(core_location.end,
                                            self.location.end,
                                            self.location.strand)
            tail = SeqFeature(tail_location, type="CDS_motif")
            tail.translation = self.tail
            tail.qualifiers['locus_tag'] = [self.locus_tag]
            tail.qualifiers['note'] = ['tail peptide', self.peptide_class]
            features.append(tail)

        return features
Ejemplo n.º 23
0
    assert record.name not in feature_sets
    feature_sets[record.name] = gd_track_for_features.new_set()

#We add dummy features to the tracks for each cross-link BEFORE we add the
#arrow features for the genes. This ensures the genes appear on top:
for X, Y, X_vs_Y in [("NC_002703", "AF323668", A_vs_B),
                     ("AF323668", "NC_003212", B_vs_C)]:
    features_X = records[X].features
    features_Y = records[Y].features
    set_X = feature_sets[X]
    set_Y = feature_sets[Y]
    for score, x, y in X_vs_Y:
        color = colors.linearlyInterpolatedColor(colors.white, colors.firebrick, 0, 100, score)
        border = colors.lightgrey
        f_x = get_feature(features_X, x)
        F_x = set_X.add_feature(SeqFeature(FeatureLocation(f_x.location.start, f_x.location.end, strand=0)),
                                color=color, border=border)
        f_y = get_feature(features_Y, y)
        F_y = set_Y.add_feature(SeqFeature(FeatureLocation(f_y.location.start,f_y.location.end, strand=0)),
                                color=color, border=border)
        gd_diagram.cross_track_links.append(CrossLink(F_x, F_y, color, border))


for record, gene_colors in zip([A_rec, B_rec, C_rec], [A_colors, B_colors, C_colors]):
    gd_feature_set = feature_sets[record.name]

    i = 0
    for feature in record.features:
        if feature.type != "gene":
            #Exclude this feature
            continue
Ejemplo n.º 24
0
    def trim_overlapping(self):
        """ Shrinks the cluster, where possible, to exclude any features which
            overlap with the edges of the cluster.
            Any feature fully contained before shrinking will still be fully
            contained.
        """
        if not self.parent_record:
            logging.warning(
                "Trimming cluster which does not belong to a record")
            return
        features = self.parent_record.get_cds_features_within_location(
            self.location, with_overlapping=True)
        # don't trim if there's no features to trim by
        if not features:
            return

        # find the deepest feature that only overlaps at the beginning
        previous = None
        index = 0
        current = features[index]
        # track where to trim to
        start = self.location.start
        while current.overlaps_with(
                self) and not current.is_contained_by(self):
            start = max([start, current.location.start, current.location.end])
            previous = current
            index += 1
            if index >= len(features):
                current = None
                break
            current = features[index]

        # don't cause a contained feature to now overlap only
        if previous and current:
            start = min([start, current.location.start, current.location.end])

        # find the deepest feature that only overlaps at the end
        # but skip any indices already covered in the lead search
        lead_index = index
        previous = None
        index = len(features) - 1
        current = features[index]
        # track where to trim to
        end = self.location.end
        while index > lead_index and current.overlaps_with(
                self) and not current.is_contained_by(self):
            end = min([end, current.location.start, current.location.end])
            previous = current
            index -= 1
            if index < 0:
                current = None
                break
            current = features[index]

        # but don't cause a contained feature to now overlap only
        if previous and current:
            end = max([end, current.location.start, current.location.end])

        # finally, do the trim itself
        new_loc = FeatureLocation(start, end, self.location.strand)
        if self.location.start != start or self.location.end != end:
            logging.debug("Cluster %d trimming location from %s to %s",
                          self.get_cluster_number(), self.location, new_loc)
        # make sure the size is never increased
        assert self.location.start <= start < end <= self.location.end
        self.location = new_loc

        for cds in self.cds_children:
            assert cds.is_contained_by(
                self), "cluster trimming removed wholly contained CDS"
Ejemplo n.º 25
0
    def _assemble(self):

        for dr in self.dsrecs:
            if dr.name in ("", ".", "<unknown name>", None):
                dr.name = "frag{}".format(len(dr))

        if self.only_terminal_overlaps:
            algorithm = terminal_overlap
        else:
            algorithm = common_sub_strings

        # analyze_overlaps
        cols = {}
        for dsrec in self.dsrecs:
            dsrec.features = [f for f in dsrec.features if f.type != "overlap"]
            dsrec.seq = Dseq(dsrec.seq.todata)
        rcs = {dsrec: dsrec.rc() for dsrec in self.dsrecs}
        matches = []
        dsset = OrderedSet()

        for a, b in itertools.combinations(self.dsrecs, 2):
            match = algorithm(
                str(a.seq).upper(),
                str(b.seq).upper(), self.limit)
            if match:
                matches.append((a, b, match))
                dsset.add(a)
                dsset.add(b)
            match = algorithm(
                str(a.seq).upper(),
                str(rcs[b].seq).upper(), self.limit)
            if match:
                matches.append((a, rcs[b], match))
                dsset.add(a)
                dsset.add(rcs[b])
                matches.append(
                    (rcs[a], b, [(len(a) - sa - le, len(b) - sb - le, le)
                                 for sa, sb, le in match]))
                dsset.add(b)
                dsset.add(rcs[a])

        self.no_of_olaps = 0

        for a, b, match in matches:
            for start_in_a, start_in_b, length in match:
                self.no_of_olaps += 1
                chksum = a[start_in_a:start_in_a + length].seguid()
                #assert chksum == b[start_in_b:start_in_b+length].seguid()

                try:
                    fcol, revcol = cols[chksum]
                except KeyError:
                    fcol = '#%02X%02X%02X' % (random.randint(
                        175, 255), random.randint(
                            175, 255), random.randint(175, 255))
                    rcol = '#%02X%02X%02X' % (random.randint(
                        175, 255), random.randint(
                            175, 255), random.randint(175, 255))
                    cols[chksum] = fcol, rcol

                qual = {
                    "note": ["olp_{}".format(chksum)],
                    "chksum": [chksum],
                    "ApEinfo_fwdcolor": [fcol],
                    "ApEinfo_revcolor": [rcol]
                }

                if not chksum in [
                        f.qualifiers["chksum"][0]
                        for f in a.features if f.type == "overlap"
                ]:
                    a.features.append(
                        SeqFeature(FeatureLocation(start_in_a,
                                                   start_in_a + length),
                                   type="overlap",
                                   qualifiers=qual))
                if not chksum in [
                        f.qualifiers["chksum"][0]
                        for f in b.features if f.type == "overlap"
                ]:
                    b.features.append(
                        SeqFeature(FeatureLocation(start_in_b,
                                                   start_in_b + length),
                                   type="overlap",
                                   qualifiers=qual))
        for ds in dsset:
            ds.features = sorted([f for f in ds.features],
                                 key=operator.attrgetter("location.start"))

        self.analyzed_dsrecs = list(dsset)

        # Create graph

        self.G = nx.MultiDiGraph(multiedges=True,
                                 name="original graph",
                                 selfloops=False)
        self.G.add_node('5')
        self.G.add_node('3')

        for i, dsrec in enumerate(self.analyzed_dsrecs):

            overlaps = sorted({
                f.qualifiers['chksum'][0]: f
                for f in dsrec.features if f.type == 'overlap'
            }.values(),
                              key=operator.attrgetter('location.start'))

            if overlaps:
                overlaps = ([
                    SeqFeature(FeatureLocation(0, 0),
                               type='overlap',
                               qualifiers={'chksum': ['5']})
                ] + overlaps + [
                    SeqFeature(FeatureLocation(len(dsrec), len(dsrec)),
                               type='overlap',
                               qualifiers={'chksum': ['3']})
                ])

                for olp1, olp2 in itertools.combinations(overlaps, 2):

                    n1 = olp1.qualifiers['chksum'][0]
                    n2 = olp2.qualifiers['chksum'][0]

                    if n1 == '5' and n2 == '3':
                        continue

                    s1, e1, s2, e2 = (
                        olp1.location.start.position,
                        olp1.location.end.position,
                        olp2.location.start.position,
                        olp2.location.end.position,
                    )

                    source_fragment = Fragment(dsrec, s1, e1, s2, e2, i)

                    self.G.add_edge(n1,
                                    n2,
                                    frag=source_fragment,
                                    weight=s1 - e1,
                                    i=i)

        #linear assembly

        linear_products = defaultdict(list)

        for path in all_simple_paths_edges(self.G,
                                           '5',
                                           '3',
                                           data=True,
                                           cutoff=self.max_nodes):

            pred_frag = copy(path[0][2].values().pop()['frag'])
            source_fragments = [
                pred_frag,
            ]

            if pred_frag.start2 < pred_frag.end1:
                result = pred_frag[pred_frag.start2 +
                                   (pred_frag.end1 -
                                    pred_frag.start2):pred_frag.end2]
            else:
                result = pred_frag[pred_frag.end1:pred_frag.end2]

            for first_node, second_node, edgedict in path[1:]:

                edgedict = edgedict.values().pop()

                f = copy(edgedict['frag'])

                f.alignment = pred_frag.alignment + pred_frag.start2 - f.start1
                source_fragments.append(f)

                if f.start2 > f.end1:
                    result += f[f.end1:f.end2]
                else:
                    result += f[f.start2 + (f.end1 - f.start2):f.end2]

                pred_frag = f

            add = True
            for lp in linear_products[len(result)]:
                if (str(result.seq).lower() == str(lp.seq).lower()
                        or str(result.seq).lower() == str(
                            lp.seq.reverse_complement()).lower()):
                    add = False
            for dsrec in self.dsrecs:
                if (str(result.seq).lower() == str(dsrec.seq).lower()
                        or str(result.seq).lower() == str(
                            dsrec.seq.reverse_complement()).lower()):
                    add = False
            if add:
                linear_products[len(result)].append(
                    Contig(result, source_fragments))

        self.linear_products = list(
            itertools.chain.from_iterable(
                linear_products[size]
                for size in sorted(linear_products, reverse=True)))

        # circular assembly

        self.cG = self.G.copy()
        self.cG.remove_nodes_from(('5', '3'))
        #circular_products=defaultdict(list)
        circular_products = {}

        for pth in all_circular_paths_edges(self.cG):

            ns = min(enumerate(pth), key=lambda x: x[1][2]['i'])[0]

            path = pth[ns:] + pth[:ns]

            pred_frag = copy(path[0][2]['frag'])

            source_fragments = [
                pred_frag,
            ]

            if pred_frag.start2 < pred_frag.end1:
                result = pred_frag[pred_frag.start2 +
                                   (pred_frag.end1 -
                                    pred_frag.start2):pred_frag.end2]
            else:
                result = pred_frag[pred_frag.end1:pred_frag.end2]

            result.seq = Dseq(str(result.seq))

            for first_node, second_node, edgedict in path[1:]:

                f = copy(edgedict['frag'])

                f.alignment = pred_frag.alignment + pred_frag.start2 - f.start1
                source_fragments.append(f)

                if f.start2 > f.end1:
                    nxt = f[f.end1:f.end2]
                else:
                    nxt = f[f.start2 + (f.end1 - f.start2):f.end2]
                nxt.seq = Dseq(str(nxt.seq))
                result += nxt

                pred_frag = f

            #add=True
            #for cp in circular_products[len(result)]:
            #    if (str(result.seq).lower() in str(cp.seq).lower()*2
            #        or
            #        str(result.seq).lower() == str(cp.seq.reverse_complement()).lower()*2):
            #        pass
            #        add=False
            #        print "##--"
            #if add:
            #    circular_products[len(result)].append( Contig( Dseqrecord(result, circular=True), source_fragments))

            r = Dseqrecord(result, circular=True)
            circular_products[r.cseguid()] = Contig(r, source_fragments)

        #self.circular_products = list(itertools.chain.from_iterable(circular_products[size] for size in sorted(circular_products, reverse=True)))
        self.circular_products = sorted(circular_products.values(),
                                        key=len,
                                        reverse=True)
Ejemplo n.º 26
0
    def addFeat(self, feat, colorMax, percent):
        labelTab = []
        try:
            labelTab = feat.qualifiers['product'][0].split(" ")
        except KeyError:
            print " NO PRODUCT FOUND FOR "
            print feat
            labelTab[0] = "No name"
        labelName = ""
        if len(labelTab) <= Track.maxLabelWord:
            for word in labelTab:
                labelName += word + " "
        else:
            labelName = labelTab[0] + " " + labelTab[1] + " " + labelTab[
                2] + " "

        labelName = labelName[0:len(labelName) - 1] + " \n " + str(
            feat.location.start) + " - " + str(
                feat.location.end)  #skip the final space and add location

        #change location
        newStart = feat.location.start - self.diff
        newEnd = feat.location.end - self.diff
        if newEnd > Track.maxSize:
            Track.maxSize = newEnd
        newLocation = FeatureLocation(newStart, newEnd, feat.strand)
        feat.location = newLocation
        if self.nbFeats == 0:
            self.gdFeature.add_feature(feat,
                                       color=Track.backgroundColor,
                                       sigil="ARROW",
                                       name=labelName,
                                       label_position="start",
                                       label_angle=Track.angle[self.nbFeats %
                                                               2],
                                       label=True,
                                       strand=Track.strand[self.nbFeats % 2])
        else:
            self.gdFeature.add_feature(feat,
                                       color=Track.backgroundColor,
                                       sigil="ARROW",
                                       name=labelName,
                                       label_position="middle",
                                       label_angle=Track.angle[self.nbFeats %
                                                               2],
                                       label=True,
                                       strand=Track.strand[self.nbFeats % 2])
        if feat.strand == 1:
            self.gdFeature.add_feature(
                feat,
                border=colorMax,
                color=colors.linearlyInterpolatedColor(white, colorMax,
                                                       minSimilarityScore, 100,
                                                       percent),
                sigil="ARROW",
                name=feat.qualifiers['product'][0][0:11].replace(" ", "_") +
                " \n " + str(feat.location.start) + " - " +
                str(feat.location.end),
                label_position="middle",
                label_angle=0,
                label=False)
        else:
            self.gdFeature.add_feature(feat,
                                       border=colorMax,
                                       color=colors.linearlyInterpolatedColor(
                                           white, colorMax, minSimilarityScore,
                                           100, percent),
                                       sigil="ARROW",
                                       label_position="middle",
                                       label_angle=180,
                                       label=False)
        self.nbFeats += 1
        self.gdTrack.add_set(self.gdFeature)
Ejemplo n.º 27
0
    def test_eq_not_identical(self):
        """Test two different locations are not equal"""
        loc1 = FeatureLocation(22, 42, 1)
        loc2 = FeatureLocation(23, 42, 1)
        self.assertNotEqual(loc1, loc2)

        loc1 = FeatureLocation(23, 42, 1)
        loc2 = FeatureLocation(23, 43, 1)
        self.assertNotEqual(loc1, loc2)

        loc1 = FeatureLocation(23, 42, 1)
        loc2 = FeatureLocation(23, 42, -1)
        self.assertNotEqual(loc1, loc2)

        loc1 = FeatureLocation(23, 42, 1)
        loc2 = (23, 42, 1)
        self.assertNotEqual(loc1, loc2)

        loc1 = FeatureLocation(23, 42, 1, 'foo')
        loc2 = FeatureLocation(23, 42, 1, 'bar')
        self.assertNotEqual(loc1, loc2)

        loc1 = FeatureLocation(23, 42, 1, 'foo', 'bar')
        loc2 = FeatureLocation(23, 42, 1, 'foo', 'baz')
        self.assertNotEqual(loc1, loc2)
def blastxml2gff3(blastxml,
                  min_gap=3,
                  trim=False,
                  trim_end=False,
                  include_seq=False):
    from Bio.Blast import NCBIXML
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.SeqFeature import SeqFeature, FeatureLocation

    blast_records = NCBIXML.parse(blastxml)
    for idx_record, record in enumerate(blast_records):
        # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
        MATCH_TYPE.get(record.application, "match")

        recid = record.query
        if " " in recid:
            recid = recid[0:recid.index(" ")]

        rec = SeqRecord(Seq("ACTG"), id=recid)
        for idx_hit, hit in enumerate(record.alignments):
            qualifiers = {
                "ID": "b2g.%s.%s" % (idx_record, idx_hit),
                "source": "blast",
                "accession": hit.accession,
                "hit_id": hit.hit_id,
                "length": hit.length,
                "hit_titles": hit.title.split(" >"),
            }
            top_feature = SeqFeature(
                FeatureLocation(1, 1000000000),  # TODO.
                type="match",
                strand=0,
                qualifiers=qualifiers,
            )
            top_feature.sub_features = []
            feat_min = None
            feat_max = None

            for idx_hsp, hsp in enumerate(hit.hsps):
                part_qualifiers = {"source": "blastn"}
                part_qualifiers.update(qualifiers)
                part_qualifiers["ID"] += ".%s" % idx_hsp

                if include_seq:
                    part_qualifiers.update({
                        "blast_qseq": hsp.query,
                        "blast_sseq": hsp.sbjct,
                        "blast_mseq": hsp.match,
                    })

                for prop in (
                        "score",
                        "bits",
                        "identities",
                        "positives",
                        "gaps",
                        "align_length",
                        "strand",
                        "frame",
                        "query_start",
                        "query_end",
                        "sbjct_start",
                        "sbjct_end",
                ):
                    part_qualifiers["blast_" + prop] = getattr(hsp, prop, None)

                desc = hit.title.split(" >")[0]
                part_qualifiers["description"] = desc[desc.index(" "):]
                part_qualifiers["score"] = hsp.expect

                if feat_min is None:
                    feat_min = hsp.sbjct_start
                    feat_max = hsp.sbjct_end

                if hsp.sbjct_start < feat_min:
                    feat_min = hsp.sbjct_start

                if hsp.sbjct_end > feat_max:
                    feat_max = hsp.sbjct_end

                top_feature.sub_features.append(
                    SeqFeature(
                        FeatureLocation(hsp.query_start, hsp.query_end),
                        type="match_part",
                        strand=0,
                        qualifiers=copy.deepcopy(part_qualifiers),
                    ))

            top_feature.location._start = feat_min
            top_feature.location._end = feat_max
            rec.features.append(top_feature)
        rec.annotations = {}
        yield rec
Ejemplo n.º 29
0
def run_glimmerhmm(seq_record, options):
    basedir = utils.get_genefinding_basedir(options)
    with TemporaryDirectory(change=True):
        #Write FASTA file and run GlimmerHMM
        utils.fix_record_name_id(seq_record, options)
        name = seq_record.id
        while len(name) > 0 and name[0] == '-':
            name = name[1:]
        if name == "":
            name = "unknown"
        fasta_file = '%s.fasta' % name
        result_file = '%s.predict' % name
        with open(fasta_file, 'w') as handle:
            seqio.write([seq_record], handle, 'fasta')
        glimmerhmm = ['glimmerhmm']
        glimmerhmm.extend([
            fasta_file,
            utils.get_full_path(__file__,
                                "train_%s" % options.glimmerhmm_train_folder),
            "-g"
        ])
        out, err, retcode = execute(glimmerhmm)
        if err.find('ERROR') > -1:
            logging.error("Failed to run GlimmerHMM: %r" % err)
            return

        #Parse GlimmerHMM predictions
        resultstext = out
        if "CDS" not in resultstext:
            logging.error("GlimmerHMM gene prediction failed: no genes found.")
        resultstext = resultstext.replace("\r", " ")
        lines = resultstext.split("\n")
        lines = lines[2:-1]
        orfnames = []
        positions = []
        strands = []
        x = 0
        orfnr = 0
        starts = []
        ends = []
        for line in lines:
            columns = line.split("\t")
            if len(columns) > 1:
                if x == 0:
                    if columns[6] == "+":
                        bpy_strand = 1
                    else:
                        bpy_strand = -1
                    if "mRNA" not in line:
                        starts.append(int(columns[3]))
                        ends.append(int(columns[4]))
                elif x == (len(lines) - 1) or "mRNA" in lines[x + 1]:
                    if columns[6] == "+":
                        bpy_strand = 1
                    else:
                        bpy_strand = -1
                    strands.append(bpy_strand)
                    starts.append(int(columns[3]))
                    ends.append(int(columns[4]))
                    orfnames.append("orf" + (5 - orfnr) * "0" + str(orfnr))
                    orfnr += 1
                    if len(starts) == 1:
                        if starts[0] == 0:
                            starts[0] = 1
                        if ends[0] == 0:
                            ends[0] = 1
                        positions.append([[starts[0] - 1, ends[0]]])
                    else:
                        pos = []
                        if bpy_strand == -1:
                            starts.reverse()
                            ends.reverse()
                        for i in starts:
                            if i == 0:
                                i = 1
                            if ends[starts.index(i)] == 0:
                                ends[starts.index(i)] = 1
                            pos.append([i - 1, ends[starts.index(i)]])
                        positions.append(pos)
                    starts = []
                    ends = []
                elif "mRNA" not in line:
                    starts.append(int(columns[3]))
                    ends.append(int(columns[4]))
            x += 1
        if len(orfnames) == 0:
            logging.error("GlimmerHMM gene prediction failed. Please check the " \
                "format of your input FASTA file.")
        #Create seq_record features for identified genes
        idx = 0
        for orfname in orfnames:
            bpy_strand = strands[idx]
            genepositions = positions[idx]
            #For genes with only one CDS
            if len(genepositions) == 1:
                gstart, gend = genepositions[0]
                loc = FeatureLocation(gstart, gend, strand=bpy_strand)
                feature = SeqFeature(
                    location=loc,
                    id=orfname,
                    type="CDS",
                    qualifiers={
                        'locus_tag':
                        ['ctg%s_%s' % (options.record_idx, orfname)]
                    })
                seq_record.features.append(feature)
            #For genes with multiple exons
            else:
                gstart, gend = min(genepositions[0]), max(genepositions[-1])
                sublocations = []
                for exonstart, exonend in genepositions:
                    exonloc = FeatureLocation(exonstart,
                                              exonend,
                                              strand=bpy_strand)
                    sublocations.append(exonloc)
                loc = CompoundLocation(sublocations)
                feature = SeqFeature(
                    location=loc,
                    id=orfname,
                    type="CDS",
                    qualifiers={
                        'locus_tag':
                        ['ctg%s_%s' % (options.record_idx, orfname)]
                    })
                seq_record.features.append(feature)
            idx += 1
Ejemplo n.º 30
0
 def test_start_without_end(self):
     expected = [
         FeatureLocation(ExactPosition(3), AfterPosition(9), strand=1)
     ]
     self.run_both_dirs(expected, "NNNATGNNN")