예제 #1
0
def rebase(parent, child, interpro=False, protein2dna=False):
    child_features = __get_features(child, interpro=interpro)

    for rec in GFF.parse(parent):
        # TODO, replace with recursion in case it's matched against a
        # non-parent feature. We're cheating a bit here right now...
        replacement_features = []
        for feature in rec.features:
            if feature.id in child_features:
                new_subfeatures = child_features[feature.id]
                # TODO: update starts
                fixed_subfeatures = []
                for x in new_subfeatures:
                    # Then update the location of the actual feature
                    __update_feature_location(x, feature, protein2dna)

                    if interpro:
                        for y in ('status', 'Target'):
                            try:
                                del x.qualifiers[y]
                            except:
                                pass

                    fixed_subfeatures.append(x)
                replacement_features.extend(fixed_subfeatures)
        # We do this so we don't include the original set of features that we
        # were rebasing against in our result.
        rec.features = replacement_features
        GFF.write([rec], sys.stdout)
예제 #2
0
 def t_write_from_recs(self):
     """Write out GFF3 from SeqRecord inputs.
     """
     seq = Seq("GATCGATCGATCGATCGATC")
     rec = SeqRecord(seq, "ID1")
     qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"],
                   "ID": "gene1"}
     sub_qualifiers = {"source": "prediction"}
     top_feature = SeqFeature(FeatureLocation(0, 20), type="gene", strand=1,
                                                       qualifiers=qualifiers)
     top_feature.sub_features = [SeqFeature(FeatureLocation(0, 5), type="exon", strand=1,
                                            qualifiers=sub_qualifiers),
                                 SeqFeature(FeatureLocation(15, 20), type="exon", strand=1,
                                            qualifiers=sub_qualifiers)]
     rec.features = [top_feature]
     out_handle = StringIO.StringIO()
     GFF.write([rec], out_handle)
     wrote_info = out_handle.getvalue().split("\n")
     assert wrote_info[0] == "##gff-version 3"
     assert wrote_info[1] == "##sequence-region ID1 1 20"
     assert wrote_info[2].split("\t") == ['ID1', 'prediction', 'gene', '1',
                                          '20', '10.0', '+', '.',
                                          'other=Some,annotations;ID=gene1']
     assert wrote_info[3].split("\t") == ['ID1', 'prediction', 'exon', '1', '5',
                                          '.', '+', '.', 'Parent=gene1']
예제 #3
0
def main():
    from argparse import ArgumentParser

    parser = ArgumentParser("Convert SAM to GFF3 format using BCBio GFF")
    parser.add_argument("sam_filename")
    parser.add_argument("-i", "--input_fasta", default=None, help="(Optional) input fasta. If given, coverage will be calculated.")
    parser.add_argument("-s", "--source", required=True, help="source name (ex: hg38, mm10)")

    args = parser.parse_args()

    if not args.sam_filename.endswith('.sam'):
        print >> sys.stderr, "Only accepts files ending in .sam. Abort!"
        sys.exit(-1)

    prefix = args.sam_filename[:-4]
    output_gff3 = prefix + '.gff3'

    q_dict = None
    if args.input_fasta is not None:
        q_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse(open(args.input_fasta), 'fasta'))

    with open(output_gff3, 'w') as f:
        recs = [convert_sam_rec_to_gff3_rec(r0, args.source) for r0 in GMAPSAMReader(args.sam_filename, True, query_len_dict=q_dict)]
        BCBio_GFF.write(filter(lambda x: x is not None, recs), f)


    print >> sys.stderr, "Output written to {0}.".format(output_gff3)
예제 #4
0
파일: io.py 프로젝트: ginkgobioworks/edge
    def to_gff_file(self, file):
        """
        Export to GFF format, saving to provided file like object.
        """
        records = []

        for fragment in self.__genome.fragments.all():
            fragment = fragment.indexed_fragment()
            seq = Seq(fragment.sequence)
            rec = SeqRecord(seq, "%s" % (fragment.name,))
            features = []

            for annotation in fragment.annotations():
                # FeatureLocation first bp is AfterPosition, so -1
                loc = FeatureLocation(annotation.base_first - 1, annotation.base_last)
                qualifiers = {'name': annotation.feature.name}
                strand = annotation.feature.strand
                feature = SeqFeature(loc,
                                     type=annotation.feature.type,
                                     strand=0 if strand is None else strand,
                                     qualifiers=qualifiers)
                features.append(feature)

            rec.features = features
            records.append(rec)

        GFF.write(records, file, include_fasta=True)
예제 #5
0
파일: io.py 프로젝트: Chris7/edge
    def to_gff(self, filename):
        """
        Export to GFF format, saving to the specified filename.
        """
        records = []

        for fragment in self.__genome.fragments.all():
            fragment = fragment.indexed_fragment()
            seq = Seq(fragment.sequence)
            rec = SeqRecord(seq, "%s" % (fragment.name,))
            features = []

            for annotation in fragment.annotations():
                # FeatureLocation first bp is AfterPosition, so -1
                loc = FeatureLocation(annotation.base_first-1, annotation.base_last)
                qualifiers = {'name': annotation.feature.name}
                feature = SeqFeature(loc,
                                     type=annotation.feature.type,
                                     strand=1,
                                     qualifiers=qualifiers)
                features.append(feature)

            rec.features = features
            records.append(rec)

        with open(filename, "w") as out_handle:
            GFF.write(records, out_handle, include_fasta=True)
예제 #6
0
def CpGIslandsToGFF(island_location):
# Output methylation regions (CpG Islands, namely) to a GFF3 compliant file 

    out_file = os.getcwd() \
    + '/' \
    + os.path.splitext(base)[0] \
    + '.gff'


    seq = cur_record.seq
    rec = SeqRecord(seq, "ID1") 

    qualifiers = {"source": "bssimulation", "score": '.', "ID": cur_record.name}
    sub_qualifiers = {"source": "bssimulation"}
    top_feature = SeqFeature(FeatureLocation(0, len(cur_record)), type="region", strand=0,
                         qualifiers=qualifiers)
    for i in island_location:
        begin = int(i[0] - i[1]/2)
        end = int(i[0] + i[1]/2)

        top_feature.sub_features.append(SeqFeature(FeatureLocation(begin, end), 
            type="CpG_island", 
            strand=0,
            qualifiers=sub_qualifiers))

    rec.features = [top_feature]
 
    with open(out_file, "w") as out_handle:
        GFF.write([rec], out_handle)
예제 #7
0
def rebase(parent, child, interpro=False, protein2dna=False):
    child_features = __get_features(child, interpro=interpro)

    for rec in GFF.parse(parent):
        replacement_features = []
        for feature in feature_lambda(
                rec.features,
                feature_test_qual_value,
                {
                    'qualifier': 'ID',
                    'attribute_list': child_features.keys(),
                },
                subfeatures=False):

            new_subfeatures = child_features[feature.id]
            fixed_subfeatures = []
            for x in new_subfeatures:
                # Then update the location of the actual feature
                __update_feature_location(x, feature, protein2dna)

                if interpro:
                    for y in ('status', 'Target'):
                        try:
                            del x.qualifiers[y]
                        except:
                            pass

                fixed_subfeatures.append(x)
            replacement_features.extend(fixed_subfeatures)
        # We do this so we don't include the original set of features that we
        # were rebasing against in our result.
        rec.features = replacement_features
        rec.annotations = {}
        GFF.write([rec], sys.stdout)
예제 #8
0
 def genbank_to_gff(self,
         genbank_file):
     from Bio import SeqIO
     from BCBio import GFF
     gff_file = "%s.gff" % (os.path.splitext(genbank_file)[0],)
     with open(gff_file, "w") as out_handle:
         GFF.write(SeqIO.parse(genbank_file, "genbank"), out_handle, include_fasta=True)
     return dict(gff_file=gff_file)
예제 #9
0
def main(gb_file,include_fasta=None):
    out_file = "%s.gff" % os.path.splitext(gb_file)[0]
    inc_fasta = False
    if include_fasta is not None:
        if include_fasta.lower() in ("true","yes","1"):
            inc_fasta = True
        
    with open(out_file, "w") as out_handle:
        GFF.write(SeqIO.parse(gb_file, "genbank"), out_handle, inc_fasta)
예제 #10
0
def embl2gff(dat, org, gff):
    """
    Parse embl file and estract mature miRNA location information.
    """
    # extract records
    dat_parser = SeqIO.parse(dat, "embl")
    # extract organism specific miRNAs
    org_mirnas = [mirna for mirna in dat_parser if mirna.name.startswith(org)]
    for mirna in org_mirnas:
        mirna.id = mirna.name
    GFF.write(org_mirnas, gff)
예제 #11
0
def genbank_to_gff(gb_file):
    """Convert GenBank file to GFF for IGV display.
    """
    max_size = 1e4
    gff_file = "%s.gff3" % os.path.splitext(gb_file)[0]
    if not os.path.exists(gff_file):
        with open(gb_file) as in_handle:
            with open(gff_file, "w") as out_handle:
                gb_iterator = SeqIO.parse(in_handle, "genbank")
                GFF.write(_filter_features(gb_iterator, max_size),
                          out_handle)
예제 #12
0
def to_GFF(args):
    """
    Convert a GenBank or EMBL file to GFF

    Biopython does not natively support GFF

    Can be useful for QUAST (Quality Assessment Tool for Genome Assemblies)

    :param args: an argparse args list
    """
    in_type = args.inFormat.lower()
    with open(args.input) as fin, open(args.output, "w") as fout:
        GFF.write(SeqIO.parse(fin, in_type), fout)
예제 #13
0
 def t_write_seqrecord(self):
     """Write single SeqRecords.
     """
     seq = Seq("GATCGATCGATCGATCGATC")
     rec = SeqRecord(seq, "ID1")
     qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"],
                   "ID": "gene1"}
     rec.features = [SeqFeature(FeatureLocation(0, 20), type="gene", strand=1,
                                qualifiers=qualifiers)]
     out_handle = StringIO.StringIO()
     GFF.write([rec], out_handle, include_fasta=True)
     wrote_info = out_handle.getvalue().split("\n")
     gff_line = wrote_info[2]
     assert gff_line.split("\t")[0] == "ID1"
예제 #14
0
    def t_gff3_to_gff3(self):
        """Read in and write out GFF3 without any loss of information.
        """
        recs = SeqIO.to_dict(GFF.parse(self._test_gff_file))
        out_handle = StringIO.StringIO()
        GFF.write(recs.values(), out_handle)
        wrote_handle = StringIO.StringIO(out_handle.getvalue())
        recs_two = SeqIO.to_dict(GFF.parse(wrote_handle))

        orig_rec = recs.values()[0]
        re_rec = recs.values()[0]
        assert len(orig_rec.features) == len(re_rec.features)
        for i, orig_f in enumerate(orig_rec.features):
            assert str(orig_f) == str(re_rec.features[i])
예제 #15
0
 def t_write_fasta(self):
     """Include FASTA records in GFF output.
     """
     seq = Seq("GATCGATCGATCGATCGATC")
     rec = SeqRecord(seq, "ID1")
     qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"],
                   "ID": "gene1"}
     rec.features = [SeqFeature(FeatureLocation(0, 20), type="gene", strand=1,
                                qualifiers=qualifiers)]
     out_handle = StringIO.StringIO()
     GFF.write([rec], out_handle, include_fasta=True)
     wrote_info = out_handle.getvalue().split("\n")
     fasta_parts = wrote_info[3:]
     assert fasta_parts[0] == "##FASTA"
     assert fasta_parts[1] == ">ID1 <unknown description>"
     assert fasta_parts[2] == str(seq)
예제 #16
0
def main():
	for j in glob.glob("*.gb"):
		
		#from BCBio import GFF
		#from Bio import SeqIO
		fname=str(j).split(".")[0]
		in_file = j
		out_file = str(fname)+".gff"
		in_handle = open(in_file)
		out_handle = open(out_file, "w")
		  
		GFF.write(SeqIO.parse(in_handle, "genbank"), out_handle)
		   
		in_handle.close()
		out_handle.close()

		print "converted..."+str(j)
예제 #17
0
def export(org_cn, seqs):
    org_data = wa.organisms.findOrganismByCn(org_cn)

    data = io.StringIO()

    kwargs = dict(
        exportType='GFF3',
        seqType='genomic',
        exportGff3Fasta=True,
        output="text",
        exportFormat="text",
        organism=org_cn,
    )

    if len(seqs) > 0:
        data.write(wa.io.write(
            exportAllSequences=False,
            sequences=seqs,
            **kwargs
        ).encode('utf-8'))
    else:
        data.write(wa.io.write(
            exportAllSequences=True,
            sequences=[],
            **kwargs
        ).encode('utf-8'))

    # Seek back to start
    data.seek(0)

    records = list(GFF.parse(data))
    if len(records) == 0:
        print("Could not find any sequences or annotations for this organism + reference sequence")
        sys.exit(2)
    else:
        for record in records:
            record.annotations = {}
            record.features = sorted(record.features, key=lambda x: x.location.start)
            if args.gff:
                GFF.write([record], args.gff)
            record.description = ""
            if args.fasta:
                SeqIO.write([record], args.fasta, 'fasta')

    return org_data
예제 #18
0
def gb2gff(gbname):
	"""
	suppose the gb file end as prefix.gb
	write prefix.fasta and prefix.gff file out
	usage: python gb2gff.py "nigoni.gb"
	"""
	prefix=gbname.replace(".gb","")
	out_gff=open((prefix+".gff"),"w")
	out_fasta=open(prefix+".fasta","w")

	with open(gbname) as in_handle:
		GFF.write(SeqIO.parse(in_handle,"genbank"),out_gff)
	with open(gbname) as in_handle:  # have to reopen the file
		count =SeqIO.write(SeqIO.parse(in_handle,"genbank"),out_fasta,"fasta")

 	out_gff.close()
 	out_fasta.close()

 	print("Converted %i records" % count)
def parse_gff(id_start_end, gff3):
    ids = {}
    for line in id_start_end:
        l = line.split()
        if l[0] in ids:
            ids[l[0]].append((int(l[1]), int(l[2])))
        else:
            ids[l[0]] = [(int(l[1]), int(l[2]))]

    for rec in GFF.parse(gff3):
        locs = ids[rec.id]
        feats = []
        for feat in rec.features:
            f_loc = (feat.location.start, feat.location.end)
            for loc in locs:
                if (min(f_loc) <= max(loc)) and (max(f_loc) >= min(loc)):
                    feats.append(feat)

        rec.features = feats
        GFF.write([rec], sys.stdout)
예제 #20
0
def gb2gff(gbname):
    """
	suppose the gb file end as prefix.gb
	write prefix.fasta and prefix.gff file out
	usage: python gb2gff.py "nigoni.gb"
	"""
    prefix = gbname.replace(".gb", "")
    out_gff = open((prefix + ".gff"), "w")
    out_fasta = open(prefix + ".fasta", "w")

    with open(gbname) as in_handle:
        GFF.write(SeqIO.parse(in_handle, "genbank"), out_gff)
    with open(gbname) as in_handle:  # have to reopen the file
        count = SeqIO.write(SeqIO.parse(in_handle, "genbank"), out_fasta,
                            "fasta")

    out_gff.close()
    out_fasta.close()

    print("Converted %i records" % count)
예제 #21
0
 def t_gff2_to_gff3(self):
     """Read in GFF2 and write out as GFF3.
     """
     recs = SeqIO.to_dict(GFF.parse(self._wormbase_file))
     out_handle = StringIO.StringIO()
     GFF.write(recs.values(), out_handle)
     wrote_handle = StringIO.StringIO(out_handle.getvalue())
     # check some tricky lines in the GFF2 file
     checks = 0
     for line in wrote_handle:
         if line.find("Interpolated_map_position") >= 0:
             checks += 1
             assert line.find("RFLP=No") > 0
         if line.find("Gene=WBGene00000138") > 0:
             checks += 1
             assert line.find("ID=B0019.1") > 0
         if line.find("translated_nucleotide_match\t12762127") > 0:
             checks += 1
             assert line.find("Note=MSP:FADFSPLDVSDVNFATDDLAK") > 0
     assert checks == 3, "Missing check line"
예제 #22
0
def genbank2gff(inputfile):
    """
    This function transfer genbank file to gff file
    
    * inputfile: str. File name ends with gbk or gb
    
    return filename.gff
    """
    handle = open(inputfile, "r")
    if inputfile.endswith("gb"):
        out = inputfile[:-3] + ".gff"
    else:
        out = inputfile[:-4] + ".gff"
    out_handle = open(out, "w")
    result = SeqIO.parse(handle, "genbank")
    GFF.write(result, out_handle)

    handle.close()
    out_handle.close()
    return out
예제 #23
0
def write_gff_cluster(clusters,
                      header,
                      output_path,
                      sample_name='sample',
                      threads=1):
    """Write clusters as GFF entries."""
    with open(output_path, "w") as out_handle:
        tp = ThreadPoolExecutor(threads)
        futures = []
        records = OrderedDict((tid, SeqRecord(Seq(""), sn))
                              for tid, sn in enumerate(header.references))
        for i, cluster in enumerate(clusters):
            if not cluster.exclude:
                func = partial(get_feature, cluster, sample_name, i)
                futures.append(tp.submit(func))
        for future in futures:
            tid, feature = future.result()
            records[tid].features.append(feature)
        GFF.write(records.values(), out_handle)
        tp.shutdown(wait=True)
예제 #24
0
 def t_gff2_to_gff3(self):
     """Read in GFF2 and write out as GFF3.
     """
     recs = SeqIO.to_dict(GFF.parse(self._wormbase_file))
     out_handle = StringIO.StringIO()
     GFF.write(recs.values(), out_handle)
     wrote_handle = StringIO.StringIO(out_handle.getvalue())
     # check some tricky lines in the GFF2 file
     checks = 0
     for line in wrote_handle:
         if line.find("Interpolated_map_position") >= 0:
             checks += 1
             assert line.find("RFLP=No") > 0
         if line.find("Gene=WBGene00000138") > 0:
             checks += 1
             assert line.find("ID=B0019.1") > 0
         if line.find("translated_nucleotide_match\t12762127") > 0:
             checks += 1
             assert line.find("Note=MSP%3AFADFSPLDVSDVNFATDDLAK") > 0
     assert checks == 3, "Missing check line"
예제 #25
0
def rebase(parent, child, interpro=False, protein2dna=False, map_by='ID'):
    # get all of the features we will be re-mapping in a dictionary, keyed by parent feature ID
    child_features = __get_features(child, interpro=interpro)

    for rec in GFF.parse(parent):
        replacement_features = []
        # Horrifically slow I believe
        for feature in feature_lambda(
                rec.features,
                # Filter features in the parent genome by those that are
                # "interesting", i.e. have results in child_features array.
                # Probably an unnecessary optimisation.
                feature_test_qual_value,
                {
                    'qualifier': map_by,
                    'attribute_list': child_features.keys(),
                },
                subfeatures=False):

            # Features which will be re-mapped
            to_remap = child_features[feature.id]
            # TODO: update starts
            fixed_features = []
            for x in to_remap:
                # Then update the location of the actual feature
                __update_feature_location(x, feature, protein2dna)

                if interpro:
                    for y in ('status', 'Target'):
                        try:
                            del x.qualifiers[y]
                        except:
                            pass

                fixed_features.append(x)
            replacement_features.extend(fixed_features)
        # We do this so we don't include the original set of features that we
        # were rebasing against in our result.
        rec.features = replacement_features
        rec.annotations = {}
        GFF.write([rec], sys.stdout)
예제 #26
0
 def t_write_from_recs(self):
     """Write out GFF3 from SeqRecord inputs.
     """
     seq = Seq("GATCGATCGATCGATCGATC")
     rec = SeqRecord(seq, "ID1")
     qualifiers = {
         "source": "prediction",
         "score": 10.0,
         "other": ["Some", "annotations"],
         "ID": "gene1"
     }
     sub_qualifiers = {"source": "prediction"}
     top_feature = SeqFeature(FeatureLocation(0, 20),
                              type="gene",
                              strand=1,
                              qualifiers=qualifiers)
     top_feature.sub_features = [
         SeqFeature(FeatureLocation(0, 5),
                    type="exon",
                    strand=1,
                    qualifiers=sub_qualifiers),
         SeqFeature(FeatureLocation(15, 20),
                    type="exon",
                    strand=1,
                    qualifiers=sub_qualifiers)
     ]
     rec.features = [top_feature]
     out_handle = StringIO()
     GFF.write([rec], out_handle)
     wrote_info = out_handle.getvalue().split("\n")
     assert wrote_info[0] == "##gff-version 3"
     assert wrote_info[1] == "##sequence-region ID1 1 20"
     print(wrote_info[2].split("\t"))
     assert wrote_info[2].split("\t") == [
         'ID1', 'prediction', 'gene', '1', '20', '10.0', '+', '.',
         'ID=gene1;other=Some,annotations'
     ]
     assert wrote_info[3].split("\t") == [
         'ID1', 'prediction', 'exon', '1', '5', '.', '+', '.',
         'Parent=gene1'
     ]
예제 #27
0
def rebase(parent, child, interpro=False, protein2dna=False, map_by='ID'):
    # get all of the features we will be re-mapping in a dictionary, keyed by parent feature ID
    child_features = __get_features(child, interpro=interpro)

    for rec in GFF.parse(parent):
        replacement_features = []
        for feature in feature_lambda(
                rec.features,
                # Filter features in the parent genome by those that are
                # "interesting", i.e. have results in child_features array.
                # Probably an unnecessary optimisation.
                feature_test_qual_value,
                {
                    'qualifier': map_by,
                    'attribute_list': child_features.keys(),
                },
                subfeatures=False):

            # Features which will be re-mapped
            to_remap = child_features[feature.id]
            # TODO: update starts
            fixed_features = []
            for x in to_remap:
                # Then update the location of the actual feature
                __update_feature_location(x, feature, protein2dna)

                if interpro:
                    for y in ('status', 'Target'):
                        try:
                            del x.qualifiers[y]
                        except Exception:
                            pass

                fixed_features.append(x)
            replacement_features.extend(fixed_features)
        # We do this so we don't include the original set of features that we
        # were rebasing against in our result.
        rec.features = replacement_features
        rec.annotations = {}
        GFF.write([rec], sys.stdout)
예제 #28
0
파일: to-gff.py 프로젝트: mscook/to-gff
def to_GFF(args):
    """
    Convert a GenBank or EMBL file to GFF

    Mainly useful for QUAST (Quality Assessment Tool for Genome Assemblies)

    :param args: an argparse args list
    """
    args.in_file  = os.path.expanduser(args.in_file)
    args.out_file = os.path.expanduser(args.out_file)
    in_type = "genbank"
    if args.embl == True:
        in_type = "embl"
    if args.getfasta == True:
        base =  os.path.dirname(args.out_file)
        fasta = os.path.splitext(os.path.basename(args.out_file))[0]+'.fa'
        fasta_out =  os.path.join(base, fasta)
    with open(args.in_file) as fin, open(args.out_file, 'w') as fout:
        GFF.write(SeqIO.parse(fin, in_type), fout)
    if args.getfasta == True:
        with open(args.in_file) as fin, open(fasta_out, 'w') as opt_out:
            SeqIO.write(SeqIO.parse(fin, in_type), opt_out, "fasta")
예제 #29
0
def main(
    sam_filename: str = typer.Argument(...),
    input_fasta: Optional[str] = typer.Option(
        None,
        "--input_fasta",
        "-i",
        help="(Optional) input fasta. If given, coverage will be calculated.",
    ),
    source: str = typer.Option(
        ..., "--source", "-s", help="source name (ex: hg38, mm10)"
    ),
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
):
    sam_filename = Path(sam_filename)

    if sam_filename.suffix != (".sam"):
        raise RuntimeError("Only accepts files ending in .sam. Abort!")

    prefix = sam_filename.stem
    output_gff3 = f"{prefix}.gff3"

    q_dict = None
    if input_fasta is not None:
        q_dict = {r.id: len(r.seq) for r in SeqIO.parse(open(input_fasta), "fasta")}

    with open(output_gff3, "w") as f:
        recs = [
            convert_sam_rec_to_gff3_rec(r0, source)
            for r0 in GMAPSAMReader(sam_filename, True, query_len_dict=q_dict)
        ]
        BCBio_GFF.write([x for x in recs if x is not None], f)

    logger.info(f"Output written to {output_gff3}.")
def parse_gff(locations, gff3):
    locs = []
    for line in locations:
        # Consume lines from tabular list of base locations and convert into int list
        line = line.strip()
        if line:
            locs.append(int(line))
    #sort for speed
    locs.sort()

    for rec in GFF.parse(gff3):
        matched_features = []
        for feat in rec.features:
            for loc in locs:
                if loc in feat:
                    # base location is found within this feature's boudary
                    matched_features.append(feat)
                elif loc > feat.location.end:
                    # locations are now beyond this feature, skip checking
                    break
        rec.features = matched_features
        GFF.write([rec], sys.stdout)
def check_and_dump_annotations(annotations, fasta, outfile_path):
    checked_annotations = ((annotation, annotation.check_annotation(fasta)) for annotation in annotations)

    with open(outfile_path, 'w') as outfile:
        for annotation, (orf, cause) in checked_annotations:
            if not orf:
                print("Gene with id={0}: {1}".format(annotation.id(), cause))
            else:
                record = SeqRecord(fasta, str(annotation.id()))
                qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"],
                              "ID": annotation.id()}

                top_feature = SeqFeature(FeatureLocation(annotation.start(), annotation.end()), type='gene',
                                         strand=1 if annotation.is_forward() else -1,
                                         qualifiers=qualifiers)
                sub_qualifiers = {"source": "prediction"}
                top_feature.sub_features = [SeqFeature(FeatureLocation(orf[0], orf[1]), type="CDS",
                                                       strand=1 if annotation.is_forward() else -1,
                                                       qualifiers=sub_qualifiers)]

                record.features = [top_feature]
                GFF.write([record], outfile)
예제 #32
0
 def t_write_seqrecord(self):
     """Write single SeqRecords.
     """
     seq = Seq("GATCGATCGATCGATCGATC")
     rec = SeqRecord(seq, "ID1")
     qualifiers = {
         "source": "prediction",
         "score": 10.0,
         "other": ["Some", "annotations"],
         "ID": "gene1"
     }
     rec.features = [
         SeqFeature(FeatureLocation(0, 20),
                    type="gene",
                    strand=1,
                    qualifiers=qualifiers)
     ]
     out_handle = StringIO()
     GFF.write([rec], out_handle, include_fasta=True)
     wrote_info = out_handle.getvalue().split("\n")
     gff_line = wrote_info[2]
     assert gff_line.split("\t")[0] == "ID1"
예제 #33
0
def generate_gff_from_genbank(ref_genome):
    """If this reference genome has a genbank but not a GFF, generate
    a GFF from the genbank. """

    # If a GFF already exists, then just return.
    if ref_genome.dataset_set.filter(
            type=Dataset.TYPE.REFERENCE_GENOME_GFF).exists():
        return

    # Check that a genbank exists.
    assert ref_genome.dataset_set.filter(
            type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).exists()

    # Get genbank path and filename components (for creating GFF file name).
    genbank_path = get_dataset_with_type(
            ref_genome,
            type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).get_absolute_location()

    genbank_dir, genbank_filename = os.path.split(genbank_path)
    genbank_noext = os.path.splitext(genbank_filename)[0]

    # Put the GFF file in the same dir, just change the extension to .gff.
    gff_filename = os.path.join(genbank_dir, (genbank_noext + '.gff'))

    # Get the individual records, each corresponding to a chromosome.
    genome_records = list(SeqIO.parse(genbank_path, 'genbank'))

    # SnpEFF takes the name attr, but the BioPython uses the id attr to make its
    # GFF file, so overwrite the id with the name when converting to GFF.

    for genome_record in genome_records:
        genome_record.name = genome_record.id

    GFF.write(genome_records, open(gff_filename, 'w'))

    dataset_type = IMPORT_FORMAT_TO_DATASET_TYPE['gff']
    copy_and_add_dataset_source(ref_genome, dataset_type,
            dataset_type, gff_filename)
예제 #34
0
def generate_gff_from_genbank(ref_genome):
    """If this reference genome has a genbank but not a GFF, generate
    a GFF from the genbank. """

    # If a GFF already exists, then just return.
    if ref_genome.dataset_set.filter(
            type=Dataset.TYPE.REFERENCE_GENOME_GFF).exists():
        return

    # Check that a genbank exists.
    assert ref_genome.dataset_set.filter(
            type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).exists()

    # Get genbank path and filename components (for creating GFF file name).
    genbank_path = get_dataset_with_type(
            ref_genome,
            type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).get_absolute_location()

    genbank_dir, genbank_filename = os.path.split(genbank_path)
    genbank_noext = os.path.splitext(genbank_filename)[0]

    # Put the GFF file in the same dir, just change the extension to .gff.
    gff_filename = os.path.join(genbank_dir, (genbank_noext + '.gff'))

    # Get the individual records, each corresponding to a chromosome.
    genome_records = list(SeqIO.parse(genbank_path, 'genbank'))

    # SnpEFF takes the name attr, but the BioPython uses the id attr to make its
    # GFF file, so overwrite the id with the name when converting to GFF.

    for genome_record in genome_records:
        genome_record.name = genome_record.id

    GFF.write(genome_records, open(gff_filename, 'w'))

    dataset_type = IMPORT_FORMAT_TO_DATASET_TYPE['gff']
    copy_and_add_dataset_source(ref_genome, dataset_type,
            dataset_type, gff_filename)
def split_into_frames(gff3):
    for rec in GFF.parse(gff3):
        rf1 = []
        rf2 = []
        rf3 = []
        rf4 = []
        rf5 = []
        rf6 = []
        dummy_rec = copy.deepcopy(rec)
        dummy_rec.annotations = {}
        for gene in feature_lambda(rec.features,
                                   feature_test_type, {'types': 'gene'},
                                   subfeatures=True):
            if gene.location.strand == 1:
                frame = str(((gene.location.start) % 3) + 1)
            else:
                frame = str((-(gene.location.start - 1) % 3) + 4)
            locals()['rf' + frame].append(gene)

        for i in range(6):
            dummy_rec.features = locals()['rf' + str(i + 1)]
            with open('rf' + str(i + 1) + '.gff3', 'a') as outfile:
                GFF.write([dummy_rec], outfile)
예제 #36
0
def main():
    from argparse import ArgumentParser

    parser = ArgumentParser("Convert SAM to GFF3 format using BCBio GFF")
    parser.add_argument("sam_filename")
    parser.add_argument(
        "-i",
        "--input_fasta",
        default=None,
        help="(Optional) input fasta. If given, coverage will be calculated.")
    parser.add_argument("-s",
                        "--source",
                        required=True,
                        help="source name (ex: hg38, mm10)")

    args = parser.parse_args()

    if not args.sam_filename.endswith('.sam'):
        print >> sys.stderr, "Only accepts files ending in .sam. Abort!"
        sys.exit(-1)

    prefix = args.sam_filename[:-4]
    output_gff3 = prefix + '.gff3'

    q_dict = None
    if args.input_fasta is not None:
        q_dict = dict((r.id, len(r.seq))
                      for r in SeqIO.parse(open(args.input_fasta), 'fasta'))

    with open(output_gff3, 'w') as f:
        recs = [
            convert_sam_rec_to_gff3_rec(r0, args.source) for r0 in
            GMAPSAMReader(args.sam_filename, True, query_len_dict=q_dict)
        ]
        BCBio_GFF.write(filter(lambda x: x is not None, recs), f)

    print >> sys.stderr, "Output written to {0}.".format(output_gff3)
예제 #37
0
 def t_write_fasta(self):
     """Include FASTA records in GFF output.
     """
     seq = Seq("GATCGATCGATCGATCGATC")
     rec = SeqRecord(seq, "ID1")
     qualifiers = {
         "source": "prediction",
         "score": 10.0,
         "other": ["Some", "annotations"],
         "ID": "gene1"
     }
     rec.features = [
         SeqFeature(FeatureLocation(0, 20),
                    type="gene",
                    strand=1,
                    qualifiers=qualifiers)
     ]
     out_handle = StringIO.StringIO()
     GFF.write([rec], out_handle, include_fasta=True)
     wrote_info = out_handle.getvalue().split("\n")
     fasta_parts = wrote_info[3:]
     assert fasta_parts[0] == "##FASTA"
     assert fasta_parts[1] == ">ID1 <unknown description>"
     assert fasta_parts[2] == str(seq)
예제 #38
0
def merge_interpro(gff3, interpro):
    ipr_additions = {}
    # blacklist = ('Name', 'ID', 'Target', 'date', 'status', 'signature_desc', 'source', 'md5', 'score')
    whitelist = ("Dbxref", "Ontology_term")

    for rec in GFF.parse(interpro):
        ipr_additions[rec.id] = {}
        for feature in rec.features:
            quals = feature.qualifiers
            for key in quals:
                if key not in ipr_additions[rec.id]:
                    ipr_additions[rec.id][key] = set()
                for value in quals[key]:
                    ipr_additions[rec.id][key].add(value)

        # Cast as a list so we aren't iterating over actual keyset. Otherwise,
        # we'll throw an error for modifying keyset during iteration, which we
        # don't really care about here.
        for key in list(ipr_additions[rec.id]):
            if key not in whitelist:
                del ipr_additions[rec.id][key]

    for rec in GFF.parse(gff3):
        for feature in feature_lambda(rec.features,
                                      feature_test_true,
                                      None,
                                      subfeatures=True):
            if feature.id in ipr_additions:
                for key in ipr_additions[feature.id]:
                    if key not in feature.qualifiers:
                        feature.qualifiers[key] = []

                    feature.qualifiers[key] += list(
                        ipr_additions[feature.id][key])
        rec.annotations = {}
        GFF.write([rec], sys.stdout)
예제 #39
0
def gb2gff(infile, outfile):
    """Translate GenBank file to GFF3 file. TODO: the procedure now does not 
    handle join correctly

    Args:
        infile (str): input GenBank file
        outfile (str): output GFF3 file
    Returns:
        Number of records written
    """

    gb_handle = open(infile, 'r')
    gff_handle = open(outfile, 'w')
    res = GFF.write(SeqIO.parse(gb_handle, "gb"), gff_handle)
    gff_handle.close()
    return (res)
예제 #40
0
        for hit in hits:
            rec_a_hits_in_b.append(rec_b_map[hit])

    for feature in rec_b.features:
        hits = tree_a.find_range(
            (int(feature.location.start), int(feature.location.end)))
        for hit in hits:
            rec_b_hits_in_a.append(rec_a_map[hit])

    rec_a.features = set(rec_a_hits_in_b)
    rec_b.features = set(rec_b_hits_in_a)
    return rec_a, rec_b


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='rebase gff3 features against parent locations', epilog="")
    parser.add_argument('a', type=argparse.FileType("r"))
    parser.add_argument('b', type=argparse.FileType("r"))
    parser.add_argument('--oa', type=str, default='a_hits_in_b.gff')
    parser.add_argument('--ob', type=str, default='b_hits_in_a.gff')
    args = parser.parse_args()

    b, a = intersect(args.a, args.b)

    with open(args.oa, 'w') as handle:
        GFF.write([a], handle)

    with open(args.ob, 'w') as handle:
        GFF.write([b], handle)
예제 #41
0
    for record in parse_transterm(transterm_output):
        yield record


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Export corresponding sequence in genome from GFF3', epilog="")
    parser.add_argument('fasta', help='Fasta Genome')
    parser.add_argument('gff3', help='GFF3 File')

    parser.add_argument('--min_conf', type=int, default=76, help='Only output terminators with confidence >= n')

    # parser.add_argument('--gc', type=float, default=-2.3, help='Score of a G-C pair')
    # parser.add_argument('--au', type=float, default=-0.9, help='Score of an A-U pair')
    # parser.add_argument('--gu', type=float, default=1.3, help='Score of a G-U pair')
    # parser.add_argument('--mm', type=float, default=3.5, help='Score of any other pair')
    # parser.add_argument('--gap', type=int, default=6, help='Score of a gap in the hairpin')
    # parser.add_argument('--max_hp_score', type=float, default=-2, help='Maximum allowable hairpin score')
    # parser.add_argument('--max_tail_score', type=float, default=-2.5, help='Maximum allowable tail score')
    # parser.add_argument('--max_len', type=int, default=59, help='Total extent of hairpin <= n NT long')
    # parser.add_argument('--min_stem', type=int, default=4, help='Stem must be n nucleotides long')
    # parser.add_argument('--max_loop', type=int, default=13, help='The loop portion can be no longer than n')
    # parser.add_argument('--min_loop', type=int, default=3, help='Loop portion of the hairpin must be at least n long')
    # parser.add_argument('--uwin_require', type=int, default=3, help='Number of "U" nucleotides in the --uwin_length long region.')
    # parser.add_argument('--loop_penalty', default='1,2,3,4,5,6,7,8,9,10,11', help='The cost of loops of various lengths can be set using --loop_penalty=f1,f2,f3,f4,f5,...fn, where f1 is the cost of a loop of length --min_loop, f2 is the cost of a loop of length --min_loop+1, as so on. If there are too few terms to cover up to max_loop, the last term is repeated.',)

    args = parser.parse_args()

    for record in main(existing_expterm=os.path.join(SCRIPT_PATH, 'expterm.dat'), **vars(args)):
        GFF.write([record], sys.stdout)
예제 #42
0
    def write_gff(self):
        # First write genes kept from the base annotation...
        in_handle = open(self.filtered_base_gff)
        recs = []

        all_merged = [j for i in self.blacklist_merged.values() for j in i]
        self.num_merged = len(all_merged)

        for rec in GFF.parse(in_handle):
            rec.annotations = {}
            rec.seq = ""
            new_feats = []

            for f in rec.features:  # gene

                gene_id = f.qualifiers['ID'][0]

                search = re.search(self.id_regex, gene_id)
                raw_gene_id = self.id_syntax.replace('{id}', search.group(1))

                if raw_gene_id not in self.blacklist_base and raw_gene_id not in all_merged:
                    cleaned_f = self.clean_feature(f)
                    cleaned_f = self.guess_exons(cleaned_f)
                    cleaned_f = self.renumber_exons(cleaned_f)
                    new_feats.append(cleaned_f)
                    self.num_kept_base += 1
                    self.num_total_genes += 1
                else:
                    self.num_replaced_base += 1

            rec.features = new_feats

            if len(rec.features):
                recs.append(rec)

        in_handle.close()

        # ... then write genes models from apollo
        in_handle = open(self.apollo_gff)
        for rec in GFF.parse(in_handle):
            rec.annotations = {}
            rec.seq = ""
            new_feats = []

            for f in rec.features:  # gene

                gene_id = f.qualifiers['ID'][0]

                if gene_id in self.name_map:
                    f.qualifiers['Alias'] = [gene_id]
                    f.qualifiers['ID'][0] = self.name_map[gene_id]
                    nfeat = self.clean_feature(f)
                    nfeat = self.guess_utrs(nfeat)
                    nfeat = self.renumber_exons(nfeat)
                    new_feats.append(nfeat)
                    self.num_total_genes += 1

            rec.features = new_feats

            if len(rec.features):
                recs.append(rec)

        in_handle.close()

        out_handle = open(self.out_gff, "w")
        GFF.write(recs, out_handle)
        out_handle.close()
예제 #43
0
def main(gb_file):
    out_file = "%s.gff" % os.path.splitext(gb_file)[0]
    with open(out_file, "w") as out_handle:
        GFF.write(SeqIO.parse(gb_file, "genbank"), out_handle)
예제 #44
0
# -*- coding: utf-8 -*-
# @Author: Marylette B. Roa
# @Date:   2018-03-08 20:16:26
# @Last Modified by:   Marylette B. Roa
# @Last Modified time: 2018-07-18 10:36:23

import sys

sys.path.append('/mnt/e/virtual_envs/windowsEnv/lib/python3.4/site-packages')

from Bio import SeqIO
from BCBio import GFF

in_file = "sequence.gb"
out_file = "sequence.gff"
in_handle = open(in_file)
out_handle = open(out_file, "w")

GFF.write(SeqIO.parse(in_handle, "genbank"), out_handle)

in_handle.close()
out_handle.close()
parser.add_argument("-o",
                    "--output_file",
                    action="store",
                    dest="output_file",
                    help="Output file with extracted_annotations")
parser.add_argument("-d",
                    "--ids_file",
                    action="store",
                    dest="ids_file",
                    help="File with ids of annotations to extract")
parser.add_argument("-t",
                    "--annotation_types",
                    action="store",
                    dest="annotation_types",
                    default=["gene"],
                    type=lambda s: s.split(","),
                    help="Comma-separated list of annotation types to extract")

args = parser.parse_args()

annotation_ids = IdList()
annotation_ids.read(args.ids_file, comments_prefix="#")
#print args.annotation_types
out_fd = open(args.output_file, "w")

GFF.write(
    record_with_extracted_annotations_generator(args.input_gff,
                                                args.annotation_types), out_fd)

out_fd.close()
예제 #46
0

def cigar_from_string(query, match, subject, strict_m=True):
    matchline = _qms_to_matches(query, match, subject, strict_m=strict_m)
    if len(matchline) > 0:
        return _matchline_to_cigar(matchline)
    else:
        return ""


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Convert Blast XML to gapped GFF3', epilog='')
    parser.add_argument('blastxml', type=open, help='Blast XML Output')
    parser.add_argument(
        '--min_gap',
        type=int,
        help='Maximum gap size before generating a new match_part',
        default=3)
    parser.add_argument(
        '--trim',
        action='store_true',
        help='Trim blast hits to be only as long as the parent feature')
    parser.add_argument('--trim_end',
                        action='store_true',
                        help='Cut blast results off at end of gene')
    args = parser.parse_args()

    result = blastxml2gff3(**vars(args))
    GFF.write(result, sys.stdout)
예제 #47
0
        record.features = []
        tmprec = copy.deepcopy(record)
        tmprec.annotations = {}
        tmprec.features = []
        record.features = record_features

        for feature in record.features:
            props = []
            if 'record_id' in args.keys:
                props.append(record.id)
            if 'source' in args.keys:
                props.append(feature.qualifiers['source'][0])
            if 'target' in args.keys:
                props.append(feature.qualifiers['Target'][0])

            propkey = '|'.join(map(str, props))

            if propkey not in file_handles:
                filename = args.joiner.join(props)
                path = os.path.join('out', filename + '.gff3')
                logging.info("Opening %s", path)
                file_handles[propkey] = open(path, 'a')

            tmprec.features = [feature]
            GFF.write([tmprec], file_handles[propkey])
        # SeqIO.write([record], args.fasta, 'fasta')
        # sys.exit()

    for key in file_handles:
        file_handles[key].close()
                gene_end = end

            gene = SeqFeature(FeatureLocation(gene_start, gene_end),
                              type="gene",
                              strand=strand,
                              qualifiers={
                                  'Source': 'MGA',
                                  'ID': '%s.%s' % (current_record.id, gene_id),
                              })

            gene.sub_features = [cds_feat]
            if rbs_feat is not None:
                gene.sub_features.append(rbs_feat)
            current_record.features.append(gene)
    yield current_record


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Convert MGA to GFF3',
                                     epilog="")
    parser.add_argument('mga_output',
                        type=argparse.FileType("r"),
                        help='MetaGeneAnnotator Output')
    parser.add_argument('genome',
                        type=argparse.FileType("r"),
                        help='Fasta Genome')
    args = parser.parse_args()

    for result in mga_to_gff3(**vars(args)):
        GFF.write([result], sys.stdout)
예제 #49
0
    def export_gff3(self, organism_id):
        """
        Export organism features as GFF3

        :type organism_id: int
        :param organism_id: Organism ID

        :rtype: None
        :return: None
        """
        # check if the organism exists
        res = self.session.query(self.model.organism).filter(
            self.model.organism.organism_id.in_([organism_id]))
        sys.stderr.write("Processing %s sequences\n" % res.count())

        for org in res:
            # TODO: can we do this properly?
            seq = Seq("A" * 1, IUPAC.unambiguous_dna)

            # Annotation features
            features = self.session.query(self.model.feature, self.model.featureloc) \
                .filter_by(organism_id=org.organism_id) \
                .join(self.model.featureloc, self.model.feature.feature_id == self.model.featureloc.feature_id, isouter=True)
            sys.stderr.write("\tProcessing %s features\n" % features.count())

            biopy_features = {}
            idx = 0
            for idx, (feature, featureloc) in enumerate(features):
                if idx % 5000 == 0:
                    sys.stderr.write("\t%s / %s\n" % (idx, features.count()))

                # [u'dbxref_id', u'feature_id', u'is_analysis', u'is_obsolete',
                # u'md5checksum', u'name', u'organism_id', u'residues', u'seqlen',
                # u'timeaccessioned', u'timelastmodified', u'type_id',
                # u'uniquename']
                # [u'feature_id', u'featureloc_id', u'fmax', u'fmin',
                # u'is_fmax_partial', u'is_fmin_partial', u'locgroup', u'phase',
                # u'rank', u'residue_info', u'srcfeature_id', u'strand']
                qualifiers = {
                    self.ci.get_cvterm_name(prop.type_id): prop.value
                    for prop in self.session.query(self.model.featureprop).
                    filter_by(feature_id=feature.feature_id).all()
                }

                qualifiers['ID'] = feature.uniquename

                biopy_features[feature.feature_id] = SeqFeature(
                    location=BioFeatureLocation(featureloc.fmin,
                                                featureloc.fmax)
                    if featureloc else None,
                    id=feature.uniquename,
                    type=self.ci.get_cvterm_name(feature.type_id),
                    strand=featureloc.strand if featureloc else None,
                    qualifiers=qualifiers)
            sys.stderr.write("\t%s / %s\n" % (idx + 1, features.count()))

            # res = self.session.query(self.model.organism).filter(self.model.organism.organism_id.in_(organism_id))
            relationships = self.session.query(self.model.feature_relationship) \
                .filter(self.model.feature_relationship.subject_id.in_(biopy_features.keys()))
            sys.stderr.write("\tProcessing %s relationships\n" %
                             relationships.count())

            #  feature_relationship_id | subject_id | object_id | type_id | value | rank
            # -------------------------+------------+-----------+---------+-------+------
            #                        1 |          4 |         3 |      37 |       |    0
            #                        2 |          5 |         4 |      37 |       |    0
            #                        3 |          6 |         4 |      37 |       |    0
            #                        4 |          7 |         4 |      37 |       |    0

            features = []

            def findById(feature_list, id):
                for feature in feature_list:
                    if feature.id == id:
                        yield feature

                    if hasattr(feature, 'sub_features'):
                        for x in findById(feature.sub_features, id):
                            yield x

            # Now to re-parent things properly
            idx = 0
            for idx, rel in enumerate(relationships):
                if idx % 5000 == 0:
                    sys.stderr.write("\t%s / %s\n" %
                                     (idx, relationships.count()))

                term = self.ci.get_cvterm_name(rel.type_id)
                if term != 'part_of':
                    sys.stderr.write(
                        "\tCannot handle non-part_of relationships (%s %s %s)\n"
                        % (rel.subject_id, term, rel.object_id))
                    continue

                # Try and find the features in features.
                child = list(
                    findById(features, biopy_features[rel.subject_id].id))
                parent = list(
                    findById(features, biopy_features[rel.object_id].id))

                assert len(child) <= 1
                assert len(parent) <= 1
                alreadyProcessedParent = False
                alreadyProcessedChild = False

                # If they aren't there, pull them from the complete set.
                if len(child) == 0:
                    child = biopy_features[rel.subject_id]
                else:
                    child = child[0]
                    alreadyProcessedChild = True

                if len(parent) == 0:
                    parent = biopy_features[rel.object_id]
                else:
                    parent = parent[0]
                    alreadyProcessedParent = True

                if not hasattr(parent, 'sub_features'):
                    parent.sub_features = []

                parent.sub_features.append(child)
                if alreadyProcessedChild and alreadyProcessedParent:
                    # Here we've seen both (they're BOTH in the list), so we need to remove
                    # child and not touch parent since we added to parent already
                    if child in features:
                        features.remove(child)
                elif alreadyProcessedChild and not alreadyProcessedParent:
                    # Here our child is already in features, so we need to remove it from
                    # the feature set, add to the parent (done) and re-place in features.
                    features.remove(child)
                    features.append(parent)
                elif not alreadyProcessedChild and alreadyProcessedParent:
                    # In this case we've seen the parent before, already in list, no need to do anything
                    # features.append(parent)
                    pass
                else:
                    # Otherwise, completely new feature.
                    features.append(parent)

            sys.stderr.write("\t%s / %s\n" % (idx + 1, relationships.count()))

            n = org.common_name if org.common_name else 'org_%s' % org.organism_id
            record = SeqRecord(
                seq,
                id=n,
                name=n,
                description="%s %s" % (org.genus, org.species),
            )
            record.features = sorted(features, key=lambda f: f.location.start)

            GFF.write([record], sys.stdout)

        return ""  # Return an empty string to avoid getting a "None" when printing on stdout
예제 #50
0
def shinefind(
    genbank_file,
    gff3_output=None,
    table_output=None,
    lookahead_min=5,
    lookahead_max=15,
    top_only=False,
    add=False,
):
    table_output.write("\t".join([
        "ID",
        "Name",
        "Terminus",
        "Terminus",
        "Strand",
        "Upstream Sequence",
        "SD",
        "Spacing",
    ]) + "\n")

    sd_finder = NaiveSDCaller()
    # Parse GFF3 records
    for record in list(SeqIO.parse(genbank_file, "genbank")):
        # Sometimes you have a case where TWO CDS features have the same start. Only handle ONE.
        seen = {}
        # Shinefind's "gff3_output".
        gff3_output_record = SeqRecord(record.seq, record.id)
        # Loop over all CDS features
        for feature in record.features:
            if feature.type != "CDS":
                continue

            seen_loc = (feature.location.start
                        if feature.strand > 0 else feature.location.end)
            if seen_loc in seen:
                continue
            else:
                seen[seen_loc] = True

            sds, start, end, seq = sd_finder.testFeatureUpstream(
                feature, record, sd_min=lookahead_min, sd_max=lookahead_max)

            feature_id = get_id(feature)
            sd_features = sd_finder.to_features(sds,
                                                feature.location.strand,
                                                start,
                                                end,
                                                feature_id=feature.id)

            human_strand = "+" if feature.location.strand == 1 else "-"

            # http://book.pythontips.com/en/latest/for_-_else.html
            log.debug("Found %s SDs", len(sds))
            for (sd, sd_feature) in zip(sds, sd_features):
                # If we only want the top feature, after the bulk of the
                # forloop executes once, we append the top feature, and fake a
                # break, because an actual break triggers the else: block
                table_output.write("\t".join(
                    map(
                        str,
                        [
                            feature.id,
                            feature_id,
                            feature.location.start,
                            feature.location.end,
                            human_strand,
                            sd_finder.highlight_sd(seq, sd["start"],
                                                   sd["end"]),
                            sd["hit"],
                            int(sd["spacing"]) + lookahead_min,
                        ],
                    )) + "\n")

                if add:
                    # Append the top RBS to the gene feature
                    record.features.append(sd_feature)
                # Also register the feature with the separate GFF3 output
                gff3_output_record.features.append(sd_feature)

                if top_only:
                    break
            else:
                if len(sds) != 0:
                    log.debug("Should not reach here if %s", len(sds) != 0)
                    # Somehow this is triggerring, and I don't feel like figuring out why. Someone else's problem.
                    continue
                table_output.write("\t".join(
                    map(
                        str,
                        [
                            feature.id,
                            feature_id,
                            feature.location.start,
                            feature.location.end,
                            human_strand,
                            seq,
                            None,
                            -1,
                        ],
                    )) + "\n")

        record.features = sorted(record.features,
                                 key=lambda x: x.location.start)
        SeqIO.write([record], sys.stdout, "genbank")

        gff3_output_record.features = sorted(gff3_output_record.features,
                                             key=lambda x: x.location.start)
        gff3_output_record.annotations = {}
        GFF.write([gff3_output_record], gff3_output)
예제 #51
0
                o_data.append(float(parts[3]))
                m_data.append(float(parts[4]))

    bigwig_store(bw_i, record.id, i_data)
    bigwig_store(bw_o, record.id, o_data)
    bigwig_store(bw_m, record.id, m_data)
    yield record


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Process TMHMM outputs in GFF3, BigWig')
    parser.add_argument('--bw_i', default='tmhmm_i.wig')
    parser.add_argument('--bw_o', default='tmhmm_o.wig')
    parser.add_argument('--bw_m', default='tmhmm_m.wig')
    args = parser.parse_args()

    bw_i = open(args.bw_i, 'w')
    bw_o = open(args.bw_o, 'w')
    bw_m = open(args.bw_m, 'w')

    bigwig_add_header(bw_i, 'i', name='TMHMM')
    bigwig_add_header(bw_o, 'o', name='TMHMM')
    bigwig_add_header(bw_m, 'm', name='TMHMM')

    for sequence in convert(None, bw_i, bw_o, bw_m):
        GFF.write([sequence], sys.stdout)

    bw_i.close()
    bw_o.close()
    bw_m.close()
예제 #52
0
        if fasta:
            if len(rec.seq) == rec.seq.count("?"):
                log.error(
                    "ERROR: You have provided a fasta file but the sequence ID in the fasta file DID NOT MATCH THE GFF. THIS IS BAD."
                )

        yield rec


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Reopen a set of GFF3 annotations')
    parser.add_argument('gff3',
                        type=argparse.FileType("r"),
                        help='GFF3 annotations')
    parser.add_argument('--fasta',
                        type=argparse.FileType("r"),
                        help='Optional fasta file')
    parser.add_argument('--fasta_output',
                        type=argparse.FileType("w"),
                        help='Optional fasta file output',
                        default='reopened.fasta')
    parser.add_argument('index', type=int, help='Index to reopen genome at')
    args = parser.parse_args()

    for rec in gff_reopen(**vars(args)):
        GFF.write([rec], sys.stdout)
        if args.fasta:
            SeqIO.write([rec], args.fasta_output, 'fasta')
예제 #53
0
#!/usr/bin/env python
import sys
import argparse
from Bio import SeqIO
from BCBio import GFF

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=
        "Sample script to add an attribute to a feature via web services")
    parser.add_argument("data", type=argparse.FileType("r"), help="GFF3 File")
    parser.add_argument(
        "--gff",
        type=argparse.FileType("w"),
        help="Output Annotations",
        default="data.gff3",
    )
    parser.add_argument(
        "--fasta",
        type=argparse.FileType("w"),
        help="Output Sequence",
        default="data.fa",
    )
    args = parser.parse_args()

    for record in GFF.parse(args.data):
        GFF.write([record], args.gff)
        record.description = ""
        SeqIO.write([record], args.fasta, "fasta")
        sys.exit()