Ejemplo n.º 1
0
    def t_gff3_to_gff3(self):
        """Read in and write out GFF3 without any loss of information.
        """
        recs = SeqIO.to_dict(GFF.parse(self._test_gff_file))
        out_handle = StringIO.StringIO()
        GFF.write(recs.values(), out_handle)
        wrote_handle = StringIO.StringIO(out_handle.getvalue())
        recs_two = SeqIO.to_dict(GFF.parse(wrote_handle))

        orig_rec = recs.values()[0]
        re_rec = recs.values()[0]
        assert len(orig_rec.features) == len(re_rec.features)
        for i, orig_f in enumerate(orig_rec.features):
            assert str(orig_f) == str(re_rec.features[i])
Ejemplo n.º 2
0
    def not_t_full_celegans(self):
        """Test the full C elegans chromosome and GFF files.

        This is used to test GFF on large files and is not run as a standard
        test. You will need to download the files and adjust the paths
        to run this.
        """
        # read the sequence information
        seq_file = os.path.join(self._full_dir, "c_elegans.WS199.dna.fa")
        gff_file = os.path.join(self._full_dir, "c_elegans.WS199.gff3")
        seq_handle = open(seq_file)
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
        seq_handle.close()
        #with open(gff_file) as gff_handle:
        #    possible_limits = feature_adder.available_limits(gff_handle)
        #    pprint.pprint(possible_limits)
        rnai_types = [('Orfeome', 'PCR_product'),
                    ('GenePair_STS', 'PCR_product'),
                    ('Promoterome', 'PCR_product')]
        gene_types = [('Non_coding_transcript', 'gene'),
                      ('Coding_transcript', 'gene'),
                      ('Coding_transcript', 'mRNA'),
                      ('Coding_transcript', 'CDS')]
        limit_info = dict(gff_source_type = rnai_types + gene_types)
        for rec in GFF.parse(gff_file, seq_dict, limit_info=limit_info):
            pass
Ejemplo n.º 3
0
def gene_to_TSS(gene_name):
	#Initialize variables
	transcription_start = ''
	strand = ''
	chromosome = ''
	found_gene = False

	#Open annotation file
	annotation_file = 'crispr_app/Homo_sapiens.GRCh38.84.gtf'
	limit_info = dict(
	         gff_type = ["transcript"])
	annotation_handle = open(annotation_file)

	#Parse through annotated data, searching for matching gene names
	for rec in GFF.parse(annotation_handle, limit_info=limit_info, target_lines=1):
		feature = rec.features[0]
		qualifiers = feature.qualifiers

		#Once matching gene is found, determine the transcription start site and chromosome
		if str(qualifiers['gene_name']).strip('[').strip(']').strip('\'') == gene_name:
			found_gene = True
			chromosome = rec.id
			strand = feature.strand
			if strand == 1:
				if not transcription_start:
					transcription_start = float('inf')
				transcription_start = min(int(feature.location.start), int(transcription_start))
			elif strand == -1:
				if not transcription_start:
					transcription_start = -1
				transcription_start = max(int(feature.location.end), int(transcription_start))
		elif found_gene == True:
			break
	annotation_handle.close()
	return (transcription_start, strand, chromosome)
Ejemplo n.º 4
0
    def handle(self, *args, **options):
        organism, created = Organism.objects.get_or_create(
                common_name=options['organism_name'],
                taxon=options['taxon'],
                ebi_id=options['ebi_id']
        )

        for record in SeqIO.parse(options['fasta'], "fasta"):
            refseq, created = RefSeq.objects.get_or_create(
                name=record.id,
                length=len(record.seq),
                organism=organism
            )

        for rec in GFF.parse(options['gff3']):
            rs = RefSeq.objects.get(name=rec.id, organism=organism)
            for feat in rec.features:
                if feat.type != 'gene':
                    continue
                gene, created = Gene.objects.get_or_create(
                    start=feat.location.start,
                    end=feat.location.end,
                    strand=feat.location.strand,
                    refseq=rs,
                    db_object_id=feat.id,
                    db_object_symbol=feat.id
                )
Ejemplo n.º 5
0
def get_gff_dict(gfffile):
    """Creates a dictionary with product information from given gff file.
    
    Returns dictionary. Dictionary key is the contig id, values are products for the contig."""
    out_dict = {}

    for rec in GFF.parse(gfffile):

        # Add features if there are any
        if rec.features > 0:
            gff_info = None
                
            # Add all features
            # Features are separated by ,
            # example:
            # featuretype;product;product,featuretype;product
            # or
            # CDS;protein3;protein31,CDS;protein3
            for f in rec.features:
                if len(f.qualifiers['product']) > 0:
                    # if gff_info is None, do not add ',' separator
                    try:
                        gff_info += ",%s" % ";".join([f.type] + f.qualifiers['product'])
                    except TypeError:
                        gff_info = ";".join([f.type] + f.qualifiers['product'])

            # Test if there were any features with a product
            if gff_info == None:
                gff_info = "N/A"
        else:
            gff_info = "N/A"

        out_dict[rec.id] = gff_info

    return out_dict
Ejemplo n.º 6
0
def gene_to_early_exons(gene_name, num_exons):
	#Initialize variables
	exons = {}
	exonCount = 0
	maxExons = 0

	#Open annotation file
	annotation_file = 'crispr_app/Homo_sapiens.GRCh38.84.gtf'
	limit_info = dict(
	         gff_type = ["exon"])
	annotation_handle = open(annotation_file)

	#Parse through annotated data, searching for matching gene names & exons 
	strand = ''
	for rec in GFF.parse(annotation_handle, limit_info=limit_info, target_lines=1):
		feature = rec.features[0]
	 	qualifiers = feature.qualifiers

	 	#Once matching gene is found, determine the exon regions and chromosome
	 	if str(qualifiers['gene_name']).strip('[').strip(']').strip('\'') == gene_name:
	 		chromosome = rec.id
	 		strand = feature.strand

	 		#Get only first version of gene in annotated data
	 		exonNum = str(qualifiers['exon_number']).strip('[').strip(']').strip('\'')
	 		maxExons = max(maxExons, int(exonNum))
	 		exonCount +=1 
			if exonCount > maxExons:
				break
			if exonCount > num_exons:
				break
			exons[exonNum] = [int(feature.location.start), int(feature.location.end), strand]
	annotation_handle.close()
	return exons, chromosome
Ejemplo n.º 7
0
 def t_fasta_directive(self):
     """Parse FASTA sequence information contained in a GFF3 file.
     """
     recs = SeqIO.to_dict(GFF.parse(self._gff_file))
     assert len(recs) == 1
     test_rec = recs['chr17']
     assert str(test_rec.seq) == "GATTACAGATTACA"
Ejemplo n.º 8
0
def load_gff(gff):
    """Parses a single GFF file and returns a chromosome-indexed dict for
       that file.

    Arguments
    ---------
    gff: str
        Filepath to GFF

    Returns
    -------
    dict: A dictionary representation of the GFF entries, indexed by
            chromosome ID
    """
    annotations = {}

    if gff.endswith('.gz'):
        import gzip
        from io import TextIOWrapper
        fp = TextIOWrapper(gzip.open(gff))
    else:
        fp = open(gff)

    for entry in GFF.parse(fp):
        if len(entry.features) > 0 and entry.features[0].type == 'chromosome':
            annotations[entry.id] = entry
    fp.close()

    return annotations
Ejemplo n.º 9
0
def gene_positions(genefile, include_chromosome=True, include_strand=True, coding_only=False, ignore_strange_cases=False):
    """ Return a gene_ID:(chromosome, strand, start_pos, end_pos) dictionary based on GFF input file. 
    
    The positions are 1-based, end-inclusive. 
    If include_chromosome and/or include_strand is False, the corresponding values are missing from the output tuples.

    If coding_only is True, the start/end positions are the start and end of the first and last exon (i.e. excluding the UTRs). 
     In that case, if  a gene doesn't have an mRNA with exons, or has multiple mRNAs, raise an Exception, 
      unless ignore_strange_cases is True, then just don't include it in the output.
    """
    gene_positions = {}
    with open(os.path.expanduser(genefile)) as GENEFILE:
        # if coding_only is False, only look at genes, not sub-features
        genefile_parsing_limits = {'gff_type': ['gene']} if not coding_only else {}
        for chromosome_record in GFF.parse(GENEFILE, limit_info=genefile_parsing_limits):
            for gene_record in chromosome_record.features:
                # BCBio uses 0-based and end-exclusive positions (first-third base is bases 0,1,2, i.e range 0-3) - 
                #  convert to 1-based end-inclusive (so first-third base is bases 1,2,3, i.e. range 1-3)
                if include_chromosome:      full_pos_info = (chromosome_record.id,)
                else:                       full_pos_info = ()
                if include_strand:          full_pos_info += (GFF_strands[gene_record.strand],)
                if not coding_only:
                    full_pos_info += get_feature_start_end(gene_record)
                else:
                    try:    start_end = get_gene_start_end_excluding_UTRs(gene_record)
                    except (NoRNAError, MultipleRNAError):
                        if ignore_strange_cases:    continue
                        else:                       raise
                    full_pos_info += start_end
                gene_positions[gene_record.id] = full_pos_info
    return gene_positions
Ejemplo n.º 10
0
def shortrna_regions(mirna_gff, star_csv, seq_file):
    """Return miRNA sequences with corresponding guide and star regions.
    """
    seq_index = SeqIO.index(seq_file, "fasta")
    mirna_seqs = dict()
    with open(star_csv) as in_handle:
        for name, guide, star in csv.reader(in_handle):
            mirna_seqs[name] = (guide.strip(), star.strip())

    for rec in GFF.parse(mirna_gff):
        cur_seq = str(seq_index[rec.id].seq)
        for f in rec.features:
            name = f.qualifiers["ID"][0]
            start, end = (f.location.nofuzzy_start, f.location.nofuzzy_end)
            yield (rec.id, start, end, name)
            #guide, star = mirna_seqs.get(name, ("", ""))
            for seq_name, guide, star in [(n, g, s) for n, (g, s) in
                    mirna_seqs.iteritems() if n.startswith(name)]:
                for find_seq, ext in [(guide, "guide"), (star, "star")]:
                    if find_seq:
                        if f.strand == -1:
                            find_seq = str(Seq(find_seq).reverse_complement())
                        region = cur_seq[start:end]
                        pos = region.find(find_seq)
                        if pos > -1:
                            yield (rec.id, start + pos, start + pos + len(find_seq),
                                    "%s_%s" % (seq_name, ext))
                        else:
                            print f.strand, name, ext, pos, find_seq, region
                            raise NotImplementedError
Ejemplo n.º 11
0
def rebase(parent, child, interpro=False, protein2dna=False):
    child_features = __get_features(child, interpro=interpro)

    for rec in GFF.parse(parent):
        replacement_features = []
        for feature in feature_lambda(
                rec.features,
                feature_test_qual_value,
                {
                    'qualifier': 'ID',
                    'attribute_list': child_features.keys(),
                },
                subfeatures=False):

            new_subfeatures = child_features[feature.id]
            fixed_subfeatures = []
            for x in new_subfeatures:
                # Then update the location of the actual feature
                __update_feature_location(x, feature, protein2dna)

                if interpro:
                    for y in ('status', 'Target'):
                        try:
                            del x.qualifiers[y]
                        except:
                            pass

                fixed_subfeatures.append(x)
            replacement_features.extend(fixed_subfeatures)
        # We do this so we don't include the original set of features that we
        # were rebasing against in our result.
        rec.features = replacement_features
        rec.annotations = {}
        GFF.write([rec], sys.stdout)
Ejemplo n.º 12
0
 def t_ensembl_nested_features(self):
     """Test nesting of features with GFF2 files using transcript_id.
     """
     rec_dict = SeqIO.to_dict(GFF.parse(self._ensembl_file))
     assert len(rec_dict["I"].features) == 2
     t_feature = rec_dict["I"].features[0]
     assert len(t_feature.sub_features) == 32
Ejemplo n.º 13
0
def rebase(parent, child, interpro=False, protein2dna=False):
    child_features = __get_features(child, interpro=interpro)

    for rec in GFF.parse(parent):
        # TODO, replace with recursion in case it's matched against a
        # non-parent feature. We're cheating a bit here right now...
        replacement_features = []
        for feature in rec.features:
            if feature.id in child_features:
                new_subfeatures = child_features[feature.id]
                # TODO: update starts
                fixed_subfeatures = []
                for x in new_subfeatures:
                    # Then update the location of the actual feature
                    __update_feature_location(x, feature, protein2dna)

                    if interpro:
                        for y in ('status', 'Target'):
                            try:
                                del x.qualifiers[y]
                            except:
                                pass

                    fixed_subfeatures.append(x)
                replacement_features.extend(fixed_subfeatures)
        # We do this so we don't include the original set of features that we
        # were rebasing against in our result.
        rec.features = replacement_features
        GFF.write([rec], sys.stdout)
Ejemplo n.º 14
0
def prepareSample(filter_matrix, gff_path):
	random.seed()
	candidate_list = []
	handle = open(gff_path, 'r')
	gene_count = 0
	for record in GFF.parse(handle):
		for feature in record.features:
			if feature.type == 'gene':
				locus_tag = feature.qualifiers['locus_tag'][0]
				isMatch = False
				gene_count += 1
				for key in filter_matrix:
					if key == locus_tag:
						isMatch = True
						break
				if isMatch == False:
					candidate_list.append(locus_tag)
	countToAdd = round(gene_count / 2) - len(filter_matrix)
	if countToAdd > 0:
		for i in range(1, countToAdd):
			list_len = len(candidate_list)
			list_id = random.randint(0, list_len - 1)
			locus_str = candidate_list[ list_id ]
			filter_matrix[locus_str] = (0, 0)
			candidate_list.remove( locus_str )
				
	handle.close()
	return(filter_matrix)
Ejemplo n.º 15
0
def read_gff_transcripts(fobj, fname="", min_exons=1, merge=0):
    
    # Setup logging
    logger = logging.getLogger('pita')
  
    if merge > 0:
        logger.warning("Merging exons not yet implemented for GFF files!")

    #limits = dict(gff_type = ["mRNA", "exon"])
    smap = {"1":"+",1:"+","-1":"-",-1:"-", None:"+"}
    transcripts = []
    for rec in GFF.parse(fobj):
        chrom = rec.id
        for feature in rec.features:
            #logger.debug("feature: {0}", feature)
            
            for gene in _gff_type_iterator(feature, ['mRNA', 'transcript', 'inferred_parent']):
                #logger.debug("Adding gene: {0}", gene)
                exons = []
                #logger.debug("subfeatures: {0}", gene.sub_features)
                for exon in [f for f in gene.sub_features if f.type == 'exon']:
                    #link[gene.id] = link.setdefault(gene.id, 0) + 1
                    start = int(exon.location.start.position)# - 1    
                    end = int(exon.location.end.position)
                    strand = smap[exon.strand]
                    exons.append([chrom, start, end, strand])
                logger.debug("%s: %s - %s exons", fname, gene.id, len(exons))
                if len(exons) >= min_exons:
                    transcripts.append([gene.id, fname, exons])

    return transcripts
Ejemplo n.º 16
0
def doWork( args ):
    panel=Panel(fig_width=900, padding = 25, grid=None, xmin=0)
    seq_length = 0
    for gff in args.gffs:
        seqrecord = GFF.parse(gff).next()
        if len(seqrecord) > seq_length:
            seq_length = len(seqrecord)
        #seqrecord = SeqIO.parse(args.infile, "genbank").next()
        cds_track = tracks.BaseTrack(sort_by = 'collapse')
        for feature in seqrecord.features:
            if feature.type == 'CDS':
                #print feature.qualifiers['product']
                if feature.qualifiers['product'][0] == 'hypothetical protein':
                    col = '#BDBDBD'
                else:
                    col = '#2B8CBE'
                feat = features.GenericSeqFeature(feature, color_by_cm=False,
                        fc=col )
                cds_track.append(feat)
            elif feature.type == 'source':
                cds_track.append(features.GenericSeqFeature(feature,
                    color_by_cm=False, alpha=0.0, fc='1.0', ec='1.0'))
            else:
                cds_track.append(features.GenericSeqFeature(feature,
                    color_by_cm=False, fc='0.0', ec='0.0'))
        panel.add_track(cds_track)
    panel.save(args.outfile, xmin=0,xmax=seq_length)
Ejemplo n.º 17
0
def main(gff_file, fasta_file = None):
    # Use splitext to remove the extension of the original input file
    out_file = "%s.gb" % os.path.splitext(gff_file)[0]

    # Parser will differ slightly if fasta file is given
    if os.stat(gff_file) == 0 or ((fasta_file is not None) and os.stat(fasta_file)):
        print "ERROR: Empty file provided or cannot stat files"
        exit(64);
    elif fasta_file is None:
        gff_iter = GFF.parse(gff_file) #Parser/generator object
    else:
        fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna)) # Process fasta file
        gff_iter = GFF.parse(gff_file, fasta_input) # Give fasta file to parser
    
    # One line to call all the checking function and to write in genbank format
    SeqIO.write(_check_gff(_fix_ncbi_id(gff_iter)), out_file, "genbank")
Ejemplo n.º 18
0
def read_gff_file(gfffile):
    featureid_locations={}
    limits=dict(gff_type=["gene","mRNA","CDS"])
    with open(gfffile) as in_handle:
        for rec in GFF.parse(in_handle, limit_info=limits):
            for feature in rec.features:
                featureid_locations[feature.id] = rec.id
    return featureid_locations
Ejemplo n.º 19
0
def parse_gff(fname, ftype='unknown'):
    from BCBio import GFF
    entries = []
    for e in GFF.parse(fname):
        for f in e.features:
            entries.append([e.id.lower(), int(f.location.start), int(f.location.end), f])
    print len(entries), "entries read from GFF file", fname
    return entries
Ejemplo n.º 20
0
def main(argv):
  gtf_filename = ''
  seqfilename = ''
  feature = 'cds'
  seqtype = ''
  try:
      opts, args = getopt.getopt(argv,"hg:s:f:t:",["seqfile=","blastfile=", "feature=", "seqtype="])
  except getopt.GetoptError:
    print 'Type GetOrthologGroups.py -h for options'
    sys.exit(2)
  for opt, arg in opts:
    if opt == "-h":
       print 'GetOrthologGroups.py -g <gtf_file> -s <seqfile> -f <feature> -t <seqtype>'
       sys.exit()
    elif opt in ("-g", "--gtffile"):
       gtf_filename = arg
    elif opt in ("-s", "--seqfile"):
       seqfilename = arg
    elif opt in ("-f", "--feature"):
       feature = arg
    elif opt in ("-t", "--seqtype"):
       seqtype = arg
  if seqtype != "contigs" and seqtype != "consensus":
    sys.exit("seqtype must be either 'contigs' or 'consensus'")     
  seqfilehandle = open(seqfilename)
  seq_dict = SeqIO.to_dict(SeqIO.parse(seqfilehandle, "fasta"))
  seqfilehandle.close()
  gtf_filehandle = open(gtf_filename)
  for SeqRec in GFF.parse(gtf_filehandle, base_dict=seq_dict): 
    if not SeqRec.features:
      continue                                                  #Skip sequences that are not in the GFF
    #cluster_num = SeqRec.features[0]
    #print SeqRec.features[0].qualifiers['gene_id'][0]
    gene_id = SeqRec.features[0].qualifiers['gene_id'][0]
    cluster_num = gene_id.split('_')[1]
    if not cluster_num[0] =='c':                               #Skip sequences that match reference sequences that are not part of a cluster
      continue
    cluster_num = cluster_num[1:]
    if seqtype == "contigs":
      cluster_filename = path.join(path.expanduser("~"), "Bioinformatics", "Selaginella", "ContigClusters", "Cluster_" + cluster_num + ".fa")
    elif seqtype == "consensus":
      cluster_filename = path.join(path.expanduser("~"), "Bioinformatics", "Selaginella", "ConsensusCLusters", "Cluster_" + cluster_num + ".fa")     
    if feature == 'cds':
      subseq = SeqRec.features[0].extract(SeqRec)
    elif feature == 'contigs':
      if SeqRec.features[0].location.strand == -1:
        subseq = SeqRec.seq.reverse_complement()
      else:
        subseq = SeqRec.seq 
    else:
      sys.exit("feature %s not recognized" % feature)
      
    subseq.id = gene_id
    subseq.description = gene_id
    cluster_file = open(cluster_filename, "a")
    cluster_file.write(subseq.format("fasta"))
    cluster_file.close()
  gtf_filehandle.close()
Ejemplo n.º 21
0
def main(infile, gff, outfile, ftype='CDS', use_phase=False, translate=False):
    ref_seq = SeqIO.to_dict(SeqIO.parse(infile, format="fasta"))
    # Parse GFF annotations.

    genome_with_features = GFF.parse(
        gff,
        base_dict=ref_seq
        )
    """ bcbio-gff codes exons, mRNA etc as subfeatures which is now
    depreciated in biopython, this code fixes that issue. """
    new_genome_with_features = list()
    for scaffold in genome_with_features:
        new_features = list()
        for feature in scaffold.features:
            gene_features = subfeatures(feature)
            new_features.extend(gene_features)
        scaffold.features = new_features
        new_genome_with_features.append(scaffold)
    """ Genome with features doesn't have scaffolds without any gff
    features. Here I update the existing records in genome with the
    new ones containing features. """
    ref_seq.update(SeqIO.to_dict(new_genome_with_features))

    sequences = list()
    for scaffold, sequence in ref_seq.items():
        for feature in sequence.features:
            if feature.type != ftype:
                continue
            start = feature.location.start
            end = feature.location.end
            try:
                phase = int(feature.qualifiers['phase'][0])
            except KeyError:
                phase = 0
            strand = feature.location.strand

            if use_phase:
                fseq = feature.extract(sequence)[phase:]
            else:
                fseq = feature.extract(sequence)

            fseq.id = feature.id
            fseq.name = feature.id

            strand = '-' if strand == -1 else '+'
            fseq.description = "{}:{}-{}[{}]".format(
                scaffold,
                start,
                end,
                strand,
                )
            if translate:
                tseq = fseq.seq.translate()
                fseq.seq = tseq
            sequences.append(fseq)

    SeqIO.write(sequences, outfile, 'fasta')
    return
Ejemplo n.º 22
0
def main(gff_file, ref_file, ofile, seq_type="CDS"):
    with open(ref_file) as in_handle:
        fasta_recs = SeqIO.to_dict(SeqIO.parse(in_handle, "fasta"))

    base, ext = os.path.splitext(gff_file)

    gff_iter = GFF.parse(gff_file, fasta_recs)
    recs = protein_recs(check_gff(gff_iter), fasta_recs, seq_type)
    SeqIO.write(recs, ofile, "fasta")
Ejemplo n.º 23
0
def main(expterm, fasta, gff3):
    with open(fasta, 'r') as handle:
        seq_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))

    # Build coords file
    with open(gff3, 'r') as handle:
        for rec in GFF.parse(handle, base_dict=seq_dict):
            with open('tmp.coords', 'w') as coords:
                for feat in rec.features:
                    if feat.type == 'gene':
                        coords.write('\t'.join([
                            feat.id,
                            str(feat.location.start + 1),
                            str(feat.location.end),
                            rec.id,
                        ]) + '\n')
            with open('tmp.fasta', 'w') as fasta_handle:
                SeqIO.write(rec, fasta_handle, 'fasta')

            cmd = ['transterm', '-p', expterm, fasta, 'tmp.coords']
            output = subprocess.check_output(cmd)
            #   TERM 1         4342 - 4366     + F    93 -11.5 -3.22878 | opp_overlap 4342, overlap 4340 4357
            ttre = re.compile(
                r'^  (?P<name>.*) (?P<start>\d+) - (?P<end>\d+)\s+'
                r'(?P<strand>[-+])\s+(?P<loc>[GFRTHNgfr]+)\s+'
                r'(?P<conf>\d+)\s+(?P<hp>[0-9.-]+)\s+(?P<tail>[0-9.-]+)'
            )

            rec.features = []
            batches = output.split('SEQUENCE ')
            for batch in batches[1:]:
                batch_lines = batch.split('\n')
                # Strip the header
                interesting = batch_lines[2:]
                unformatted = [x for x in interesting if x.startswith('  ')][0::2]
                for terminator in unformatted:
                    m = ttre.match(terminator)
                    if m:
                        start = int(m.group('start')) - 1
                        end = int(m.group('end'))
                        if m.group('strand') == '+':
                            strand = 1
                        else:
                            strand = 0

                        feature = SeqFeature(
                            FeatureLocation(start, end),
                            type="terminator",
                            strand=strand,
                            qualifiers={
                                "source": "TransTermHP_2.09",
                                "score": m.group('conf'),
                                "ID": m.group('name'),
                            }
                        )
                        rec.features.append(feature)
            yield rec
Ejemplo n.º 24
0
def main( gff_file, fasta_file, outfile, oformat ):

    fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna))
    gff_iter = GFF.parse(gff_file, fasta_input)
    gff_iter = add_translation(gff_iter)
    if oformat in ['genbank', 'gb']:
        SeqIO.write(check_gff(fix_ncbi_id(gff_iter)), outfile, oformat)
    else:
        SeqIO.write(check_gff(gff_iter), outfile, oformat)
Ejemplo n.º 25
0
def load_gff(gff_file):
	"""Returns a list of parsed gff."""
	with open(gff_file,"r") as f:
		print "Parsing file {}...".format(gff_file)
		rec = []
		for line in GFF.parse(f):
			rec.append(line)

	return rec
Ejemplo n.º 26
0
def load_features(reference, feature_names=None):
    #read in appropriately whether GFF or Genbank
    #checks explicitly for GFF otherwise assumes Genbank
    if not os.path.isfile(reference):
        print("ERROR: reference sequence not found. looking for", reference)
        return None

    features = {}
    if '.gff' in reference.lower():
        #looks for 'gene' and 'gene' as best for TB
        try:
            from BCBio import GFF #Package name is confusing - tell user exactly what they need!
        except ImportError:
            print("ERROR: Package BCBio.GFF not found! Please install using \'pip install bcbio-gff\' before re-running.")
            return None
        limit_info = dict( gff_type = ['gene'] )

        with open(reference) as in_handle:
            for rec in GFF.parse(in_handle, limit_info=limit_info):
                for feat in rec.features:
                    if feature_names is not None: #check both tags; user may have used either
                        if "gene" in feat.qualifiers and feat.qualifiers["gene"][0] in feature_names:
                            fname = feat.qualifiers["gene"][0]
                        elif "locus_tag" in feat.qualifiers and feat.qualifiers["locus_tag"][0] in feature_names:
                            fname = feat.qualifiers["locus_tag"][0]
                        else:
                            fname = None
                    else:
                        if "gene" in feat.qualifiers:
                            fname = feat.qualifiers["gene"][0]
                        else:
                            fname = feat.qualifiers["locus_tag"][0]
                    if fname:
                        features[fname] = feat

            if feature_names is not None:
                for fe in feature_names:
                    if fe not in features:
                        print("Couldn't find gene {} in GFF or GenBank file".format(fe))

    else:
        from Bio import SeqIO
        for feat in SeqIO.read(reference, 'genbank').features:
            if feat.type=='CDS':
                if "locus_tag" in feat.qualifiers:
                    fname = feat.qualifiers["locus_tag"][0]
                    if feature_names is None or fname in feature_names:
                        features[fname] = feat
                elif "gene" in feat.qualifiers:
                    fname = feat.qualifiers["gene"][0]
                    if feature_names is None or fname in feature_names:
                        features[fname] = feat
            elif feat.type=='source': #read 'nuc' as well for annotations - need start/end of whole!
                features['nuc'] = feat

    return features
 def t_unescaped_semicolons(self):
     """Parse inputs with unescaped semi-colons.
     This is a band-aid to not fail rather than correct parsing, since
     the combined feature will not be maintained.
     """
     f = os.path.join(self._test_dir, "unescaped-semicolon.gff3")
     rec_dict = SeqIO.to_dict(GFF.parse(f))
     f = rec_dict['chr1'].features[0]
     assert f.qualifiers["Description"][0].startswith('osFTL6')
     assert f.qualifiers["Description"][0].endswith('protein, expressed')
Ejemplo n.º 28
0
 def t_wb_cds_nested_features(self):
     """Nesting of GFF2 features with a flat CDS key value pair.
     """
     rec_dict = SeqIO.to_dict(GFF.parse(self._wb_alt_file))
     assert len(rec_dict) == 2
     features = rec_dict.values()[1].features
     assert len(features) == 1
     tfeature = features[0]
     assert tfeature.id == "cr01.sctg102.wum.2.1"
     assert len(tfeature.sub_features) == 7
def load_gff(db, gff_file, fasta_file, fetch_taxonomy=False, taxid=None):
    from BCBio import GFF
    with open(fasta_file) as seq_handle:
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))

    saved = []
    for rec in GFF.parse(gff_file, seq_dict ):
        saved.append(add_taxid(rec, taxid))

    db.load(saved, fetch_NCBI_taxonomy=fetch_taxonomy)
Ejemplo n.º 30
0
 def t_gff2_iteration(self):
     """Test iterated features with GFF2 files, breaking without parents.
     """
     recs = []
     for rec in GFF.parse(self._wormbase_file, target_lines=15):
         recs.append(rec)
     assert len(recs) == 4
     assert recs[0].features[0].type == 'region'
     assert recs[0].features[1].type == 'SAGE_tag'
     assert len(recs[0].features[2].sub_features) == 29
Ejemplo n.º 31
0
#!/usr/bin/env python
import sys
import argparse
from BCBio import GFF
from gff3 import feature_lambda, feature_test_type

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('gff3',
                        type=argparse.FileType("r"),
                        help='GFF3 annotations')
    parser.add_argument('types',
                        type=str,
                        nargs='+',
                        help='Feature type to filter on')
    parser.add_argument('--invert', action='store_true')
    args = parser.parse_args()

    for rec in GFF.parse(args.gff3):
        rec.features = feature_lambda(
            rec.features,
            feature_test_type,
            {'types': args.types},
            invert=args.invert,
            subfeatures=False,
        )
        GFF.write([rec], sys.stdout)
Ejemplo n.º 32
0
	def parse_gff(self, inputGFF):
		'''
		get a list of contigs plus 0-indexed gene-coordinates and sense-ness of protein coding regions from a gff file.
		Only tested with prokka GFF files.
		'''
		from BCBio import GFF
		import Bio
		import re
		import warnings

		def rev_comp(string):
			string = string.upper()
			complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N' : 'N'} 
			bases = list(string) 
			bases = [complement[base] for base in bases]
			bases.reverse()
			return ''.join(bases)
 
		try:
			with open(inputGFF) as in_handle:
				_ = next(GFF.parse(in_handle))
		except:
			print ('Parsing of GFF failed. This is probably because your biopython version is too new. Try downgrading to 1.76 or older')
			sys.exit(1)

		with open(inputGFF) as in_handle:
		
			for rec in GFF.parse(in_handle):
				tmp = []
				for r in rec.features:
					if "minced" in r.qualifiers['source'][0] or "Minced" in r.qualifiers['source'][0]:
						# This catches CRISPR repeats.
						continue
					if r.sub_features:
						prodigal_bool = 'Prodigal' in r.sub_features[0].qualifiers['source'][0] or 'prodigal' in r.sub_features[0].qualifiers['source'][0]
					else:
						prodigal_bool = 'Prodigal' in r.qualifiers['source'][0] or 'prodigal' in r.qualifiers['source'][0]
					
					if prodigal_bool:
						# Prokka not only finds protein sequences, but also t-/r-RNA sequences. In order to only parse protein coding sequences,
						# I search for Prodigal/Prodigal in the source entry of the sub_features attribute.
						
						# the sub_features attribute of a seq_record object is apparently deprecated. I couldn't find any other way to access
						# the required information, though. Should probably be fixed when I can.
						indices = str(r.location).split('[')[1].split(']')[0].split(':')
						indices = [int(x) for x in indices]
						sense = str(r.location).split('(')[1].split(')')[0]
						if sense == "-":
							gene_seq = rev_comp(rec.seq[indices[0]:indices[1]])
						else:
							gene_seq = rec.seq[indices[0]:indices[1]]

						if (str(gene_seq[0:3]) == "ATG" or str(gene_seq[0:3]) == "GTG" or str(gene_seq[0:3]) == "TTG"):
							pass
						else:
							warnings.warn(str(r.id) + " doesn't start with a common start codon. Beware. Continuing.")

						if (str(gene_seq[-3:]) == "TAG" or str(gene_seq[-3:]) == "TAA" or str(gene_seq[-3:]) == "TGA"):
							pass
						else:
							warnings.warn(str(r.id) + " doesn't stop with a usual stop codon. Beware. Continuing.")
						tmp.append((indices, sense))
				
				if str(rec.id) in self.contigs:
					self.contigs[str(rec.id)].annotations.append(tmp)
				else:
					warnings.warn(str(rec.id) + " is not tracked by the BAMFile.")
Ejemplo n.º 33
0
def density_adjusted(fname, chr_sam, minlength, maxlength, path_wig, path_den,
                     path_gff):
    '''Density will be a size separated dictionary = {length : [reads at 0, reads at 1, ....]}
        this makes it easier to select a size range later for analysis'''

    fname = fname
    chr_sam = chr_sam
    minlength = minlength
    maxlength = maxlength
    GFFgen = GFF.parse(path_gff)

    # open chr aligned sam file
    f_samfile = open(chr_sam)
    samfile = csv.reader(f_samfile, delimiter='	')

    # dictionaries to hold read counts
    density_plus = {}
    density_minus = {}
    density_plus_sizesep = {}
    density_minus_sizesep = {}

    if minlength < 0 or maxlength < 0:
        print "Error. Length input not valid."
        return (0)

    # Makes 2 sets of indices, one for all reads, and another for size separated:
    for sequence in GFFgen:
        density_plus[sequence.id] = [0 for x in range(len(sequence) + 20)]
        density_minus[sequence.id] = [0 for x in range(len(sequence) + 20)]

    for length in range(minlength, maxlength + 1):
        density_plus_sizesep[length] = [0 for x in range(len(sequence) + 20)]
        density_minus_sizesep[length] = [0 for x in range(len(sequence) + 20)]

    total_reads = 0
    mapped_reads = 0

    # Loop through the samfile.
    for read in samfile:
        if read[0][0] == '@':  # Ignore header lines.
            continue

        if read[1] == '4':  # A bowtie mismatch.
            continue

        chrom = read[2]  # chromosome identified for read in bowtie
        readid = read[0]  # read id
        startp = int(
            read[3]
        ) - 1  # start position. Need to subtract 1 since genomic sequence starts at 1,
        seq = Seq.Seq(read[9])  # sequence of the read
        length = len(seq)  # length of read

        if length < 23:
            length_shift = 24 - length
        else:
            length_shift = 0

        if chrom not in density_plus.keys():
            print "Error: Bowtie index and GFF do not match"

        total_reads += 1

        # Note that Bowtie reverse complements any sequence aligning to the reverse strand.
        # and so read[3] is the 3'-end of minus strand reads

        # Filter to get rid of reads of particular length. Or a particular strand.
        if (length < minlength or length > maxlength):
            continue

        mapped_reads += 1

        # 16 is the minus strand, 0 is the plus strand
        if (read[1] == '16'):
            start = startp - length_shift
            density_minus[chrom][start] += 1
            density_minus_sizesep[length][start] += 1

        if (read[1] == '0'):
            start = startp + length - 1 + length_shift
            density_plus[chrom][start] += 1
            density_plus_sizesep[length][start] += 1

    path_oldformat = path_den + "binary/"
    if not os.path.exists(path_oldformat):
        os.makedirs(path_oldformat)

    density_plus[sequence.id] = [
        float(i) * 1000000 / float(mapped_reads)
        for i in density_plus[sequence.id]
    ]
    density_minus[sequence.id] = [
        float(i) * 1000000 / float(mapped_reads)
        for i in density_minus[sequence.id]
    ]

    ribo_util.writebin(density_plus, path_oldformat + fname + "_plus_")
    ribo_util.makePickle(density_plus, path_den + "plus")
    ribo_util.makePickle(density_plus_sizesep, path_den + "plus_sizesep")
    ribo_util.countstowig(density_plus, path_wig + "_plus")

    ribo_util.writebin(density_minus, path_oldformat + fname + "_minus_")
    ribo_util.makePickle(density_minus, path_den + "minus")
    ribo_util.makePickle(density_minus_sizesep, path_den + "minus_sizesep")
    ribo_util.countstowig(density_minus, path_wig + "_minus")
Ejemplo n.º 34
0
def find_lipoprotein(gff3_file,
                     fasta_genome,
                     lipobox_mindist=10,
                     lipobox_maxdist=60):
    seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_genome, "fasta"))

    CASES = [
        re.compile('^.{%s,%s}[ACGSILMFTV][^REKD][GASNL]C' %
                   (lipobox_mindist, lipobox_maxdist)),
        # re.compile('^.{%s,%s}AWAC' % (lipobox_mindist, lipobox_maxdist)),
        # Make sure to not have multiple cases that share matches, will introduce duplicate features into gff3 file
    ]

    for record in GFF.parse(gff3_file, base_dict=seq_dict):
        good_features = []

        genes = list(
            feature_lambda(record.features,
                           feature_test_type, {'type': 'gene'},
                           subfeatures=True))
        for gene in genes:
            cdss = list(
                feature_lambda(gene.sub_features,
                               feature_test_type, {'type': 'CDS'},
                               subfeatures=False))
            if len(cdss) == 0:
                continue

            # Someday this will bite me in the arse.
            cds = cdss[0]

            try:
                tmpseq = str(
                    cds.extract(record.seq).translate(table=11,
                                                      cds=True)).replace(
                                                          "*", "")
            except:
                continue

            for case in CASES:
                m = case.search(tmpseq)
                if m:
                    if cds.location.strand > 0:
                        start = cds.location.start + (3 * (m.end() - 4))
                        end = cds.location.start + (3 * m.end())
                    else:
                        start = cds.location.end - (3 * (m.end() - 4))
                        end = cds.location.end - (3 * m.end())

                    tmp = SeqFeature(FeatureLocation(
                        min(start, end),
                        max(start, end),
                        strand=cds.location.strand),
                                     type='Lipobox',
                                     qualifiers={
                                         'source': 'CPT_LipoRy',
                                         'ID': '%s.lipobox' % get_id(gene),
                                     })
                    tmp.qualifiers['sequence'] = str(
                        tmp.extract(record).seq.translate())

                    gene.sub_features.append(tmp)
                    good_features.append(gene)

        record.features = good_features
        yield [record]
Ejemplo n.º 35
0
                full = r[2]

        genes.add(dwg.rect(insert=(10, y), size=(full, 10), fill='black'))
        y += 15

        for j, gene in enumerate(rec.features):
            # draw each gene in blue
            x = 10 + gene.location.start
            length = gene.location.end - gene.location.start
            genes.add(dwg.rect(insert=(x, y), size=(length, 10), fill='blue'))

            for sf in gene.sub_features:
                if sf.type == 'Shine_Dalgarno_sequence':
                    x = 10 + sf.location.start
                    length = sf.location.end - sf.location.start
                    genes.add(dwg.rect(insert=(x, y), size=(length, 10), fill='blue'))

            y += 15

            dwg.save()
        y += 20


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='output svgs for start sites')
    parser.add_argument('gff3', type=argparse.FileType("r"), help='gff3 file')
    args = parser.parse_args()

    records = GFF.parse(args.gff3)
    draw(records)
Ejemplo n.º 36
0
    def load_gff3(
        self,
        organism,
        gff3,
        source=None,
        batch_size=1,
        test=False,
        use_name=False,
        disable_cds_recalculation=False,
        timing=False,
    ):
        """
        Load a full GFF3 into annotation track

        :type organism: str
        :param organism: Organism Common Name

        :type gff3: str
        :param gff3: GFF3 to load

        :type source: str
        :param source: URL where the input dataset can be found.

        :type batch_size: int
        :param batch_size: Size of batches before writing

        :type test: bool
        :param test: Run in dry run mode

        :type use_name: bool
        :param use_name: Use the given name instead of generating one.

        :type disable_cds_recalculation: bool
        :param disable_cds_recalculation: Disable CDS recalculation and instead use the one provided

        :type timing: bool
        :param timing: Output loading performance metrics

        :rtype: str
        :return: Loading report
        """
        organisms = self._wa.organisms.get_organisms()
        org_ids = []
        for org in organisms:
            if organism == org['commonName'] or organism == str(org['id']):
                org_ids.append(org['id'])

        if len(org_ids) == 0:
            raise Exception("Organism name or id not found [" + organism + "]")

        if len(org_ids) > 1:
            raise Exception("More than one organism found for [" + organism +
                            "].  Use an organism ID instead: " + str(org_ids))

        total_features_written = 0
        start_timer = default_timer()
        if timing:
            print(
                'Times are in seconds.  If batch-size > 1 then .(total_batch_time/avg_feature_time)'
            )

        all_processed = {'top-level': [], 'transcripts': []}
        loading_status = {}
        for rec in GFF.parse(gff3):
            self.set_sequence(organism, rec.id)
            try:
                log.info("Processing %s with features: %s" %
                         (rec.id, rec.features))
                processed = self._process_gff_entry(
                    rec,
                    source=source,
                    disable_cds_recalculation=disable_cds_recalculation,
                    use_name=use_name)
                all_processed['top-level'].extend(processed['top-level'])
                all_processed['transcripts'].extend(processed['transcripts'])
                total_features_written += 1
                written_top = self._check_write(batch_size, test,
                                                all_processed['top-level'],
                                                FeatureType.FEATURE, timing)
                written_transcripts = self._check_write(
                    batch_size, test, all_processed['transcripts'],
                    FeatureType.TRANSCRIPT, timing)

                if len(written_top):
                    all_processed['top-level'] = []
                    loading_status = {**loading_status, **written_top}
                if len(written_transcripts):
                    all_processed['transcripts'] = []
                    loading_status = {**loading_status, **written_transcripts}

            except Exception as e:
                msg = str(e)
                if '\n' in msg:
                    msg = msg[0:msg.index('\n')]
                log.error("Failed to load features from %s" % rec.id)

        # Write the rest of things to write (ignore batch_size)
        written_top = self._check_write(0, test, all_processed['top-level'],
                                        FeatureType.FEATURE, timing)
        written_transcripts = self._check_write(0, test,
                                                all_processed['transcripts'],
                                                FeatureType.TRANSCRIPT, timing)

        if len(written_top):
            all_processed['top-level'] = []
            loading_status = {**loading_status, **written_top}
        if len(written_transcripts):
            all_processed['transcripts'] = []
            loading_status = {**loading_status, **written_transcripts}

        log.info("Finished loading")
        if timing:
            end_timer = default_timer()
            duration = end_timer - start_timer
            print(
                str(duration) + " seconds to write " +
                str(total_features_written) + " features")
            print("Avg write time (s) per feature: " +
                  str('{:.3f}'.format(duration / total_features_written)))

        return loading_status
Ejemplo n.º 37
0
def catch_middle_stop(gff3_files, genome_assembly_file, output_dir):
    D_bad = defaultdict(bool)
    D_stop = defaultdict(int)
    D_toomanyX = defaultdict(int)
    D_gap = defaultdict(int)
    D_intron = defaultdict(int)
    for gff3_file in gff3_files:
        prefix = os.path.basename(os.path.splitext(gff3_file)[0])

        # Import genome sequence
        in_seq_handle = open(genome_assembly_file)
        seq_dict = SeqIO.to_dict(SeqIO.parse(in_seq_handle, 'fasta'))
        in_seq_handle.close()

        # Import GFF3
        in_handle = open(gff3_file)
        for rec in GFF.parse(in_handle, base_dict=seq_dict):
            gene_features = rec.features
            for gene_feature in gene_features:
                mrna_features = gene_feature.sub_features
                for mrna_feature in mrna_features:
                    mrna_sub_features = mrna_feature.sub_features
                    mrna_sub_features_s = sorted(
                        mrna_sub_features, key=lambda x: x.location.start)
                    seq_cds = []
                    coords = []
                    mrna_sub_features_s2 = []
                    for feature in mrna_sub_features_s:
                        if feature.type != 'CDS':
                            continue
                        mrna_sub_features_s2.append(feature)
                        seq_cds.append(rec.seq[feature.location.start:feature.
                                               location.end])
                        coords.append(
                            (feature.location.start, feature.location.end))

                    i = 1
                    while i < len(coords):
                        intron_start = coords[i - 1][1]
                        intron_end = coords[i][0]
                        intron_len = intron_end - intron_start
                        if intron_len < 10:
                            D_bad[(prefix, mrna_feature.id)] = True
                            D_intron[prefix] += 1
                        i += 1

                    gene_seq = reduce(operator.add, seq_cds)
                    # If strand is -, get reverse complementary sequence
                    if mrna_feature.strand == -1:
                        gene_seq = gene_seq.reverse_complement()
                        phase = mrna_sub_features_s2[-1].qualifiers['phase']
                    else:
                        phase = mrna_sub_features_s2[0].qualifiers['phase']

                    phase = int(phase[0])
                    gene_seq = gene_seq[phase:]
                    protein_seq = str(gene_seq.translate())

                    # Check protein seq has stop codon in the middle
                    protein_seq2 = re.sub('\*$', '', protein_seq)
                    count_stop = protein_seq2.count('*')
                    if count_stop > 0:
                        D_bad[(prefix, mrna_feature.id)] = True
                        D_stop[prefix] += 1

                    # Check if translation consists of more than 50% X residues
                    len_prot = len(protein_seq2)
                    len_X = protein_seq2.count('X')
                    if len_X / len_prot > 0.5:
                        D_bad[(prefix, mrna_feature.id)] = True
                        D_toomanyX[prefix] += 1

                    # Check if feature begins or ends in gap
                    gene_seq2 = str(gene_seq).lower()
                    if gene_seq2.startswith('n') or gene_seq2.endswith('n'):
                        D_bad[(prefix, mrna_feature.id)] = True
                        D_gap[prefix] += 1
    outfile_stats = os.path.join(output_dir, 'bad_genes_stats.txt')
    outhandle_stats = open(outfile_stats, 'w')
    run_names = D_stop.keys()
    header_txt = '{}\t{}\n'.format('type', '\t'.join(run_names))
    outhandle_stats.write(header_txt)

    stop_list = [str(D_stop[x]) for x in run_names]
    toomanyX_list = [str(D_toomanyX[x]) for x in run_names]
    gap_list = [str(D_gap[x]) for x in run_names]
    intron_list = [str(D_intron[x]) for x in run_names]

    outhandle_stats.write('internal_stop\t{}\n'.format('\t'.join(stop_list)))
    outhandle_stats.write('start_with_gap\t{}\n'.format('\t'.join(gap_list)))
    outhandle_stats.write('toomanyX\t{}\n'.format('\t'.join(toomanyX_list)))
    outhandle_stats.write('short_intron\t{}\n'.format('\t'.join(intron_list)))
    D_bad_pickle = os.path.join(output_dir, 'D_bad.p')
    cPickle.dump(D_bad, open(D_bad_pickle, 'wb'))
Ejemplo n.º 38
0
def main(gff_file, fasta_file):
    out_file = "%s.gb" % os.path.splitext(gff_file)[0]
    fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna))
    gff_iter = GFF.parse(gff_file, fasta_input)
    SeqIO.write(_check_gff(_fix_ncbi_id(gff_iter)), out_file, "genbank")
Ejemplo n.º 39
0
def reformat(data):
    for record in GFF.parse(data):
        record.annotations = {}
        GFF.write([record], sys.stdout)
Ejemplo n.º 40
0
def gff3_to_genbank(gff_file, fasta_file, transltbl):
    fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna))
    gff_iter = GFF.parse(gff_file, fasta_input)

    for record in gff_iter:
        yield handle_record(record, transltbl)
Ejemplo n.º 41
0
def mutate(gff3, fasta, changes, customSeqs, new_id):
    # Change Language
    # - we can only accept ONE genome as an input. (TODO: support multiple?)
    # - we can only build ONE genome as an output. (TODO: support multiple?)
    # - must allow selection of various regions
    # '1,1000,+   40,100,-    custom_seq_1'
    try:
        custom_seqs = SeqIO.to_dict(SeqIO.parse(customSeqs, "fasta"))
    except:
        custom_seqs = {}
    seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
    # Pull first and onl record
    rec = list(GFF.parse(gff3, base_dict=seq_dict))[0]
    # Create a "clean" record
    new_record = copy.deepcopy(rec)
    new_record.id = new_id
    new_record.seq = Seq('')
    new_record.features = []
    new_record.annotations = {}
    # Process changes.
    chain = []
    for change in changes:
        if ',' in change:
            (start, end, strand) = change.split(',')
            start = int(start) - 1
            end = int(end)

            # Make any complaints
            broken_feature_start = list(feature_lambda(rec.features, feature_test_contains, {'index': start}, subfeatures=False))
            if len(broken_feature_start) > 0:
                pass
                # log.info("DANGER: Start index chosen (%s) is in the middle of a feature (%s %s). This feature will disappear from the output", start, broken_feature_start[0].id, broken_feature_start[0].location)
            broken_feature_end = list(feature_lambda(rec.features, feature_test_contains, {'index': end}, subfeatures=False))
            if len(broken_feature_end) > 0:
                pass
                # log.info("DANGER: End index chosen (%s) is in the middle of a feature (%s %s). This feature will disappear from the output", end, broken_feature_end[0].id, broken_feature_end[0].location)

            # Ok, fetch features
            if strand == '+':
                tmp_req = rec[start:end]
            else:
                tmp_req = rec[start:end].reverse_complement(
                    id=True, name=True, description=True, features=True,
                    annotations=True, letter_annotations=True, dbxrefs=True
                )

            def update_location(feature):
                feature.location._start += len(new_record)
                feature.location._end += len(new_record)

                if hasattr(feature, 'sub_features'):
                    for sf in feature.sub_features:
                        update_location(sf)

            for feature in tmp_req.features:
                update_location(feature)

            chain.append([
                rec.id,
                start + 1,
                end,
                strand,
                new_record.id,
                len(new_record) + 1,
                len(new_record) + (end - start),
                '+'
            ])

            new_record.seq += tmp_req.seq
            # NB: THIS MUST USE BIOPYTHON 1.67. 1.68 Removes access to
            # subfeatures, which means you will only get top-level features.
            new_record.features += tmp_req.features
        else:
            new_record.seq += custom_seqs[change].seq
    yield new_record, chain
Ejemplo n.º 42
0
def readGFF(gff):

    with open(gff, 'r') as gff_file:
        for rec in GFF.parse(gff_file, limit_info=dict(gff_type = ["gene"])):
            genes = rec.features
    return genes
Ejemplo n.º 43
0
db = MySQLdb.connect(host="localhost",
                     db="hg19",
                     read_default_file="~/.my.cnf")
cursor = db.cursor(MySQLdb.cursors.DictCursor)

entryNumber = args.entryNumber

#
# Read over the gff file collecting data for ID mapping.
# map the ID to what will be the displayed ID: the combination of the
# name and accession number.  For miRNAs, also record the ID of the
# pre-miRNA that it was derived from.
miRnaToPreMiRna = dict()
idToLabel = dict()
gffIter = GFF.parse(args.inputGff)
for chrom in gffIter:
    for hit in chrom.features:
        id = hit.id
        label = "%s|%s" % (hit.qualifiers["Name"][0], hit.qualifiers["ID"][0])
        idToLabel[id] = label
        if hit.type == "miRNA":
            miRnaToPreMiRna[hit.id] = hit.qualifiers["derives_from"][0]

#
# Read the bed file containing the GRCh37-lite coordinates.
# While converting each line to GAF format, replace the ID
# with the miRNA name, look up the name of the pre-miRNA that
# the miRNA is derived from, and note that in the featureInfo field.
miRnaBedFp = open(args.miRnaBed)
for line in miRnaBedFp:
Ejemplo n.º 44
0
def from_TriTrypDB(name, gff, fasta, tax, tmp_dir=None):
    genome = {x.id: x for x in sp(fasta)}
    from BCBio import GFF
    import re
    annotation = list(GFF.parse(gff, base_dict=genome))
    contig = annotation[0]

    seqCol = BioDocFactory.create_genome(name, contig, tax, Tax)
    seqCol.save()

    if not tmp_dir:
        tmp_dir = "/tmp/" + name + "/"
    mkdir(tmp_dir)
    gene_ids = {}
    with tqdm(annotation) as pbar:
        for contig in pbar:
            pbar.set_description(contig.id)
            if len(contig.seq) > 15000000:
                contig.seq = ""
            contigDoc, gene_ids2 = BioDocFactory.create_contig(
                contig,
                seqCol,
                type_map={
                    "rRNA": "rRNA",
                    "ncRNA": "ncRNA",
                    NCBI.f_mRNA: "gene",
                    "exon": "exon",
                    "gene": "gene",
                    NCBI.f_CDS: NCBI.f_CDS,
                    "rRNA": "rRNA",
                    "tRNA": "tRNA",
                    "tmRNA": "tmRNA",
                    "snoRNA": "snoRNA",
                    "three_prime_UTR": "three_prime_UTR",
                    "five_prime_UTR": "five_prime_UTR"
                })
            gene_ids.update(gene_ids2)
            contigDoc.save()
    prots = []
    with tqdm(tritryp_protein_iter(annotation)) as pbar:
        for (protein, cds_f) in pbar:

            protDoc = Protein(seq=str(protein.seq), name=protein.id)

            if "description" in cds_f.qualifiers:
                protein_description = cds_f.qualifiers['description'][0]
            elif "Note" in cds_f.qualifiers:
                protein_description = cds_f.qualifiers['Note'][0]
            elif "product" in cds_f.qualifiers:
                protein_description = cds_f.qualifiers['product'][0]
            else:
                protein_description = ""

            protDoc.description = protein_description

            gos = []
            if "Ontology_term" in cds_f.qualifiers:
                gos = [
                    x.lower() for x in cds_f.qualifiers["Ontology_term"]
                    if "GO:" in x and (
                        x not in ["GO:0008150", "GO:0003674", "GO:0005575"])
                ]

            note = cds_f.qualifiers["Note"][0].split(
                " ")[0] if "Note" in cds_f.qualifiers else ""
            ecs = ["ec:" + note] if re.match(
                '^[0-9]+\.[0-9\-]+\.[0-9\-]+\.[0-9\-]$', note) else []
            ontologies = list(set(ecs + gos))

            protDoc.gene = [protein.id]
            protDoc.ontologies = ontologies
            protDoc.alias = [protein.id]

            if len(protDoc.seq) > 30000:
                raise Exception("No existen proteinas tan largas...")
            protDoc.gene_id = gene_ids[protein.id]
            protDoc.organism = name
            protDoc.auth = str(BioMongoDB.demo_id)
            protDoc.seq_collection_id = seqCol
            prots.append(protDoc)
            if pbar.n and ((pbar.n % 1000) == 0):
                Protein.objects.insert(prots)
                prots = []
    if prots:
        Protein.objects.insert(prots)

    _common_annotations(name, tmp_dir)
Ejemplo n.º 45
0
     model_to_id = read_gene_annotation(args.gene_annotation)
 else:
     model_to_id = False
 #print model_to_id
 #exit()
 features = run(model_to_id, args.create_genes)
 '''Get gene filter if required'''
 if args.gene_filter:
     filt_genes = [x.strip("\n") for x in open(args.gene_filter)]
 else:
     filt_genes = False
 print len(features), "frameDP updates"
 in_file = args.input
 in_handle = open(in_file)
 rec_list = []
 for rec in GFF.parse(in_handle):
     #print rec
     new_features = []
     for gene in rec.features:
         if gene.id == "temp_gene_198":
             pass  #print "orig",gene
         gene = parse_id(gene.id, features, gene, model_to_id,
                         args.create_genes, filt_genes)
         if gene:
             for g in gene:
                 if g.id == "Potri.007N006900":
                     print g
                 if g.id == "temp_gene_198":
                     print g
                     exit()
         if args.create_genes and gene:
Ejemplo n.º 46
0
def main():
    """Main script body"""
    # Do a quick check of inputs
    gffs = sys.argv[1:]

    for gff in gffs:
        if not os.path.isfile(gff):
            sys.exit("Invalid input file specified: %s" % gff)

    # Open first input GFF
    if gffs[0].endswith('.gz'):
        fp = gzip.open(gffs[0])
    else:
        fp = open(gffs[0])

    # Create a list to store output entries
    combined = []

    # Get GFF header and chromosome entries from first input file (it's the
    # same for all input files)
    for line in fp:
        if line.startswith("#") or "\tchromosome\t" in line:
            combined.append(line)

    # Reset read counter
    fp.seek(0)

    # Get chromosome entries for the first input file; as of TriTrypDB 29, the
    # GFF file no longer includes chromosome entries so we will use dicts
    # instead.
    chromosomes = {}

    for entry in GFF.parse(fp):
        if len(entry.features) > 0 and entry.features[0].type in [
                'chromosome', 'contig'
        ]:
            chromosomes[entry.id] = entry

    fp.close()

    # Add sites from all GFFs
    for gff in gffs:
        # Open input GFF
        if gff.endswith('.gz'):
            fp = gzip.open(gff)
        else:
            fp = open(gff)

        for entry in GFF.parse(fp):
            for feature in entry.features:
                # Add chromosome key if it doesn't already exist (needed for
                # TriTrypDB 29+)
                if entry.id not in chromosomes:
                    chromosomes[entry.id] = Chromosome()
                chromosomes[entry.id].features.append(feature)

    # combined sites
    sites = {}

    # Sort and combine sites and add to results
    for ch_id in chromosomes:
        print("Parsing sites for %s" % ch_id)
        chromosomes[ch_id].features.sort()

        sites[ch_id] = {}

        for site in chromosomes[ch_id].features:
            # skip chromosomes
            if site.type in ['chromosome', 'contig']:
                continue

            # site info
            gene_id = site.qualifiers['Name'].pop()
            desc = ",".join(site.qualifiers['description'])
            score = int(site.qualifiers['score'].pop())
            source = site.qualifiers['source'].pop()
            feature_type = site.type

            if gene_id not in sites[ch_id]:
                sites[ch_id][gene_id] = {}

            # new entry
            if site.location.end not in sites[ch_id][gene_id]:
                sites[ch_id][gene_id][site.location.end] = {
                    'Name': gene_id,
                    'source': source,
                    'type': feature_type,
                    'score': score,
                    'strand': site.strand,
                    'description': desc
                }
            else:
                # updating score
                sites[ch_id][gene_id][site.location.end]['score'] += score

    # Add combined rows
    for ch_id in sites:
        print("Combining sites for %s" % ch_id)
        # iterate over genes on each chromosome
        for gene_id in sites[ch_id]:
            site_counter = 1

            # iterate over sites for gene
            for loc in sorted(sites[ch_id][gene_id].keys()):
                site = sites[ch_id][gene_id][loc]

                # feature type abbreviation
                if site['type'] == 'trans_splice_site':
                    feature_type = 'sl'
                else:
                    feature_type = 'polya'

                # assign a new site id
                site_id = "%s.%s.%d" % (site['Name'], feature_type,
                                        site_counter)

                # description
                desc = "ID=%s;Name=%s;description=%s" % (site_id, site['Name'],
                                                         site['description'])

                # create output row
                strand = '+' if site['strand'] == 1 else '-'

                row = "\t".join([
                    ch_id, site['source'], site['type'],
                    str(loc),
                    str(loc),
                    str(site['score']), strand, '.', desc
                ])
                combined.append(row + "\n")
                site_counter += 1

    # write combined gff
    #gff_suffix = commonsuffix(gffs).replace('.gz', '')
    gff_suffix = ".gff"

    outfile = "".join([os.path.commonprefix(gffs), 'combined', gff_suffix])

    with open(outfile, 'w') as output:
        output.writelines(combined)

    print("Done!")
Ejemplo n.º 47
0
def GFF_to_dict(paths_in, gff_settings):
    '''Parse gff into dict:
        - feat_of_interest = what to look for in gff (protein_coding, tRNA, rRNA, etc)
        - name_qual        = qualifier for alias/gene name (Name, gene_id)
        - name_qual_alt    = alternative qualifier, if none, set as 'none' 
        - biotype_qual     = qualifier for type of feature (biotype, etc)
        
        These values must correspont to values in the GFF'''
    '''Unload gff_settings'''

    path_out = gff_settings['path_out']
    feat_of_interest = gff_settings[
        'feat_of_interest']  #all, protein_coding, tRNA, rRNA
    name_qual = gff_settings['name_qual']
    name_qual_alt = gff_settings['name_qual_alt']
    remove_genes = gff_settings['remove_genes']
    #    aSD_seq          = gff_settings['aSD_seq']
    path_badgenes = paths_in['path_badgenes']
    '''Output path can be defined, or use 0 to set as the annotation file for my main pipeline'''

    if path_out == 0:
        path_gff_dict = paths_in['path_gff_dict']
    else:
        path_gff_dict = path_out
    '''Parse GFF using BCBio'''

    GFFgen = GFF.parse(paths_in['path_gff'])
    feat_num = 0
    '''Define data arrays: will be used as columns for pandas DateFrame'''

    gff_dict = {}
    chromosome = []
    aliaslist = []
    startlist = []
    stoplist = []
    seqlist = []
    typelist = []
    strandlist = []
    startcodon = []
    stopcodon = []
    SDaffinity = []
    G_content = []
    C_content = []
    A_content = []
    T_content = []

    aa_code, codon_code = ribo_util.get_genetic_code()
    aa_comp_dict = {}
    '''Make list of bad genes'''

    # from Gene-Wei-Li

    #    bad_genes = pd.read_csv(path_badgenes)
    #    bad_genes = bad_genes.to_dict(orient='list')
    #    bad_genes = bad_genes['GeneName']
    '''Sift through GFF for relevant information'''

    for chromosome_number in range(1, 50):
        chr = next(GFFgen, None)

        if chr is None:
            break

        for feature in chr.features:
            chromosome_id = chr.id

            if feature.sub_features == []:
                feat_num += 1
                continue

            if remove_genes == 'yes':
                '''Skip over non-CDS annotations'''

                if not feature.sub_features[0].type == feat_of_interest:
                    feat_num += 1
                    continue
                elif feature.qualifiers.has_key('pseudo') == True:
                    feat_num += 1
                    continue
                else:
                    feature_type = 'CDS'
            else:
                '''Add feat type to GFF, noting pseudogenes'''

                if feature.qualifiers.has_key('pseudo') == True:
                    feature_type = 'pseudo'
                else:
                    feature_type = feature.sub_features[0].type
            '''Get feature name'''

            if name_qual in feature.qualifiers:
                feat_name = feature.qualifiers[name_qual][0]
            elif name_qual_alt in feature.qualifiers:
                feat_name = feature.qualifiers[name_qual_alt][0]
            else:
                feat_name = 'None'
                feat_num += 1
                continue
            '''Remove feature if bad'''

            #            if remove_genes == 'yes':
            #                if feat_name in bad_genes:
            #                    feat_num+=1
            #                    continue
            #            else:
            #                if feat_name in bad_genes:
            #                    feature_type = 'bad'
            '''Get start, end, and strand position'''

            start = feature.location.start.position
            end = feature.location.end.position
            strand = feature.strand
            '''Analyze features of interest for feat information'''

            alias = feat_name
            '''Each strand is treated differently, + strand == 1'''

            if strand == 1:
                '''I save gene sequence + 50 bp from each end:
                makes it easier to analyze start and stop sequence 
                context without using whole genome sequence'''

                if start < 50:  # if gene is near the beginning of genome sequence:
                    sequence = 'N' * (50 - start
                                      )  # TB GFF starts at 0, add N * 50
                    sequence = sequence + chr[
                        0:end + 50].seq  # gene sequence + 50nt at each end
                else:
                    sequence = chr[start - 50:end +
                                   50].seq  # gene sequence + 50nt at each end

                strand_val = '+'
                startcodon_pos = start
                stopcodon_pos = end - 1

                if start > 200:
                    upstream_seq = chr[start - 200:start + 100].seq

            else:
                '''For minus strand, 'end' is start codon, 'start' is stop codon
                and sequence is reverse compliment of gene sequence.'''

                sequence_rc = chr[start - 50:end + 50].seq
                sequence = sequence_rc.reverse_complement()

                strand_val = '-'
                startcodon_pos = end - 1
                stopcodon_pos = start

                if end + 200 > len(chr.seq):
                    upstream_seq = 'none'
                else:
                    upstream_seq_rc = chr[end - 100:end + 200].seq
                    upstream_seq = upstream_seq_rc.reverse_complement()

            sequence = str(sequence)
            start_codon = sequence[50:53:1]
            stop_codon = sequence[-53:-50]
            '''get sequence from start to stop for GC analysis'''

            CDS_seq = sequence[50:-50:1]

            G, C, A, T = GC_of_CDS(CDS_seq)
            '''Calculate SD affinity'''

            #            SD_seq = sequence[30:50:1]    # analyze 20 nt upstream of start codons
            #            SD_affinity = shine_dalgarno_affinity(aSD_seq, SD_seq)
            '''Append data to lists'''

            if alias == 'trmD':
                print sequence

            chromosome.append(chromosome_id)
            typelist.append(feature_type)
            aliaslist.append(alias)
            seqlist.append(sequence)
            strandlist.append(strand_val)
            startlist.append(startcodon_pos)
            stoplist.append(stopcodon_pos)
            startcodon.append(start_codon)
            stopcodon.append(stop_codon)
            #            SDaffinity.append(SD_affinity)
            G_content.append(G)
            C_content.append(C)
            A_content.append(A)
            T_content.append(T)

            feat_num += 1
    '''Append lists to gff_dict'''
    gff_dict['Chromosome'] = chromosome
    gff_dict['Alias'] = aliaslist
    gff_dict['Strand'] = strandlist
    gff_dict['Start'] = startlist
    gff_dict['Stop'] = stoplist
    gff_dict['Sequence'] = seqlist
    gff_dict['Start_Codon'] = startcodon
    gff_dict['Stop_Codon'] = stopcodon
    gff_dict['Type'] = typelist
    #    gff_dict['SD_affinity'] = SDaffinity
    gff_dict['G_content'] = G_content
    gff_dict['C_content'] = C_content
    gff_dict['A_content'] = A_content
    gff_dict['T_content'] = T_content
    '''Pickle dict for use later'''
    ribo_util.makePickle(gff_dict, path_gff_dict)
    '''print dataframe, and save as .csv for use later'''
    ## Print GFF to check
    gff_df = pd.DataFrame(gff_dict)
    display(gff_df)
    gff_df.to_csv(path_gff_dict + '.csv')

    return
Ejemplo n.º 48
0
def SD_affinity_genome(paths_in):
    '''This function takes octamers of genomic sequence and calculates shine dalgarno affinity:
    output is a size separated dict that can be used like a density dict.'''
    # load octamers
    SD_affinity = paths_in['SD_affinity']

    affinity_list = pd.read_csv(SD_affinity)
    affinity_list = pd.Series(affinity_list.SD_affinity.values,
                              index=affinity_list.Octamer).to_dict()

    length_range = range(10, 46)
    GFFgen = GFF.parse(paths_in['path_gff'])
    chr = GFFgen.next()
    feat_num = 0

    affinity_plus = []
    affinity_minus = []
    density_plus_sizesep = {}
    density_minus_sizesep = {}

    sequence = chr.seq
    sequence_rc = sequence.reverse_complement()
    genome_size = len(sequence)

    position = 0
    for position in range(0, genome_size):
        if position < 8:
            motif = 'AAAAAAAA'
            motif_rc = 'AAAAAAAA'
        elif genome_size - position < 8:
            motif_rc = 'AAAAAAAA'
            motif = 'AAAAAAAA'
        else:

            motif = sequence[position - 8:position].transcribe()
            motif_rc = sequence[position:position + 8].transcribe()

            motif_rc = motif_rc.reverse_complement()

        if len(motif) == 8 and len(motif_rc) == 8:
            SD_affinity_plus = affinity_list[motif]
            SD_affinity_minus = affinity_list[motif_rc]
        else:
            SD_affinity_plus = 0.0
            SD_affinity_minus = 0.0

        if position == 100000:
            print '100000'
        if position == 500000:
            print '500000'
        if position == 1000000:
            print '1000000'
        if position == 2000000:
            print '2000000'

        affinity_plus.append(SD_affinity_plus)
        affinity_minus.append(SD_affinity_minus)

    for length in length_range:
        density_plus_sizesep[length] = affinity_plus
        density_minus_sizesep[length] = affinity_minus

    path_den = inpath + 'density/density/SD1/'
    ribo_util.makePickle(density_plus_sizesep, path_den + "plus_sizesep")
    ribo_util.makePickle(density_minus_sizesep, path_den + "minus_sizesep")

    return
Ejemplo n.º 49
0
                        help="increase verbosity")
    parser.add_argument("-q",
                        "--quiet",
                        action="count",
                        default=0,
                        help="decrease verbosity")

    args = parser.parse_args()

    logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
                        level=(5 - args.verbose + args.quiet) * 10,
                        datefmt="%H:%M:%S")

    try:
        from BCBio import GFF
        for record in GFF.parse(args.gff_file):
            break

        for gff_feature in record.features:
            print gff_feature
            print "_" * 80
            feature = Feature(gff_feature,
                              args.translation_file,
                              1,
                              feature_definition_dir="features",
                              qualifier_definition_dir="qualifiers")
            print "_" * 80
            print feature
            break
    except Exception as e:
        import traceback
Ejemplo n.º 50
0
#!/usr/bin/env python
import logging
import sys

from BCBio import GFF

logging.basicConfig(level=logging.INFO)
log = logging.getLogger()

if __name__ == "__main__":
    attr = sys.argv[2]

    for record in GFF.parse(sys.argv[1]):
        if len(record.features) == 0:
            continue

        for feature in sorted(record.features, key=lambda x: x.location.start):
            # chrom chromStart chromEnd
            # name score strand
            # thickStart thickEnd itemRgb

            kv = {
                "strand":
                0 if not feature.location.strand else feature.location.strand,
                "value": feature.qualifiers.get("score", [0])[0],
            }

            if attr not in feature.qualifiers:
                continue

            name = feature.qualifiers[attr][0]
Ejemplo n.º 51
0
def extract_pos(gff, filter=None):
    """
    retrieves information from gff files, also is able to filter on certain genes.
    :param gff:
    :param filter:
    :return:
    """
    print("Processing gencode annotation..")
    in_handle = open(gff)
    limit_info = dict(gff_type=['exon', 'CDS'])
    if filter:
        print('Filtering on: ', filter)
        if filter[0].startswith('#'):
            chrfilter = filter[0].split()[1].strip().split(',')
            limit_info['gff_id'] = ['chr' + (str(x)) for x in chrfilter]
            filter = filter[1::]
    genlist = set()
    for rec in GFF.parse(in_handle, limit_info=limit_info):
        for feature in rec.features:
            if feature.type == 'inferred_parent':
                for f in feature.sub_features:
                    if containsTR(f.qualifiers['gene_name'][0], filter):
                        if f.type == 'CDS' and 'chr' in rec.id:
                            if containCDSGENE(genlist,
                                              f.qualifiers['gene_name'][0],
                                              'gene'):
                                genlist = discarditem(
                                    genlist, f.qualifiers['gene_name'][0])
                            genlist.add((f.qualifiers['gene_name'][0],
                                         (f.location.start + 1, f.location.end,
                                          f.location.strand), f.type, rec.id,
                                         f.qualifiers['exon_number'][0]))
                            entries = [
                                genname for genname in genlist
                                if genname[0] == f.qualifiers['gene_name'][0]
                            ]
                            for entry in entries:
                                if entry[2] == 'exon':
                                    genlist.remove(entry)
                        elif f.type == 'exon' and 'chr' in rec.id:
                            if not containCDSGENE(genlist,
                                                  f.qualifiers['gene_name'][0],
                                                  'CDS'):
                                genlist.add(
                                    (f.qualifiers['gene_name'][0],
                                     (f.location.start + 1, f.location.end,
                                      f.location.strand), f.type, rec.id,
                                     f.qualifiers['exon_number'][0]))
            else:
                if containsTR(feature.qualifiers['gene_name'][0], filter):
                    try:
                        ex_number = feature.qualifiers['exon_number'][0]
                    except:
                        ex_number = 1
                    if feature.type == 'CDS' and 'chr' in rec.id:
                        if containCDSGENE(genlist,
                                          feature.qualifiers['gene_name'][0],
                                          'gene'):
                            genlist = discarditem(
                                genlist, feature.qualifiers['gene_name'][0])
                        genlist.add(
                            (feature.qualifiers['gene_name'][0],
                             (feature.location.start + 1, feature.location.end,
                              feature.location.strand), feature.type, rec.id,
                             ex_number))
                        entries = [
                            genname for genname in genlist
                            if genname[0] == feature.qualifiers['gene_name'][0]
                        ]
                        for entry in entries:
                            if entry[2] == 'exon':
                                genlist.remove(entry)
                    elif feature.type == 'exon' and 'chr' in rec.id:
                        if not containCDSGENE(
                                genlist, feature.qualifiers['gene_name'][0],
                                'CDS'):
                            genlist.add((feature.qualifiers['gene_name'][0],
                                         (feature.location.start + 1,
                                          feature.location.end,
                                          feature.location.strand),
                                         feature.type, rec.id, ex_number))
        #for item in genlist:
        #    if "chr" not in item[3]:
        #        genlist.remove(item)
    return sorted(genlist)
Ejemplo n.º 52
0
def load_features(reference, feature_names=None):
    #read in appropriately whether GFF or Genbank
    #checks explicitly for GFF otherwise assumes Genbank
    if not os.path.isfile(reference):
        print("ERROR: reference sequence not found. looking for", reference)
        return None

    features = {}
    if '.gff' in reference.lower():
        #looks for 'gene' and 'gene' as best for TB
        try:
            from BCBio import GFF  #Package name is confusing - tell user exactly what they need!
        except ImportError:
            print(
                "ERROR: Package BCBio.GFF not found! Please install using \'pip install bcbio-gff\' before re-running."
            )
            return None
        limit_info = dict(gff_type=['gene'])

        with open(reference) as in_handle:
            for rec in GFF.parse(in_handle, limit_info=limit_info):
                for feat in rec.features:
                    if feature_names is not None:  #check both tags; user may have used either
                        if "gene" in feat.qualifiers and feat.qualifiers[
                                "gene"][0] in feature_names:
                            fname = feat.qualifiers["gene"][0]
                        elif "locus_tag" in feat.qualifiers and feat.qualifiers[
                                "locus_tag"][0] in feature_names:
                            fname = feat.qualifiers["locus_tag"][0]
                        else:
                            fname = None
                    else:
                        if "gene" in feat.qualifiers:
                            fname = feat.qualifiers["gene"][0]
                        else:
                            fname = feat.qualifiers["locus_tag"][0]
                    if fname:
                        features[fname] = feat

            if feature_names is not None:
                for fe in feature_names:
                    if fe not in features:
                        print("Couldn't find gene {} in GFF or GenBank file".
                              format(fe))

    else:
        from Bio import SeqIO
        for feat in SeqIO.read(reference, 'genbank').features:
            if feat.type == 'CDS':
                if "locus_tag" in feat.qualifiers:
                    fname = feat.qualifiers["locus_tag"][0]
                    if feature_names is None or fname in feature_names:
                        features[fname] = feat
                elif "gene" in feat.qualifiers:
                    fname = feat.qualifiers["gene"][0]
                    if feature_names is None or fname in feature_names:
                        features[fname] = feat
            elif feat.type == 'source':  #read 'nuc' as well for annotations - need start/end of whole!
                features['nuc'] = feat

    return features
Ejemplo n.º 53
0
logging.basicConfig(level=logging.INFO)
log = logging.getLogger()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Renumber genbank files")
    parser.add_argument("data", type=argparse.FileType("r"), help="Annotations")
    parser.add_argument("--filetype", type=str, help="Filetype")
    parser.add_argument("--new_name", type=str, help="New Name")

    args = parser.parse_args()

    # Iterate over our input data
    if args.filetype == "gff3":
        it = GFF.parse(args.data)
    else:
        it = SeqIO.parse(args.data, args.filetype)

    for idx, record in enumerate(it):
        # Can only handle a single name
        if idx > 1:
            raise Exception("Too many sequences")

        # Update name
        record.id = args.new_name
        record.name = args.new_name

        # Output according to type
        if args.filetype == "gff3":
            GFF.write([record], sys.stdout)
Ejemplo n.º 54
0
#NODE_10 		1165936 1166465

# internal
# ['Reference', ['2434777', '2434933', 'ancestral']],
# ['2016-17702', ['2483546', '2485711', 'ancestral']],
# ['2016-17705', ['2483546', '2485711', 'ancestral']],
# ['2016-11776', ['2483546', '2485711', 'ancestral']],
# ['2016-11778', ['2483546', '2485711', 'ancestral']],
# ['2016-11777', ['2483546', '2485711', 'ancestral']],
# ['2016-17701', ['2483546', '2485711', 'ancestral']],
# ['Reference', ['2483546', '2485711', 'ancestral']]]
 
if args.gubbins:
	msg('Parsing Gubbins file: {}'.format(coordfn))
	with open(coordfn) as gff:
		for rec in GFF.parse(gff):
			for f in rec.features:
				taxa = f.qualifiers['taxa'][0].strip().split()
				state = ('ancestral', 'extant')[ len(taxa)==1 ]
				for taxon in taxa:
					seqLIST.append( [ taxon, [ int(f.location.start), int(f.location.end), state ] ] )
else:
	msg('Parsing ClonalFrameML file: {}'.format(coordfn))
	with open(coordfn) as csvfile:
		RCseqs = csv.reader(csvfile, delimiter='\t')
		next(csvfile)
		for row in RCseqs:
			seq = row[0]
			RCstart = row[1]
			RCstop = row[2]
			node = t.search_nodes(name=seq)[0]
Ejemplo n.º 55
0
def armar_grafo_completo():
    regiones = list(
        GFF.parse("/data/organismos/ILEX_PARA2/regulation/ncbi_IP4.gff3.reg"))
    scaffolds_dict = defaultdict(lambda: [])
    for c in regiones:
        for f in c.features:
            motif = f.sub_features[0].qualifiers["description"][0]
            if len(motifs) > 1:
                print set(motifs)
            scaffolds_dict[c.id].append({
                "start": int(f.location.start),
                "end": int(f.location.end),
                "strand": f.location.strand,
                "tf": f.qualifiers["TF_id"][0],
                "motif": motif,
                "count": len(f.sub_features)
            })
    ann = list(GFF.parse("/data/organismos/ILEX_PARA2/ncbi_IP4.gff3.whole"))
    tf_gene = []
    # dist = defaultdict(lambda: {})
    contig_map = {}
    for contig in ann:
        for f in contig.features:
            if f.type == "gene" and f.sub_features:
                contig_map[f.sub_features[0].id] = contig.id

    for contig in ann:
        if contig.id in scaffolds_dict:
            # for f1 in contig.features:
            #     for f2 in contig.features:
            #         if (f1.type == "gene" and f1.sub_features) and (f2.type == "gene" and f2.sub_features):
            #             dist[f1.sub_features[0].id][f2.sub_features[0].id] = abs(f1.location.start - f2.location.start)

            for reg in scaffolds_dict[contig.id]:
                if reg["strand"] == 1:
                    genes = [
                        x for x in contig.features
                        if int(x.location.start + 100) >= reg["end"]
                        if x.type == "gene" and x.sub_features
                    ]
                    gene = sorted(
                        genes,
                        key=lambda x: x.location.start)[0].sub_features[0]
                else:
                    genes = [
                        x for x in contig.features
                        if (x.location.end - 100) <= reg["start"]
                        if x.type == "gene" and x.sub_features
                    ]
                    try:
                        gene = sorted(
                            genes,
                            key=lambda x: x.location.end)[-1].sub_features[0]
                    except:
                        print([contig.id, reg["start"]])
                tf_gene.append({
                    "scaffold": contig.id,
                    "tf": reg["tf"],
                    "motif": reg["motif"],
                    "count": reg["count"],
                    "gene": gene.id
                })
    nodes = {}
    edges = defaultdict(lambda: [])
    tfs = {}
    for row in tf_gene:
        nodes[row["tf"]] = 1
        tfs[row["tf"]] = [row["motif"], row["count"]]
        nodes[row["gene"]] = 1
        edges[row["tf"]].append(row["gene"])

    diG = nx.DiGraph()
    for node, scaffold in nodes.items():
        node = node.replace("*", "")
        if node == "unknown":
            continue
        diG.add_node(node,
                     scaffold=contig_map[node],
                     count=tfs[node][1] if node in tfs else "",
                     motif=tfs[node][0] if node in tfs else "")

    for ft, genes in edges.items():
        ft = ft.replace("*", "")
        if ft == "unknown":
            continue
        for regulated in genes:
            if regulated == "unknown":
                continue
            regulated = regulated.replace("*", "")
            diG.add_edge(ft, regulated)

    assert "unknown" not in diG
    nx.write_gml(diG, "/data/organismos/ILEX_PARA2/regulation/graph.gml")
Ejemplo n.º 56
0
def agrupar_sitios():
    regiones = list(
        GFF.parse("/data/organismos/ILEX_PARA2/regulation/ncbi_IP4.gff3.reg"))
    ids = 1
    groups = {}
    for c in tqdm(regiones):
        groups[c.id] = []
        for strand in [1, -1]:
            group = SeqFeature(id=c.features[0],
                               type="grouped_transcription_regulatory_region",
                               location=c.features[0].location)
            group.sub_features = []

            fs = sorted([f for f in c.features if f.strand == strand],
                        key=lambda x: x.location.start)
            if not fs:
                continue
            group.sub_features += [fs[0]]

            for f in fs[1:]:
                end = max([x.location.end for x in group.sub_features])
                if ((abs(f.location.start - end) < 1500)
                        or (set(range(f.location.start, f.location.end)) & set(
                            range(group.sub_features[-1].location.start,
                                  group.sub_features[-1].location.end)))):
                    group.sub_features.append(f)
                else:
                    group.qualifiers = {
                        "description":
                        "_".join(
                            sorted(
                                set([
                                    x.qualifiers["description"][0].split(
                                        " regulatory region")[0]
                                    for x in group.sub_features
                                ]))),
                        "ID": ["ILEXPARARR" + str(ids)]
                    }
                    ids += 1

                    group.location = FeatureLocation(
                        start=min(
                            [x.location.start for x in group.sub_features]),
                        end=max([x.location.end for x in group.sub_features]),
                        strand=f.location.strand)
                    assert group.location.start < group.location.end
                    if (group.location.end - group.location.start) > 5000:
                        print(group.qualifiers["ID"])

                    groups[c.id].append(group)
                    group = SeqFeature(
                        id=c.features[0],
                        type="grouped_transcription_regulatory_region",
                        location=f.location)
                    group.sub_features = [f]
            if group:
                group.qualifiers = {
                    "description":
                    "_".join(
                        sorted(
                            set([
                                x.qualifiers["description"][0].split(
                                    " binding site")[0]
                                for x in group.sub_features
                            ]))),
                    "ID": ["ILEXPARARR" + str(ids)]
                }
                ids += 1
                group.location = FeatureLocation(
                    start=min([x.location.start for x in group.sub_features]),
                    end=max([x.location.end for x in group.sub_features]),
                    strand=f.location.strand)
                assert group.location.start < group.location.end
                if (group.location.end - group.location.start) > 5000:
                    print(group.qualifiers["ID"])

                groups[c.id].append(group)

    # for _, v in groups.items():
    #     for x in v:
    #         x.sub_features = []
    records = [
        SeqRecord(id=k, name="", description="", seq=Seq(""), features=v)
        for k, v in groups.items()
    ]
    GFF.write(tqdm(records),
              open("/data/organismos/ILEX_PARA2/regulation/grouped.gff", "w"))
Ejemplo n.º 57
0
def AreInitSitesUsed():

    hitlist = []

    with open(
            '/users/buskirk/documents/profiling/projects/sORFs/init_sites.csv',
            'rU') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            hitlist.append(row)

    GFFgen = GFF.parse(
        '/users/buskirk/documents/profiling/GFF/MG1655/coli3.gff')
    chrom = GFFgen.next()

    genes_plus = geneplot(chrom, 1)
    genes_minus = geneplot(chrom, -1)

    pathi = "/users/buskirk/documents/profiling/projects/sORFs/wigfiles/"
    filelist = ['r_control', 'o_control']

    for fname in filelist:
        density_filestring = pathi + fname
        counts_plus = readwig(density_filestring + "_plus")
        counts_minus = readwig(density_filestring + "_minus")

        for iH in hitlist:

            if iH[0] == 'start':
                iH.append(fname)
                continue

            start = int(iH[0]) - 1  # back to 0 based list
            stop = int(iH[1]) - 1
            rpm = 0.0
            counter = 0
            skip = False

            if iH[7] == 'plus':
                for iL in range(start - 15, stop + 1 + 15):
                    if iL >= len(genes_plus):
                        skip = True
                        break

                    if genes_plus[
                            iL] != '0':  # no overlap for 15 nt on either side of new sORF allowed
                        rpkm = genes_plus[iL]
                        skip = True
                        break

                if not skip:
                    for iP in range(start, stop +
                                    1):  # + 1 to include last nt in stop codon
                        rpm += counts_plus[iP + 15]
                        counter += 1

            elif iH[7] == 'minus':
                for iL in range(
                        start - 15, stop + 1 +
                        15):  # start is left and stop right in chromosome

                    if genes_minus[iL] != '0':
                        rpkm = genes_minus[iL]
                        skip = True
                        break

                if not skip:
                    for iP in range(start, stop + 1):
                        rpm += counts_minus[iP - 15]
                        counter += 1

            if counter != 0:
                rpkm = rpm * 1000 / counter
            iH.append(rpkm)

    writelisttoexcel(
        hitlist,
        '/users/buskirk/documents/profiling/projects/sORFs/init_sites_rpkm')
Ejemplo n.º 58
0
    def load_legacy_gff3(self, organism, gff3, source=None):
        """
        Load a full GFF3 into annotation track (legacy version, kept for compatibility only)

        :type organism: str
        :param organism: Organism Common Name

        :type gff3: str
        :param gff3: GFF3 to load

        :type source: str
        :param source: URL where the input dataset can be found.

        :rtype: str
        :return: Loading report
        """

        sys.stdout.write('# ')
        sys.stdout.write('\t'.join(
            ['Feature ID', 'Apollo ID', 'Success', 'Messages']))
        sys.stdout.write('\n')

        bad_quals = [
            'date_creation', 'source', 'owner', 'date_last_modified', 'Name',
            'ID'
        ]

        for rec in GFF.parse(gff3):
            self.set_sequence(organism, rec.id)
            for feature in rec.features:
                # We can only handle genes right now
                if feature.type not in ('gene', 'terminator'):
                    continue
                # Convert the feature into a presentation that Apollo will accept
                feature_data = features_to_feature_schema([feature])
                # TODO: do we handle all top-types here?
                if 'children' in feature_data[0] and any([
                        child['type']['name'] == 'tRNA'
                        for child in feature_data[0]['children']
                ]):
                    # We're experiencing a (transient?) problem where gene_001 to
                    # gene_025 will be rejected. Thus, hardcode to a known working
                    # gene name and update later.

                    feature_data[0]['name'] = 'tRNA_000'
                    tRNA_sf = [
                        child for child in feature.sub_features
                        if child.type == 'tRNA'
                    ][0]
                    tRNA_type = 'tRNA-' + tRNA_sf.qualifiers.get(
                        'Codon', ["Unk"])[0]

                    if 'Name' in feature.qualifiers:
                        if feature.qualifiers['Name'][0].startswith('tRNA-'):
                            tRNA_type = feature.qualifiers['Name'][0]

                    newfeature = self.add_feature(feature_data[0])

                    def func0():
                        self.set_name(
                            newfeature['features'][0]['uniquename'],
                            tRNA_type,
                        )

                    retry(func0)

                    if source:
                        gene_id = newfeature['features'][0]['parent_id']

                        def setSource():
                            self.add_attribute(gene_id, 'DatasetSource',
                                               source)

                        retry(setSource)

                    sys.stdout.write('\t'.join([
                        feature.id,
                        newfeature['features'][0]['uniquename'],
                        'success',
                    ]))
                if feature_data[0]['type']['name'] == 'terminator':
                    # We're experiencing a (transient?) problem where gene_001 to
                    # gene_025 will be rejected. Thus, hardcode to a known working
                    # gene name and update later.
                    feature_data[0]['name'] = 'terminator_000'
                    newfeature = self.add_feature(feature_data[0])

                    def func0():
                        self.set_name(newfeature['features'][0]['uniquename'],
                                      'terminator')

                    retry(func0)

                    if source:
                        gene_id = newfeature['features'][0]['parent_id']

                        def setSource():
                            self.add_attribute(gene_id, 'DatasetSource',
                                               source)

                        retry(setSource)

                    sys.stdout.write('\t'.join([
                        feature.id,
                        newfeature['features'][0]['uniquename'],
                        'success',
                    ]))
                else:
                    try:
                        # We're experiencing a (transient?) problem where gene_001 to
                        # gene_025 will be rejected. Thus, hardcode to a known working
                        # gene name and update later.
                        feature_data[0]['name'] = 'gene_000'
                        # Create the new feature
                        newfeature = self.add_feature(feature_data[0])
                        # Extract the UUIDs that apollo returns to us
                        mrna_id = newfeature['features'][0]['uniquename']
                        gene_id = newfeature['features'][0]['parent_id']
                        # Sleep to give it time to actually persist the feature. Apollo
                        # is terrible about writing + immediately reading back written
                        # data.
                        time.sleep(1)

                        # Extract CDS feature from the feature data, this will be used
                        # to set the CDS location correctly (apollo currently screwing
                        # this up (2.0.6))
                        min_cds = None
                        max_cds = None

                        for feat in feature_data[0]['children']:
                            # mRNA level
                            for subfeat in feat['children']:
                                # Can be exon or CDS
                                if subfeat['type']['name'] == 'CDS':
                                    if min_cds is None:
                                        min_cds = subfeat['location']['fmin']
                                        max_cds = subfeat['location']['fmax']
                                    else:
                                        min_cds = min(
                                            min_cds,
                                            subfeat['location']['fmin'])
                                        max_cds = max(
                                            max_cds,
                                            subfeat['location']['fmax'])
                                if 'children' in subfeat:
                                    for subsubfeat in subfeat['children']:
                                        if subsubfeat['type']['name'] == 'CDS':
                                            if min_cds is None:
                                                min_cds = subsubfeat[
                                                    'location']['fmin']
                                                max_cds = subsubfeat[
                                                    'location']['fmax']
                                            else:
                                                min_cds = min(
                                                    min_cds,
                                                    subsubfeat['location']
                                                    ['fmin'])
                                                max_cds = max(
                                                    max_cds,
                                                    subsubfeat['location']
                                                    ['fmax'])

                        # Correct the translation start, but with strand specific log
                        if feature_data[0]['location']['strand'] == 1:
                            self.set_translation_start(mrna_id,
                                                       min(min_cds, max_cds))
                        else:
                            self.set_translation_start(
                                mrna_id,
                                max(min_cds, max_cds) - 1)

                        # Finally we set the name, this should be correct.
                        def func():
                            self.set_name(
                                mrna_id,
                                feature.qualifiers.get(
                                    'product',
                                    feature.qualifiers.get(
                                        'Name', ["Unknown"]))[0])

                        retry(func)

                        def func():
                            self.set_name(
                                gene_id,
                                feature.qualifiers.get(
                                    'product',
                                    feature.qualifiers.get(
                                        'Name', ["Unknown"]))[0])

                        retry(func)

                        if source:
                            gene_id = newfeature['features'][0]['parent_id']

                            def setSource():
                                self.add_attribute(gene_id, 'DatasetSource',
                                                   source)

                            retry(setSource)
                        extra_attr = {}
                        for (key, values) in feature.qualifiers.items():
                            if key in bad_quals:
                                continue

                            if key == 'Note':

                                def func2():
                                    self.add_comments(gene_id, values)

                                retry(func2)
                            else:
                                extra_attr[key] = values

                        for key in extra_attr:

                            def func3():
                                self.add_attribute(gene_id, key,
                                                   extra_attr[key])

                            retry(func3)

                        sys.stdout.write('\t'.join([
                            feature.id,
                            gene_id,
                            'success',
                        ]))
                    except Exception as e:
                        msg = str(e)
                        if '\n' in msg:
                            msg = msg[0:msg.index('\n')]
                        sys.stdout.write('\t'.join(
                            [feature.id, '', 'ERROR', msg]))
                sys.stdout.write('\n')
                sys.stdout.flush()
Ejemplo n.º 59
0
# Set static parameters
method = 'genebuild'
pANDe = ROOT.split('/')[-1]
primary_assembly, ensembl_build = pANDe.split('.')

# Create an accession to name dictionary
ac_to_name_list = []
ac_to_name = {}
name_to_ac = {}
alignment_file = os.path.join(ALIGNMENTS, 'Homo_sapiens.%s.%s.chr_patch_hapl_scaff.gff3') % (
primary_assembly, ensembl_build)
aln_handle = open(alignment_file)
# Limiter which identifies the Ensembl tagged lines only (Note Capital E)
limit_info = dict(gff_source=["GRCh37"])
for rec in GFF.parse(aln_handle, limit_info=limit_info):
    ac_to_name_list.append(rec.features[0].qualifiers.get('Alias'))
# MT accession is not provided in the GFF file
# Contig is 16569 nt so NC_012920.1
ac_to_name_list.append(['', 'NC_012920.1'])

for a_n in ac_to_name_list:
    if a_n is None:
        continue
    # Need to work back from the refseq Accession
    workback = name_to_accession.to_name(a_n[-1])
    a_n = [workback] + a_n
    # Filter out primary chromosomes
    try:
        ac_to_name[a_n[2]] = str(a_n[0]).replace('chr', '')
        name_to_ac[str(a_n[0]).replace('chr', '')] = str(a_n[2])
Ejemplo n.º 60
0

parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input_gff", action="store", dest="input_gff",
                    help="Gff file with annotations to extract")
parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout",
                    help="Output file with information about transcripts")
parser.add_argument("-l", "--longest_isoforms", action="store", dest="longest_isoforms",
                    help="File to write longest isoforms")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

for record in GFF.parse(open(args.input_gff)):
    for feature in record.features:
        #print feature
        if feature.type == "gene":
            transcript_id_list = []
            transcript_len_list = []
            CDS_len_list = []
            for subfeature in feature.sub_features:
                #print subfeature
                #print(subfeature.type)
                if subfeature.type == "mRNA" or subfeature.type == "transcript":
                    transcript_id_list.append(subfeature.id)
                    transcript_len_list.append(len(subfeature))
                    CDS_len = 0
                    for subsubfeature in subfeature.sub_features:
                        if subsubfeature.type == "CDS":