Ejemplo n.º 1
0
def BEDIterator(handle):
  """Generator function to iterate over Fasta records (as SeqRecord objects).

  handle - input file

  If this is not given, then the entire title line will be used
  as the description, and the first word as the id and name.

  Note that use of title2ids matches that of Bio.Fasta.SequenceParser
  but the defaults are slightly different.
  """
  line_no = 0
  #Skip any text before the first record (e.g. blank lines, comments)
  while True :
    line_no += 1
    line = handle.readline().strip()
    if not line:
      return
    if line[0] == "#" or len(line) == 0:
      continue
    try:
      ref,source,type,start,end,score,strand,frame,attributes = \
        line.split("\t")
    except:
      raise FormatError, "Problem with line %d in %s.  Line was\n%s" %\
        (line_no,handle.name,line)

    attr_pairs = attributes.strip(';').split(";")
    attr_dict = dict(map(lambda x: tuple(x.split("=")), attr_pairs))
    result = SeqFeature(location=FeatureLocation(int(start),int(end)),
      type=type,strand=_gff3_strand_to_numeric[strand],ref=ref,ref_db=source)
    result.id = attr_dict.get("ID",None)
    result.name = attr_dict.get("Name",None)
    result.attributes = attr_dict # not an official property of SeqFeature.
    yield result
Ejemplo n.º 2
0
    def ncrna_gene(self, ncrna):
        """Create a gene for ncRNAs"""
        
        gene = SeqFeature(ncrna.location, type="ncRNA_gene")
        gene.qualifiers["source"] = ncrna.qualifiers["source"]
        gene.sub_features = [ncrna]
        gene.id = ncrna.id

        return gene
Ejemplo n.º 3
0
    def cds_gene(self, cds):
        """Create a gene for a lone CDS"""

        # Create a transcript, add the CDS
        transcript = SeqFeature(cds.location, type="mRNA")
        transcript.qualifiers["source"] = cds.qualifiers["source"]
        transcript.sub_features = [cds]

        # Add an exon too
        exon = SeqFeature(cds.location, type="exon")
        exon.qualifiers["source"] = cds.qualifiers["source"]
        transcript.sub_features.append(exon)
        
        # Create a gene, add the transcript
        gene = SeqFeature(cds.location, type="gene")
        gene.qualifiers["source"] = cds.qualifiers["source"]
        gene.sub_features = [transcript]
        gene.id = self.generate_stable_id()

        return gene
Ejemplo n.º 4
0
def gbk2gff(genbank_path, new_gff_path, species_id):
    print('Start change', os.path.basename(new_gff_path))
    records_list = []
    genome = SeqIO.read(genbank_path, "genbank")
    remove_none_location(genome)
    genome.features.sort(key=lambda x: x.location.start)
    gene_count = 0
    IR_count = 0
    for ele in genome.features:
        if ele.type == 'gene':
            if ele.qualifiers['gene'][0] == 'rps12':
                continue
            gene_count += 1
            ele.id = species_id + '%03d' % gene_count
            for child_feature in genome[ele.location.start:ele.location.
                                        end].features:
                fix_location(child_feature, ele.location.start)
                if child_feature.type != 'gene' and \
                        child_feature.location.start == ele.location.start and \
                        child_feature.location.end == ele.location.end:
                    child_feature.type = 'mRNA' if child_feature.type == 'CDS' else child_feature.type
                    if child_feature.qualifiers['gene'][0] == ele.qualifiers[
                            'gene'][0]:
                        # This module for protein coding gene CDS region
                        if child_feature.type == 'mRNA':
                            gene_attributes = [
                                'ID=' + ele.id,
                                'Name=' + ele.qualifiers['gene'][0],
                                'gene_biotype=protein_coding'
                            ]
                            records_list.append(
                                get_record(ele, gene_attributes))
                            cds_count = 0
                            for cds in reversed(child_feature.location.parts):
                                cds_count += 1
                                cds_feature = SeqFeature(
                                    cds,
                                    type='CDS',
                                    qualifiers={
                                        'codon_start':
                                        child_feature.qualifiers['codon_start']
                                        [0]
                                    })
                                cds_feature.id = 'cds_' + species_id + '%03d' % gene_count + '_' + '%d' % cds_count
                                cds_attributes = [
                                    'ID=' + cds_feature.id, 'Parent=' + ele.id,
                                    'product=' +
                                    child_feature.qualifiers['product'][0]
                                ]
                                records_list.append(
                                    get_record(cds_feature, cds_attributes))
                        # This module for rRNA and tRNA exon
                        else:
                            # gene
                            gene_attributes = [
                                'ID=' + ele.id,
                                'Name=' + ele.qualifiers['gene'][0],
                                'gene_biotype=' + child_feature.type
                            ]
                            records_list.append(
                                get_record(ele, gene_attributes))
                            # rna
                            child_feature.id = 'rna_' + species_id + '%03d' % gene_count
                            child_attributes = [
                                'ID=' + child_feature.id, 'Parent=' + ele.id,
                                'product=' +
                                child_feature.qualifiers['product'][0]
                            ]
                            records_list.append(
                                get_record(child_feature, child_attributes))
                            exon_list = []
                            exon_count = 0
                            # exon
                            for exon in reversed(child_feature.location.parts):
                                exon_count += 1
                                exon_feature = SeqFeature(exon, type='exon')
                                exon_feature.id = 'exon_' + species_id + '%03d' % gene_count + '_' + '%d' % exon_count
                                exon_attributes = [
                                    'ID=' + exon_feature.id,
                                    'Parent=' + child_feature.id
                                ]
                                exon_list.append(
                                    get_record(exon_feature, exon_attributes))
                            if exon_count > 1:
                                records_list += exon_list
        elif ele.type == 'repeat_region':
            IR_count += 1
            gene_attributes = [
                'ID=IR' + str(IR_count), 'note=Inverted repeats'
            ]
            records_list.append(get_record(ele, gene_attributes))
    records_dict = {index: record for index, record in enumerate(records_list)}
    result_gff = pd.DataFrame.from_dict(records_dict, 'index')
    result_gff['seqid'] = genome.id
    result_gff['score'] = '.'
    result_gff['source'] = 'PGA'
    result_gff = result_gff[[
        "seqid", "source", "type", "start", "end", "score", "strand", "phase",
        "attributes"
    ]]
    result_gff.to_csv(new_gff_path,
                      sep='\t',
                      header=False,
                      index=False,
                      encoding='utf8')
    return genome.seq
Ejemplo n.º 5
0
		# st and en are unmodified from the blast file
		# they can be backwards
		# original row as of nov23 read
		#feature = SeqFeature(FeatureLocation(st-1,en), strand=framepart2,type="repeathit")
		# start location should not be modified by -1 in this case


		# changed Featurelocation start, older line:
		#feature = SeqFeature(FeatureLocation(st,en), strand=framepart2,type="repeathit")

		# Genbank magic adds one to start position?

		feature = SeqFeature(FeatureLocation(st-1,en), strand=framepart2,type="repeathit")
		#feature.id=rename
		#feature.qualifiers["hit"]=rename
		feature.id=isname
		feature.qualifiers["hit"]=isname
		feature.qualifiers["sequence_length"]=str(len(contigseq[myhsp.sbjct_start-1:myhsp.sbjct_end]))
#
		feature.qualifiers["score"]=str(myhsp.score)
		if myhsp.score>bestscorehsp.score:
			bestscorehsp=myhsp

		if myhsp.expect<bestexphsp.expect:
			bestexphsp=myhsp

		# hit length is disance from hits start plus hit end
		feature.qualifiers["hitlength"]=str(1+max(st,en)-min(st,en))
#
		feature.qualifiers["expect"]=str(myhsp.expect)
		feature.qualifiers["query_start"]=str(myhsp.query_start)
Ejemplo n.º 6
0
 for feature in feature_lambda(record.features, feature_test_true, {}):
     if feature.type in args.changeList:
         #if "Parent" in feature.qualifiers.keys():
         #endOfChain = False
         #while endOfChain == False:
         # parentFeat = record.features(feature.qualifiers["Parent"][0])
         #for x in range(0, len(args.changeList)):
         #feature.qualifiers["Parent"] = []
         newChain = [feature]
         tempParent = ""
         if "Parent" in feature.qualifiers.keys():
             tempParent = feature.qualifiers["Parent"][0]
         for x in range(0, len(args.changeTo)):
             tempFeat = SeqFeature(location=feature.location)
             tempFeat.type = args.changeTo[len(args.changeTo) - 1 - x]
             tempFeat.id = feature.id + "_p" + str(
                 len(args.changeTo) - x)
             tempFeat.ref_db = feature.ref_db
             tempFeat.ref = feature.ref
             #tempFeat.sub_features.append(newChain[x])
             if "Parent" in newChain[x].qualifiers.keys():
                 newChain[x].qualifiers["Parent"][
                     0] = feature.id + "_p" + str(
                         len(args.changeTo) - x)
             else:
                 newChain[x].qualifiers["Parent"] = [
                     feature.id + "_p" + str(len(args.changeTo) - x)
                 ]
             tempFeat.qualifiers["ID"] = [tempFeat.id]
             if "Name" in newChain[x].qualifiers.keys():
                 tempFeat.qualifiers["Name"] = [
                     feature.qualifiers["Name"][0] + "_p" +