コード例 #1
0
def splicejunctions_to_gff3(inputBedFile, chrom_sizes, output):
    gff3 = open(output, 'w')
    gff3.write("##gff-version 3\n")
    sizes_dict = subtools.sequence_region(chrom_sizes)
    seq_regions = dict()
    with open(inputBedFile, 'r') as bed:
        for line in bed:
            field = OrderedDict()
            attribute = OrderedDict()
            li = line.rstrip().split("\t")
            field['seqid'] = li[0]
            if field['seqid'] not in seq_regions:
                end_region = sizes_dict[field['seqid']]
                gff3.write("##sequence-region " + field['seqid'] + ' 1 ' +
                           str(end_region) + '\n')
                seq_regions[field['seqid']] = end_region
            field['source'] = li[3]
            field['type'] = 'junction'
            # The first base in a chromosome is numbered 0 in BED format
            field['start'] = int(li[1]) + 1
            field['end'] = li[2]
            field['score'] = li[12]
            field['strand'] = li[5]
            field['phase'] = '.'
            attribute['ID'] = li[0] + '_' + li[3]
            attribute['Name'] = li[3]
            attribute['blockcount'] = li[9]
            attribute['blocksizes'] = li[10]
            attribute['chromstarts'] = li[11]
            subtools.write_features(field, attribute, gff3)
            subtools.child_blocks(field, attribute, gff3, 'exon_junction')
    gff3.close()
コード例 #2
0
def gtfToGff3(gtf_file, gff3_file, chrom_sizes):
    """
    Covert gtf file output from StringTie to gff3 format
    """
    gff3 = open(gff3_file, 'w')
    gff3.write("##gff-version 3\n")
    sizes_dict = subtools.sequence_region(chrom_sizes)
    seq_regions = dict()
    parents = dict()
    with open(gtf_file, 'r') as gtf:
        for line in gtf:
            if line.startswith('#') or not line.strip():
                continue
            field = OrderedDict()
            attribute = OrderedDict()
            li = line.rstrip().split("\t")
            #print li
            field['seqid'] = li[0]
            #print field['seqid']
            if field['seqid'] not in seq_regions:
                end_region = sizes_dict[field['seqid']]
                gff3.write("##sequence-region " + field['seqid'] + ' 1 ' +
                           str(end_region) + '\n')
                seq_regions[field['seqid']] = end_region
            field['source'] = li[1]
            field['type'] = li[2]
            # The first base in a chromosome is numbered 0 in BED format
            field['start'] = li[3]
            field['end'] = li[4]
            field['score'] = li[5]
            field['strand'] = li[6]
            field['phase'] = li[7]
            attr_li = li[8].split(';')
            gene_id = attr_li[0].split()[1].strip('"')
            attribute['ID'] = gene_id + '_' + field['type'] + '_' + str(
                field['start']) + '_' + str(field['end'])
            if field['type'] == 'transcript':
                parents[gene_id] = attribute['ID']
                attribute['transcript_id'] = attr_li[1].split()[1].strip('"')
                attribute['coverage'] = attr_li[2].split()[1].strip('"')
                attribute['fpkm'] = attr_li[3].split()[1].strip('"')
                attribute['tpm'] = attr_li[4].split()[1].strip('"')
            elif field['type'] == 'exon':
                attribute['Parent'] = parents[gene_id]
                attribute['transcript_id'] = attr_li[1].split()[1].strip('"')
                attribute['coverage'] = attr_li[3].split()[1].strip('"')
            subtools.write_features(field, attribute, gff3)
    gff3.close()
コード例 #3
0
 def bigpsl_to_gff3(self):
     gff3 = open(self.gff3_file.name, 'w')
     gff3.write("##gff-version 3\n")
     sizes_dict = subtools.sequence_region(self.chromSizesFile)
     seq_regions = dict()
     with open(self.inputFile, 'r') as bed:
         for line in bed:
             field = OrderedDict()
             attribute = OrderedDict()
             li = line.rstrip().split("\t")
             field['seqid'] = li[0]
             if field['seqid'] not in seq_regions:
                 end_region = sizes_dict[field['seqid']]
                 gff3.write("##sequence-region " + field['seqid'] + ' 1 ' +
                            str(end_region) + '\n')
                 seq_regions[field['seqid']] = end_region
             field['source'] = 'UCSC BLAT alignment tool'
             field['type'] = 'match'
             # The first base in a chromosome is numbered 0 in BED format
             field['start'] = str(int(li[1]) + 1)
             field['end'] = li[2]
             field['score'] = li[4]
             field['strand'] = li[5]
             field['phase'] = '.'
             attribute['ID'] = li[0] + '_' + li[3]
             attribute['Name'] = li[3]
             attribute['blockcount'] = li[9]
             attribute['blocksizes'] = li[10]
             attribute['chromstarts'] = li[11]
             attribute['ochrom_start'] = li[12]
             attribute['ochrom_end'] = li[13]
             attribute['ochrom_strand'] = li[14]
             attribute['ochrom_size'] = li[15]
             attribute['ochrom_starts'] = li[16]
             attribute['sequence on other chromosome'] = li[17]
             attribute['cds in ncbi format'] = li[18]
             attribute['size of target chromosome'] = li[19]
             attribute['number of bases matched'] = li[20]
             attribute['number of bases that don\'t match'] = li[21]
             attribute[
                 'number of bases that match but are part of repeats'] = li[
                     22]
             attribute['number of \'N\' bases'] = li[23]
             subtools.write_features(field, attribute, gff3)
             subtools.child_blocks(field, attribute, gff3, 'match_part')
     gff3.close()
コード例 #4
0
def trfbig_to_gff3(inputBedFile, chrom_sizes, output):
    gff3 = open(output, 'w')
    gff3.write("##gff-version 3\n")
    sizes_dict = subtools.sequence_region(chrom_sizes)
    seq_regions = dict()
    with open(inputBedFile, 'r') as bed:
        for line in bed:
            field = OrderedDict()
            attribute = OrderedDict()
            li = line.rstrip().split("\t")
            field['seqid'] = li[0]
            if field['seqid'] not in seq_regions:
                end_region = sizes_dict[field['seqid']]
                gff3.write("##sequence-region " + field['seqid'] + ' 1 ' +
                           str(end_region) + '\n')
                seq_regions[field['seqid']] = end_region
            field['source'] = li[3]
            field['type'] = 'tandem_repeat'
            # The first base in a chromosome is numbered 0 in BED format
            field['start'] = str(int(li[1]) + 1)
            field['end'] = li[2]
            field['score'] = li[9]
            field['strand'] = '+'
            field['phase'] = '.'
            attribute['length of repeat unit'] = li[4]
            attribute['mean number of copies of repeat'] = li[5]
            attribute['length of consensus sequence'] = li[6]
            attribute['percentage match'] = li[7]
            attribute['percentage indel'] = li[8]
            attribute['percent of a\'s in repeat unit'] = li[10]
            attribute['percent of c\'s in repeat unit'] = li[11]
            attribute['percent of g\'s in repeat unit'] = li[12]
            attribute['percent of t\'s in repeat unit'] = li[13]
            attribute['entropy'] = li[14]
            attribute['sequence of repeat unit element'] = li[15]
            subtools.write_features(field, attribute, gff3)
    gff3.close()