Ejemplo n.º 1
0
def splicejunctions_to_gff3(inputBedFile, chrom_sizes, output):
    gff3 = open(output, 'w')
    gff3.write("##gff-version 3\n")
    sizes_dict = subtools.sequence_region(chrom_sizes)
    seq_regions = dict()
    with open(inputBedFile, 'r') as bed:
        for line in bed:
            field = OrderedDict()
            attribute = OrderedDict()
            li = line.rstrip().split("\t")
            field['seqid'] = li[0]
            if field['seqid'] not in seq_regions:
                end_region = sizes_dict[field['seqid']]
                gff3.write("##sequence-region " + field['seqid'] + ' 1 ' +
                           str(end_region) + '\n')
                seq_regions[field['seqid']] = end_region
            field['source'] = li[3]
            field['type'] = 'junction'
            # The first base in a chromosome is numbered 0 in BED format
            field['start'] = int(li[1]) + 1
            field['end'] = li[2]
            field['score'] = li[12]
            field['strand'] = li[5]
            field['phase'] = '.'
            attribute['ID'] = li[0] + '_' + li[3]
            attribute['Name'] = li[3]
            attribute['blockcount'] = li[9]
            attribute['blocksizes'] = li[10]
            attribute['chromstarts'] = li[11]
            subtools.write_features(field, attribute, gff3)
            subtools.child_blocks(field, attribute, gff3, 'exon_junction')
    gff3.close()
Ejemplo n.º 2
0
def gtfToGff3(gtf_file, gff3_file, chrom_sizes):
    """
    Covert gtf file output from StringTie to gff3 format
    """
    gff3 = open(gff3_file, 'w')
    gff3.write("##gff-version 3\n")
    sizes_dict = subtools.sequence_region(chrom_sizes)
    seq_regions = dict()
    parents = dict()
    with open(gtf_file, 'r') as gtf:
        for line in gtf:
            if line.startswith('#') or not line.strip():
                continue
            field = OrderedDict()
            attribute = OrderedDict()
            li = line.rstrip().split("\t")
            #print li
            field['seqid'] = li[0]
            #print field['seqid']
            if field['seqid'] not in seq_regions:
                end_region = sizes_dict[field['seqid']]
                gff3.write("##sequence-region " + field['seqid'] + ' 1 ' +
                           str(end_region) + '\n')
                seq_regions[field['seqid']] = end_region
            field['source'] = li[1]
            field['type'] = li[2]
            # The first base in a chromosome is numbered 0 in BED format
            field['start'] = li[3]
            field['end'] = li[4]
            field['score'] = li[5]
            field['strand'] = li[6]
            field['phase'] = li[7]
            attr_li = li[8].split(';')
            gene_id = attr_li[0].split()[1].strip('"')
            attribute['ID'] = gene_id + '_' + field['type'] + '_' + str(
                field['start']) + '_' + str(field['end'])
            if field['type'] == 'transcript':
                parents[gene_id] = attribute['ID']
                attribute['transcript_id'] = attr_li[1].split()[1].strip('"')
                attribute['coverage'] = attr_li[2].split()[1].strip('"')
                attribute['fpkm'] = attr_li[3].split()[1].strip('"')
                attribute['tpm'] = attr_li[4].split()[1].strip('"')
            elif field['type'] == 'exon':
                attribute['Parent'] = parents[gene_id]
                attribute['transcript_id'] = attr_li[1].split()[1].strip('"')
                attribute['coverage'] = attr_li[3].split()[1].strip('"')
            subtools.write_features(field, attribute, gff3)
    gff3.close()
Ejemplo n.º 3
0
 def bigpsl_to_gff3(self):
     gff3 = open(self.gff3_file.name, 'w')
     gff3.write("##gff-version 3\n")
     sizes_dict = subtools.sequence_region(self.chromSizesFile)
     seq_regions = dict()
     with open(self.inputFile, 'r') as bed:
         for line in bed:
             field = OrderedDict()
             attribute = OrderedDict()
             li = line.rstrip().split("\t")
             field['seqid'] = li[0]
             if field['seqid'] not in seq_regions:
                 end_region = sizes_dict[field['seqid']]
                 gff3.write("##sequence-region " + field['seqid'] + ' 1 ' +
                            str(end_region) + '\n')
                 seq_regions[field['seqid']] = end_region
             field['source'] = 'UCSC BLAT alignment tool'
             field['type'] = 'match'
             # The first base in a chromosome is numbered 0 in BED format
             field['start'] = str(int(li[1]) + 1)
             field['end'] = li[2]
             field['score'] = li[4]
             field['strand'] = li[5]
             field['phase'] = '.'
             attribute['ID'] = li[0] + '_' + li[3]
             attribute['Name'] = li[3]
             attribute['blockcount'] = li[9]
             attribute['blocksizes'] = li[10]
             attribute['chromstarts'] = li[11]
             attribute['ochrom_start'] = li[12]
             attribute['ochrom_end'] = li[13]
             attribute['ochrom_strand'] = li[14]
             attribute['ochrom_size'] = li[15]
             attribute['ochrom_starts'] = li[16]
             attribute['sequence on other chromosome'] = li[17]
             attribute['cds in ncbi format'] = li[18]
             attribute['size of target chromosome'] = li[19]
             attribute['number of bases matched'] = li[20]
             attribute['number of bases that don\'t match'] = li[21]
             attribute[
                 'number of bases that match but are part of repeats'] = li[
                     22]
             attribute['number of \'N\' bases'] = li[23]
             subtools.write_features(field, attribute, gff3)
             subtools.child_blocks(field, attribute, gff3, 'match_part')
     gff3.close()
Ejemplo n.º 4
0
def trfbig_to_gff3(inputBedFile, chrom_sizes, output):
    gff3 = open(output, 'w')
    gff3.write("##gff-version 3\n")
    sizes_dict = subtools.sequence_region(chrom_sizes)
    seq_regions = dict()
    with open(inputBedFile, 'r') as bed:
        for line in bed:
            field = OrderedDict()
            attribute = OrderedDict()
            li = line.rstrip().split("\t")
            field['seqid'] = li[0]
            if field['seqid'] not in seq_regions:
                end_region = sizes_dict[field['seqid']]
                gff3.write("##sequence-region " + field['seqid'] + ' 1 ' +
                           str(end_region) + '\n')
                seq_regions[field['seqid']] = end_region
            field['source'] = li[3]
            field['type'] = 'tandem_repeat'
            # The first base in a chromosome is numbered 0 in BED format
            field['start'] = str(int(li[1]) + 1)
            field['end'] = li[2]
            field['score'] = li[9]
            field['strand'] = '+'
            field['phase'] = '.'
            attribute['length of repeat unit'] = li[4]
            attribute['mean number of copies of repeat'] = li[5]
            attribute['length of consensus sequence'] = li[6]
            attribute['percentage match'] = li[7]
            attribute['percentage indel'] = li[8]
            attribute['percent of a\'s in repeat unit'] = li[10]
            attribute['percent of c\'s in repeat unit'] = li[11]
            attribute['percent of g\'s in repeat unit'] = li[12]
            attribute['percent of t\'s in repeat unit'] = li[13]
            attribute['entropy'] = li[14]
            attribute['sequence of repeat unit element'] = li[15]
            subtools.write_features(field, attribute, gff3)
    gff3.close()
def gff3_writer(blast_records, gff3_file):
    gff3 = open(gff3_file, 'a')
    gff3.write("##gff-version 3\n")
    seq_regions = dict()
    for blast_record in blast_records:
        query_name = blast_record.query.split(" ")[0]
        source = blast_record.application
        method = blast_record.matrix
        for alignment in blast_record.alignments:
            group = {
                "parent_field": OrderedDict(),
                "parent_attribute": OrderedDict(),
                "alignments": []
            }
            title = alignment.title.split(" ")
            contig_name = title[len(title) - 1]
            length = alignment.length
            group['parent_field']['seqid'] = contig_name
            group['parent_field']['source'] = source
            group['parent_field']['type'] = 'match'
            group['parent_attribute']['ID'] = contig_name + '_' + query_name
            group['parent_attribute']['Name'] = query_name
            group['parent_attribute']['method'] = method
            group['parent_attribute']['length'] = length
            if contig_name not in seq_regions:
                gff3.write("##sequence-region " + contig_name + ' 1 ' +
                           str(length) + '\n')
                seq_regions[contig_name] = length
            match_num = 0
            coords = [length, 0]
            for hsp in alignment.hsps:
                hsp_align = {}
                field = OrderedDict()
                attribute = OrderedDict()
                ref = hsp.sbjct
                query = hsp.query
                field['seqid'] = contig_name
                field['source'] = source
                field['type'] = 'match_part'

                field['start'] = hsp.sbjct_start
                if field['start'] < coords[0]:
                    coords[0] = field['start']
                ref_length = len(ref.replace('-', ''))
                # if run tblastn, the actual length of reference should be multiplied by 3
                if source.lower() == "tblastn":
                    ref_length *= 3
                field['end'] = field['start'] + ref_length - 1
                if field['end'] > coords[1]:
                    coords[1] = field['end']
                field['score'] = hsp.score
                #decide if the alignment in the same strand or reverse strand
                #reading frame
                # (+, +), (0, 0), (-, -) => +
                # (+, -), (-, +) => -
                if hsp.frame[1] * hsp.frame[0] > 0:
                    field['strand'] = '+'
                elif hsp.frame[1] * hsp.frame[0] < 0:
                    field['strand'] = '-'
                else:
                    if hsp.frame[0] + hsp.frame[1] >= 0:
                        field['strand'] = '+'
                    else:
                        field['strand'] = '-'
                field['phase'] = '.'

                target_start = hsp.query_start
                target_len = len(query.replace('-', ''))
                # if run blastx, the actual length of query should be multiplied by 3
                if source.lower() == "blastx":
                    target_len *= 3
                target_end = target_start + target_len - 1
                attribute['ID'] = group['parent_attribute'][
                    'ID'] + '_match_' + str(match_num)
                attribute['Parent'] = group['parent_attribute']['ID']
                attribute['Target'] = query_name + " " + str(
                    target_start) + " " + str(target_end)
                attribute['Gap'] = align2cigar(query, ref)
                #store the query sequence and match string in the file in order to display alignment with BlastAlignment plugin
                attribute['subject'] = hsp.sbjct
                attribute['query'] = hsp.query
                attribute['match'] = hsp.match
                attribute['gaps'] = attribute['match'].count(' ')
                similar = attribute['match'].count('+')
                attribute['identities'] = len(
                    attribute['match']) - similar - attribute['gaps']
                attribute['positives'] = attribute['identities'] + similar
                attribute['expect'] = hsp.expect
                # show reading frame attribute only if the frame is not (0, 0)
                attribute['frame'] = hsp.frame[1]
                match_num += 1
                hsp_align['field'] = field
                hsp_align['attribute'] = attribute
                group['alignments'].append(hsp_align)
            group['parent_field']['start'] = coords[0]
            group['parent_field']['end'] = coords[1]
            group['parent_field']['score'] = group['parent_field'][
                'strand'] = group['parent_field']['phase'] = '.'
            group['parent_attribute']['match_num'] = match_num
            group['alignments'].sort(
                key=lambda x: (x['field']['start'], x['field']['end']))
            subtools.write_features(group['parent_field'],
                                    group['parent_attribute'], gff3)
            prev_end = -1
            for align in group['alignments']:
                overlap = ''
                if align['field']['start'] <= prev_end:
                    overlap += str(
                        align['field']['start']) + ',' + str(prev_end)
                prev_end = align['field']['end']
                align['attribute']['overlap'] = overlap
                subtools.write_features(align['field'], align['attribute'],
                                        gff3)
    gff3.close()