Example #1
0
def intron_from_line(line):
    try:
        if isinstance(line[3], str) and line[3] in '+-':
            return Intron(line[0], line[1], line[2], line[3])
        else:
            return Intron(line[0], line[1], line[2])
    except IndexError:
        return Intron(line[0], line[1], line[2])
def _line_to_intron(line):
    """RegTools junctions to Intron."""
    scaffold = line[0]
    start = line[1] + int(line[-2].split(',')[0])
    end = line[2] - int(line[-2].split(',')[1])
    score = line[4]
    return Intron(scaffold, start, end, support=score)
Example #3
0
def extract_introns_from_gtf(file, file_out):
    introns_p = []
    unique = set()
    for line in process_file(file):
        if line[2] == 'transcript':
            new_gene = True
            gene = line[9]
        elif line[2] == 'exon':
            if new_gene:
                new_gene = False
                start = line[4]
            else:
                end = line[3] - 1
                scaffold = line[0]
                sign = line[6]
                i = Intron(scaffold, start, end, gene=gene, strand=sign)
                if ' '.join([scaffold, str(start), str(end)]) not in unique:
                    unique.add(' '.join([scaffold, str(start), str(end)]))
                    introns_p.append(i)
                start = line[4]

    with open(file_out, 'w') as f_out:
        for intron in introns_p:
            f_out.write('\t'.join([
                str(x) for x in [
                    intron.scaffold, intron.start, intron.end, intron.gene,
                    intron.strand
                ]
            ]))
            f_out.write('\n')
def intron_stats(file):
    """Calculate basic statistics of introns from a file."""
    introns = []
    for line in process_file(file):
        i = Intron(line[0], line[1], line[2], support=line[3])
        introns.append(i)
    print('Number of introns:0', len(introns))
    print('Mean support: ', sum([i.support for i in introns]) / len(introns))
    print('Median support: ', median([intron.support for intron in introns]))
def choose_best_introns(file_in, file_out, cutoff):
    """
    Choose one best intron over every position.

    :param file_in: (str) Path to the .bed file with introns in format: scaffold start end support. All introns from
    a scaffold must come one after another in the file, and within one scaffold introns have to be sorted by start.
    :param file_out: (str) Path to the out file with best introns.
    :param cutoff: (int) Minimum support of the best intron.
    :return: Two dictionaries where key is scaffold and value is the list of introns on the scaffold:
    one containing all the introns from the input file and one with the best introns.
    """
    with open(file_out, 'w') as f_out:

        best_introns = defaultdict(list)
        all_introns = defaultdict(list)

        chrom_old = 'scaffold_0'
        start_old = 0
        end_old = 0
        score_old = 0

        def write_junction():
            junction = '\t'.join([str(x) for x in [chrom, start_old, end_old, score_old]])
            f_out.write(junction)
            f_out.write('\n')
            best_introns[chrom].append(i)

        for line in process_file(file_in):
            chrom, start, end, score = line
            i = Intron(chrom, start, end, support=score)
            all_introns[chrom].append(i)
            # only consider introns with high enough support
            if score < cutoff:
                continue

            if chrom == chrom_old:
                if start < end_old:
                    # still in the same intron
                    if score > score_old:
                        # one best intron in each position
                        start_old, end_old, score_old = start, end, score
                else:
                    # in a new intron, so the old one has to be written down
                    if not start_old - end_old == 0:
                        write_junction()
                    start_old, end_old, score_old = start, end, score

            else:
                # new scaffold, so we need to write down the last intron
                write_junction()
                chrom_old, start_old, end_old, score_old = chrom, start, end, score

        # now we need to write the last one
        write_junction()

        return all_introns, best_introns
Example #6
0
def intron_dict(file):
    my_introns = defaultdict(list)
    for line in process_file(file):
        intron = Intron(line[0],
                        line[1],
                        line[2],
                        gene=line[3],
                        strand=line[4])
        my_introns[intron.scaffold].append(intron)
    return my_introns
def file_to_seq_introns(file, margin):
    """Extract introns with sequences from a fasta file"""
    introns = []
    with open(file) as f_in:
        for line in f_in.readlines():
            if line[0] == '>':
                line = re.split('[>:\-]', line.strip())
                scaffold, start, end = line[1:]
            else:
                i = Intron(scaffold,
                           int(start),
                           int(end),
                           margin_left=margin,
                           margin_right=margin,
                           sequence=line.strip())
                introns.append(i)
    return introns