def intron_from_line(line): try: if isinstance(line[3], str) and line[3] in '+-': return Intron(line[0], line[1], line[2], line[3]) else: return Intron(line[0], line[1], line[2]) except IndexError: return Intron(line[0], line[1], line[2])
def _line_to_intron(line): """RegTools junctions to Intron.""" scaffold = line[0] start = line[1] + int(line[-2].split(',')[0]) end = line[2] - int(line[-2].split(',')[1]) score = line[4] return Intron(scaffold, start, end, support=score)
def extract_introns_from_gtf(file, file_out): introns_p = [] unique = set() for line in process_file(file): if line[2] == 'transcript': new_gene = True gene = line[9] elif line[2] == 'exon': if new_gene: new_gene = False start = line[4] else: end = line[3] - 1 scaffold = line[0] sign = line[6] i = Intron(scaffold, start, end, gene=gene, strand=sign) if ' '.join([scaffold, str(start), str(end)]) not in unique: unique.add(' '.join([scaffold, str(start), str(end)])) introns_p.append(i) start = line[4] with open(file_out, 'w') as f_out: for intron in introns_p: f_out.write('\t'.join([ str(x) for x in [ intron.scaffold, intron.start, intron.end, intron.gene, intron.strand ] ])) f_out.write('\n')
def intron_stats(file): """Calculate basic statistics of introns from a file.""" introns = [] for line in process_file(file): i = Intron(line[0], line[1], line[2], support=line[3]) introns.append(i) print('Number of introns:0', len(introns)) print('Mean support: ', sum([i.support for i in introns]) / len(introns)) print('Median support: ', median([intron.support for intron in introns]))
def choose_best_introns(file_in, file_out, cutoff): """ Choose one best intron over every position. :param file_in: (str) Path to the .bed file with introns in format: scaffold start end support. All introns from a scaffold must come one after another in the file, and within one scaffold introns have to be sorted by start. :param file_out: (str) Path to the out file with best introns. :param cutoff: (int) Minimum support of the best intron. :return: Two dictionaries where key is scaffold and value is the list of introns on the scaffold: one containing all the introns from the input file and one with the best introns. """ with open(file_out, 'w') as f_out: best_introns = defaultdict(list) all_introns = defaultdict(list) chrom_old = 'scaffold_0' start_old = 0 end_old = 0 score_old = 0 def write_junction(): junction = '\t'.join([str(x) for x in [chrom, start_old, end_old, score_old]]) f_out.write(junction) f_out.write('\n') best_introns[chrom].append(i) for line in process_file(file_in): chrom, start, end, score = line i = Intron(chrom, start, end, support=score) all_introns[chrom].append(i) # only consider introns with high enough support if score < cutoff: continue if chrom == chrom_old: if start < end_old: # still in the same intron if score > score_old: # one best intron in each position start_old, end_old, score_old = start, end, score else: # in a new intron, so the old one has to be written down if not start_old - end_old == 0: write_junction() start_old, end_old, score_old = start, end, score else: # new scaffold, so we need to write down the last intron write_junction() chrom_old, start_old, end_old, score_old = chrom, start, end, score # now we need to write the last one write_junction() return all_introns, best_introns
def intron_dict(file): my_introns = defaultdict(list) for line in process_file(file): intron = Intron(line[0], line[1], line[2], gene=line[3], strand=line[4]) my_introns[intron.scaffold].append(intron) return my_introns
def file_to_seq_introns(file, margin): """Extract introns with sequences from a fasta file""" introns = [] with open(file) as f_in: for line in f_in.readlines(): if line[0] == '>': line = re.split('[>:\-]', line.strip()) scaffold, start, end = line[1:] else: i = Intron(scaffold, int(start), int(end), margin_left=margin, margin_right=margin, sequence=line.strip()) introns.append(i) return introns