def read_insertions(telocate_out, sample_name, chromosomes, rp_threshold=0): insertions = [] with open(telocate_out,"r") as raw: for x, line in enumerate(raw): if x > 1: insert = output.Insertion(output.Telocate()) split_line = line.split("\t") insert.chromosome = split_line[0] insert.start = int(split_line[1]) te_name = split_line[3].split("/")[1] insert.family = te_name if "old" in split_line[15]: insert.type = "reference" insert.end = insert.start+int(split_line[2]) insert.name = te_name+"|reference|NA|"+sample_name+"|telocate|rp|" else: insert.type = "non-reference" insert.end = insert.start insert.name = te_name+"|non-reference|NA|"+sample_name+"|telocate|rp|" if split_line[12] == "parallel": insert.strand = "+" elif split_line[12] == "uncertain": insert.strand = "." else: insert.strand = "-" insert.support_info.support['read_pair_support'].value = int(split_line[6]) if insert.support_info.support['read_pair_support'].value >= rp_threshold and insert.chromosome in chromosomes: insertions.append(insert) return insertions
def get_ref_tes(gff, taxon, chroms): ref_inserts = [] te_family = {} with open(taxon, "r") as t: for line in t: split_line = line.split("\t") te_id = split_line[0] family = split_line[1] te_family[te_id] = family with open(gff, "r") as g: for line in g: if "#" not in line: split_line = line.split("\t") insert = output.Insertion(output.Popoolationte2()) insert.type = "reference" insert.chromosome = split_line[0] insert.start = int(split_line[3]) insert.end = int(split_line[4]) insert.strand = split_line[6] insert.family = te_family[split_line[2]] if insert.chromosome in chroms: ref_inserts.append(insert) return ref_inserts
def read_insertions(bed, te_to_family, sample_name, te_pos_to_family, chromosomes, reference=False): inserts = [] with open(bed, "r") as b: for line in b: insert = output.Insertion(output.Tepid()) split_line = line.split("\t") insert.chromosome = split_line[0] if insert.chromosome in chromosomes: insert.start = int(split_line[1]) insert.end = int(split_line[2]) if reference: te_name = split_line[4].split(",")[0] insert.family = te_to_family[te_name] insert.strand = split_line[3] insert.type = "reference" insert.name = insert.family + "|reference|NA|" + sample_name + "|tepid|nonab|" else: te_chrom = split_line[3] te_start = split_line[4] te_end = split_line[5] insert.family = te_pos_to_family[te_chrom + "_" + te_start + "_" + te_end] insert.type = "non-reference" insert.name = insert.family + "|non-reference|NA|" + sample_name + "|tepid|sr|" insert.support_info.id = split_line[-1].replace("\n", "") inserts.append(insert) return inserts
def get_non_absent_ref_tes(te_gff, absence_bed, sample, out, log): insertions = [] tmp_gff = out + "/tmp.ref_nonabs.gff" command = ["bedtools", "subtract", "-A", "-a", te_gff, "-b", absence_bed] mccutils.run_command_stdout(command, tmp_gff, log=log) with open(tmp_gff, "r") as gff: for line in gff: if "#" not in line: line = line.replace(";", "\t") split_line = line.split("\t") insert = output.Insertion(output.Temp()) insert.chromosome = split_line[0] insert.start = int(split_line[3]) insert.end = int(split_line[4]) insert.name = split_line[9].split( "=")[1] + "|reference|NA|" + sample + "|temp|nonab|" insert.strand = split_line[6] insert.type = "reference" insertions.append(insert) mccutils.remove(tmp_gff) return insertions
def read_insertions(insert_bed, sample_name, chromosomes, config): insertions = [] with open(insert_bed, "r") as inf: for x, line in enumerate(inf): if x > 0: insert = output.Insertion(output.Temp2()) split_line = line.split("\t") if len(split_line) == 15: insert.chromosome = split_line[0] insert.start = int(split_line[1]) + 1 insert.end = int(split_line[2]) insert.family = split_line[3].split(":")[0] insert.type = "non-reference" insert.support_info.support["frequency"].value = float( split_line[4]) insert.strand = split_line[5] insert.support_info.support["class"].value = split_line[6] insert.support_info.support["supportreads"].value = float( split_line[7]) insert.support_info.support[ "referencereads"].value = float(split_line[8]) insert.support_info.support[ "fiveprimesupport"].value = float(split_line[9]) insert.support_info.support[ "threeprimesupport"].value = float(split_line[10]) insert.support_info.support["reliability"].value = float( split_line[12].replace( "%", "")) # rare enties have a % sign for some reason insert.support_info.support[ "fiveprimejunctionsupport"].value = float( split_line[13]) insert.support_info.support[ "threeprimejunctionsupport"].value = float( split_line[14]) insert.name = insert.family + "|non-reference|" + str( insert.support_info.support['frequency'].value ) + "|" + sample_name + "|temp2|" if insert.support_info.support[ "fiveprimejunctionsupport"].value > 0 and insert.support_info.support[ "threeprimejunctionsupport"].value > 0: insert.name += "sr|" else: insert.name += "rp|" if (insert.chromosome in chromosomes and insert.support_info.support["frequency"].value >= config.PARAMS["frequency_threshold"] and insert.support_info.support["class"].value in config. PARAMS["acceptable_insertion_support_classes"]): insertions.append(insert) return insertions
def read_insertions(tebreak_out, sample_name, chromosomes, config): insertions = [] header = {} with open(tebreak_out, "r") as inf: for ln,line in enumerate(inf): line = line.replace("\n","") split_line = line.split("\t") if ln == 0: for x,val in enumerate(split_line): header[val] = x else: insert = output.Insertion(output.Tebreak()) insert.chromosome = split_line[header['Chromosome']] insert.start = int(split_line[header['3_Prime_End']])+1 insert.end = int(split_line[header['5_Prime_End']]) insert.family = split_line[header['Superfamily']] insert.type = "non-reference" if split_line[header['Orient_5p']] == split_line[header['Orient_3p']]: insert.strand = split_line[header['Orient_5p']] else: insert.strand = "." if insert.strand == "-": tmp = insert.start insert.start = insert.end insert.end = tmp insert.support_info.support["five_p_elt_match"].value = float(split_line[header['5p_Elt_Match']]) insert.support_info.support["three_p_elt_match"].value = float(split_line[header['3p_Elt_Match']]) insert.support_info.support["five_p_genome_match"].value = float(split_line[header['5p_Genome_Match']]) insert.support_info.support["three_p_genome_match"].value = float(split_line[header['3p_Genome_Match']]) insert.support_info.support["split_reads_5prime"].value = float(split_line[header['Split_reads_5prime']]) insert.support_info.support["split_reads_3prime"].value = float(split_line[header['Split_reads_3prime']]) insert.support_info.support["remapped_discordant"].value = float(split_line[header['Remapped_Discordant']]) insert.support_info.support["remap_disc_fraction"].value = float(split_line[header['Remap_Disc_Fraction']]) insert.support_info.support["remapped_splitreads"].value = float(split_line[header['Remapped_Splitreads']]) insert.support_info.support["remap_split_fraction"].value = float(split_line[header['Remap_Split_Fraction']]) insert.name = insert.family+"|non-reference|NA|"+sample_name+"|tebreak|sr|" if ( insert.chromosome in chromosomes and insert.support_info.support["five_p_elt_match"].value >= config.MIN_5P_ELT_MATCH and insert.support_info.support["three_p_elt_match"].value >= config.MIN_3P_ELT_MATCH and insert.support_info.support["five_p_genome_match"].value >= config.MIN_5P_GENOME_MATCH and insert.support_info.support["three_p_genome_match"].value >= config.MIN_3P_GENOME_MATCH and insert.support_info.support["split_reads_5prime"].value >= config.MIN_SPLIT_READS_5P and insert.support_info.support["split_reads_3prime"].value >= config.MIN_SPLIT_READS_3P and insert.support_info.support["remapped_discordant"].value >= config.MIN_REMAPPED_DISCORDANT and insert.support_info.support["remap_disc_fraction"].value >= config.MIN_REMAP_DISC_FRACTION and insert.support_info.support["remapped_splitreads"].value >= config.MIN_REMAPPED_SPLITREADS and insert.support_info.support["remap_split_fraction"].value >= config.MIN_REMAP_SPLIT_FRACTION ): insertions.append(insert) return insertions
def read_insertions(predictions, ref_tes, chroms, sample, both_end_support_needed=True, support_threshold=0.1): insertions = [] with open(predictions, "r") as tsv: for line in tsv: split_line = line.split("\t") insert = output.Insertion(output.Popoolationte2()) insert.chromosome = split_line[1] insert.start = int(split_line[2]) insert.end = int(split_line[2]) insert.strand = split_line[3] insert.family = split_line[4] insert.support_info.support['flanks_supported'].value = split_line[ 6] insert.support_info.support['frequency'].value = float( split_line[8]) if (insert.support_info.support['flanks_supported'].value == "FR" or not both_end_support_needed ) and insert.support_info.support[ 'frequency'].value > support_threshold: # determine if insert is a ref insert for x in range(0, len(ref_tes)): if ref_tes[ x].start <= insert.start and insert.start <= ref_tes[ x].end: insert.family = ref_tes[x].family insert.support_info.added = ref_tes[ x].support_info.added if not ref_tes[x].support_info.added: ref_tes[x].support_info.added = True insert.type = "reference" insert.start = ref_tes[x].start insert.end = ref_tes[x].end insert.strand = ref_tes[x].strand if insert.type == "reference": insert.name = insert.family + "|reference|" + str( insert.support_info.support['frequency'].value ) + "|" + sample + "|popoolationte2|rp|" else: insert.type = "non-reference" insert.name = insert.family + "|non-reference|" + str( insert.support_info.support['frequency'].value ) + "|" + sample + "|popoolationte2|rp|" if not insert.support_info.added: insertions.append(insert) return insertions
def read_insertions(ref_bed, nonref_bed, chromosomes, sample_name, out_dir): insertions = [] with open(ref_bed, "r") as inbed: for line in inbed: line = line.replace("\n", "") insert = output.Insertion(output.Ngs_te_mapper2()) split_line = line.split("\t") insert.chromosome = split_line[0] insert.start = int(split_line[1]) + 1 insert.end = int(split_line[2]) insert.type = "reference" insert.strand = split_line[5] insert.family = split_line[3] insert.name = insert.family + "|" + insert.type + "|NA|" + sample_name + "|ngs_te_mapper2|sr|" if insert.chromosome in chromosomes: insertions.append(insert) with open(nonref_bed, "r") as inbed: for line in inbed: line = line.replace("\n", "") insert = output.Insertion(output.Ngs_te_mapper2()) split_line = line.split("\t") insert.chromosome = split_line[0] insert.start = int(split_line[1]) + 1 insert.end = int(split_line[2]) insert.type = "non-reference" insert.strand = split_line[5] insert.family = split_line[3].split("|")[0] insert.support_info.support['frequency'].value = float( split_line[3].split("|")[2]) insert.support_info.support['three_prime_support'].value = int( split_line[3].split("|")[3]) insert.support_info.support['five_prime_support'].value = int( split_line[3].split("|")[4]) insert.support_info.support['reference_reads'].value = int( split_line[3].split("|")[5]) insert.name = insert.family + "|" + insert.type + "|" + str( insert.support_info.support['frequency'].value ) + "|" + sample_name + "|ngs_te_mapper2|sr|" if insert.chromosome in chromosomes: insertions.append(insert) return insertions
def get_insertions(gff, sample_name, chromosomes, ref_l_threshold=0, ref_r_threshold=0, nonref_l_threshold=0, nonref_r_threshold=0): insertions = [] with open(gff, "r") as ingff: for line in ingff: if "#" not in line: split_line = line.split("\t") feats = split_line[8].split(";") insert = output.Insertion(output.Relocate()) insert.chromosome = split_line[0] insert.start = int(split_line[3]) insert.end = int(split_line[4]) insert.strand = split_line[6] feat_id = "" feat_te_name = "" for feat in feats: if "ID=" in feat: feat_id = feat.split("=")[1] elif "TE_Name=" in feat: feat_te_name = feat.split("=")[1] elif "Note=" in feat: if "Shared" in feat: insert.type = "reference" elif "Non-reference" in feat: insert.type = "non-reference" else: insert.type = "missing" elif "left_flanking_read_count=" in feat: insert.support_info.support['left_flanking_reads'].value = int(feat.split("=")[1]) elif "right_flanking_read_count=" in feat: insert.support_info.support['right_flanking_reads'].value = int(feat.split("=")[1]) if insert.type == "reference": insert.family = feat_te_name insert.name = feat_te_name+"|reference|NA|"+sample_name+"|relocate|sr|" elif insert.type == "non-reference": feat_te_name = feat_id.split(".")[0] insert.family = feat_te_name insert.name = feat_te_name+"|non-reference|NA|"+sample_name+"|relocate|sr|" if insert.type == "reference" and insert.support_info.support['left_flanking_reads'].value >= ref_l_threshold and insert.support_info.support['right_flanking_reads'].value >= ref_r_threshold and insert.chromosome in chromosomes: insertions.append(insert) elif insert.type == "non-reference" and insert.support_info.support['left_flanking_reads'].value >= nonref_l_threshold and insert.support_info.support['right_flanking_reads'].value >= nonref_r_threshold and insert.chromosome in chromosomes: insertions.append(insert) return insertions
def read_insertions(retroseq_vcf, sample_name, chromosomes, support_threshold=0, breakpoint_threshold=6): insertions = [] with open(retroseq_vcf, "r") as vcf: for line in vcf: if "#" not in line: insert = output.Insertion(output.Retroseq()) line = line.replace("\n", "") split_line = line.split("\t") insert.chromosome = split_line[0] info = {} split_info = split_line[7].split(";") for i in split_info: if "=" in i: info[i.split("=")[0]] = i.split("=")[1] insert.family = (info['MEINFO'].split(",")[0]).split("-")[0] insert.start = int(info['MEINFO'].split(",")[1]) insert.end = int(info['MEINFO'].split(",")[2]) format_keys = split_line[8].split(":") format_vals = split_line[9].split(":") form = {} for x, key in enumerate(format_keys): form[key] = format_vals[x] insert.support_info.support['read_pair_support'].value = int( form['SP']) insert.support_info.support['clip3'].value = int(form['CLIP3']) insert.support_info.support['clip5'].value = int(form['CLIP5']) insert.support_info.support['call_status'].value = int( form['FL']) insert.type = "non-reference" insert.name = insert.family + "|non-reference|NA|" + sample_name + "|retroseq|rp|" if insert.support_info.support[ 'read_pair_support'].value >= support_threshold and insert.support_info.support[ 'call_status'].value >= breakpoint_threshold and insert.chromosome in chromosomes: insertions.append(insert) return insertions
def get_non_absent_ref_tes(deletions, te_gff, te_to_family, sample_name): ref_tes = [] with open(te_gff, "r") as gff: for line in gff: ref_te = output.Insertion(output.Tepid()) split_line = line.split("\t") ref_te.chromosome = split_line[0] ref_te.start = int(split_line[3]) ref_te.end = int(split_line[4]) ref_te.strand = split_line[6] feats = split_line[8] split_feats = feats.split(";") te_id = "" for f in split_feats: if "ID=" in f: te_id = f.split("=")[1] ref_te.family = te_to_family[te_id] ref_te.type = "reference" ref_te.name = ref_te.family + "|reference|NA|" + sample_name + "|tepid|nonab|" ref_tes.append(ref_te) absent = [] for deletion in deletions: key = "_".join([ deletion.chromosome, str(deletion.start), str(deletion.end), deletion.strand, deletion.family ]) absent.append(key) non_absent = [] for te in ref_tes: key = "_".join( [te.chromosome, str(te.start), str(te.end), te.strand, te.family]) if key not in absent: non_absent.append(te) return non_absent
def read_insertions(bed, chromosomes, sample_name, out_dir, min_read_cutoff=0): insertions = [] with open(bed, "r") as inbed: for line in inbed: insert = output.Insertion(output.Ngs_te_mapper()) line = line.replace(";", "\t") split_line = line.split("\t") insert.chromosome = split_line[0] insert.start = int(split_line[1]) + 1 insert.end = int(split_line[2]) insert.type = split_line[8].replace("\n", "") insert.strand = split_line[4] insert.family = split_line[5] insert.name = insert.family + "|" + insert.type + "|NA|" + sample_name + "|ngs_te_mapper|sr|" insert.support_info.support['supportingreads'].value = int( split_line[7]) if insert.support_info.support[ 'supportingreads'].value > min_read_cutoff and insert.chromosome in chromosomes: insertions.append(insert) return insertions
def get_insertions(gff, sample_name, chromosomes, l_support_threshold=0, r_support_threshold=0, l_junction_threshold=0, r_junction_threshold=0, insert_type="ref"): insertions = [] with open(gff, "r") as ingff: for line in ingff: if "#" not in line: line = line.replace(";", "\t") split_line = line.split("\t") insert = output.Insertion(output.Relocate2()) insert.chromosome = split_line[0] insert.start = int(split_line[3]) insert.end = int(split_line[4]) insert.strand = split_line[6] insert.type = insert_type insert.name = split_line[8].split("=")[1] te_name = "" if insert_type == "ref": insert.type = "reference" insert.support_info.support[ 'right_junction_reads'].value = int( split_line[11].split(":")[1]) insert.support_info.support[ 'left_junction_reads'].value = int( split_line[12].split(":")[1]) insert.support_info.support[ 'right_support_reads'].value = int( split_line[13].split(":")[1]) insert.support_info.support[ 'left_support_reads'].value = int( split_line[14].split(":")[1]) else: insert.type = "non-reference" te_name = split_line[9].split("=")[1] te_name = te_name.split("/")[0] insert.family = te_name insert.name = te_name + "|non-reference|NA|" + sample_name + "|relocate2|sr|" insert.support_info.support[ 'right_junction_reads'].value = int( split_line[12].split("=")[1]) insert.support_info.support[ 'left_junction_reads'].value = int( split_line[13].split("=")[1]) insert.support_info.support[ 'right_support_reads'].value = int( split_line[14].split("=")[1]) insert.support_info.support[ 'left_support_reads'].value = int( split_line[15].split("=")[1]) if (insert.support_info.support['right_junction_reads'].value >= r_junction_threshold and insert.support_info.support['left_junction_reads']. value >= l_junction_threshold and insert.support_info.support['right_support_reads']. value >= r_support_threshold and insert.support_info.support['left_support_reads'].value >= l_support_threshold and insert.chromosome in chromosomes and te_name != "repeat_name"): insertions.append(insert) return insertions
def read_insertions(predictions, chroms, sample, ref_tes, min_presence=3, max_absence=None, min_presence_fraction=0.1, require_tsd=False, require_both_breakpoints=False): insertions = [] with open(predictions, "r") as tsv: for line in tsv: split_line = line.split("\t") insert = output.Insertion(output.Teflon()) insert.chromosome = split_line[0] both_ends = False tsd = False if insert.chromosome in chroms: if split_line[1] != "-" and split_line[2] != "-": left = int(split_line[1]) right = int(split_line[2]) both_ends = True elif split_line[1] == "-": left = int(split_line[2]) right = int(split_line[2]) else: left = int(split_line[1]) right = int(split_line[1]) if left > right: tsd = True tmp = right right = left left = tmp elif left == right: tsd = True right += 1 insert.start = left - 1 insert.end = right insert.family = split_line[3] insert.strand = split_line[5] # if reference prediction, uses ref TE coordinates if split_line[6] != "-": tsd = True both_ends = True insert.type = "reference" te_names = split_line[6].split(",") te_name = "" for name in te_names: if name in ref_tes.keys(): if te_name == "": te_name = name if te_name == "": sys.exit("TEFLON ERROR: can't find:" + split_line[6] + " in reference TEs...\n") insert.chromosome = ref_tes[te_name][0] insert.start = ref_tes[te_name][1] insert.end = ref_tes[te_name][2] else: insert.type = "non-reference" insert.support_info.support[ 'five_prime_supported'].value = split_line[7] insert.support_info.support[ 'three_prime_supported'].value = split_line[7] insert.support_info.support['presence_reads'].value = int( split_line[9]) insert.support_info.support['absence_reads'].value = int( split_line[10]) insert.support_info.support['ambiguous_reads'].value = int( split_line[11]) insert.support_info.support['frequency'].value = float( split_line[12]) insert.name = insert.family + "|" + insert.type + "|" + str( insert.support_info.support['frequency'].value ) + "|" + sample + "|teflon|rp|" if ((insert.support_info.support['presence_reads'].value >= min_presence) and (max_absence is None or insert.support_info.support['absence_reads'].value <= max_absence) and (insert.support_info.support['frequency'].value >= min_presence_fraction) and ((tsd or not require_tsd) and (both_ends or not require_both_breakpoints))): insertions.append(insert) return insertions
def read_insertions(popoolationte, sample_name, chromosomes, require_both_end_support=True, percent_read_support_threshold=0.1): insertions = [] with open(popoolationte, "r") as tsv: for line in tsv: insert = output.Insertion(output.Popoolationte()) split_line = line.split("\t") insert.chromosome = split_line[0] pos_in_reference_seq = to_number(split_line[1]) insert.support_info.support["flanks_supported"].value = split_line[ 2] insert.family = split_line[3] insert.support_info.support["frequency"].value = to_number( split_line[4], to_float=True) ref_te_id = split_line[6] insert.support_info.support[ "forward_insert_start"].value = to_number(split_line[8]) insert.support_info.support[ "forward_insert_end"].value = to_number(split_line[9]) insert.support_info.support[ "forward_insert_freq"].value = to_number(split_line[10], to_float=True) insert.support_info.support[ "forward_insert_cov"].value = to_number(split_line[11]) insert.support_info.support[ "forward_presence_reads"].value = to_number(split_line[12]) insert.support_info.support[ "forward_absence_reads"].value = to_number(split_line[13]) insert.support_info.support[ "reverse_insert_start"].value = to_number(split_line[15]) insert.support_info.support[ "reverse_insert_end"].value = to_number(split_line[16]) insert.support_info.support[ "reverse_insert_freq"].value = to_number(split_line[17], to_float=True) insert.support_info.support[ "reverse_insert_cov"].value = to_number(split_line[18]) insert.support_info.support[ "reverse_presence_reads"].value = to_number(split_line[19]) insert.support_info.support[ "reverse_absence_reads"].value = to_number(split_line[20]) if insert.support_info.support["forward_insert_start"].value == 0: insert.start = pos_in_reference_seq insert.end = insert.support_info.support[ "reverse_insert_start"].value elif insert.support_info.support[ "reverse_insert_start"].value == 0: insert.start = insert.support_info.support[ "forward_insert_end"].value insert.end = pos_in_reference_seq else: insert.start = insert.support_info.support[ "forward_insert_end"].value insert.end = insert.support_info.support[ "reverse_insert_start"].value if "-" == ref_te_id: insert.type = "non-reference" insert.name = insert.family + "|non-reference|" + str( insert.support_info.support["frequency"].value ) + "|" + sample_name + "|popoolationte|rp|" else: insert.type = "reference" insert.name = insert.family + "|reference|" + str( insert.support_info.support["frequency"].value ) + "|" + sample_name + "|popoolationte|rp|" if not require_both_end_support: if ("FR" in insert.support_info.support["flanks_supported"].value and (insert.support_info.support["frequency"].value >= percent_read_support_threshold) and insert.chromosome in chromosomes): insertions.append(insert) elif ("F" in insert.support_info.support["flanks_supported"].value and (insert.support_info.support["forward_insert_freq"].value >= percent_read_support_threshold) and insert.chromosome in chromosomes): insertions.append(insert) elif ((insert.support_info.support["reverse_insert_freq"].value >= percent_read_support_threshold) and insert.chromosome in chromosomes): insertions.append(insert) else: if ("FR" in insert.support_info.support["flanks_supported"].value and (insert.support_info.support["forward_insert_freq"].value >= percent_read_support_threshold or insert.support_info.support["reverse_insert_freq"].value >= percent_read_support_threshold) and insert.chromosome in chromosomes): insertions.append(insert) return insertions
def read_insertions(jitterbug_gff, taxonomy, chroms, sample_name, min_fwd_read_support=0, min_rev_read_support=0, min_sr_support=0, min_zygosity=0.0): insertions = [] te_family = {} with open(taxonomy, "r") as tsv: for line in tsv: line = line.replace("\n", "") split_line = line.split("\t") te_family[split_line[0]] = split_line[1] with open(jitterbug_gff, "r") as gff: for line in gff: line = line.replace("\n", "") split_line = line.split("\t") if len(split_line) == 9: # insert = mccutils.Insertion() insert = output.Insertion(output.Jitterbug()) insert.chromosome = split_line[0] if insert.chromosome in chroms: insert.start = int(split_line[3]) insert.end = int(split_line[4]) insert.type = "non-reference" feats = split_line[8] feats = feats.replace(" ", "") feats = feats.split(";") supporting_families = [] sr = False family = "NONE" for feat in feats: if "softclipped_pos" in feat: pos = feat.split("=")[1] pos = pos.replace("(", "") pos = pos.replace(")", "") pos = pos.split(",") start = int(pos[0]) - 1 end = int(pos[1]) if start > -1 and end > -1: insert.start = start insert.end = end sr = True if "predicted_superfam" in feat: te = feat.split("=")[1] family = te_family[te] insert.family = family if "supporting_fwd_reads" in feat: insert.support_info.support[ 'supporting_fwd_reads'].value = int( feat.split("=")[1]) if "supporting_rev_reads" in feat: insert.support_info.support[ 'supporting_rev_reads'].value = int( feat.split("=")[1]) if "softclipped_support" in feat: insert.support_info.support[ 'softclipped_support'].value = int( feat.split("=")[1]) if "zygosity" in feat: insert.support_info.support[ 'zygosity'].value = float(feat.split("=")[1]) insert.name = family + "|non-reference|" + str( insert.support_info.support['zygosity'].value ) + "|" + sample_name + "|jitterbug|" if sr: insert.name += "sr|" else: insert.name = "rp|" if ((insert.support_info.support['supporting_fwd_reads']. value >= min_fwd_read_support) and (insert.support_info.support['supporting_rev_reads']. value >= min_rev_read_support) and (insert.support_info.support['softclipped_support']. value >= min_sr_support) and (insert.support_info.support['zygosity'].value >= min_zygosity)): insertions.append(insert) return insertions
def read_insertion_summary(infile, sample): insertions = [] with open(infile, "r") as inf: for x, line in enumerate(inf): if x > 0: insert = output.Insertion(output.Temp()) split_line = line.split("\t") if len(split_line) == 14: insert.chromosome = split_line[0] insert.start = int(split_line[1]) - 1 insert.end = int(split_line[2]) insert.family = split_line[3] insert.name = insert.family + "|non-reference|" + split_line[ 7] + "|" + sample + "|temp|" if "antisense" in split_line[4]: insert.strand = "-" else: insert.strand = "+" insert.support_info.support['class'].value = split_line[5] insert.support_info.support['variantsupport'].value = int( float(split_line[6])) insert.support_info.support['frequency'].value = float( split_line[7]) insert.support_info.support['junction1'].value = int( split_line[8]) insert.support_info.support[ 'junction1support'].value = int(split_line[9]) insert.support_info.support['junction2'].value = int( split_line[10]) insert.support_info.support[ 'junction2support'].value = int(split_line[11]) insert.support_info.support[ 'fiveprimesupport'].value = int(float(split_line[12])) insert.support_info.support[ 'threeprimesupport'].value = int( float(split_line[13].replace("\n", ""))) insert.type = "non-reference" if insert.end >= insert.start and insert.end > 0 and insert.start > -1: # if split read, use junction positions as start and end if insert.support_info.support[ 'junction1support'].value > 0 and insert.support_info.support[ 'junction2support'].value > 0: insert.start = insert.support_info.support[ 'junction1'].value insert.end = insert.support_info.support[ 'junction2'].value insert.name = insert.name + "sr|" # read pair else: insert.name = insert.name + "rp|" insertions.append(insert) else: print( "<TEMP POST> Omitting malformed line from insertion summary results:", line) else: print( "<TEMP POST> Omitting malformed line from insertion summary results:", line) return insertions