Esempio n. 1
0
def read_insertions(telocate_out, sample_name, chromosomes, rp_threshold=0):
    insertions = []
    with open(telocate_out,"r") as raw:
        for x, line in enumerate(raw):
            if x > 1:
                insert = output.Insertion(output.Telocate())
                split_line = line.split("\t")
                insert.chromosome = split_line[0]
                insert.start = int(split_line[1])
                
                te_name = split_line[3].split("/")[1]
                insert.family = te_name
                if "old" in split_line[15]:
                    insert.type = "reference"
                    insert.end = insert.start+int(split_line[2])
                    insert.name = te_name+"|reference|NA|"+sample_name+"|telocate|rp|"
                else:
                    insert.type = "non-reference"
                    insert.end = insert.start
                    insert.name = te_name+"|non-reference|NA|"+sample_name+"|telocate|rp|"

                if split_line[12] == "parallel":
                    insert.strand = "+"
                elif split_line[12] == "uncertain":
                    insert.strand = "."
                else:
                    insert.strand = "-"

                insert.support_info.support['read_pair_support'].value = int(split_line[6])

                if insert.support_info.support['read_pair_support'].value >= rp_threshold and insert.chromosome in chromosomes:
                    insertions.append(insert)
    
    return insertions
Esempio n. 2
0
def get_ref_tes(gff, taxon, chroms):
    ref_inserts = []
    te_family = {}
    with open(taxon, "r") as t:
        for line in t:
            split_line = line.split("\t")
            te_id = split_line[0]
            family = split_line[1]
            te_family[te_id] = family

    with open(gff, "r") as g:
        for line in g:
            if "#" not in line:
                split_line = line.split("\t")
                insert = output.Insertion(output.Popoolationte2())
                insert.type = "reference"
                insert.chromosome = split_line[0]
                insert.start = int(split_line[3])
                insert.end = int(split_line[4])
                insert.strand = split_line[6]
                insert.family = te_family[split_line[2]]
                if insert.chromosome in chroms:
                    ref_inserts.append(insert)

    return ref_inserts
Esempio n. 3
0
def read_insertions(bed,
                    te_to_family,
                    sample_name,
                    te_pos_to_family,
                    chromosomes,
                    reference=False):
    inserts = []
    with open(bed, "r") as b:
        for line in b:
            insert = output.Insertion(output.Tepid())
            split_line = line.split("\t")
            insert.chromosome = split_line[0]
            if insert.chromosome in chromosomes:
                insert.start = int(split_line[1])
                insert.end = int(split_line[2])

                if reference:
                    te_name = split_line[4].split(",")[0]
                    insert.family = te_to_family[te_name]
                    insert.strand = split_line[3]
                    insert.type = "reference"
                    insert.name = insert.family + "|reference|NA|" + sample_name + "|tepid|nonab|"
                else:
                    te_chrom = split_line[3]
                    te_start = split_line[4]
                    te_end = split_line[5]
                    insert.family = te_pos_to_family[te_chrom + "_" +
                                                     te_start + "_" + te_end]
                    insert.type = "non-reference"
                    insert.name = insert.family + "|non-reference|NA|" + sample_name + "|tepid|sr|"

                insert.support_info.id = split_line[-1].replace("\n", "")
                inserts.append(insert)

    return inserts
Esempio n. 4
0
def get_non_absent_ref_tes(te_gff, absence_bed, sample, out, log):
    insertions = []
    tmp_gff = out + "/tmp.ref_nonabs.gff"
    command = ["bedtools", "subtract", "-A", "-a", te_gff, "-b", absence_bed]
    mccutils.run_command_stdout(command, tmp_gff, log=log)

    with open(tmp_gff, "r") as gff:
        for line in gff:
            if "#" not in line:
                line = line.replace(";", "\t")
                split_line = line.split("\t")
                insert = output.Insertion(output.Temp())
                insert.chromosome = split_line[0]
                insert.start = int(split_line[3])
                insert.end = int(split_line[4])
                insert.name = split_line[9].split(
                    "=")[1] + "|reference|NA|" + sample + "|temp|nonab|"
                insert.strand = split_line[6]
                insert.type = "reference"

                insertions.append(insert)

    mccutils.remove(tmp_gff)

    return insertions
Esempio n. 5
0
def read_insertions(insert_bed, sample_name, chromosomes, config):
    insertions = []
    with open(insert_bed, "r") as inf:
        for x, line in enumerate(inf):
            if x > 0:
                insert = output.Insertion(output.Temp2())
                split_line = line.split("\t")
                if len(split_line) == 15:
                    insert.chromosome = split_line[0]
                    insert.start = int(split_line[1]) + 1
                    insert.end = int(split_line[2])
                    insert.family = split_line[3].split(":")[0]
                    insert.type = "non-reference"
                    insert.support_info.support["frequency"].value = float(
                        split_line[4])
                    insert.strand = split_line[5]
                    insert.support_info.support["class"].value = split_line[6]
                    insert.support_info.support["supportreads"].value = float(
                        split_line[7])
                    insert.support_info.support[
                        "referencereads"].value = float(split_line[8])
                    insert.support_info.support[
                        "fiveprimesupport"].value = float(split_line[9])
                    insert.support_info.support[
                        "threeprimesupport"].value = float(split_line[10])
                    insert.support_info.support["reliability"].value = float(
                        split_line[12].replace(
                            "%",
                            ""))  # rare enties have a % sign for some reason
                    insert.support_info.support[
                        "fiveprimejunctionsupport"].value = float(
                            split_line[13])
                    insert.support_info.support[
                        "threeprimejunctionsupport"].value = float(
                            split_line[14])

                    insert.name = insert.family + "|non-reference|" + str(
                        insert.support_info.support['frequency'].value
                    ) + "|" + sample_name + "|temp2|"

                    if insert.support_info.support[
                            "fiveprimejunctionsupport"].value > 0 and insert.support_info.support[
                                "threeprimejunctionsupport"].value > 0:
                        insert.name += "sr|"
                    else:
                        insert.name += "rp|"

                    if (insert.chromosome in chromosomes
                            and insert.support_info.support["frequency"].value
                            >= config.PARAMS["frequency_threshold"]
                            and insert.support_info.support["class"].value
                            in config.
                            PARAMS["acceptable_insertion_support_classes"]):
                        insertions.append(insert)

    return insertions
Esempio n. 6
0
def read_insertions(tebreak_out, sample_name, chromosomes, config):
    insertions = []
    header = {}
    with open(tebreak_out, "r") as inf:
        for ln,line in enumerate(inf):
            line = line.replace("\n","")
            split_line = line.split("\t")
            if ln == 0:
                for x,val in enumerate(split_line):
                    header[val] = x
            else:
                insert = output.Insertion(output.Tebreak())
                insert.chromosome = split_line[header['Chromosome']]
                insert.start = int(split_line[header['3_Prime_End']])+1
                insert.end = int(split_line[header['5_Prime_End']])
                insert.family = split_line[header['Superfamily']]
                insert.type = "non-reference"
                if split_line[header['Orient_5p']] == split_line[header['Orient_3p']]:
                    insert.strand = split_line[header['Orient_5p']]
                else:
                    insert.strand = "."
                
                if insert.strand == "-":
                    tmp = insert.start
                    insert.start = insert.end
                    insert.end = tmp

                insert.support_info.support["five_p_elt_match"].value = float(split_line[header['5p_Elt_Match']])
                insert.support_info.support["three_p_elt_match"].value = float(split_line[header['3p_Elt_Match']])
                insert.support_info.support["five_p_genome_match"].value = float(split_line[header['5p_Genome_Match']])
                insert.support_info.support["three_p_genome_match"].value = float(split_line[header['3p_Genome_Match']])
                insert.support_info.support["split_reads_5prime"].value = float(split_line[header['Split_reads_5prime']])
                insert.support_info.support["split_reads_3prime"].value = float(split_line[header['Split_reads_3prime']])
                insert.support_info.support["remapped_discordant"].value = float(split_line[header['Remapped_Discordant']])
                insert.support_info.support["remap_disc_fraction"].value = float(split_line[header['Remap_Disc_Fraction']])
                insert.support_info.support["remapped_splitreads"].value = float(split_line[header['Remapped_Splitreads']])
                insert.support_info.support["remap_split_fraction"].value = float(split_line[header['Remap_Split_Fraction']])

                insert.name = insert.family+"|non-reference|NA|"+sample_name+"|tebreak|sr|"

                if (
                    insert.chromosome in chromosomes and
                    insert.support_info.support["five_p_elt_match"].value >= config.MIN_5P_ELT_MATCH and 
                    insert.support_info.support["three_p_elt_match"].value >= config.MIN_3P_ELT_MATCH and
                    insert.support_info.support["five_p_genome_match"].value >= config.MIN_5P_GENOME_MATCH and
                    insert.support_info.support["three_p_genome_match"].value >= config.MIN_3P_GENOME_MATCH and
                    insert.support_info.support["split_reads_5prime"].value >= config.MIN_SPLIT_READS_5P and
                    insert.support_info.support["split_reads_3prime"].value >= config.MIN_SPLIT_READS_3P and
                    insert.support_info.support["remapped_discordant"].value >= config.MIN_REMAPPED_DISCORDANT and
                    insert.support_info.support["remap_disc_fraction"].value >= config.MIN_REMAP_DISC_FRACTION and
                    insert.support_info.support["remapped_splitreads"].value >= config.MIN_REMAPPED_SPLITREADS and
                    insert.support_info.support["remap_split_fraction"].value >= config.MIN_REMAP_SPLIT_FRACTION
                ):
                    insertions.append(insert)
    
    return insertions
Esempio n. 7
0
def read_insertions(predictions,
                    ref_tes,
                    chroms,
                    sample,
                    both_end_support_needed=True,
                    support_threshold=0.1):
    insertions = []

    with open(predictions, "r") as tsv:
        for line in tsv:
            split_line = line.split("\t")
            insert = output.Insertion(output.Popoolationte2())
            insert.chromosome = split_line[1]
            insert.start = int(split_line[2])
            insert.end = int(split_line[2])
            insert.strand = split_line[3]
            insert.family = split_line[4]
            insert.support_info.support['flanks_supported'].value = split_line[
                6]
            insert.support_info.support['frequency'].value = float(
                split_line[8])

            if (insert.support_info.support['flanks_supported'].value == "FR"
                    or not both_end_support_needed
                ) and insert.support_info.support[
                    'frequency'].value > support_threshold:
                # determine if insert is a ref insert
                for x in range(0, len(ref_tes)):
                    if ref_tes[
                            x].start <= insert.start and insert.start <= ref_tes[
                                x].end:
                        insert.family = ref_tes[x].family
                        insert.support_info.added = ref_tes[
                            x].support_info.added
                        if not ref_tes[x].support_info.added:
                            ref_tes[x].support_info.added = True

                        insert.type = "reference"
                        insert.start = ref_tes[x].start
                        insert.end = ref_tes[x].end
                        insert.strand = ref_tes[x].strand

                if insert.type == "reference":
                    insert.name = insert.family + "|reference|" + str(
                        insert.support_info.support['frequency'].value
                    ) + "|" + sample + "|popoolationte2|rp|"
                else:
                    insert.type = "non-reference"
                    insert.name = insert.family + "|non-reference|" + str(
                        insert.support_info.support['frequency'].value
                    ) + "|" + sample + "|popoolationte2|rp|"

                if not insert.support_info.added:
                    insertions.append(insert)

    return insertions
Esempio n. 8
0
def read_insertions(ref_bed, nonref_bed, chromosomes, sample_name, out_dir):
    insertions = []
    with open(ref_bed, "r") as inbed:
        for line in inbed:
            line = line.replace("\n", "")
            insert = output.Insertion(output.Ngs_te_mapper2())
            split_line = line.split("\t")
            insert.chromosome = split_line[0]
            insert.start = int(split_line[1]) + 1
            insert.end = int(split_line[2])
            insert.type = "reference"
            insert.strand = split_line[5]
            insert.family = split_line[3]
            insert.name = insert.family + "|" + insert.type + "|NA|" + sample_name + "|ngs_te_mapper2|sr|"
            if insert.chromosome in chromosomes:
                insertions.append(insert)

    with open(nonref_bed, "r") as inbed:
        for line in inbed:
            line = line.replace("\n", "")
            insert = output.Insertion(output.Ngs_te_mapper2())
            split_line = line.split("\t")
            insert.chromosome = split_line[0]
            insert.start = int(split_line[1]) + 1
            insert.end = int(split_line[2])
            insert.type = "non-reference"
            insert.strand = split_line[5]
            insert.family = split_line[3].split("|")[0]
            insert.support_info.support['frequency'].value = float(
                split_line[3].split("|")[2])
            insert.support_info.support['three_prime_support'].value = int(
                split_line[3].split("|")[3])
            insert.support_info.support['five_prime_support'].value = int(
                split_line[3].split("|")[4])
            insert.support_info.support['reference_reads'].value = int(
                split_line[3].split("|")[5])
            insert.name = insert.family + "|" + insert.type + "|" + str(
                insert.support_info.support['frequency'].value
            ) + "|" + sample_name + "|ngs_te_mapper2|sr|"
            if insert.chromosome in chromosomes:
                insertions.append(insert)

    return insertions
Esempio n. 9
0
def get_insertions(gff, sample_name, chromosomes, ref_l_threshold=0, ref_r_threshold=0, nonref_l_threshold=0, nonref_r_threshold=0):
    insertions = []
    with open(gff, "r") as ingff:
        for line in ingff:
            if "#" not in line:
                split_line = line.split("\t")
                feats = split_line[8].split(";")
                insert = output.Insertion(output.Relocate())
                insert.chromosome = split_line[0]
                insert.start = int(split_line[3])
                insert.end = int(split_line[4])
                insert.strand = split_line[6]

                feat_id = ""
                feat_te_name = ""
                for feat in feats:
                    if "ID=" in feat:
                        feat_id = feat.split("=")[1]
                    elif "TE_Name=" in feat:
                        feat_te_name = feat.split("=")[1]
                    elif "Note=" in feat:
                        if "Shared" in feat:
                            insert.type = "reference"
                        elif "Non-reference" in feat:
                            insert.type = "non-reference"
                        else:
                            insert.type = "missing"
                    
                    elif "left_flanking_read_count=" in feat:
                        insert.support_info.support['left_flanking_reads'].value = int(feat.split("=")[1])
                    
                    elif "right_flanking_read_count=" in feat:
                        insert.support_info.support['right_flanking_reads'].value = int(feat.split("=")[1])
                
                if insert.type == "reference":
                    insert.family = feat_te_name
                    insert.name = feat_te_name+"|reference|NA|"+sample_name+"|relocate|sr|"
                elif insert.type == "non-reference":
                    feat_te_name = feat_id.split(".")[0]
                    insert.family = feat_te_name
                    insert.name = feat_te_name+"|non-reference|NA|"+sample_name+"|relocate|sr|"
            
            if insert.type == "reference" and insert.support_info.support['left_flanking_reads'].value >= ref_l_threshold and insert.support_info.support['right_flanking_reads'].value >= ref_r_threshold and insert.chromosome in chromosomes:
                insertions.append(insert)
            elif insert.type == "non-reference" and insert.support_info.support['left_flanking_reads'].value >= nonref_l_threshold and insert.support_info.support['right_flanking_reads'].value >= nonref_r_threshold and insert.chromosome in chromosomes:
                insertions.append(insert)
    
    return insertions
Esempio n. 10
0
def read_insertions(retroseq_vcf,
                    sample_name,
                    chromosomes,
                    support_threshold=0,
                    breakpoint_threshold=6):
    insertions = []

    with open(retroseq_vcf, "r") as vcf:
        for line in vcf:
            if "#" not in line:
                insert = output.Insertion(output.Retroseq())
                line = line.replace("\n", "")
                split_line = line.split("\t")
                insert.chromosome = split_line[0]

                info = {}
                split_info = split_line[7].split(";")
                for i in split_info:
                    if "=" in i:
                        info[i.split("=")[0]] = i.split("=")[1]

                insert.family = (info['MEINFO'].split(",")[0]).split("-")[0]
                insert.start = int(info['MEINFO'].split(",")[1])
                insert.end = int(info['MEINFO'].split(",")[2])

                format_keys = split_line[8].split(":")
                format_vals = split_line[9].split(":")
                form = {}
                for x, key in enumerate(format_keys):
                    form[key] = format_vals[x]

                insert.support_info.support['read_pair_support'].value = int(
                    form['SP'])
                insert.support_info.support['clip3'].value = int(form['CLIP3'])
                insert.support_info.support['clip5'].value = int(form['CLIP5'])
                insert.support_info.support['call_status'].value = int(
                    form['FL'])
                insert.type = "non-reference"
                insert.name = insert.family + "|non-reference|NA|" + sample_name + "|retroseq|rp|"

                if insert.support_info.support[
                        'read_pair_support'].value >= support_threshold and insert.support_info.support[
                            'call_status'].value >= breakpoint_threshold and insert.chromosome in chromosomes:
                    insertions.append(insert)

    return insertions
Esempio n. 11
0
def get_non_absent_ref_tes(deletions, te_gff, te_to_family, sample_name):
    ref_tes = []
    with open(te_gff, "r") as gff:
        for line in gff:
            ref_te = output.Insertion(output.Tepid())
            split_line = line.split("\t")
            ref_te.chromosome = split_line[0]
            ref_te.start = int(split_line[3])
            ref_te.end = int(split_line[4])
            ref_te.strand = split_line[6]
            feats = split_line[8]
            split_feats = feats.split(";")
            te_id = ""
            for f in split_feats:
                if "ID=" in f:
                    te_id = f.split("=")[1]

            ref_te.family = te_to_family[te_id]
            ref_te.type = "reference"
            ref_te.name = ref_te.family + "|reference|NA|" + sample_name + "|tepid|nonab|"
            ref_tes.append(ref_te)

    absent = []
    for deletion in deletions:
        key = "_".join([
            deletion.chromosome,
            str(deletion.start),
            str(deletion.end), deletion.strand, deletion.family
        ])
        absent.append(key)

    non_absent = []
    for te in ref_tes:
        key = "_".join(
            [te.chromosome,
             str(te.start),
             str(te.end), te.strand, te.family])
        if key not in absent:
            non_absent.append(te)

    return non_absent
Esempio n. 12
0
def read_insertions(bed, chromosomes, sample_name, out_dir, min_read_cutoff=0):
    insertions = []
    with open(bed, "r") as inbed:
        for line in inbed:
            insert = output.Insertion(output.Ngs_te_mapper())
            line = line.replace(";", "\t")
            split_line = line.split("\t")
            insert.chromosome = split_line[0]
            insert.start = int(split_line[1]) + 1
            insert.end = int(split_line[2])
            insert.type = split_line[8].replace("\n", "")
            insert.strand = split_line[4]
            insert.family = split_line[5]
            insert.name = insert.family + "|" + insert.type + "|NA|" + sample_name + "|ngs_te_mapper|sr|"
            insert.support_info.support['supportingreads'].value = int(
                split_line[7])
            if insert.support_info.support[
                    'supportingreads'].value > min_read_cutoff and insert.chromosome in chromosomes:
                insertions.append(insert)

    return insertions
Esempio n. 13
0
def get_insertions(gff,
                   sample_name,
                   chromosomes,
                   l_support_threshold=0,
                   r_support_threshold=0,
                   l_junction_threshold=0,
                   r_junction_threshold=0,
                   insert_type="ref"):
    insertions = []
    with open(gff, "r") as ingff:
        for line in ingff:
            if "#" not in line:
                line = line.replace(";", "\t")
                split_line = line.split("\t")
                insert = output.Insertion(output.Relocate2())
                insert.chromosome = split_line[0]
                insert.start = int(split_line[3])
                insert.end = int(split_line[4])
                insert.strand = split_line[6]
                insert.type = insert_type

                insert.name = split_line[8].split("=")[1]

                te_name = ""
                if insert_type == "ref":
                    insert.type = "reference"
                    insert.support_info.support[
                        'right_junction_reads'].value = int(
                            split_line[11].split(":")[1])
                    insert.support_info.support[
                        'left_junction_reads'].value = int(
                            split_line[12].split(":")[1])
                    insert.support_info.support[
                        'right_support_reads'].value = int(
                            split_line[13].split(":")[1])
                    insert.support_info.support[
                        'left_support_reads'].value = int(
                            split_line[14].split(":")[1])
                else:
                    insert.type = "non-reference"
                    te_name = split_line[9].split("=")[1]
                    te_name = te_name.split("/")[0]
                    insert.family = te_name
                    insert.name = te_name + "|non-reference|NA|" + sample_name + "|relocate2|sr|"
                    insert.support_info.support[
                        'right_junction_reads'].value = int(
                            split_line[12].split("=")[1])
                    insert.support_info.support[
                        'left_junction_reads'].value = int(
                            split_line[13].split("=")[1])
                    insert.support_info.support[
                        'right_support_reads'].value = int(
                            split_line[14].split("=")[1])
                    insert.support_info.support[
                        'left_support_reads'].value = int(
                            split_line[15].split("=")[1])

                if (insert.support_info.support['right_junction_reads'].value
                        >= r_junction_threshold
                        and insert.support_info.support['left_junction_reads'].
                        value >= l_junction_threshold
                        and insert.support_info.support['right_support_reads'].
                        value >= r_support_threshold and
                        insert.support_info.support['left_support_reads'].value
                        >= l_support_threshold
                        and insert.chromosome in chromosomes
                        and te_name != "repeat_name"):
                    insertions.append(insert)

    return insertions
Esempio n. 14
0
def read_insertions(predictions,
                    chroms,
                    sample,
                    ref_tes,
                    min_presence=3,
                    max_absence=None,
                    min_presence_fraction=0.1,
                    require_tsd=False,
                    require_both_breakpoints=False):
    insertions = []

    with open(predictions, "r") as tsv:
        for line in tsv:
            split_line = line.split("\t")
            insert = output.Insertion(output.Teflon())

            insert.chromosome = split_line[0]

            both_ends = False
            tsd = False
            if insert.chromosome in chroms:
                if split_line[1] != "-" and split_line[2] != "-":
                    left = int(split_line[1])
                    right = int(split_line[2])
                    both_ends = True

                elif split_line[1] == "-":
                    left = int(split_line[2])
                    right = int(split_line[2])
                else:
                    left = int(split_line[1])
                    right = int(split_line[1])

                if left > right:
                    tsd = True
                    tmp = right
                    right = left
                    left = tmp

                elif left == right:
                    tsd = True
                    right += 1

                insert.start = left - 1
                insert.end = right

                insert.family = split_line[3]

                insert.strand = split_line[5]

                # if reference prediction, uses ref TE coordinates
                if split_line[6] != "-":
                    tsd = True
                    both_ends = True
                    insert.type = "reference"
                    te_names = split_line[6].split(",")
                    te_name = ""
                    for name in te_names:
                        if name in ref_tes.keys():
                            if te_name == "":
                                te_name = name

                    if te_name == "":
                        sys.exit("TEFLON ERROR: can't find:" + split_line[6] +
                                 " in reference TEs...\n")
                    insert.chromosome = ref_tes[te_name][0]
                    insert.start = ref_tes[te_name][1]
                    insert.end = ref_tes[te_name][2]

                else:
                    insert.type = "non-reference"

                insert.support_info.support[
                    'five_prime_supported'].value = split_line[7]
                insert.support_info.support[
                    'three_prime_supported'].value = split_line[7]

                insert.support_info.support['presence_reads'].value = int(
                    split_line[9])
                insert.support_info.support['absence_reads'].value = int(
                    split_line[10])
                insert.support_info.support['ambiguous_reads'].value = int(
                    split_line[11])
                insert.support_info.support['frequency'].value = float(
                    split_line[12])

                insert.name = insert.family + "|" + insert.type + "|" + str(
                    insert.support_info.support['frequency'].value
                ) + "|" + sample + "|teflon|rp|"

                if ((insert.support_info.support['presence_reads'].value >=
                     min_presence) and
                    (max_absence is None
                     or insert.support_info.support['absence_reads'].value <=
                     max_absence)
                        and (insert.support_info.support['frequency'].value >=
                             min_presence_fraction)
                        and ((tsd or not require_tsd) and
                             (both_ends or not require_both_breakpoints))):
                    insertions.append(insert)

    return insertions
Esempio n. 15
0
def read_insertions(popoolationte,
                    sample_name,
                    chromosomes,
                    require_both_end_support=True,
                    percent_read_support_threshold=0.1):
    insertions = []

    with open(popoolationte, "r") as tsv:
        for line in tsv:
            insert = output.Insertion(output.Popoolationte())
            split_line = line.split("\t")
            insert.chromosome = split_line[0]
            pos_in_reference_seq = to_number(split_line[1])
            insert.support_info.support["flanks_supported"].value = split_line[
                2]
            insert.family = split_line[3]
            insert.support_info.support["frequency"].value = to_number(
                split_line[4], to_float=True)
            ref_te_id = split_line[6]
            insert.support_info.support[
                "forward_insert_start"].value = to_number(split_line[8])
            insert.support_info.support[
                "forward_insert_end"].value = to_number(split_line[9])
            insert.support_info.support[
                "forward_insert_freq"].value = to_number(split_line[10],
                                                         to_float=True)
            insert.support_info.support[
                "forward_insert_cov"].value = to_number(split_line[11])
            insert.support_info.support[
                "forward_presence_reads"].value = to_number(split_line[12])
            insert.support_info.support[
                "forward_absence_reads"].value = to_number(split_line[13])
            insert.support_info.support[
                "reverse_insert_start"].value = to_number(split_line[15])
            insert.support_info.support[
                "reverse_insert_end"].value = to_number(split_line[16])
            insert.support_info.support[
                "reverse_insert_freq"].value = to_number(split_line[17],
                                                         to_float=True)
            insert.support_info.support[
                "reverse_insert_cov"].value = to_number(split_line[18])
            insert.support_info.support[
                "reverse_presence_reads"].value = to_number(split_line[19])
            insert.support_info.support[
                "reverse_absence_reads"].value = to_number(split_line[20])

            if insert.support_info.support["forward_insert_start"].value == 0:
                insert.start = pos_in_reference_seq
                insert.end = insert.support_info.support[
                    "reverse_insert_start"].value

            elif insert.support_info.support[
                    "reverse_insert_start"].value == 0:
                insert.start = insert.support_info.support[
                    "forward_insert_end"].value
                insert.end = pos_in_reference_seq

            else:
                insert.start = insert.support_info.support[
                    "forward_insert_end"].value
                insert.end = insert.support_info.support[
                    "reverse_insert_start"].value

            if "-" == ref_te_id:
                insert.type = "non-reference"
                insert.name = insert.family + "|non-reference|" + str(
                    insert.support_info.support["frequency"].value
                ) + "|" + sample_name + "|popoolationte|rp|"
            else:
                insert.type = "reference"
                insert.name = insert.family + "|reference|" + str(
                    insert.support_info.support["frequency"].value
                ) + "|" + sample_name + "|popoolationte|rp|"

            if not require_both_end_support:
                if ("FR" in
                        insert.support_info.support["flanks_supported"].value
                        and (insert.support_info.support["frequency"].value >=
                             percent_read_support_threshold)
                        and insert.chromosome in chromosomes):
                    insertions.append(insert)

                elif ("F"
                      in insert.support_info.support["flanks_supported"].value
                      and
                      (insert.support_info.support["forward_insert_freq"].value
                       >= percent_read_support_threshold)
                      and insert.chromosome in chromosomes):
                    insertions.append(insert)

                elif ((insert.support_info.support["reverse_insert_freq"].value
                       >= percent_read_support_threshold)
                      and insert.chromosome in chromosomes):
                    insertions.append(insert)
            else:
                if ("FR" in
                        insert.support_info.support["flanks_supported"].value
                        and
                    (insert.support_info.support["forward_insert_freq"].value
                     >= percent_read_support_threshold or
                     insert.support_info.support["reverse_insert_freq"].value
                     >= percent_read_support_threshold)
                        and insert.chromosome in chromosomes):
                    insertions.append(insert)

    return insertions
Esempio n. 16
0
def read_insertions(jitterbug_gff,
                    taxonomy,
                    chroms,
                    sample_name,
                    min_fwd_read_support=0,
                    min_rev_read_support=0,
                    min_sr_support=0,
                    min_zygosity=0.0):
    insertions = []

    te_family = {}
    with open(taxonomy, "r") as tsv:
        for line in tsv:
            line = line.replace("\n", "")
            split_line = line.split("\t")
            te_family[split_line[0]] = split_line[1]

    with open(jitterbug_gff, "r") as gff:
        for line in gff:
            line = line.replace("\n", "")
            split_line = line.split("\t")
            if len(split_line) == 9:
                # insert = mccutils.Insertion()
                insert = output.Insertion(output.Jitterbug())

                insert.chromosome = split_line[0]
                if insert.chromosome in chroms:
                    insert.start = int(split_line[3])
                    insert.end = int(split_line[4])
                    insert.type = "non-reference"

                    feats = split_line[8]
                    feats = feats.replace(" ", "")
                    feats = feats.split(";")
                    supporting_families = []
                    sr = False
                    family = "NONE"
                    for feat in feats:
                        if "softclipped_pos" in feat:
                            pos = feat.split("=")[1]
                            pos = pos.replace("(", "")
                            pos = pos.replace(")", "")
                            pos = pos.split(",")
                            start = int(pos[0]) - 1
                            end = int(pos[1])

                            if start > -1 and end > -1:
                                insert.start = start
                                insert.end = end
                                sr = True

                        if "predicted_superfam" in feat:
                            te = feat.split("=")[1]
                            family = te_family[te]
                            insert.family = family

                        if "supporting_fwd_reads" in feat:
                            insert.support_info.support[
                                'supporting_fwd_reads'].value = int(
                                    feat.split("=")[1])

                        if "supporting_rev_reads" in feat:
                            insert.support_info.support[
                                'supporting_rev_reads'].value = int(
                                    feat.split("=")[1])

                        if "softclipped_support" in feat:
                            insert.support_info.support[
                                'softclipped_support'].value = int(
                                    feat.split("=")[1])

                        if "zygosity" in feat:
                            insert.support_info.support[
                                'zygosity'].value = float(feat.split("=")[1])

                    insert.name = family + "|non-reference|" + str(
                        insert.support_info.support['zygosity'].value
                    ) + "|" + sample_name + "|jitterbug|"
                    if sr:
                        insert.name += "sr|"
                    else:
                        insert.name = "rp|"

                    if ((insert.support_info.support['supporting_fwd_reads'].
                         value >= min_fwd_read_support) and
                        (insert.support_info.support['supporting_rev_reads'].
                         value >= min_rev_read_support) and
                        (insert.support_info.support['softclipped_support'].
                         value >= min_sr_support)
                            and (insert.support_info.support['zygosity'].value
                                 >= min_zygosity)):
                        insertions.append(insert)

    return insertions
Esempio n. 17
0
def read_insertion_summary(infile, sample):
    insertions = []
    with open(infile, "r") as inf:
        for x, line in enumerate(inf):
            if x > 0:
                insert = output.Insertion(output.Temp())
                split_line = line.split("\t")
                if len(split_line) == 14:
                    insert.chromosome = split_line[0]
                    insert.start = int(split_line[1]) - 1
                    insert.end = int(split_line[2])
                    insert.family = split_line[3]
                    insert.name = insert.family + "|non-reference|" + split_line[
                        7] + "|" + sample + "|temp|"

                    if "antisense" in split_line[4]:
                        insert.strand = "-"
                    else:
                        insert.strand = "+"

                    insert.support_info.support['class'].value = split_line[5]
                    insert.support_info.support['variantsupport'].value = int(
                        float(split_line[6]))
                    insert.support_info.support['frequency'].value = float(
                        split_line[7])
                    insert.support_info.support['junction1'].value = int(
                        split_line[8])
                    insert.support_info.support[
                        'junction1support'].value = int(split_line[9])
                    insert.support_info.support['junction2'].value = int(
                        split_line[10])
                    insert.support_info.support[
                        'junction2support'].value = int(split_line[11])
                    insert.support_info.support[
                        'fiveprimesupport'].value = int(float(split_line[12]))
                    insert.support_info.support[
                        'threeprimesupport'].value = int(
                            float(split_line[13].replace("\n", "")))
                    insert.type = "non-reference"

                    if insert.end >= insert.start and insert.end > 0 and insert.start > -1:

                        # if split read, use junction positions as start and end
                        if insert.support_info.support[
                                'junction1support'].value > 0 and insert.support_info.support[
                                    'junction2support'].value > 0:
                            insert.start = insert.support_info.support[
                                'junction1'].value
                            insert.end = insert.support_info.support[
                                'junction2'].value
                            insert.name = insert.name + "sr|"

                        # read pair
                        else:
                            insert.name = insert.name + "rp|"

                        insertions.append(insert)
                    else:
                        print(
                            "<TEMP POST> Omitting malformed line from insertion summary results:",
                            line)
                else:
                    print(
                        "<TEMP POST> Omitting malformed line from insertion summary results:",
                        line)

    return insertions