def upstream_and_downstream_seq(args):
    chromosome = split_coords(args.coords)[0]
    start = str(split_coords(args.coords)[1])
    downstream = str(int(start)-1000)
    end = str(split_coords(args.coords.replace('"', ""))[2])
    upstream = str(int(end)+1000)

    #using the samtools faidx function to take the appropriate sequence from a reference genome
    downstream_fa = Seq(pysam.faidx(args.genome, chromosome+":"+downstream+"-"+start), generic_dna)

    upstream_fa = Seq(pysam.faidx(args.genome, chromosome+":"+end+"-"+upstream), generic_dna)

    # Selecting only the sequence and converting to uppercase
    downstream_seq = downstream_fa[(len(downstream_fa.split('\n')[0])):-1].upper()
    # Selecting only the sequence, converting to uppercase, reversing and then getting the complementary sequence
    reverse_compliment_upstream_seq = upstream_fa[(len(upstream_fa.split('\n')[0])):-1].upper().reverse_complement()

    # Making sequence records with ID header and sequence
    downstream_seq = SeqRecord(downstream_seq, id="downstream_sequence")
    reverse_compliment_upstream_seq = SeqRecord(reverse_compliment_upstream_seq, id="upstream_sequence")

    if os.path.isdir(args.directory+"tmp/") == False:
        os.mkdir(args.directory+"tmp/")

    # Writing sequences to fasta file
    downstream_outfile = open(os.path.join(args.directory+"tmp/", "downstream.fa"), "w")
    downstream_outfile.write(">"+str(downstream_seq.id) + "\n" + str(downstream_seq.seq))

    upstream_outfile = open(os.path.join(args.directory+"tmp/", "upstream.fa"), "w")
    upstream_outfile.write(">"+str(reverse_compliment_upstream_seq.id) + "\n" + str(reverse_compliment_upstream_seq.seq))
Example #2
0
def location_of_binding(args, outlist, exon_list):
    finaloutlist = []
    for line in outlist:
        col = line.split("\t")
        if int(col[9]) < split_coords(args.coords)[1]:
            line = line + "\t" + "Downstream"
        elif int(col[9]) > split_coords(args.coords)[2]:
            line = line + "\t" + "Upstream"
        elif int(col[9]) > split_coords(args.coords)[1] and int(
                col[10]) < split_coords(args.coords)[2]:
            line = line + "\t" + "circRNA"
        elif int(col[9]) == split_coords(args.coords)[1] or int(
                col[10]) == split_coords(args.coords)[2]:
            line = line + "\t" + "BSJ"
        elif int(col[10]) > split_coords(args.coords)[1] or int(
                col[9]) < split_coords(args.coords)[2]:
            line = line + "\t" + "BSJ"
        finaloutlist.append(line)
        for exon in exon_list:
            if int(col[9]) >= split_coords(exon)[1] and int(
                    col[10]) <= split_coords(exon)[2]:
                line = line + "\t" + "Exon Binding"
            finaloutlist.append(line)

    return finaloutlist
Example #3
0
def exon_coords(args):
    line_number1 = []
    line_number2 = []
    exon_file = open(args.exon).read().splitlines()
    for line in exon_file:
        col = line.split("\t")
        if col[0] == split_coords(
                args.coords)[0] and col[2] == "exon" and col[3] == str(
                    split_coords(args.coords)[1]):
            line_number1.append(exon_file.index(line))
        if col[0] == split_coords(
                args.coords)[0] and col[2] == "exon" and col[4] == str(
                    split_coords(args.coords)[2]):
            line_number2.append(exon_file.index(line))

    # Making list of exons composing circRNA
    exon_list = []
    for line in exon_file[line_number1[0]:line_number2[0] + 1]:
        col = line.split("\t")
        if col[2] == "exon":
            exon_list.append(col[0] + ":" + col[3] + "-" + col[4])

    exon_list = remove_duplicates(exon_list)

    return exon_list
Example #4
0
def intron_binding(args, finaloutlist):
    yes_count = 0
    for element in finaloutlist:
        col = element.split("\t")
        if int(col[9]) < split_coords(args.coords)[1] \
                or int(col[9]) > split_coords(args.coords)[2]:
            yes_count += 1

    return yes_count
Example #5
0
def check_circBase(args, circBase):
    match = []
    for line in circBase:
        col = line.split("\t")
        if "#" not in line:
            if split_coords(args.coords)[0] == col[0] and split_coords(
                    args.coords)[1] == int(col[1]) and split_coords(
                        args.coords)[2] == int(col[2]):
                match.append(line)
    return match
def index_gtf(args):
    line_number1 = []
    line_number2 = []
    gtf_file = open(args.A).read().splitlines()
    for line in gtf_file:
        col = line.split("\t")
        if col[0] == split_coords(
                args.ID)[0] and col[2] == "exon" and col[3] == str(
                    split_coords(args.ID)[1]):
            line_number1.append(gtf_file.index(line))
        elif col[0] == split_coords(
                args.ID)[0] and col[2] == "exon" and col[4] == str(
                    split_coords(args.ID)[2]):
            line_number2.append(gtf_file.index(line))

    return line_number1, line_number2, gtf_file
Example #7
0
def up_intron(args):
    annotation = open(args.annotation, "r").read().splitlines()
    for line in annotation:
        col = line.split("\t")
        if int(col[1]) == split_coords(args.coords)[2]:
            upstream_intron = (int(col[2]))

            return upstream_intron
Example #8
0
def ENCORI(args, downstream_intron, upstream_intron):
    outlist = []
    infile = open(args.infile, "r").read().splitlines()
    for line in infile:
        if "#" not in line:
            col = line.split("\t")
            if col[8] == split_coords(args.coords)[0] \
                    and int(col[9]) >= downstream_intron + 1 \
                    and int(col[10]) <= upstream_intron:

                outlist.append(line)

    return outlist