def get_conservation(transcript_list,
                     output_file,
                     max_dS_threshold=None,
                     max_omega_threshold=None):
    """
    Get the conversation for a list of sequences and only keep those that pass

    Args:
        transcript_list (dict): dict containing transcript id, the cds and the ortholog seqs
        output_file (str): path to output file
        max_dS_threshold (float): if set, pass in the dS threshold you wish alignments to be below
        max_omega_threshold (float): if set, pass in the omega threshold you wish alignments to be below
    """

    print("Getting the most conserved ortholog for each transcript...")

    temp_dir = "temp_conservation_files"
    gen.create_output_directories(temp_dir)
    # get a list of the transcript ids
    transcript_ids = list(transcript_list.keys())
    # transcript_ids = transcript_ids[:200]
    # run this linearly because it doesnt like being parallelised
    # outputs = run_conservation_check(transcript_ids, transcript_list, max_dS_threshold, max_omega_threshold, temp_dir)
    outputs = gen.run_parallel_function(
        transcript_ids,
        [transcript_list, max_dS_threshold, max_omega_threshold, temp_dir],
        run_conservation_check,
        parallel=False)
    # remove the old output file if there is one
    gen.remove_file(output_file)
    # now concat the output files
    args = ["cat"]
    [args.append(i) for i in outputs]
    gen.run_process(args, file_for_output=output_file)
    gen.remove_directory(temp_dir)
Beispiel #2
0
def bam_nm_filter(input_bam, output, nm_less_equal_to=None):
    '''
    Filters bam reads by NM value.
    nm_less_equal_to: the NM value you wish to filter by.
    '''
    if not nm_less_equal_to:
        print("Please provide NM filter value.")
        raise Exception

    #create output file
    if output[-4:] == ".bam":
        output_file = "{0}.sam".format(output[:-4])
    else:
        output_file = output
    sam_output = gen.run_process(["samtools", "view", "-h", input_bam])
    #create grep args and include header fields if they exist
    grep_args = ["^@"]
    #for each nm less than equal to threshold, create grep arg
    for i in range(nm_less_equal_to + 1):
        grep_args.append("\|\tNM:i:{0}\t".format(i))
    grep_args = "".join(grep_args)
    gen.run_process(["grep", grep_args],
                    input_to_pipe=sam_output,
                    file_for_output=output_file)

    #if wanting to create bam, create bam and delete sam
    if output != output_file:
        samtools_args = ["samtools", "view", "-bh", output_file]
        gen.run_process(samtools_args, file_for_output=output)
        gen.remove_file(output_file)
def blast_sequences(fasta_file, database_path, output_file, evalue=None):
    """
    Given a fasta file and a database, run a blast

    Args:
        fasta_file (str): path to fasta file to blast sequences for
        database_path (str): path to local blast database
        output_file (str): path to output file
        evalue (str):
    """

    print("BLASTing sequences...")

    if not evalue:
        evalue = "1e-04"
    elif type(evalue) != "str":
        print("evalue must be a string!")
        raise Exception
    # run blast
    args = [
        "Blastn", "-task", "blastn", "-query", fasta_file, "-db",
        database_path, "-out", output_file, "-outfmt", "10", "-evalue", evalue,
        "-num_threads",
        str(int((os.cpu_count()) - 3))
    ]
    gen.run_process(args)
Beispiel #4
0
def retrieve_bams_core(all_files, local_directory, host, user, password,
                       ftp_directory, expect_string):
    '''
    Core function parallelized in retrieve_bams above.
    '''
    #connect to FTP server
    ftp = gen.ftp_connect(host, user, password, directory=ftp_directory)
    #loop over .bam files
    for pos, bam_file in enumerate(all_files):
        expect_file = "temp_data/expect_file{0}.txt".format(random.random())
        start_time = time.time()
        print("{0}/{1}".format(pos, len(all_files)))
        local_bam_file = "{0}/{1}".format(local_directory, bam_file)
        #retrieve current file
        if not os.path.isfile(local_bam_file):
            ftp = gen.ftp_retrieve(ftp,
                                   host,
                                   user,
                                   password,
                                   ftp_directory,
                                   bam_file,
                                   destination=local_directory)
        #transfer file to Watson
        current_expect_string = str.replace(expect_string, "foo", bam_file)
        with open(expect_file, "w") as e_file:
            e_file.write(current_expect_string)
        gen.run_process(["expect", expect_file])
        print("Transferred to Watson.")
        gen.remove_file(expect_file)
        gen.remove_file(local_bam_file)
        print("Time spent: {0} minutes.\n".format(
            round((time.time() - start_time) / 60), 3))
    ftp = gen.ftp_check(ftp, host, user, password, ftp_directory)
    ftp.quit()
Beispiel #5
0
def fasta_from_intervals(bed_file, fasta_file, genome_fasta, force_strand = True, names = False):
    """
    Takes a bed file and creates a fasta file with the corresponding sequences.
    Credit: Rosina Savisaar

    Args:
        bed_file (str): the bed file path to create fasta from
        fasta_file (str): the output fasta file path
        genome_fasta (str): the file path to the genome fasta file
        names (bool): if False, the fasta record names will be generated from the sequence coordinates.
        names (bool): if True, the fasta name will correspond to whatever is in the 'name' field of the bed file
    """

    #if the index file exists, check whether the expected features are present
    genome_fasta_index = genome_fasta + '.fai'
    if(os.path.exists(genome_fasta_index)):
        bed_chrs = sorted(list(set([entry[0] for entry in gen.read_many_fields(bed_file, "\t")])))
        index_chrs = sorted(list(set([entry[0] for entry in gen.read_many_fields(genome_fasta_index, "\t")])))
        if(not set(bed_chrs).issubset(set(index_chrs))):
            gen.remove_file(genome_fasta_index)

    bedtools_args = ["bedtools", "getfasta", "-s", "-fi", genome_fasta, "-bed", bed_file, "-fo", fasta_file]
    if not force_strand:
        del bedtools_args[2]
    if names:
        bedtools_args.append("-name")
    gen.run_process(bedtools_args)
    names, seqs = gen.read_fasta(fasta_file)
    seqs = [i.upper() for i in seqs]
    gen.write_to_fasta(names, seqs, fasta_file)
Beispiel #6
0
def convert_bed(input_bed, output_bed = None, to_hg38 = True):
    """
    Convert bed file from hg37 to hg38 and vice versa

    Args:
        input_bed (str): path to bed file
        output_bed (str): if set, path to output_file
        to_hg38 (bool): if set, convert to hg38, else convert to hg37
    """

    # create temp file if no output file is given
    if not output_bed:
        file_to_write = "temp_files/{0}.bed".format(random.random())
    else:
        file_to_write = output_bed

    entries = gen.read_many_fields(input_bed, "\t")
    with open(file_to_write, "w") as outfile:
        for entry in entries:
            if to_hg38:
                entry[0] = entry[0].strip("chr")
            else:
                entry[0] = "chr{0}".format(entry[0])
            outfile.write("{0}\n".format("\t".join(entry)))

    # remove the temp file if created
    if not output_bed:
        gen.run_process(["mv", file_to_write, input_bed])
        gen.remove_file(file_to_write)
Beispiel #7
0
def bam_xt_filter(input_bam, output, xt_filter=None):
    '''
    Filter a bam/sam file by XT tag.
    '''
    if not xt_filter:
        print("Please specify XT filter.")
        raise Exception
    #create output file
    if output[-4:] == ".bam":
        output_file = "{0}.sam".format(output[:-4])
    else:
        output_file = output

    sam_output = gen.run_process(["samtools", "view", "-h", input_bam])
    grep_args = []
    #get header lines
    grep_args.append("^@")
    #get XT values with xt_filter
    grep_args.append("\|\tXT:A:{0}\t".format(xt_filter))
    grep_args = "".join(grep_args)
    gen.run_process(["grep", grep_args],
                    input_to_pipe=sam_output,
                    file_for_output=output_file)

    #if wanting to create bam, create bam and delete sam
    if output != output_file:
        samtools_args = ["samtools", "view", "-bh", output_file]
        gen.run_process(samtools_args, file_for_output=output)
        gen.remove_file(output_file)
Beispiel #8
0
def extract_exons(gtf, bed):
        '''
        Given a GTF file, extract exon coordinates and write them to .bed.
        EX.: extract_exons("../source_data/Homo_sapiens.GRCh37.87.gtf",
        "../source_data/Homo_sapiens.GRCh37.87_exons.bed")
        '''
        #extract exons from GTF
        exons = gen.run_process(["grep", "\texon\t", gtf])
        #filter down to only protein-coding ones
        exons = gen.run_process(["grep", "transcript_biotype \"protein_coding\""], input_to_pipe = exons)
        #split lines
        exons = [i.split("\t") for i in exons.split("\n")]
        #format as .bed. Switch to base 0.
        exons = [["chr{0}".format(i[0]), int(i[3]) - 1, i[4], i[8], ".", i[6]] for i in exons if len(i) >= 8]
        #pre-compile regex
        trans_regex = re.compile("(?<=transcript_id \")ENST[0-9]*")
        exon_no_regex = re.compile("(?<=exon_number \")[0-9]*")
        #extract transcript IDs and exon numbers
        for pos, exon in enumerate(exons):
                to_parse = exon[3]
                trans = re.search(trans_regex, to_parse).group(0)
                exon_no = re.search(exon_no_regex, to_parse).group(0)
                exons[pos][3] = "{0}.{1}".format(trans, exon_no)
        #write to bed
        with open(bed, "w") as file:
                for exon in exons:
                        file.write("{0}\n".format("\t".join([str(i) for i in exon])))
Beispiel #9
0
def sort_bed(input_file_name, output_file_name):
    '''
    Sort a bed file.
    '''
    #This is done via a temp file because that way you can specify the same file as input and output file and thus
    #overwrite the unsorted file with the sorted one.
    temp_file_name = "temp_data/temp_sorted_bed{0}.bed".format(random.random())
    gen.run_process(["sort-bed", input_file_name],
                    file_for_output=temp_file_name)
    gen.run_process(["mv", temp_file_name, output_file_name])
    gen.remove_file(temp_file_name)
Beispiel #10
0
 def test_phase_bams(self):
     snps = "test_data/bam_ops/test_phase_bams/snps.bed"
     sam = "test_data/bam_ops/test_phase_bams/reads.sam"
     bam = "test_data/bam_ops/test_phase_bams/reads.bam"
     gen.run_process(["samtools", "view", "-S", "-b", sam], file_for_output = bam)
     expected = "test_data/bam_ops/test_phase_bams/expected.sam"
     observed = "test_data/bam_ops/test_phase_bams/observed.sam"
     gen.remove_file(observed)
     phase_bams(snps, bam, "HG3", observed)
     expected = gen.read_many_fields(expected, "\t")
     observed = gen.read_many_fields(observed, "\t")
     self.assertEqual(expected, observed)
Beispiel #11
0
 def test_bam_xt_filter(self):
     input_bam = "test_data/bam_ops/test_bam_xt_filter/input_bam.bam"
     expected = "test_data/bam_ops/test_bam_xt_filter/expected_bam_xt_filter.sam"
     observed = "test_data/bam_ops/test_bam_xt_filter/observed_bam_xt_filter.bam"
     observed_sam_file = "test_data/bam_ops/test_bam_xt_filter/observed_bam_xt_filter.sam"
     bam_xt_filter(input_bam, observed, xt_filter="U")
     #convert bam to sam to check correct output
     samtools_args = ["samtools", "view", observed]
     gen.run_process(samtools_args, file_for_output=observed_sam_file)
     expected = gen.read_many_fields(expected, "\t")
     observed = gen.read_many_fields(observed_sam_file, "\t")
     self.assertEqual(expected, observed)
Beispiel #12
0
 def test_bam_flag_filter_unmapped_reads(self):
     input_bam = "test_data/bam_ops/test_bam_flag_filter_unmapped_reads/input_bam.bam"
     expected = "test_data/bam_ops/test_bam_flag_filter_unmapped_reads/expected_flag_filtered_unmapped_reads.sam"
     observed = "test_data/bam_ops/test_bam_flag_filter_unmapped_reads/observed_flag_filtered_unmapped_reads_bam.bam"
     observed_sam_output = "test_data/bam_ops/test_bam_flag_filter_unmapped_reads/observed_flag_filtered_unmapped_reads.sam"
     bam_flag_filter(input_bam, observed, get_unmapped_reads=True)
     #convert bam to sam to check correct output
     samtools_args = ["samtools", "view", observed]
     gen.run_process(samtools_args, file_for_output=observed_sam_output)
     expected = gen.read_many_fields(expected, "\t")
     observed = gen.read_many_fields(observed_sam_output, "\t")
     self.assertEqual(expected, observed)
def make_blast_database(fasta_file, database_path):
    """
    Make a BLAST database
    """

    print("Making BLAST database...")

    args = [
        "makeblastdb", "-in", fasta_file, "-out", database_path, "-dbtype",
        "nucl"
    ]
    gen.run_process(args)
Beispiel #14
0
 def test_bam_quality_filter_less_than(self):
     input_bam = "test_data/bam_ops/test_bam_quality_filter_less_than/test_bam.bam"
     expected = "test_data/bam_ops/test_bam_quality_filter_less_than/expected_bam_quality_filter_less_than.sam"
     observed = "test_data/bam_ops/test_bam_quality_filter_less_than/observed_bam_quality_filter_less_than.bam"
     observed_sam_output = "test_data/bam_ops/test_bam_quality_filter_less_than/observed_bam_quality_filter_less_than.sam"
     expected = gen.read_many_fields(expected, "\t")
     bam_quality_filter(input_bam, observed, quality_less_than_equal_to=250)
     #convert bam to sam to check correct output
     #use samtools to extract in the same format as sam
     samtools_args = ["samtools", "view", observed]
     gen.run_process(samtools_args, file_for_output=observed_sam_output)
     observed = gen.read_many_fields(observed_sam_output, "\t")
     self.assertEqual(expected, observed)
Beispiel #15
0
def merge_bams(bam_list, output_file):
    '''
    Merge a list of bam files to defined output file.
    '''
    #setup args, add -r to attach filename rg tag
    args = ["samtools", "merge", "-r"]
    if os.path.exists(output_file):
        args.append("-f")
    args.append(output_file)
    #loop through each input file and add to argument list
    for file in bam_list:
        args.append(file)
    gen.run_process(args)
Beispiel #16
0
 def test_intersect_bam(self):
     bam_file = "test_data/bam_ops/test_intersect_bam/test_input_bam.bam"
     bed_file = "test_data/bam_ops/test_intersect_bam/test_input_bed.bed"
     observed_bam_output = "test_data/bam_ops/test_intersect_bam/observed_bam_intersect.bam"
     observed_bed_output = "test_data/bam_ops/test_intersect_bam/observed_bam_intersect.bed"
     expected_bed_output = "test_data/bam_ops/test_intersect_bam/expected_intersect_bed.bed"
     intersect_bed(bam_file, bed_file, output_file=observed_bam_output, intersect_bam=True)
     expected = gen.read_many_fields(expected_bed_output, "\t")
     #convert bam to bed to check correct output
     #use samtools to extract in the same format as bed
     samtools_args = ["samtools", "view", observed_bam_output]
     gen.run_process(samtools_args, file_for_output=observed_bed_output)
     observed = gen.read_many_fields(observed_bed_output, "\t")
     self.assertEqual(observed, expected)
Beispiel #17
0
 def test_merge_bams(self):
     input_bam1 = "test_data/bam_ops/test_merge_bams/input1.bam"
     input_bam2 = "test_data/bam_ops/test_merge_bams/input2.bam"
     input_list = [input_bam1, input_bam2]
     expected = "test_data/bam_ops/test_merge_bams/expected_merge_bams.sam"
     observed = "test_data/bam_ops/test_merge_bams/observed_merge_bams.bam"
     observed_sam_output = "test_data/bam_ops/test_merge_bams/observed_merge_bams.sam"
     merge_bams(input_list, observed)
     #convert bam to sam to check correct output
     samtools_args = ["samtools", "view", observed]
     gen.run_process(samtools_args, file_for_output=observed_sam_output)
     expected = gen.read_many_fields(expected, "\t")
     observed = gen.read_many_fields(observed_sam_output, "\t")
     self.assertEqual(expected, observed)
Beispiel #18
0
def extract_gtf_features(gtf, features, bed):
    """
    Extract given features coordinates from a .gtf file and write to .bed

    Args:
        gtf (str): path to gtf file
        features (list): list of features to extract
        bed (str): path to output bed file
    """

    feature_list = []
    #iterate over the desired features
    for feature in features:
        #extract feature from GTF
        gtf_features = gen.run_process(
            ["grep", "\t{0}\t".format(feature), gtf])
        #filter down to only protein-coding ones
        gtf_features = gen.run_process(
            ["grep", "transcript_biotype \"protein_coding\""],
            input_to_pipe=gtf_features)
        #split lines
        gtf_features = [i.split("\t") for i in gtf_features.split("\n")]
        #add feature to list
        [i.append(feature) for i in gtf_features]
        #append to list
        feature_list.extend(gtf_features)
    #format as .bed. Switch to base 0.
    gtf_features = [[
        "chr{0}".format(i[0]),
        int(i[3]) - 1, i[4], i[8], i[-1], i[6]
    ] for i in feature_list if len(i) >= 3]

    #pre-compile regex
    trans_regex = re.compile("(?<=transcript_id \")ENST[0-9]*")
    exon_no_regex = re.compile("(?<=exon_number \")[0-9]*")
    gene_regex = re.compile("(?<=gene_id \")ENSG[0-9]*")
    #extract transcript IDs and feature numbers
    for pos, feature in enumerate(gtf_features):
        to_parse = feature[3]
        trans = re.search(trans_regex, to_parse).group(0)
        exon_no = re.search(exon_no_regex, to_parse).group(0)
        gene_id = re.search(gene_regex, to_parse).group(0)
        gtf_features[pos][3] = "{0}.{1}".format(trans, exon_no)
        gtf_features[pos].append(gene_id)
    #write to bed
    with open(bed, "w") as file:
        for feature in gtf_features:
            file.write("{0}\n".format("\t".join([str(i) for i in feature])))
Beispiel #19
0
def sort_bed(input_file, output_file):
    """
    Sort a bed file.

    Args:
        input_file (str): path to the input file
        output_file (str): path to the output file
    """

    # Do like this so we can sort a file and keep the same name
    gen.create_output_directories("temp_data")
    temp_file_name = "temp_data/temp_sorted_bed{0}.bed".format(random.random())
    gen.run_process(["sortBed", "-i", input_file],
                    file_for_output=temp_file_name)
    gen.run_process(["mv", temp_file_name, output_file])
    gen.remove_file(temp_file_name)
Beispiel #20
0
def remove_bed_overlaps(input_file, output_file):
    '''
    Given a bed file, only leave non-overlapping elements, regardless of the strand of the overlap.
    Adapted from function written by RS

    Args:
        input_file (str): path to the input file to remove overlaps
        output_file (str): path to the output file
    '''
    #check how many columns there are in the bedfile
    with open(input_file) as file:
        line = file.readline()
        column_number = line.count("\t") + 1
    # merge overlapping intervals and have it count how many of the elements from the original file contribute to each
    # interval in the new file
    # note that bedops takes the column numbers in base 1
    if column_number > 3:
        columns = ",".join([str(i)
                            for i in range(4, column_number + 1)] + ["1"])
        operations = ",".join(
            ["distinct" for i in range(4, column_number + 1)] + ["count"])
    else:
        columns = "1"
        operations = "count"
    merge_result = gen.run_process([
        "bedtools", "merge", "-i", input_file, "-c", columns, "-o", operations
    ])
    #only leave those intervals that do not result from a merge and delete counts column
    merge_result = merge_result.split("\n")
    with open(output_file, "w") as outfile:
        for line in merge_result:
            if line[-2:] == "\t1":
                outfile.write("{0}\n".format(line[:-2]))
Beispiel #21
0
def get_genome_bed_from_fasta_index(features_bed, fasta_index, output_file):
    """
    Given a list of features, get the genome coordinates as a bed file.

    Args:
        features_bed (str): path to bed file containing features
        fasta_index (str): path to fasta index file
        output_file (str): path to output file
    """

    # get all the chromosomes required
    first_column = [
        i.strip("chr") for i in list(
            set(
                gen.run_process(["awk", "{print $1}"],
                                file_for_input=features_bed).split('\n')))
        if len(i)
    ]
    # get index lines
    index = gen.read_many_fields(fasta_index, "\t")
    with open(output_file, "w") as outfile:
        for i in index:
            if i[0] in first_column:
                start = 0
                length = int(i[1])
                out_info = [i[0], start, start + length, ".", "."]
                outfile.write("{0}\t+\n{0}\t-\n".format("\t".join(
                    gen.stringify(out_info))))
Beispiel #22
0
def check_coding(exons_file, CDSs_file, outfile, remove_overlapping = False):
        '''
        Given a bed file of exon coordinates and a bed file of CDS coordinates,
        writes a new bed file that only contains those exon coordinates form the former file that
        1) are fully coding
        2) are internal
        NB! Assumes that all the coordinates are from non-overlapping transcripts.
        If this is not the case, set remove_overlaps to True and it'll remove overlapping
        intervals.
        '''
        if remove_overlapping:
                bmo.sort_bed(exons_file, exons_file)
                remove_overlaps(exons_file, exons_file)
        #filter out anything that isn't fully coding
        #you have to write_both because you want to make sure that they
        #haven't been kept because of an overlap to a transcript that doesn't appear in the exons file
        temp_file = "temp_data/temp{0}.txt".format(random.random())
        bmo.intersect_bed(exons_file, CDSs_file, overlap = 1, overlap_rec = True, output_file = temp_file, force_strand = True, write_both = True, no_dups = False, no_name_check = False)
        #filter out terminal exons
        #in theory, there shouldn't be any left after the previous step
        #in practice, there may be unannotated UTRs, so it looks like we have a fully coding terminal exon,
        #whereas in reality, the exon is only partially coding
        temp_file2 = "temp_data/temp{0}.txt".format(random.random())
        with open(temp_file2, "w") as o_file:
                #figure out the rank of the last exon for each transcript
                filt_exons = gen.read_many_fields(exons_file, "\t")
                filt_exons = [i for i in filt_exons if len(i) > 3]
                names = [i[3].split(".") for i in filt_exons]
                names = gen.list_to_dict(names, 0, 1, as_list = True)
                names = {i: max([int(j) for j in names[i]]) for i in names}
                coding_exons = gen.read_many_fields(temp_file, "\t")
                for exon in coding_exons:
                        overlap_name = exon[9].split(".")
                        if overlap_name[0] in names:
                                name = exon[3].split(".")
                                if name[-1] != "1":
                                        last_exon = names[name[0]]
                                        if int(name[-1]) != last_exon:
                                                exon = [str(i) for i in exon[:6]]
                                                o_file.write("\t".join(exon))
                                                o_file.write("\n")
        bmo.sort_bed(temp_file2, temp_file2)
        gen.run_process(["mergeBed", "-i", temp_file2, "-c", "4,5,6", "-o", "distinct,distinct,distinct"], file_for_output = outfile)
        gen.remove_file(temp_file)
        gen.remove_file(temp_file2)
Beispiel #23
0
 def test_tabix_samples2(self):
     bed_file = "test_data/snp_ops/test_tabix_samples2/test_tabix.bed"
     with open("test_data/snp_ops/test_tabix_samples2/expected_test_tabix_samples2.txt") as file:
         expected = "".join(file)
     observed = "test_data/snp_ops/test_tabix_samples2/observed_test_tabix_samples2.txt"
     gen.remove_file(observed)
     gen.remove_file(observed + ".gz")
     vcf_folder = "../source_data/per_sample_vcfs"
     panel_file = "../source_data/integrated_call_samples_v3.20130502.ALL.panel"
     tabix_samples(bed_file, observed + ".gz", panel_file, vcf_folder, samples = ["NA18917", "NA19024"])
     gen.run_process(["bgzip", "-d", observed + ".gz"])
     with open(observed) as file:
         observed = "".join(file)
     expected = re.sub("0\.[0-9]*\.vcf", "N.vcf", expected)
     observed = re.sub("0\.[0-9]*\.vcf", "N.vcf", observed)
     expected = re.sub("source_[0-9]*\.[0-9]*", "source_N", expected)
     observed = re.sub("source_[0-9]*\.[0-9]*", "source_N", observed)
     self.assertEqual(observed, expected)
Beispiel #24
0
def run_bedops(A_file,
               B_file,
               force_strand=False,
               write_both=False,
               chrom=None,
               overlap=None,
               sort=False,
               output_file=None,
               intersect=False,
               hit_number=None,
               no_dups=False,
               overlap_rec=None,
               intersect_bam=None):
    '''
    See intersect_bed for details.
    '''
    if intersect:
        command = "--intersect"
    else:
        command = "--element-of"
    if sort:
        sort_bed(A_file, A_file)
        sort_bed(B_file, B_file)
    bedops_args = ["bedops", "--chrom", "foo", command, "1", A_file, B_file]
    if overlap:
        bedops_args[4] = overlap
    if chrom:
        bedops_args[2] = chrom
        if intersect:
            del bedops_args[4]
    else:
        del bedops_args[1:3]
        if intersect:
            del bedops_args[2]
    if force_strand:
        print(
            "Bedops can't search by strand! Either use bedtools or separate input data by strand!"
        )
        raise Exception
    if write_both:
        print("Bedops can't write both features!")
        raise Exception
    if hit_number:
        print(
            "Bedops hasn't been set up to count the number of overlapping elements. Use bedtools!"
        )
        raise Exception
    if no_dups:
        print("Bedops doesn't print duplicates by default!")
    if overlap_rec:
        print("Bedops hasn't been set up to filter by overlap in second file!")
    if intersect_bam:
        print("Use bedtools to intersect bam and bed!")
        raise Exception
    bedops_output = gen.run_process(bedops_args, file_for_output=output_file)
    return (bedops_output)
Beispiel #25
0
def extract_seqs(source_path, genome_fasta, output_bed, output_fasta, output_seq_fasta, mapping_file, codes_file, exclude_XY=None, hg38=None, NONCODE=None):
    """
    Generate a file containing the exon sequences for a given .bed file

    Args:
        source_path (str): the source path for the origin .gtf file
        genome_fasta (str): the source path for the genome fasta
        output_bed (str): output .bed file to contain the exon info
        output_fasta (str): output fasta containing sequences
        output_seq_fasta (str):
        mapping_file (str):
        codes_file (str): used for NONCODE sequences to get the lincRNA
        exclude_XY (bool): if true, exclude cases on the X and Y chr
        hg38 (bool): if true, use hg38
        NONCODE (bool): if true, using NONCODE sequences
    """

    # create the exon bed file
    full_bed = "{0}/full_{1}".format("/".join(output_bed.split('/')[:-1]), output_bed.split("/")[-1])
    entries_to_bed(source_path, full_bed, exclude_XY, hg38=hg38, NONCODE=NONCODE)
    # generate the fasta from the file
    full_exon_fasta = "{0}/full_{1}".format("/".join(output_fasta.split('/')[:-1]), output_fasta.split("/")[-1])
    fasta_from_intervals(full_bed, full_exon_fasta, genome_fasta, names=True)
    # build the sequences from the exons
    full_seq_fasta = "{0}/full_{1}".format("/".join(output_seq_fasta.split('/')[:-1]), output_seq_fasta.split("/")[-1])
    build_seqs_from_exons_fasta(full_exon_fasta, full_seq_fasta)
    length_filter_fasta = "{0}/length_filtered_{1}".format("/".join(output_seq_fasta.split('/')[:-1]), output_seq_fasta.split("/")[-1])
    ops.filter_seq_lengths(full_seq_fasta, length_filter_fasta, 200)
    # filter to only keep one transcript per gene
    unique_transcripts_fasta = "{0}/unique_gene_filtered_{1}".format("/".join(output_seq_fasta.split('/')[:-1]), output_seq_fasta.split("/")[-1])
    ops.uniquify_lincRNA_transcripts(length_filter_fasta, mapping_file, unique_transcripts_fasta)

    if NONCODE:
        # get only those that are lincRNA
        ops.get_passed_NONCODE_codes(unique_transcripts_fasta, codes_file, mapping_file, output_seq_fasta, "0001")
    else:
        # otherwise dont need the step above so copy to file
        gen.run_process(["cp", unique_transcripts_fasta, output_seq_fasta])
    # filter bed file from fasta
    ops.filter_bed_from_fasta(full_bed, output_seq_fasta, output_bed)
    # now just get the exon seqs from these entries
    fasta_from_intervals(output_bed, output_fasta, genome_fasta, names=True)
Beispiel #26
0
def convert2bed(input_file_name, output_file_name, group_flags=None):
    '''
    Converts an input file (sam, bam, gtf, gff...) to a bed file using bedops.
    Set 'group_flags' to an integer if you want to group all the fields from a certain field onwards.
    For instance, if you set group_flags to 5, all of the fields from the 5th onward will be turned
    into a comma-separated string and stored as one field in the.bed file.
    Note that you cannot group all the fields in a row (i.e. you can't set it to 0.)
    '''
    extension = gen.get_extension(input_file_name, 3,
                                  ["sam", "bam", "gtf", "gff"])
    bed_data = gen.run_process(
        ["convert2bed", "--input={0}".format(extension)],
        file_for_input=input_file_name,
        file_for_output=output_file_name)
    if group_flags:
        temp_file_name = "temp_data/temp_bed_file{0}.bed".format(
            random.random())
        group_flags(output_file_name, temp_file_name, group_flags)
        gen.run_process(["mv", temp_file_name, output_file_name])
        print("Grouped flags.")
    print("Converted data from {0} to bed.".format(extension))
Beispiel #27
0
def get_descriptions(names, gtf, out_file):
        '''
        Given a set of Ensembl transcript identifiers and a GTF file,
        determine the corresponding "gene name" for each transcript identifier.
        '''
        name_regex = re.compile("(?<=gene_name \")[A-z0-9\.\-\/\(\)]*(?=\")")
        trans_regex = re.compile("(?<=transcript_id \")[A-z0-9]*(?=\")")
        transcript_lines = gen.run_process(["grep", "\ttranscript\t", gtf])
        transcript_lines = transcript_lines.split("\n")
        with open(out_file, "w") as file:
                for line in transcript_lines:
                        if len(line) > 1:
                                trans = re.search(trans_regex, line).group(0)
                                if trans in names:
                                        description = re.search(name_regex, line).group(0)
                                        file.write("{0}\t{1}\n".format(trans, description))
Beispiel #28
0
def bam_quality_filter(input_bam,
                       output_bam,
                       quality_greater_than_equal_to=None,
                       quality_less_than_equal_to=None):
    '''
    Filters bam reads by quality.
    quality_less_than_equal_to: the lower threshold for quality control
    quality_greater_than_equal_to: the upper threshold for quality control
    '''

    samtools_args = ["samtools", "view", "-h"]
    #if neither thresholds are specified
    if not quality_greater_than_equal_to and not quality_less_than_equal_to:
        print("You must specify one threshold to filter reads by.")
        raise Exception
    #if both thresholds are specified
    if quality_greater_than_equal_to and quality_less_than_equal_to:
        #create temp file
        gen.create_directory("temp_data/")
        temp_file = "temp_data/{0}.{1}.bam".format(
            os.path.split(output_bam)[1][:-4], random.random())
        #first get everything below the upper threshold
        #need to account for the fact samtools removes everything below threshold
        #so when inversing need to add 1 to total
        args = samtools_args.copy()
        upper_limit = quality_less_than_equal_to + 1
        args.extend(["-q", upper_limit, input_bam, "-U", temp_file])
        gen.run_process(args)
        #second get everything above the lower threshold
        args = samtools_args.copy()
        args.extend(["-bq", quality_greater_than_equal_to, temp_file])
        gen.run_process(args, file_for_output=output_bam)
        # #cleanup files
        gen.remove_file(temp_file)
    #if only the lower threshold is specified
    elif quality_greater_than_equal_to and not quality_less_than_equal_to:
        samtools_args.extend(["-bq", quality_greater_than_equal_to, input_bam])
        gen.run_process(samtools_args, file_for_output=output_bam)
    #if only the upper threshold is specified
    elif quality_less_than_equal_to and not quality_greater_than_equal_to:
        #need to account for the fact samtools removes everything below threshold
        #so when inversing need to add 1 to total
        upper_limit = quality_less_than_equal_to + 1
        samtools_args.extend(["-q", upper_limit, input_bam, "-U", output_bam])
        gen.run_process(samtools_args)
Beispiel #29
0
def filter_fasta_from_bed(bed_file, input_fasta, output_fasta, filter_column = 3):
    """
    Given a bed file, filter a fasta file file to contain only entries with ids
    in the given column

    Args:
        bed_file (str): path to bed file containing entries
        input_fasta (str): path to fasta file containing the sequences to filter
        output_fasta (str): path to output fasta file
        filter_column (int): base 0 index of the column to use as filtering
    """

    # get the ids from the given column
    ids = [i for i in gen.run_process(["cut", "-f{0}".format(filter_column+1), bed_file]).split("\n") if i]
    # read in the sequences
    names, seqs = gen.read_fasta(input_fasta)
    # filter and output to file
    with open(output_fasta, "w") as outfile:
        [outfile.write(">{0}\n{1}\n".format(name, seqs[i])) for i, name in enumerate(names) if name in ids]
Beispiel #30
0
def filter_fasta_intervals_from_fasta(intervals_fasta, fasta, output):
        '''
        Given a fasta file and a fasta intervals file, filter the intervals file to only leave records where the 'name' field appears
        among the names in the fasta file. Write to fasta.
        '''
        #add feature in here that enables overwrite of current file
        output_exists = False
        if Path(output).exists():
            output_exists = True
            temp_file_name = "{0}.{1}{2}".format(os.path.splitext(output)[0], random.random(), os.path.splitext(output)[1])
        else:
            temp_file_name = output

        #fish out the names in the fasta
        fasta_names = gen.run_process(["grep", ">", fasta])
        fasta_names = fasta_names.split("\n")
        #remove tag and newline from each name
        fasta_names = [(i.lstrip("\>")).rstrip("\n") for i in fasta_names]
        #remove and potential blank entries
        fasta_names = [i for i in fasta_names if len(fasta_names) > 3]

        #read in the interval data
        fasta_interval_names, fasta_interval_seqs = gen.read_fasta(intervals_fasta)
        id_regex = re.compile("^(\w+).*")
        with open(temp_file_name, "w") as file:
            for i, interval in enumerate(fasta_interval_names):
                #search for the sample name
                id = re.search(id_regex, interval)
                if id:
                    trans_id = id.group(1)
                    #if the sample name is in the fasta names, output to file
                    if trans_id in fasta_names:
                        file.write(">{0}\n{1}\n".format(fasta_interval_names[i], fasta_interval_seqs[i]))
        #remove old file, replace with new
        if(output_exists):
            os.remove(output)
            shutil.move(temp_file_name, output)