def read_fasta(fasta_line, v=False): """Read fasta, return dict and type.""" fasta_data = fasta_line.split(">") eprint(f"fasta_data[0] is:\n{fasta_data[0]}") if v else None eprint(f"fasta_data[1] is:\n{fasta_data[1]}") if v else None if fasta_data[0] != "": # this is a bug eprint("ERROR! Cesar output is corrupted") eprint(f"Issue detected in the following string:\n{fasta_line}") die("Abort") del fasta_data[0] # remove it "" we don't need that sequences = {} # accumulate data here order = [] # to have ordered list # there is no guarantee that dict will contain elements in the # same order as they were added for elem in fasta_data: raw_lines = elem.split("\n") # it must be first ['capHir1', 'ATGCCGCGCCAATTCCCCAAGCTGA... ] header = raw_lines[0] # separate nucleotide-containing lines lines = [x for x in raw_lines[1:] if x != "" and not x.startswith("!")] if len(lines) == 0: # it is a mistake - empty sequence --> get rid of continue fasta_content = "".join(lines) sequences[header] = fasta_content order.append(header) return sequences, order
def save(template, batch): """Save the cluster jobs, create jobs_file file.""" filenames = {} # collect filenames of cluster jobs for num, jobs in enumerate(batch): # define the path for the job job_path = os.path.join(WORK_DATA["jobs"], f"part_{num}") filenames[num] = job_path # i need these paths for jobs_file file # put the \n-separated jobs into the template # save this finally with open(job_path, "w") as f: f.write("\n".join(jobs) + "\n") # save the jobs_file file # add > line for stdout and 2> for stderr (if required) f = open(WORK_DATA["jobs_file"], "w") for num, path in filenames.items(): cmd = template.format(path) stdout_part = f"> {WORK_DATA['results_dir']}/{num}.txt" stderr_part = "2> {WORK_DATA['errors_dir']}/{num}.txt" \ if WORK_DATA["errors_dir"] else "" jobs_file_line = "{0} {1} {2}\n".format(cmd, stdout_part, stderr_part) f.write(jobs_file_line) # make executable rc = subprocess.call(f"chmod +x {WORK_DATA['jobs_file']}", shell=True) if rc != 0: # just in case die(f"Error! chmod +x {WORK_DATA['jobs_file']} failed") f.close()
def load_results(results_dir): """Load and sort the chain feature extractor results.""" verbose("Loading the results...") results_files = os.listdir(results_dir) verbose(f"There are {len(results_files)} result files to combine") # to hold data from fields "genes": chain_genes_data = defaultdict(list) # to hold data from "chains" field: chain_raw_data = {} # read file-by-file, otherwise it takes too much place genes_counter, chain_counter = 0, 0 # count chain and genes lines for results_file in results_files: # there are N files: read them one-by-one path = os.path.join(results_dir, results_file) f = open(path, "r") for line in f: # read file line-by-line, all fields are tab-separated line_data = line.rstrip().split("\t") # define the class of this line # a line could be either gene or chain-related if line_data[0] == "genes": # process as a gene line chain, genes = process_gene_line(line_data) chain_genes_data[chain].extend(genes) genes_counter += 1 elif line_data[0] == "chain": # chain related data the_chain_related = process_chain_line(line_data) # add this chain-related dict to the global one chain_raw_data.update(the_chain_related) chain_counter += 1 # do not forget to close the file f.close() verbose(f"Got {len(chain_genes_data)} keys in chain_genes_data") verbose(f"Got {len(chain_raw_data)} keys in chain_raw_data") verbose( f"There were {genes_counter} genes lines and {chain_counter} chain lines" ) # actually, these values must be equal # just a sanity check if not genes_counter == chain_counter: eprint(f"WARNING! genes_counter and chain_counter hold different " f"values:\n{genes_counter} and {chain_counter} respectively") die("Some features extracting jobs died!") return chain_genes_data, chain_raw_data
def bed12_to_ranges(bed): """Convert bed-12 file to set of sorted ranges.""" ranges_unsort, chrom = [], None for line in bed.split("\n")[:-1]: # parse line and extract blocks line_info = line.split("\t") chrom = line_info[0] glob_start = int(line_info[1]) blocks_num = int(line_info[9]) block_sizes = [int(x) for x in line_info[10].split(",") if x != ""] block_starts = [glob_start + int(x) for x in line_info[11].split(",") if x != ""] block_ends = [block_starts[i] + block_sizes[i] for i in range(blocks_num)] for i in range(blocks_num): # save the range for each exon ranges_unsort.append((block_starts[i], block_ends[i])) # return sorted ranges die("(bed12_to_ranges) error, cannot read bed properly") if not chrom else None return chrom, sorted(ranges_unsort, key=lambda x: x[0])
def read_bed_data(bed_file): """Get the necessary data from the bed file.""" result = {} # return this dictionary verbose(f"Reading {bed_file}") f = open(bed_file, "r") for line in f: # parse tab-separated bed file all_bed_info = line.rstrip().split("\t") cds_track = make_cds_track(line) # we need CDS only cds_bed_info = cds_track.rstrip().split("\t") if len(all_bed_info) != 12 or len(cds_bed_info) != 12: # if there are not 12 fields - no guarantee that we parse what we want die(f"Error! Bed12 file {bed_file} is corrupted!") # extract fields that we need chromStart = int(all_bed_info[1]) # gene start chromEnd = int(all_bed_info[2]) # and end # blocks represent exons all_blockSizes = [ int(x) for x in all_bed_info[10].split(',') if x != '' ] cds_blockSizes = [ int(x) for x in cds_bed_info[10].split(',') if x != '' ] # data to save gene_len = abs(chromStart - chromEnd) # for decision tree I will need number of exons # and number of bases in exonic and intronic fractions exons_num = len(all_blockSizes) exon_fraction = sum(all_blockSizes) # including UTR cds_fraction = sum(cds_blockSizes) # CDS only intron_fraction = gene_len - exon_fraction gene_name = all_bed_info[3] # save the data result[gene_name] = { "gene_len": gene_len, "exon_fraction": cds_fraction, "intron_fraction": intron_fraction, "exons_num": exons_num } f.close() verbose(f"Got data for {len(result.keys())} genes") return result
def read_input(input_file): """Read input.""" # it must be chain TAB genes line if os.path.isfile(input_file): tasks = {} f = open(input_file) for line in f: line_info = line[:-1].split("\t") chain = line_info[0] genes = line_info[1] tasks[chain] = genes f.close() return tasks elif len(input_file.split()) == 2: # it is not a file but chain<space>[,-sep list of genes] chain = input_file.split()[0] genes = input_file.split()[1] return {chain: genes} else: err_msg = "Error! Wrong input. Please provide either a file containing chain to genes\n" \ "list or a \"chain<space>[comma-separated list of genes]\" formatted-file" die(err_msg) return
def merge_cesar_output(input_dir, output_bed, output_fasta, meta_data_arg, skipped_arg, prot_arg, output_trash): """Merge multiple CESAR output files.""" # check that input dir is correct die(f"Error! {input_dir} is not a dir!") \ if not os.path.isdir(input_dir) else None # get list of bdb files (output of CESAR part) bdbs = [x for x in os.listdir(input_dir) if x.endswith(".bdb")] # initiate lists for different types of output: bed_summary = [] fasta_summary = [] trash_summary = [] meta_summary = [] prot_summary = [] skipped = [] all_ok = True task_size = len(bdbs) # extract data for all the files for num, bdb_file in enumerate(bdbs): # parse bdb files one by one bdb_path = os.path.join(input_dir, bdb_file) try: # try to parse data parsed_data = parse_cesar_bdb(bdb_path) except AssertionError: # if this happened: some assertion was violated # probably CESAR output data is corrupted sys.exit(f"Error! Failed reading file {bdb_file}") # unpack parsed data tuple: bed_lines = parsed_data[0] trash_exons = parsed_data[1] fasta_lines = parsed_data[2] meta_data = parsed_data[3] prot_fasta = parsed_data[4] skip = parsed_data[5] if len(bed_lines) == 0: # actually should not happen, but can eprint(f"Warning! {bdb_file} is empty") all_ok = False continue # it is empty # append data to lists bed_summary.append("\n".join(bed_lines) + "\n") fasta_summary.append(fasta_lines) trash_summary.append("".join(trash_exons)) meta_summary.append(meta_data) skipped.append(skip) prot_summary.append(prot_fasta) eprint(f"Reading file {num + 1}/{task_size}", end="\r") # save output eprint("Saving the output") if len(bed_summary) == 0: # if so, no need to continue eprint("! merge_cesar_output.py:") die("No projections found! Abort.") # save bed, fasta and the rest with open(output_bed, "w") as f: f.write("".join(bed_summary)) with open(output_fasta, "w") as f: f.write("".join(fasta_summary)) with open(meta_data_arg, "w") as f: f.write("\n".join(meta_summary)) with open(skipped_arg, "w") as f: f.write("\n".join(skipped)) with open(prot_arg, "w") as f: f.write("\n".join(prot_summary)) if output_trash: # if requested: provide trash annotation f = open(output_trash, "w") f.write("".join(trash_summary)) f.close() return all_ok
def prepare_bed_file(bed_file, output, ouf=False, save_rejected=None, only_chrom=None): """Filter the bed file given and save the updated version.""" new_lines = [] # keep updated lines rejected = [] # keep IDs of skipped transcripts + the reason why names = Counter() # we need to make sure that all names are unique f = open(bed_file, "r") for num, line in enumerate(f, 1): # parse bed file according to specification line_data = line.rstrip().split("\t") if len(line_data) != 12: f.close() # this is for sure an error # it is possible only if something except a bed12 was provided die("Error! Bed 12 file is required! Got a file with {len(line_data)} fields instead" ) chrom = line_data[0] if only_chrom and chrom != only_chrom: # TOGA allows to perform the analysis on a specific chromosome only # is so, we can skip all transcripts that located on other chromosomes continue chromStart = int(line_data[1]) chromEnd = int(line_data[2]) name = line_data[3] # gene_name usually # bed_score = int(line_data[4]) # never used # strand = line_data[5] # otherwise: # strand = True if line_data[5] == '+' else False thickStart = int(line_data[6]) thickEnd = int(line_data[7]) # itemRgb = line_data[8] # never used blockCount = int(line_data[9]) blockSizes = [int(x) for x in line_data[10].split(',') if x != ''] blockStarts = [int(x) for x in line_data[11].split(',') if x != ''] blockEnds = [blockStarts[i] + blockSizes[i] for i in range(blockCount)] blockAbsStarts = [ blockStarts[i] + chromStart for i in range(blockCount) ] blockAbsEnds = [blockEnds[i] + chromStart for i in range(blockCount)] blockNewStarts, blockNewEnds = [], [] names[name] += 1 if thickStart > thickEnd: f.close( ) # according to bed12 specification this should never happen sys.stderr.write(f"Problem occurred at line {num}, gene {name}\n") die("Error! Bed file is corrupted, thickEnd MUST be >= thickStart") elif thickStart == thickEnd: # this means that this is a non-coding transcript # TOGA cannot process them: we can skip it rejected.append((name, "No CDS")) continue if thickStart < chromStart or thickEnd > chromEnd: # a very strange (but still possible) case f.close() # for sure an error with input data sys.stderr.write(f"Problem occurred at line {num}, gene {name}\n") die("Error! Bed file is corrupted, thickRange is outside chromRange!" ) # now select CDS only # we keep UTRs in the filtered file # however, we need CDS to check whether it's correct (% 3 == 0) for block_num in range(blockCount): blockStart = blockAbsStarts[block_num] blockEnd = blockAbsEnds[block_num] # skip the block if it is entirely UTR if blockEnd <= thickStart: continue elif blockStart >= thickEnd: continue # if we are here: this is not an entirely UTR exon # it might intersect the CDS border or to be in the CDS entirely # remove UTRs: block start must be >= CDS_start (thickStart) # block end must be <= CDS_end (thickEnd) blockNewStart = blockStart if blockStart >= thickStart else thickStart blockNewEnd = blockEnd if blockEnd <= thickEnd else thickEnd blockNewStarts.append(blockNewStart - thickStart) blockNewEnds.append(blockNewEnd - thickStart) if len(blockNewStarts) == 0: # even it thickStart != thickEnd this transcript still can be non-coding # but if there are no blocks in the CDS -> we can catch this rejected.append((name, "No CDS")) continue block_new_count = len(blockNewStarts) blockNewSizes = [ blockNewEnds[i] - blockNewStarts[i] for i in range(block_new_count) ] if sum(blockNewSizes) % 3 != 0 and not ouf: # this is an out-of-frame (or incomplete transcript) # ideally CDS length should be divisible by 3 # not ouf means that we like to keep such transcripts for some reason rejected.append((name, "Out-of-frame gene")) continue # if there are non-unique transcript IDs: die # I kill it there, not earlier to show them altogether if any(v > 1 for v in names.values()): eprint("Error! There are non-uniq transcript IDs:") for k in names.keys(): eprint(k) die("Abort") # we keep this transcript: add in to the list new_line = "\t".join([str(x) for x in line_data]) new_lines.append(new_line) f.close() if len(new_lines) == 0: # no transcripts pass the filter: probably an input data mistake sys.exit( f"Error! No reference annotation tracks left after filtering procedure! Abort" ) # write transcripts that passed the filter to the output file f = open(output, "w") if output != "stdout" else sys.stdout f.write("\n".join(new_lines) + "\n") f.close() if output != "stdout" else None if save_rejected: # save transcripts that didn't pass the filter + reason why f = open(save_rejected, "w") for elem in rejected: f.write(f"{elem[0]}\t{elem[1]}\n") f.close()
def main(): """Entry point.""" t0 = dt.now() args = parse_args() os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" # otherwise it could crash # as default we create CESAR jobs for chains with "orth" or "trans" class # but user could select another set of chain classes fields = "ORTH,TRANS" if args.fields is None else args.fields # read U12 introns: to create a list of U12-containing genes # need it to make subsequent commands u12_data = read_u12_data(args.u12) # get lists of orthologous chains per each gene # skipped_1 - no chains found -> log them batch, chain_gene_field, skipped_1 = read_orthologs(args.orthologs_file, fields, only_o2o=args.o2o_only) # split cesar jobs in different buckets (if user requested so) # like put all jobs that require < 5Gig in the bucket 1 # jobs requiring 5 to 15 Gb to bucket 2 and so on # CESAR might be very memory-consuming -> so we care about this mem_limit, buckets = define_buckets(args.mem_limit, args.buckets) # load reference bed file data; coordinates and exon sizes bed_data = read_bed(args.bed_file) # check if cesar binary exists die(f"Error! Cannot find cesar executable at {args.cesar_binary}!") if \ not os.path.isfile(args.cesar_binary) else None # pre-compute chain : gene : region data # collect the second list of skipped genes # skipped_2 -> too long corresponding regions in query regions, skipped_2 = precompute_regions(batch, bed_data, args.bdb_chain_file, chain_gene_field, args.chains_limit) # start making the jobs all_jobs = {} skipped_3 = [] for gene in batch.keys(): u12_this_gene = u12_data.get(gene) block_sizes = bed_data[gene][3] # proceed to memory estimation # the same procedure as inside CESAR2.0 code num_states, r_length = 0, 0 # required memory depends on numerous params # first, we need reference transcript-related parameters # query-related parameters will be later for block_size in block_sizes: # num_states += 6 + 6 * reference->num_codons + 1 + 2 + 2 + 22 + 6; # /* 22 and 6 for acc and donor states */ num_codons = block_size // 3 num_states += 6 + 6 * num_codons + 1 + 2 + 2 + 22 + 6 # r_length += 11 + 6 * fasta.references[i]->length # + donors[i]->length + acceptors[i]->length; r_length += block_size gene_chains_data = regions.get(gene) # check that there is something for this gene if not gene_chains_data: continue elif len(gene_chains_data) == 0: continue chains = gene_chains_data.keys() chains_arg = ",".join(chains) # chain ids -> one of the cmd args # now compute query sequence-related parameters query_lens = [v for v in gene_chains_data.values()] q_length_max = max(query_lens) # and now compute the amount of required memory memory = (num_states * 4 * 8) + \ (num_states * q_length_max * 4) + \ (num_states * 304) + \ (2 * q_length_max + r_length) * 8 + \ (q_length_max + r_length) * 2 * 1 + EXTRA_MEM # convert to gigs + 0.25 extra gig gig = math.ceil(memory / 1000000000) + 0.25 if gig > mem_limit: # it is going to consume TOO much memory # skip this gene -> save to log skipped_3.append((gene, ",".join(chains), f"memory limit ({mem_limit} gig) exceeded (needs {gig})")) continue # # 0 gene; 1 chains; 2 bed_file; 3 bdb chain_file; 4 tDB; 5 qDB; 6 output; 7 cesar_bin job = WRAPPER_TEMPLATE.format(gene, chains_arg, os.path.abspath(args.bdb_bed_file), os.path.abspath(args.bdb_chain_file), os.path.abspath(args.tDB), os.path.abspath(args.qDB), gig, os.path.abspath(args.cesar_binary), args.uhq_flank) # add some flags if required job = job + " --mask_stops" if args.mask_stops else job job = job + " --check_loss" if args.check_loss else job job = job + " --no_fpi" if args.no_fpi else job # add U12 introns data if this gene has them: job = job + f" --u12 {os.path.abspath(args.u12)}" if u12_this_gene else job all_jobs[job] = gig eprint(f"\nThere are {len(all_jobs.keys())} jobs in total.") eprint("Splitting the jobs.") # split jobs in buckets | compute proportions filled_buckets = fill_buckets(buckets, all_jobs) prop_sum = sum([k * len(v) for k, v in filled_buckets.items()]) # estimate proportion of a bucket in the runtime buckets_prop = {k: (k * len(v)) / prop_sum for k, v in filled_buckets.items()} \ if 0 not in filled_buckets.keys() else {0: 1.0} eprint("Bucket proportions are:") eprint("\n".join([f"{k} -> {v}" for k, v in buckets_prop.items()])) # get number of jobs for each bucket bucket_jobs_num = {k: math.ceil(args.jobs_num * v) for k, v in buckets_prop.items()} # save jobs, get comb lines to_combine = save_jobs(filled_buckets, bucket_jobs_num, args.jobs_dir) # save combined jobs, combined is a file containing paths to separate jobs os.mkdir(args.results) if not os.path.isdir(args.results) else None os.mkdir(args.check_loss) if args.check_loss \ and not os.path.isdir(args.check_loss) else None f = open(args.combined, "w") for num, comb in enumerate(to_combine, 1): basename = os.path.basename(comb).split(".")[0] results_path = os.path.abspath(os.path.join(args.results, basename + ".bdb")) combined_command = f"{CESAR_RUNNER} {comb} {results_path}" if args.check_loss: loss_data_path = os.path.join(args.check_loss, f"{basename}.inact_mut.txt") combined_command += f" --check_loss {loss_data_path}" if args.rejected_log: log_path = os.path.join(args.rejected_log, f"{num}.txt") combined_command += f" --rejected_log {log_path}" f.write(combined_command + "\n") f.close() # save skipped genes if required if args.skipped_genes: skipped = skipped_1 + skipped_2 + skipped_3 f = open(args.skipped_genes, "w") # usually we have gene + reason why skipped # we split them with tab f.write("\n".join(["\t".join(x) for x in skipped]) + "\n") f.close() f = open(args.paralogs_log, "w") # save IDs of paralogous projections for k, v in chain_gene_field.items(): if v != "PARALOG": continue gene_ = f"{k[1]}.{k[0]}\n" f.write(gene_) f.close() eprint(f"Estimated: {dt.now() - t0}") sys.exit(0)
def read_orthologs(orthologs_file, fields_raw, only_o2o=False): """Read orthologs file.""" # convert fields param string to list fields = [x.upper() for x in fields_raw.split(",") if x != ""] genes_chains = {} chain_gene_field = {} skipped = [] # genes skipped at this stage f = open(orthologs_file, "r") # open the file f.__next__() # skip header # first column: transcript identifier # then: chain class fields (like column 2 - orthologous chains, 3 - paralogous) for line in f: # parse line line_info = line[:-1].split("\t") # "0" is a placeholder meaning "no chains there" gene = line_info[0] selected, chains = [], {} chains["ORTH"] = [x for x in line_info[1].split(",") if x != "0"] chains["PARA"] = [x for x in line_info[2].split(",") if x != "0"] chains["TRANS"] = [x for x in line_info[3].split(",") if x != "0"] # Processed pseudogenes column ignored -> they are processed separately all_chains = chains["ORTH"] + chains["PARA"] + chains["TRANS"] if len(all_chains) == 0: # no way in running CESAR on this gene # because there are no chains we could use skipped.append((gene, "0", "No chains intersecting the gene")) continue # user can ask to process only the genes that have a single orthologous chain # here we check that this is the case not_one2one = len(chains["ORTH"]) == 0 or len(chains["ORTH"]) > 1 if only_o2o and not_one2one: # we requested only a single orthologous chain skipped.append((gene, "0", "Only one2one requested, this gene didn't pass")) continue # get those are chosen in FIELDS for field in fields: # field is most likely "ORTH" or "TRANS" field_chains = chains.get(field) if not field_chains: continue selected.extend(field_chains) for chain in field_chains: key = (chain, gene) chain_gene_field[key] = field # if a gene has no orthologous chains, then use paralogous # if no paralogous -> log this gene if not selected: # no orthologous chains # we try to use paralogous chains # of course, log this data selected = all_chains.copy() keys = [(chain, gene) for chain in selected] for key in keys: chain_gene_field[key] = "PARALOG" # write to the dict, gene to chains we will use genes_chains[gene] = selected f.close() die("Error! No gene:chains pairs selected! Probably --fields parameter is wrong!") \ if len(genes_chains) == 0 else None return genes_chains, chain_gene_field, skipped
def check_args(args): """Check if args are correct, fill global dict.""" # check the directories global VERBOSE # set verbosity level VERBOSE = True if args.verbose else False WORK_DATA["vv"] = True if args.vv else False try: # check the directories, create if it is necessary os.mkdir(args.jobs) if not os.path.isdir(args.jobs) else None os.mkdir( args.results_dir) if not os.path.isdir(args.results_dir) else None os.mkdir(args.errors_dir) \ if args.errors_dir and not os.path.isdir(args.errors_dir) \ else None WORK_DATA["jobs"] = args.jobs WORK_DATA["results_dir"] = args.results_dir WORK_DATA["errors_dir"] = args.errors_dir verbose( f"Directories in usage: {args.jobs} {args.results_dir} {args.errors_dir}" ) except FileNotFoundError as grepexc: # a one of those tasks failed eprint(f"Arguments are corrupted!\n{str(grepexc)}") die("Cannot create one of the directories requested.") # define about chain and bed files WORK_DATA["chain_file"] = args.chain_file if os.path.isfile(args.chain_file) \ else die(f"Error! Chain file {args.chain_file} is wrong!") WORK_DATA["bed_file"] = args.bed_file if os.path.isfile(args.bed_file) \ else die(f"Error! Bed file {args.bed_file} is wrong!") verbose(f"Use bed file {args.bed_file} and chain file {args.chain_file}") # look for .ID.bb file index_file = args.index_file if args.index_file else args.chain_file.replace( ".chain", ".chain_ID_position") if os.path.isfile(index_file): # check if bb file is here WORK_DATA["index_file"] = index_file verbose(f"And {index_file} as an index file") elif args.make_index: # create index if not exists eprint("make_indexed in progress...") idbb_cmd = f"/modules/chain_bdb_index.py {args.chain_file} {index_file}" call_proc(idbb_cmd) WORK_DATA["index_file"] = index_file else: # die die(f"Error! Cannot find index file at {index_file}\n" "Please define it manually") # define the number of jobs if args.job_size: # easy: WORK_DATA["job_size"] = args.job_size WORK_DATA["jobs_num"] = None else: # we must compute how many jobs to put into one cluster job WORK_DATA["job_size"] = None WORK_DATA["jobs_num"] = args.jobs_num WORK_DATA["bed_index"] = args.bed_index # some defaults WORK_DATA["jobs_file"] = args.jobs_file WORK_DATA["ref"] = args.ref # check if we are on cluster WORK_DATA["on_cluster"] = True verbose("Program-wide dictionary looks like:\n") for k, v in WORK_DATA.items(): verbose(f"{k}: {v}")
def call_proc(cmd): """Call a subprocess and catch errors.""" rc = subprocess.call(cmd, shell=True) if rc != 0: die(f"Error! Process {cmd} died! Abort.")