def check_args(chain, genes, chain_file, bed_file, verbose_level, work_data, result): # print(chain_index, chain_file) """Check if arguments are correct, extract initial data if so.""" global VERBOSE # set verbosity level VERBOSE = True if verbose_level else False verbose("# unit.py called for chain {} and genes {}".format(chain, genes)) # another minor things verbose(f"Using {bed_file} and {chain_file}") work_data["chain_id"] = chain # check genes raw_genes = [x for x in genes.split(",") if x != ""] # bed_lines = bedExtractSqlite(raw_genes, bed_index, bed_file) bed_lines = bedExtractID(bed_file, raw_genes) work_data["bed"] = bed_lines # save it work_data["genes"] = [x.split("\t")[3] for x in bed_lines.split("\n")[:-1]] # check if numbers of genes are equal if len(raw_genes) != len(bed_lines.split("\n")[:-1]): eprint("Warning. Not all the genes you set were found!\n") eprint("You set {0} genes, {1} extracted".format( len(raw_genes), len(bed_lines.split("\n")[:-1]))) eprint("Genes missed:\n{0}".format(",".join( [x for x in raw_genes if x not in work_data["genes"]]))) work_data["chain"] = chainExtractID(chain_file, int(chain)) # parse chain header chain_header = work_data["chain"].split("\n")[0].split() verbose("Chain header is:\n{0}".format(chain_header)) q_Start = int(chain_header[10]) q_End = int(chain_header[11]) q_len = abs(q_End - q_Start) work_data["chain_QLen"] = q_len work_data["chain_Tstarts"] = int(chain_header[5]) work_data["chain_Tends"] = int(chain_header[6]) result["chain_global_score"] = int(chain_header[1]) result["chain_len"] = work_data["chain_Tends"] - work_data["chain_Tstarts"]
def get_corr_q_regions(gene_to_pp_chains, chain_bdb, bed_bdb): """Create projection: q region dict.""" proj_to_q_reg = {} for gene, chains in gene_to_pp_chains.items(): gene_track = bedExtractID(bed_bdb, gene).rstrip().split("\t") gene_strand = gene_track[5] for chain_id in chains: projection = f"{gene}.{chain_id}" chain_body = chainExtractID(chain_bdb, chain_id) chain_header = chain_body.split("\n")[0].split() q_chrom = chain_header[7] q_size = int(chain_header[8]) q_strand = chain_header[9] q_start = int(chain_header[10]) q_end = int(chain_header[11]) if q_strand == "-": t_ = q_start q_start = q_size - q_end q_end = q_size - t_ proj_strand = "+" if q_strand == gene_strand else "-" proj_reg = (q_chrom, proj_strand, q_start, q_end) proj_to_q_reg[projection] = proj_reg return proj_to_q_reg
def get_corr_q_regions(gene_to_pp_chains, chain_bdb, bed_bdb): """Create projection: q region dict. We assume the following: 1) processed pseudogene is a "single-exon" element 2) ppgene chain covers the ppgene and nothing else. """ proj_to_q_reg = {} # save results here for gene, chains in gene_to_pp_chains.items(): # iterate over gene: [chain ids] elements # extract gene track gene_track = bedExtractID(bed_bdb, gene).rstrip().split("\t") gene_strand = gene_track[5] # we need the strand only for chain_id in chains: # we have a list of chains projection = f"{gene}.{chain_id}" # name this projection as usual # extract the chain and parse it's header chain_body = chainExtractID(chain_bdb, chain_id) chain_header = chain_body.split("\n")[0].split() # we need chrom, start, end, strand and q_size q_chrom = chain_header[7] q_size = int(chain_header[8]) q_strand = chain_header[9] q_start = int(chain_header[10]) q_end = int(chain_header[11]) if q_strand == "-": # if q_strand is - we need to invert coordinates # see chains documentation for details t_ = q_start q_start = q_size - q_end q_end = q_size - t_ # get projection strand and sage the region proj_strand = "+" if q_strand == gene_strand else "-" proj_reg = (q_chrom, proj_strand, q_start, q_end) proj_to_q_reg[projection] = proj_reg return proj_to_q_reg
def precompute_regions(batch, bed_data, bdb_chain_file, chain_gene_field, limit): """Precompute region for each chain: bed pair.""" eprint("Precompute regions for each gene:chain pair...") chain_to_genes, skipped = defaultdict(list), [] # revert the dict, from gene2chain to chain2genes for gene, chains in batch.items(): if len(chains) == 0: skipped.append((gene, ",".join(chains), "no orthologous chains")) continue chains_ = sorted(chains, key=lambda x: int(x)) chains_ = chains_[:limit] if len(chains) > limit: # skip genes that have > limit orthologous chains skipped.append((gene, ",".join(chains_[limit:]), f"number of chains ({limit} chains) limit exceeded")) for chain in chains_: chain_to_genes[chain].append(gene) # read regions themselves gene_chain_grange = defaultdict(dict) chains_num, iter_num = len(chain_to_genes.keys()), 0 for chain_id, genes in chain_to_genes.items(): # extract chain itself chain_body = chainExtractID(bdb_chain_file, chain_id).encode() all_gene_ranges = [] for gene in genes: # get genomic coordinates for each gene gene_data = bed_data.get(gene) grange = f"{gene_data[0]}:{gene_data[1]}-{gene_data[2]}" all_gene_ranges.append(grange) # we need to get corresponding regions in the query # for now we have chain blocks coordinates and gene # regions in the reference genome # use chain_coords_converter shared library to # convert target -> query coordinates via chain # first need to convert to C-types c_chain = ctypes.c_char_p(chain_body) c_shift = ctypes.c_int(2) granges_bytes = [s.encode("utf-8") for s in all_gene_ranges] granges_num = len(all_gene_ranges) c_granges_num = ctypes.c_int(granges_num) granges_arr = (ctypes.c_char_p * (granges_num + 1))() granges_arr[:-1] = granges_bytes granges_arr[granges_num] = None # then call the function raw_ch_conv_out = ch_lib.chain_coords_converter(c_chain, c_shift, c_granges_num, granges_arr) chain_coords_conv_out = [] # keep lines here # convert C output to python-readible type for i in range(granges_num + 1): chain_coords_conv_out.append(raw_ch_conv_out[i].decode("utf-8")) for line in chain_coords_conv_out[1:]: # then parse the output line_info = line.rstrip().split() # line info is: region num, region in refererence, region in query # one line per one gene, in the same order num = int(line_info[0]) # regions format is chrom:start-end q_grange = line_info[1].split(":")[1].split("-") q_start, q_end = int(q_grange[0]), int(q_grange[1]) que_len = q_end - q_start t_grange = line_info[2].split(":")[1].split("-") t_start, t_end = int(t_grange[0]), int(t_grange[1]) tar_len = t_end - t_start len_delta = abs(tar_len - que_len) delta_gene_times = len_delta / tar_len gene = genes[num] field = chain_gene_field.get((chain_id, gene)) # check that corresponding region in the query is not too long # if so: skip this high_rel_len = delta_gene_times > REL_LENGTH_THR high_abs_len = len_delta > ABS_LENGTH_TRH long_loci_field = field in LONG_LOCI_FIELDS if (high_rel_len or high_abs_len) and long_loci_field: skipped.append((gene, chain_id, "too long query locus")) continue # for each chain-gene pair save query region length # need this for reqired memory estimation gene_chain_grange[gene][chain_id] = que_len del raw_ch_conv_out # not sure if necessary but... iter_num += 1 # verbosity eprint(f"Chain {iter_num} / {chains_num}", end="\r") return gene_chain_grange, skipped
def precompute_regions(batch, bed_data, bdb_chain_file, chain_gene_field, limit): """Precompute region for each chain: bed pair.""" eprint("Precompute regions for each gene:chain pair...") chain_to_genes, skipped = defaultdict(list), [] # upd_batch = defaultdict(list) # revert the dict for gene, chains in batch.items(): if len(chains) == 0: skipped.append((gene, ",".join(chains), "no orthologous chains")) continue chains_ = sorted(chains, key=lambda x: int(x)) chains_ = chains_[:limit] if len(chains) > limit: skipped.append((gene, ",".join(chains_[limit:]), f"number of chains ({limit} chains) limit exceeded")) for chain in chains_: chain_to_genes[chain].append(gene) # read regions themselves gene_chain_grange = defaultdict(dict) chains_num, iter_num = len(chain_to_genes.keys()), 0 for chain_id, genes in chain_to_genes.items(): # extract chain itself + get ranges for genes chain_body = chainExtractID(bdb_chain_file, chain_id).encode() all_gene_ranges = [] for gene in genes: gene_data = bed_data.get(gene) grange = f"{gene_data[0]}:{gene_data[1]}-{gene_data[2]}" all_gene_ranges.append(grange) # using shared lib to get corresponding regions # we need to convert python datatypes to C types c_chain = ctypes.c_char_p(chain_body) c_shift = ctypes.c_int(2) granges_bytes = [s.encode("utf-8") for s in all_gene_ranges] granges_num = len(all_gene_ranges) c_granges_num = ctypes.c_int(granges_num) granges_arr = (ctypes.c_char_p * (granges_num + 1))() granges_arr[:-1] = granges_bytes granges_arr[granges_num] = None # then call the function raw_ch_conv_out = ch_lib.chain_coords_converter(c_chain, c_shift, c_granges_num, granges_arr) chain_coords_conv_out = [] # keep lines here # convert C output to python-readible type for i in range(granges_num + 1): chain_coords_conv_out.append(raw_ch_conv_out[i].decode("utf-8")) for line in chain_coords_conv_out[1:]: line_info = line[:-1].split() num = int(line_info[0]) q_grange = line_info[1].split(":")[1].split("-") q_start, q_end = int(q_grange[0]), int(q_grange[1]) que_len = q_end - q_start t_grange = line_info[2].split(":")[1].split("-") t_start, t_end = int(t_grange[0]), int(t_grange[1]) tar_len = t_end - t_start len_delta = abs(tar_len - que_len) delta_gene_times = len_delta / tar_len gene = genes[num] field = chain_gene_field.get((chain_id, gene)) high_rel_len = delta_gene_times > REL_LENGTH_THR high_abs_len = len_delta > ABS_LENGTH_TRH long_loci_field = field in LONG_LOCI_FIELDS if (high_rel_len or high_abs_len) and long_loci_field: skipped.append((gene, chain_id, "too long query locus")) continue gene_chain_grange[gene][chain_id] = que_len # not sure if necessary but... del raw_ch_conv_out iter_num += 1 eprint(f"Chain {iter_num} / {chains_num}", end="\r") return gene_chain_grange, skipped