Example #1
0
def check_args(chain, genes, chain_file, bed_file, verbose_level, work_data,
               result):
    # print(chain_index, chain_file)
    """Check if arguments are correct, extract initial data if so."""
    global VERBOSE  # set verbosity level
    VERBOSE = True if verbose_level else False
    verbose("# unit.py called for chain {} and genes {}".format(chain, genes))
    # another minor things
    verbose(f"Using {bed_file} and {chain_file}")
    work_data["chain_id"] = chain

    # check genes
    raw_genes = [x for x in genes.split(",") if x != ""]
    # bed_lines = bedExtractSqlite(raw_genes, bed_index, bed_file)
    bed_lines = bedExtractID(bed_file, raw_genes)
    work_data["bed"] = bed_lines  # save it
    work_data["genes"] = [x.split("\t")[3] for x in bed_lines.split("\n")[:-1]]

    # check if numbers of genes are equal
    if len(raw_genes) != len(bed_lines.split("\n")[:-1]):
        eprint("Warning. Not all the genes you set were found!\n")
        eprint("You set {0} genes, {1} extracted".format(
            len(raw_genes), len(bed_lines.split("\n")[:-1])))
        eprint("Genes missed:\n{0}".format(",".join(
            [x for x in raw_genes if x not in work_data["genes"]])))

    work_data["chain"] = chainExtractID(chain_file, int(chain))

    # parse chain header
    chain_header = work_data["chain"].split("\n")[0].split()
    verbose("Chain header is:\n{0}".format(chain_header))
    q_Start = int(chain_header[10])
    q_End = int(chain_header[11])
    q_len = abs(q_End - q_Start)
    work_data["chain_QLen"] = q_len
    work_data["chain_Tstarts"] = int(chain_header[5])
    work_data["chain_Tends"] = int(chain_header[6])
    result["chain_global_score"] = int(chain_header[1])
    result["chain_len"] = work_data["chain_Tends"] - work_data["chain_Tstarts"]
Example #2
0
def get_corr_q_regions(gene_to_pp_chains, chain_bdb, bed_bdb):
    """Create projection: q region dict."""
    proj_to_q_reg = {}
    for gene, chains in gene_to_pp_chains.items():
        gene_track = bedExtractID(bed_bdb, gene).rstrip().split("\t")
        gene_strand = gene_track[5]
        for chain_id in chains:
            projection = f"{gene}.{chain_id}"
            chain_body = chainExtractID(chain_bdb, chain_id)
            chain_header = chain_body.split("\n")[0].split()
            q_chrom = chain_header[7]
            q_size = int(chain_header[8])
            q_strand = chain_header[9]
            q_start = int(chain_header[10])
            q_end = int(chain_header[11])
            if q_strand == "-":
                t_ = q_start
                q_start = q_size - q_end
                q_end = q_size - t_
            proj_strand = "+" if q_strand == gene_strand else "-"
            proj_reg = (q_chrom, proj_strand, q_start, q_end)
            proj_to_q_reg[projection] = proj_reg
    return proj_to_q_reg
def get_corr_q_regions(gene_to_pp_chains, chain_bdb, bed_bdb):
    """Create projection: q region dict.
    
    We assume the following:
    1) processed pseudogene is a "single-exon" element
    2) ppgene chain covers the ppgene and nothing else.
    """
    proj_to_q_reg = {}  # save results here
    for gene, chains in gene_to_pp_chains.items():
        # iterate over gene: [chain ids] elements
        # extract gene track
        gene_track = bedExtractID(bed_bdb, gene).rstrip().split("\t")
        gene_strand = gene_track[5]  # we need the strand only
        for chain_id in chains:
            # we have a list of chains
            projection = f"{gene}.{chain_id}"  # name this projection as usual
            # extract the chain and parse it's header
            chain_body = chainExtractID(chain_bdb, chain_id)
            chain_header = chain_body.split("\n")[0].split()
            # we need chrom, start, end, strand and q_size
            q_chrom = chain_header[7]
            q_size = int(chain_header[8])
            q_strand = chain_header[9]
            q_start = int(chain_header[10])
            q_end = int(chain_header[11])
            if q_strand == "-":
                # if q_strand is - we need to invert coordinates
                # see chains documentation for details
                t_ = q_start
                q_start = q_size - q_end
                q_end = q_size - t_
            # get projection strand and sage the region
            proj_strand = "+" if q_strand == gene_strand else "-"
            proj_reg = (q_chrom, proj_strand, q_start, q_end)
            proj_to_q_reg[projection] = proj_reg
    return proj_to_q_reg
def precompute_regions(batch, bed_data, bdb_chain_file, chain_gene_field, limit):
    """Precompute region for each chain: bed pair."""
    eprint("Precompute regions for each gene:chain pair...")
    chain_to_genes, skipped = defaultdict(list), []
    # revert the dict, from gene2chain to chain2genes
    for gene, chains in batch.items():
        if len(chains) == 0:
            skipped.append((gene, ",".join(chains), "no orthologous chains"))
            continue
        chains_ = sorted(chains, key=lambda x: int(x))
        chains_ = chains_[:limit]
        if len(chains) > limit:
            # skip genes that have > limit orthologous chains
            skipped.append((gene, ",".join(chains_[limit:]),
                            f"number of chains ({limit} chains) limit exceeded"))
        for chain in chains_:
            chain_to_genes[chain].append(gene)
    # read regions themselves
    gene_chain_grange = defaultdict(dict)
    chains_num, iter_num = len(chain_to_genes.keys()), 0

    for chain_id, genes in chain_to_genes.items():
        # extract chain itself
        chain_body = chainExtractID(bdb_chain_file, chain_id).encode()
        all_gene_ranges = []
        for gene in genes:
            # get genomic coordinates for each gene
            gene_data = bed_data.get(gene)
            grange = f"{gene_data[0]}:{gene_data[1]}-{gene_data[2]}"
            all_gene_ranges.append(grange)
            
        # we need to get corresponding regions in the query
        # for now we have chain blocks coordinates and gene
        # regions in the reference genome
        # use chain_coords_converter shared library to
        # convert target -> query coordinates via chain
        # first need to convert to C-types
        c_chain = ctypes.c_char_p(chain_body)
        c_shift = ctypes.c_int(2)
        granges_bytes = [s.encode("utf-8") for s in all_gene_ranges]
        granges_num = len(all_gene_ranges)
        c_granges_num = ctypes.c_int(granges_num)
        granges_arr = (ctypes.c_char_p * (granges_num + 1))()
        granges_arr[:-1] = granges_bytes
        granges_arr[granges_num] = None

        # then call the function
        raw_ch_conv_out = ch_lib.chain_coords_converter(c_chain,
                                                        c_shift,
                                                        c_granges_num,
                                                        granges_arr)
        chain_coords_conv_out = []  # keep lines here
        # convert C output to python-readible type
        for i in range(granges_num + 1):
            chain_coords_conv_out.append(raw_ch_conv_out[i].decode("utf-8"))

        for line in chain_coords_conv_out[1:]:
            # then parse the output
            line_info = line.rstrip().split()
            # line info is: region num, region in refererence, region in query
            # one line per one gene, in the same order
            num = int(line_info[0])
            # regions format is chrom:start-end
            q_grange = line_info[1].split(":")[1].split("-")
            q_start, q_end = int(q_grange[0]), int(q_grange[1])
            que_len = q_end - q_start
            t_grange = line_info[2].split(":")[1].split("-")
            t_start, t_end = int(t_grange[0]), int(t_grange[1])
            tar_len = t_end - t_start
            len_delta = abs(tar_len - que_len)
            delta_gene_times = len_delta / tar_len
            gene = genes[num]
            field = chain_gene_field.get((chain_id, gene))
            # check that corresponding region in the query is not too long
            # if so: skip this
            high_rel_len = delta_gene_times > REL_LENGTH_THR
            high_abs_len = len_delta > ABS_LENGTH_TRH
            long_loci_field = field in LONG_LOCI_FIELDS
            if (high_rel_len or high_abs_len) and long_loci_field:
                skipped.append((gene, chain_id, "too long query locus"))
                continue
            # for each chain-gene pair save query region length
            # need this for reqired memory estimation
            gene_chain_grange[gene][chain_id] = que_len

        del raw_ch_conv_out  # not sure if necessary but...
        iter_num += 1  # verbosity
        eprint(f"Chain {iter_num} / {chains_num}", end="\r")
    return gene_chain_grange, skipped
def precompute_regions(batch, bed_data, bdb_chain_file, chain_gene_field, limit):
    """Precompute region for each chain: bed pair."""
    eprint("Precompute regions for each gene:chain pair...")
    chain_to_genes, skipped = defaultdict(list), []
    # upd_batch = defaultdict(list)
    # revert the dict
    for gene, chains in batch.items():
        if len(chains) == 0:
            skipped.append((gene, ",".join(chains), "no orthologous chains"))
            continue
        chains_ = sorted(chains, key=lambda x: int(x))
        chains_ = chains_[:limit]
        if len(chains) > limit:
            skipped.append((gene, ",".join(chains_[limit:]),
                            f"number of chains ({limit} chains) limit exceeded"))
        for chain in chains_:
            chain_to_genes[chain].append(gene)
    # read regions themselves
    gene_chain_grange = defaultdict(dict)
    chains_num, iter_num = len(chain_to_genes.keys()), 0

    for chain_id, genes in chain_to_genes.items():
        # extract chain itself + get ranges for genes
        chain_body = chainExtractID(bdb_chain_file, chain_id).encode()
        all_gene_ranges = []
        for gene in genes:
            gene_data = bed_data.get(gene)
            grange = f"{gene_data[0]}:{gene_data[1]}-{gene_data[2]}"
            all_gene_ranges.append(grange)
            
        # using shared lib to get corresponding regions
        # we need to convert python datatypes to C types
        c_chain = ctypes.c_char_p(chain_body)
        c_shift = ctypes.c_int(2)
        granges_bytes = [s.encode("utf-8") for s in all_gene_ranges]
        granges_num = len(all_gene_ranges)
        c_granges_num = ctypes.c_int(granges_num)
        granges_arr = (ctypes.c_char_p * (granges_num + 1))()
        granges_arr[:-1] = granges_bytes
        granges_arr[granges_num] = None
        # then call the function

        raw_ch_conv_out = ch_lib.chain_coords_converter(c_chain,
                                                        c_shift,
                                                        c_granges_num,
                                                        granges_arr)
        chain_coords_conv_out = []  # keep lines here
        # convert C output to python-readible type
        for i in range(granges_num + 1):
            chain_coords_conv_out.append(raw_ch_conv_out[i].decode("utf-8"))

        for line in chain_coords_conv_out[1:]:
            line_info = line[:-1].split()
            num = int(line_info[0])
            q_grange = line_info[1].split(":")[1].split("-")
            q_start, q_end = int(q_grange[0]), int(q_grange[1])
            que_len = q_end - q_start
            t_grange = line_info[2].split(":")[1].split("-")
            t_start, t_end = int(t_grange[0]), int(t_grange[1])
            tar_len = t_end - t_start
            len_delta = abs(tar_len - que_len)
            delta_gene_times = len_delta / tar_len
            gene = genes[num]
            field = chain_gene_field.get((chain_id, gene))
            high_rel_len = delta_gene_times > REL_LENGTH_THR
            high_abs_len = len_delta > ABS_LENGTH_TRH
            long_loci_field = field in LONG_LOCI_FIELDS
            if (high_rel_len or high_abs_len) and long_loci_field:
                skipped.append((gene, chain_id, "too long query locus"))
                continue
            gene_chain_grange[gene][chain_id] = que_len
        # not sure if necessary but...
        del raw_ch_conv_out
        iter_num += 1
        eprint(f"Chain {iter_num} / {chains_num}", end="\r")
    return gene_chain_grange, skipped