Ejemplo n.º 1
0
def read_fasta(fasta_line, v=False):
    """Read fasta, return dict and type."""
    fasta_data = fasta_line.split(">")
    eprint(f"fasta_data[0] is:\n{fasta_data[0]}") if v else None
    eprint(f"fasta_data[1] is:\n{fasta_data[1]}") if v else None
    if fasta_data[0] != "":
        # this is a bug
        eprint("ERROR! Cesar output is corrupted")
        eprint(f"Issue detected in the following string:\n{fasta_line}")
        die("Abort")
    del fasta_data[0]  # remove it "" we don't need that
    sequences = {}  # accumulate data here
    order = []  # to have ordered list

    # there is no guarantee that dict will contain elements in the
    # same order as they were added
    for elem in fasta_data:
        raw_lines = elem.split("\n")
        # it must be first ['capHir1', 'ATGCCGCGCCAATTCCCCAAGCTGA... ]
        header = raw_lines[0]
        # separate nucleotide-containing lines
        lines = [x for x in raw_lines[1:] if x != "" and not x.startswith("!")]
        if len(lines) == 0:  # it is a mistake - empty sequence --> get rid of
            continue
        fasta_content = "".join(lines)
        sequences[header] = fasta_content
        order.append(header)
    return sequences, order
Ejemplo n.º 2
0
def save(template, batch):
    """Save the cluster jobs, create jobs_file file."""
    filenames = {}  # collect filenames of cluster jobs
    for num, jobs in enumerate(batch):
        # define the path for the job
        job_path = os.path.join(WORK_DATA["jobs"], f"part_{num}")
        filenames[num] = job_path  # i need these paths for jobs_file file
        # put the \n-separated jobs into the template
        # save this finally
        with open(job_path, "w") as f:
            f.write("\n".join(jobs) + "\n")

    # save the jobs_file file
    # add > line for stdout and 2> for stderr (if required)
    f = open(WORK_DATA["jobs_file"], "w")
    for num, path in filenames.items():
        cmd = template.format(path)
        stdout_part = f"> {WORK_DATA['results_dir']}/{num}.txt"
        stderr_part = "2> {WORK_DATA['errors_dir']}/{num}.txt" \
            if WORK_DATA["errors_dir"] else ""
        jobs_file_line = "{0} {1} {2}\n".format(cmd, stdout_part, stderr_part)
        f.write(jobs_file_line)
    # make executable
    rc = subprocess.call(f"chmod +x {WORK_DATA['jobs_file']}", shell=True)
    if rc != 0:  # just in case
        die(f"Error! chmod +x {WORK_DATA['jobs_file']} failed")
    f.close()
Ejemplo n.º 3
0
def load_results(results_dir):
    """Load and sort the chain feature extractor results."""
    verbose("Loading the results...")
    results_files = os.listdir(results_dir)
    verbose(f"There are {len(results_files)} result files to combine")

    # to hold data from fields "genes":
    chain_genes_data = defaultdict(list)
    # to hold data from "chains" field:
    chain_raw_data = {}
    # read file-by-file, otherwise it takes too much place
    genes_counter, chain_counter = 0, 0  # count chain and genes lines

    for results_file in results_files:
        # there are N files: read them one-by-one
        path = os.path.join(results_dir, results_file)
        f = open(path, "r")
        for line in f:
            # read file line-by-line, all fields are tab-separated
            line_data = line.rstrip().split("\t")
            # define the class of this line
            # a line could be either gene or chain-related
            if line_data[0] == "genes":
                # process as a gene line
                chain, genes = process_gene_line(line_data)
                chain_genes_data[chain].extend(genes)
                genes_counter += 1
            elif line_data[0] == "chain":
                # chain related data
                the_chain_related = process_chain_line(line_data)
                # add this chain-related dict to the global one
                chain_raw_data.update(the_chain_related)
                chain_counter += 1
        # do not forget to close the file
        f.close()

    verbose(f"Got {len(chain_genes_data)} keys in chain_genes_data")
    verbose(f"Got {len(chain_raw_data)} keys in chain_raw_data")
    verbose(
        f"There were {genes_counter} genes lines and {chain_counter} chain lines"
    )
    # actually, these values must be equal
    # just a sanity check
    if not genes_counter == chain_counter:
        eprint(f"WARNING! genes_counter and chain_counter hold different "
               f"values:\n{genes_counter} and {chain_counter} respectively")
        die("Some features extracting jobs died!")
    return chain_genes_data, chain_raw_data
Ejemplo n.º 4
0
def bed12_to_ranges(bed):
    """Convert bed-12 file to set of sorted ranges."""
    ranges_unsort, chrom = [], None
    for line in bed.split("\n")[:-1]:
        # parse line and extract blocks
        line_info = line.split("\t")
        chrom = line_info[0]
        glob_start = int(line_info[1])
        blocks_num = int(line_info[9])
        block_sizes = [int(x) for x in line_info[10].split(",") if x != ""]
        block_starts = [glob_start + int(x) for x in line_info[11].split(",") if x != ""]
        block_ends = [block_starts[i] + block_sizes[i] for i in range(blocks_num)]
        for i in range(blocks_num):  # save the range for each exon
            ranges_unsort.append((block_starts[i], block_ends[i]))
    # return sorted ranges
    die("(bed12_to_ranges) error, cannot read bed properly") if not chrom else None
    return chrom, sorted(ranges_unsort, key=lambda x: x[0])
Ejemplo n.º 5
0
def read_bed_data(bed_file):
    """Get the necessary data from the bed file."""
    result = {}  # return this dictionary
    verbose(f"Reading {bed_file}")
    f = open(bed_file, "r")
    for line in f:
        # parse tab-separated bed file
        all_bed_info = line.rstrip().split("\t")
        cds_track = make_cds_track(line)  # we need CDS only
        cds_bed_info = cds_track.rstrip().split("\t")

        if len(all_bed_info) != 12 or len(cds_bed_info) != 12:
            # if there are not 12 fields - no guarantee that we parse what we want
            die(f"Error! Bed12 file {bed_file} is corrupted!")

        # extract fields that we need
        chromStart = int(all_bed_info[1])  # gene start
        chromEnd = int(all_bed_info[2])  # and end
        # blocks represent exons
        all_blockSizes = [
            int(x) for x in all_bed_info[10].split(',') if x != ''
        ]
        cds_blockSizes = [
            int(x) for x in cds_bed_info[10].split(',') if x != ''
        ]
        # data to save
        gene_len = abs(chromStart - chromEnd)

        # for decision tree I will need number of exons
        # and number of bases in exonic and intronic fractions
        exons_num = len(all_blockSizes)
        exon_fraction = sum(all_blockSizes)  # including UTR
        cds_fraction = sum(cds_blockSizes)  # CDS only
        intron_fraction = gene_len - exon_fraction
        gene_name = all_bed_info[3]

        # save the data
        result[gene_name] = {
            "gene_len": gene_len,
            "exon_fraction": cds_fraction,
            "intron_fraction": intron_fraction,
            "exons_num": exons_num
        }
    f.close()
    verbose(f"Got data for {len(result.keys())} genes")
    return result
Ejemplo n.º 6
0
def read_input(input_file):
    """Read input."""
    # it must be chain TAB genes line
    if os.path.isfile(input_file):
        tasks = {}
        f = open(input_file)
        for line in f:
            line_info = line[:-1].split("\t")
            chain = line_info[0]
            genes = line_info[1]
            tasks[chain] = genes
        f.close()
        return tasks
    elif len(input_file.split()) == 2:
        # it is not a file but chain<space>[,-sep list of genes]
        chain = input_file.split()[0]
        genes = input_file.split()[1]
        return {chain: genes}
    else:
        err_msg = "Error! Wrong input. Please provide either a file containing chain to genes\n" \
                  "list or a \"chain<space>[comma-separated list of genes]\" formatted-file"
        die(err_msg)
        return
Ejemplo n.º 7
0
def merge_cesar_output(input_dir, output_bed, output_fasta, meta_data_arg,
                       skipped_arg, prot_arg, output_trash):
    """Merge multiple CESAR output files."""
    # check that input dir is correct
    die(f"Error! {input_dir} is not a dir!") \
        if not os.path.isdir(input_dir) else None
    # get list of bdb files (output of CESAR part)
    bdbs = [x for x in os.listdir(input_dir) if x.endswith(".bdb")]

    # initiate lists for different types of output:
    bed_summary = []
    fasta_summary = []
    trash_summary = []
    meta_summary = []
    prot_summary = []
    skipped = []
    all_ok = True

    task_size = len(bdbs)
    # extract data for all the files
    for num, bdb_file in enumerate(bdbs):
        # parse bdb files one by one
        bdb_path = os.path.join(input_dir, bdb_file)
        try:  # try to parse data
            parsed_data = parse_cesar_bdb(bdb_path)
        except AssertionError:
            # if this happened: some assertion was violated
            # probably CESAR output data is corrupted
            sys.exit(f"Error! Failed reading file {bdb_file}")

        # unpack parsed data tuple:
        bed_lines = parsed_data[0]
        trash_exons = parsed_data[1]
        fasta_lines = parsed_data[2]
        meta_data = parsed_data[3]
        prot_fasta = parsed_data[4]
        skip = parsed_data[5]

        if len(bed_lines) == 0:
            # actually should not happen, but can
            eprint(f"Warning! {bdb_file} is empty")
            all_ok = False
            continue  # it is empty

        # append data to lists
        bed_summary.append("\n".join(bed_lines) + "\n")
        fasta_summary.append(fasta_lines)
        trash_summary.append("".join(trash_exons))
        meta_summary.append(meta_data)
        skipped.append(skip)
        prot_summary.append(prot_fasta)
        eprint(f"Reading file {num + 1}/{task_size}", end="\r")

    # save output
    eprint("Saving the output")

    if len(bed_summary) == 0:
        # if so, no need to continue
        eprint("! merge_cesar_output.py:")
        die("No projections found! Abort.")

    # save bed, fasta and the rest
    with open(output_bed, "w") as f:
        f.write("".join(bed_summary))
    with open(output_fasta, "w") as f:
        f.write("".join(fasta_summary))
    with open(meta_data_arg, "w") as f:
        f.write("\n".join(meta_summary))
    with open(skipped_arg, "w") as f:
        f.write("\n".join(skipped))
    with open(prot_arg, "w") as f:
        f.write("\n".join(prot_summary))

    if output_trash:
        # if requested: provide trash annotation
        f = open(output_trash, "w")
        f.write("".join(trash_summary))
        f.close()
    return all_ok
Ejemplo n.º 8
0
def prepare_bed_file(bed_file,
                     output,
                     ouf=False,
                     save_rejected=None,
                     only_chrom=None):
    """Filter the bed file given and save the updated version."""
    new_lines = []  # keep updated lines
    rejected = []  # keep IDs of skipped transcripts + the reason why
    names = Counter()  # we need to make sure that all names are unique

    f = open(bed_file, "r")
    for num, line in enumerate(f, 1):
        # parse bed file according to specification
        line_data = line.rstrip().split("\t")

        if len(line_data) != 12:
            f.close()  # this is for sure an error
            # it is possible only if something except a bed12 was provided
            die("Error! Bed 12 file is required! Got a file with {len(line_data)} fields instead"
                )

        chrom = line_data[0]
        if only_chrom and chrom != only_chrom:
            # TOGA allows to perform the analysis on a specific chromosome only
            # is so, we can skip all transcripts that located on other chromosomes
            continue
        chromStart = int(line_data[1])
        chromEnd = int(line_data[2])
        name = line_data[3]  # gene_name usually
        # bed_score = int(line_data[4])  # never used
        # strand = line_data[5]  # otherwise:
        # strand = True if line_data[5] == '+' else False
        thickStart = int(line_data[6])
        thickEnd = int(line_data[7])
        # itemRgb = line_data[8]  # never used
        blockCount = int(line_data[9])
        blockSizes = [int(x) for x in line_data[10].split(',') if x != '']
        blockStarts = [int(x) for x in line_data[11].split(',') if x != '']
        blockEnds = [blockStarts[i] + blockSizes[i] for i in range(blockCount)]
        blockAbsStarts = [
            blockStarts[i] + chromStart for i in range(blockCount)
        ]
        blockAbsEnds = [blockEnds[i] + chromStart for i in range(blockCount)]
        blockNewStarts, blockNewEnds = [], []
        names[name] += 1

        if thickStart > thickEnd:
            f.close(
            )  # according to bed12 specification this should never happen
            sys.stderr.write(f"Problem occurred at line {num}, gene {name}\n")
            die("Error! Bed file is corrupted, thickEnd MUST be >= thickStart")
        elif thickStart == thickEnd:
            # this means that this is a non-coding transcript
            # TOGA cannot process them: we can skip it
            rejected.append((name, "No CDS"))
            continue

        if thickStart < chromStart or thickEnd > chromEnd:
            # a very strange (but still possible) case
            f.close()  # for sure an error with input data
            sys.stderr.write(f"Problem occurred at line {num}, gene {name}\n")
            die("Error! Bed file is corrupted, thickRange is outside chromRange!"
                )

        # now select CDS only
        # we keep UTRs in the filtered file
        # however, we need CDS to check whether it's correct (% 3 == 0)
        for block_num in range(blockCount):
            blockStart = blockAbsStarts[block_num]
            blockEnd = blockAbsEnds[block_num]

            # skip the block if it is entirely UTR
            if blockEnd <= thickStart:
                continue
            elif blockStart >= thickEnd:
                continue

            # if we are here: this is not an entirely UTR exon
            # it might intersect the CDS border or to be in the CDS entirely
            # remove UTRs: block start must be >= CDS_start (thickStart)
            # block end must be <= CDS_end (thickEnd)
            blockNewStart = blockStart if blockStart >= thickStart else thickStart
            blockNewEnd = blockEnd if blockEnd <= thickEnd else thickEnd
            blockNewStarts.append(blockNewStart - thickStart)
            blockNewEnds.append(blockNewEnd - thickStart)

        if len(blockNewStarts) == 0:
            # even it thickStart != thickEnd this transcript still can be non-coding
            # but if there are no blocks in the CDS -> we can catch this
            rejected.append((name, "No CDS"))
            continue

        block_new_count = len(blockNewStarts)
        blockNewSizes = [
            blockNewEnds[i] - blockNewStarts[i] for i in range(block_new_count)
        ]

        if sum(blockNewSizes) % 3 != 0 and not ouf:
            # this is an out-of-frame (or incomplete transcript)
            # ideally CDS length should be divisible by 3
            # not ouf means that we like to keep such transcripts for some reason
            rejected.append((name, "Out-of-frame gene"))
            continue

        # if there are non-unique transcript IDs: die
        # I kill it there, not earlier to show them altogether
        if any(v > 1 for v in names.values()):
            eprint("Error! There are non-uniq transcript IDs:")
            for k in names.keys():
                eprint(k)
            die("Abort")
        # we keep this transcript: add in to the list
        new_line = "\t".join([str(x) for x in line_data])
        new_lines.append(new_line)
    f.close()

    if len(new_lines) == 0:
        # no transcripts pass the filter: probably an input data mistake
        sys.exit(
            f"Error! No reference annotation tracks left after filtering procedure! Abort"
        )

    # write transcripts that passed the filter to the output file
    f = open(output, "w") if output != "stdout" else sys.stdout
    f.write("\n".join(new_lines) + "\n")
    f.close() if output != "stdout" else None

    if save_rejected:
        # save transcripts that didn't pass the filter + reason why
        f = open(save_rejected, "w")
        for elem in rejected:
            f.write(f"{elem[0]}\t{elem[1]}\n")
        f.close()
Ejemplo n.º 9
0
def main():
    """Entry point."""
    t0 = dt.now()
    args = parse_args()
    os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"  # otherwise it could crash

    # as default we create CESAR jobs for chains with "orth" or "trans" class
    # but user could select another set of chain classes
    fields = "ORTH,TRANS" if args.fields is None else args.fields

    # read U12 introns: to create a list of U12-containing genes
    # need it to make subsequent commands
    u12_data = read_u12_data(args.u12)

    # get lists of orthologous chains per each gene
    # skipped_1 - no chains found -> log them
    batch, chain_gene_field, skipped_1 = read_orthologs(args.orthologs_file,
                                                        fields,
                                                        only_o2o=args.o2o_only)
    # split cesar jobs in different buckets (if user requested so)
    # like put all jobs that require < 5Gig in the bucket 1
    # jobs requiring 5 to 15 Gb to bucket 2 and so on
    # CESAR might be very memory-consuming -> so we care about this
    mem_limit, buckets = define_buckets(args.mem_limit, args.buckets)

    # load reference bed file data; coordinates and exon sizes
    bed_data = read_bed(args.bed_file)
    # check if cesar binary exists
    die(f"Error! Cannot find cesar executable at {args.cesar_binary}!") if \
        not os.path.isfile(args.cesar_binary) else None

    # pre-compute chain : gene : region data
    # collect the second list of skipped genes
    # skipped_2 -> too long corresponding regions in query
    regions, skipped_2 = precompute_regions(batch,
                                            bed_data,
                                            args.bdb_chain_file,
                                            chain_gene_field,
                                            args.chains_limit)
    
    # start making the jobs
    all_jobs = {}
    skipped_3 = []

    for gene in batch.keys():
        u12_this_gene = u12_data.get(gene)
        block_sizes = bed_data[gene][3]

        # proceed to memory estimation
        # the same procedure as inside CESAR2.0 code
        num_states, r_length = 0, 0

        # required memory depends on numerous params
        # first, we need reference transcript-related parameters
        # query-related parameters will be later
        for block_size in block_sizes:
            # num_states += 6 + 6 * reference->num_codons + 1 + 2 + 2 + 22 + 6;
            #  /* 22 and 6 for acc and donor states */
            num_codons = block_size // 3
            num_states += 6 + 6 * num_codons + 1 + 2 + 2 + 22 + 6
            # r_length += 11 + 6 * fasta.references[i]->length
            # + donors[i]->length + acceptors[i]->length;
            r_length += block_size

        gene_chains_data = regions.get(gene)
        # check that there is something for this gene
        if not gene_chains_data:
            continue
        elif len(gene_chains_data) == 0:
            continue

        chains = gene_chains_data.keys()
        chains_arg = ",".join(chains)  # chain ids -> one of the cmd args
        
        # now compute query sequence-related parameters
        query_lens = [v for v in gene_chains_data.values()]
        q_length_max = max(query_lens)
        # and now compute the amount of required memory
        memory = (num_states * 4 * 8) + \
                 (num_states * q_length_max * 4) + \
                 (num_states * 304) + \
                 (2 * q_length_max + r_length) * 8 + \
                 (q_length_max + r_length) * 2 * 1 + EXTRA_MEM

        # convert to gigs + 0.25 extra gig
        gig = math.ceil(memory / 1000000000) + 0.25 
        if gig > mem_limit:
            # it is going to consume TOO much memory
            # skip this gene -> save to log
            skipped_3.append((gene, ",".join(chains),
                             f"memory limit ({mem_limit} gig) exceeded (needs {gig})"))
            continue

        # # 0 gene; 1 chains; 2 bed_file; 3 bdb chain_file; 4 tDB; 5 qDB; 6 output; 7 cesar_bin
        job = WRAPPER_TEMPLATE.format(gene, chains_arg,
                                      os.path.abspath(args.bdb_bed_file),
                                      os.path.abspath(args.bdb_chain_file),
                                      os.path.abspath(args.tDB),
                                      os.path.abspath(args.qDB),
                                      gig,
                                      os.path.abspath(args.cesar_binary),
                                      args.uhq_flank)
        # add some flags if required
        job = job + " --mask_stops" if args.mask_stops else job
        job = job + " --check_loss" if args.check_loss else job
        job = job + " --no_fpi" if args.no_fpi else job

        # add U12 introns data if this gene has them:
        job = job + f" --u12 {os.path.abspath(args.u12)}" if u12_this_gene else job

        all_jobs[job] = gig

    eprint(f"\nThere are {len(all_jobs.keys())} jobs in total.")
    eprint("Splitting the jobs.")
    # split jobs in buckets | compute proportions
    filled_buckets = fill_buckets(buckets, all_jobs)
    prop_sum = sum([k * len(v) for k, v in filled_buckets.items()])
    # estimate proportion of a bucket in the runtime
    buckets_prop = {k: (k * len(v)) / prop_sum for k, v in filled_buckets.items()} \
        if 0 not in filled_buckets.keys() else {0: 1.0}
    eprint("Bucket proportions are:")
    eprint("\n".join([f"{k} -> {v}" for k, v in buckets_prop.items()]))
    # get number of jobs for each bucket
    bucket_jobs_num = {k: math.ceil(args.jobs_num * v) for k, v in buckets_prop.items()}
    # save jobs, get comb lines
    to_combine = save_jobs(filled_buckets, bucket_jobs_num, args.jobs_dir)
    # save combined jobs, combined is a file containing paths to separate jobs
    os.mkdir(args.results) if not os.path.isdir(args.results) else None
    os.mkdir(args.check_loss) if args.check_loss \
        and not os.path.isdir(args.check_loss) else None

    f = open(args.combined, "w")
    for num, comb in enumerate(to_combine, 1):
        basename = os.path.basename(comb).split(".")[0]
        results_path = os.path.abspath(os.path.join(args.results, basename + ".bdb"))
        combined_command = f"{CESAR_RUNNER} {comb} {results_path}"
        if args.check_loss:
            loss_data_path = os.path.join(args.check_loss,
                                          f"{basename}.inact_mut.txt")
            combined_command += f" --check_loss {loss_data_path}"
        if args.rejected_log:
            log_path = os.path.join(args.rejected_log, f"{num}.txt")
            combined_command += f" --rejected_log {log_path}"
        f.write(combined_command + "\n")
    f.close()

    # save skipped genes if required
    if args.skipped_genes:
        skipped = skipped_1 + skipped_2 + skipped_3
        f = open(args.skipped_genes, "w")
        # usually we have gene + reason why skipped
        # we split them with tab
        f.write("\n".join(["\t".join(x) for x in skipped]) + "\n")
        f.close()

    f = open(args.paralogs_log, "w")
    # save IDs of paralogous projections
    for k, v in chain_gene_field.items():
        if v != "PARALOG":
            continue
        gene_ = f"{k[1]}.{k[0]}\n"
        f.write(gene_)
    f.close()

    eprint(f"Estimated: {dt.now() - t0}")
    sys.exit(0)
Ejemplo n.º 10
0
def read_orthologs(orthologs_file, fields_raw, only_o2o=False):
    """Read orthologs file."""
    # convert fields param string to list
    fields = [x.upper() for x in fields_raw.split(",") if x != ""]
    genes_chains = {}
    chain_gene_field = {}
    skipped = []  # genes skipped at this stage
    f = open(orthologs_file, "r")  # open the file
    f.__next__()  # skip header
    # first column: transcript identifier
    # then: chain class fields (like column 2 - orthologous chains, 3 - paralogous)
    for line in f:
        # parse line
        line_info = line[:-1].split("\t")
        # "0" is a placeholder meaning "no chains there"
        gene = line_info[0]
        selected, chains = [], {}

        chains["ORTH"] = [x for x in line_info[1].split(",") if x != "0"]
        chains["PARA"] = [x for x in line_info[2].split(",") if x != "0"]
        chains["TRANS"] = [x for x in line_info[3].split(",") if x != "0"]
        # Processed pseudogenes column ignored -> they are processed separately
        all_chains = chains["ORTH"] + chains["PARA"] + chains["TRANS"]

        if len(all_chains) == 0:
            # no way in running CESAR on this gene
            # because there are no chains we could use
            skipped.append((gene, "0", "No chains intersecting the gene"))
            continue

        # user can ask to process only the genes that have a single orthologous chain
        # here we check that this is the case
        not_one2one = len(chains["ORTH"]) == 0 or len(chains["ORTH"]) > 1
        if only_o2o and not_one2one:  # we requested only a single orthologous chain
            skipped.append((gene, "0", "Only one2one requested, this gene didn't pass"))
            continue

        # get those are chosen in FIELDS
        for field in fields:
            # field is most likely "ORTH" or "TRANS"
            field_chains = chains.get(field)
            if not field_chains:
                continue
            selected.extend(field_chains)
            for chain in field_chains:
                key = (chain, gene)
                chain_gene_field[key] = field

        # if a gene has no orthologous chains, then use paralogous
        # if no paralogous -> log this gene
        if not selected:
            # no orthologous chains
            # we try to use paralogous chains
            # of course, log this data
            selected = all_chains.copy()
            keys = [(chain, gene) for chain in selected]
            for key in keys:
                chain_gene_field[key] = "PARALOG"
        # write to the dict, gene to chains we will use
        genes_chains[gene] = selected

    f.close()
    die("Error! No gene:chains pairs selected! Probably --fields parameter is wrong!") \
        if len(genes_chains) == 0 else None
    return genes_chains, chain_gene_field, skipped
Ejemplo n.º 11
0
def check_args(args):
    """Check if args are correct, fill global dict."""
    # check the directories
    global VERBOSE  # set verbosity level
    VERBOSE = True if args.verbose else False
    WORK_DATA["vv"] = True if args.vv else False

    try:  # check the directories, create if it is necessary
        os.mkdir(args.jobs) if not os.path.isdir(args.jobs) else None
        os.mkdir(
            args.results_dir) if not os.path.isdir(args.results_dir) else None
        os.mkdir(args.errors_dir) \
            if args.errors_dir and not os.path.isdir(args.errors_dir) \
            else None
        WORK_DATA["jobs"] = args.jobs
        WORK_DATA["results_dir"] = args.results_dir
        WORK_DATA["errors_dir"] = args.errors_dir
        verbose(
            f"Directories in usage: {args.jobs} {args.results_dir} {args.errors_dir}"
        )

    except FileNotFoundError as grepexc:  # a one of those tasks failed
        eprint(f"Arguments are corrupted!\n{str(grepexc)}")
        die("Cannot create one of the directories requested.")

    # define about chain and bed files
    WORK_DATA["chain_file"] = args.chain_file if os.path.isfile(args.chain_file) \
        else die(f"Error! Chain file {args.chain_file} is wrong!")

    WORK_DATA["bed_file"] = args.bed_file if os.path.isfile(args.bed_file) \
        else die(f"Error! Bed file {args.bed_file} is wrong!")
    verbose(f"Use bed file {args.bed_file} and chain file {args.chain_file}")

    # look for .ID.bb file
    index_file = args.index_file if args.index_file else args.chain_file.replace(
        ".chain", ".chain_ID_position")

    if os.path.isfile(index_file):  # check if bb file is here
        WORK_DATA["index_file"] = index_file
        verbose(f"And {index_file} as an index file")
    elif args.make_index:  # create index if not exists
        eprint("make_indexed in progress...")
        idbb_cmd = f"/modules/chain_bdb_index.py {args.chain_file} {index_file}"
        call_proc(idbb_cmd)
        WORK_DATA["index_file"] = index_file
    else:  # die
        die(f"Error! Cannot find index file at {index_file}\n"
            "Please define it manually")

    # define the number of jobs
    if args.job_size:  # easy:
        WORK_DATA["job_size"] = args.job_size
        WORK_DATA["jobs_num"] = None
    else:  # we must compute how many jobs to put into one cluster job
        WORK_DATA["job_size"] = None
        WORK_DATA["jobs_num"] = args.jobs_num
    WORK_DATA["bed_index"] = args.bed_index

    # some defaults
    WORK_DATA["jobs_file"] = args.jobs_file
    WORK_DATA["ref"] = args.ref
    # check if we are on cluster
    WORK_DATA["on_cluster"] = True
    verbose("Program-wide dictionary looks like:\n")
    for k, v in WORK_DATA.items():
        verbose(f"{k}: {v}")
Ejemplo n.º 12
0
def call_proc(cmd):
    """Call a subprocess and catch errors."""
    rc = subprocess.call(cmd, shell=True)
    if rc != 0:
        die(f"Error! Process {cmd} died! Abort.")