Esempio n. 1
0
def main():
    """Aggregates the outputs of all the tools for a specific dataset.

    For command line help, run with the '-h' flag.
    """
    args = get_args()
    dataset, chrom, _ = common.process_args(args)
    if not dataset: return

    # Collects the scores to the scored_targets dictionary, where the first ket
    # is the target spacer, the second is the tool and the third is the score's
    # name.
    targets = dataset.get_targets(args.chr)
    scored_targets = {}
    for target in targets:
        scored_targets[get_spacer(target)] = defaultdict(defaultdict)
    for tool in TOOLS.values():
        add_scores(args, tool, scored_targets)

    # Converts the mapping to a list of records, each one containing all of the
    # scores of its corresponding target.
    records = aggregate(targets, scored_targets)
    # Writes the output.
    aggregate_path = join_path(dataset.get_out_path(),
                               consts.AGG_OUT_NAME % args.chr)
    with open(aggregate_path, 'w') as out:
        out.write(common.to_csv_line(*get_headers()))
        for record in records:
            out.write(common.to_csv_line(*record))
Esempio n. 2
0
def add_rejected(chrom, out_path):
    """Adds the rejected targets to the output, with the rejection error codes.

    Args:
        chrom: A Chromosome instance.
        out_path: Path to the directory of the tool's output for the processed
            dataset.

    Writes:
        The complete output CSV file.
    """
    # Collects rejected spacers with their rejection reasons.
    rejections = []
    rejection_file_path = os.path.join(out_path, REJECT_FILE_NAME % chrom.num)
    with open(rejection_file_path, 'r') as fd:
        for line in fd:
            seq, reason = line.strip().split('\t')
            if reason not in REJECTIONS:
                # Stops if there is an unknown rejection reason.
                print "Unknown rejection reason: %s" % reason
                return
            rejections.append((seq[:-consts.PAM_LEN], REJECTIONS[reason]))

    # Adds the rejected targets to the output file.
    out_file_path = os.path.join(out_path, consts.RAW_OUT_CSV % chrom.num)
    with open(out_file_path, 'a') as fd:
        for seq, reason in rejections:
            line = common.to_csv_line(seq, "", "", "", reason, chrom.name, -1,
                                      -1, "x", "", "", "", "")
            fd.write(line)
def main(raw_args=None):
    """Samples targets from the GenomeCRISPR DB into a CSV dataset file.

    For command line help, run with the '-h' flag.

    Writes:
        A dataset CSV file with the sampled targets.
    """
    args = get_args(raw_args)
    dataset = DATASETS[args.dataset]
    dataset.set_work_dir(args.path)

    dataset_dir = os.path.join(dataset.get_data_path(), consts.RAW_DATA_DIR)
    input_file = os.path.join(dataset_dir, args.input)
    if args.output:
        out_name = args.output
    else:
        out_name = OUT_FILE % (args.chr, format_number(args.size))
    output_file = os.path.join(dataset_dir, out_name)

    print("Output will be in: %s" % output_file)

    entries = set()
    out = open(output_file, 'w')
    input = open(input_file, 'r')

    is_first = True
    for line in input:
        if is_first:
            add_headers(line.strip(), out)
            is_first = False
        else:
            entries.add(line)
    input.close()
    print("Done reading")

    # Keeps track of spacers that were already included.
    spacers = set()
    # Keeps track of the number of samples already obtained.
    sampled = 0
    # Keeps track of progress for printing.
    chunk = 1

    # While we have not sampled enough...
    while sampled < args.size:
        remaining = args.size - sampled
        # Sample more entries.
        new_entries = sample(entries, spacers, remaining, dataset)
        sampled += len(new_entries)
        for entry in new_entries:
            out.write(common.to_csv_line(*entry))

        # If we have passed a progress checkpoint - prints progress.
        if (args.size / PRINT_NUM) * chunk < sampled:
            print("Sampled: %d" % sampled)
            chunk += 1

    print("Done")
    print(STATS)
def add_headers(headers, out):
    """Writes the DB headers along with an efficiency header to the output file.

    Args:
        headers: The existing headers in the GenomeCRISPR DB.
        out: A file descriptor for the output file.
    """
    out.write(common.to_csv_line(headers, "efficient"))
Esempio n. 5
0
def parse_features(path, handlers, fd_out, label_getter):
    """Parses the input to produce the feature representations.

    Args:
        path: Path to the input.
        handlers: A list of encoding functions for the columns of the input.
        fd_out: A file descriptor for the output file.
        label_getter: A function which gets the GenomeCRISPR effect label for a
            given target.
    """
    with open(path, 'r') as fd:
        lines = fd.readlines()
        for line in lines[1:]:
            values = common.from_csv_line(line)
            target = values[0]
            encoded = list(chain(*[handlers[i](values[i+1])\
                                   for i in range(len(handlers))]))
            entry = [target, label_getter(target)] + encoded
            fd_out.write(common.to_csv_line(*entry))
 def to_csv_line(self, *args):
     """Returns a CSV line with the args, prefixed by the tool's name."""
     return common.to_csv_line(self.name, *args)