def main(): """Aggregates the outputs of all the tools for a specific dataset. For command line help, run with the '-h' flag. """ args = get_args() dataset, chrom, _ = common.process_args(args) if not dataset: return # Collects the scores to the scored_targets dictionary, where the first ket # is the target spacer, the second is the tool and the third is the score's # name. targets = dataset.get_targets(args.chr) scored_targets = {} for target in targets: scored_targets[get_spacer(target)] = defaultdict(defaultdict) for tool in TOOLS.values(): add_scores(args, tool, scored_targets) # Converts the mapping to a list of records, each one containing all of the # scores of its corresponding target. records = aggregate(targets, scored_targets) # Writes the output. aggregate_path = join_path(dataset.get_out_path(), consts.AGG_OUT_NAME % args.chr) with open(aggregate_path, 'w') as out: out.write(common.to_csv_line(*get_headers())) for record in records: out.write(common.to_csv_line(*record))
def add_rejected(chrom, out_path): """Adds the rejected targets to the output, with the rejection error codes. Args: chrom: A Chromosome instance. out_path: Path to the directory of the tool's output for the processed dataset. Writes: The complete output CSV file. """ # Collects rejected spacers with their rejection reasons. rejections = [] rejection_file_path = os.path.join(out_path, REJECT_FILE_NAME % chrom.num) with open(rejection_file_path, 'r') as fd: for line in fd: seq, reason = line.strip().split('\t') if reason not in REJECTIONS: # Stops if there is an unknown rejection reason. print "Unknown rejection reason: %s" % reason return rejections.append((seq[:-consts.PAM_LEN], REJECTIONS[reason])) # Adds the rejected targets to the output file. out_file_path = os.path.join(out_path, consts.RAW_OUT_CSV % chrom.num) with open(out_file_path, 'a') as fd: for seq, reason in rejections: line = common.to_csv_line(seq, "", "", "", reason, chrom.name, -1, -1, "x", "", "", "", "") fd.write(line)
def main(raw_args=None): """Samples targets from the GenomeCRISPR DB into a CSV dataset file. For command line help, run with the '-h' flag. Writes: A dataset CSV file with the sampled targets. """ args = get_args(raw_args) dataset = DATASETS[args.dataset] dataset.set_work_dir(args.path) dataset_dir = os.path.join(dataset.get_data_path(), consts.RAW_DATA_DIR) input_file = os.path.join(dataset_dir, args.input) if args.output: out_name = args.output else: out_name = OUT_FILE % (args.chr, format_number(args.size)) output_file = os.path.join(dataset_dir, out_name) print("Output will be in: %s" % output_file) entries = set() out = open(output_file, 'w') input = open(input_file, 'r') is_first = True for line in input: if is_first: add_headers(line.strip(), out) is_first = False else: entries.add(line) input.close() print("Done reading") # Keeps track of spacers that were already included. spacers = set() # Keeps track of the number of samples already obtained. sampled = 0 # Keeps track of progress for printing. chunk = 1 # While we have not sampled enough... while sampled < args.size: remaining = args.size - sampled # Sample more entries. new_entries = sample(entries, spacers, remaining, dataset) sampled += len(new_entries) for entry in new_entries: out.write(common.to_csv_line(*entry)) # If we have passed a progress checkpoint - prints progress. if (args.size / PRINT_NUM) * chunk < sampled: print("Sampled: %d" % sampled) chunk += 1 print("Done") print(STATS)
def add_headers(headers, out): """Writes the DB headers along with an efficiency header to the output file. Args: headers: The existing headers in the GenomeCRISPR DB. out: A file descriptor for the output file. """ out.write(common.to_csv_line(headers, "efficient"))
def parse_features(path, handlers, fd_out, label_getter): """Parses the input to produce the feature representations. Args: path: Path to the input. handlers: A list of encoding functions for the columns of the input. fd_out: A file descriptor for the output file. label_getter: A function which gets the GenomeCRISPR effect label for a given target. """ with open(path, 'r') as fd: lines = fd.readlines() for line in lines[1:]: values = common.from_csv_line(line) target = values[0] encoded = list(chain(*[handlers[i](values[i+1])\ for i in range(len(handlers))])) entry = [target, label_getter(target)] + encoded fd_out.write(common.to_csv_line(*entry))
def to_csv_line(self, *args): """Returns a CSV line with the args, prefixed by the tool's name.""" return common.to_csv_line(self.name, *args)