def main(): args = read_args() if args.debug_barcodes: valid_barcodes = load_barcodes(args) truth = None if args.truth is not None: truth, file_size = read_file(args.truth, args.size) g = Glass(args.chunk_num, header_size=args.header_size, rs=args.rs, c_dist=args.c_dist, delta=args.delta, flag_correct=not (args.no_correction), gc=args.gc, max_homopolymer=args.max_homopolymer, max_hamming=args.max_hamming, decode=not (args.mock), exDNA=args.expand_nt, chunk_size=args.size, np=args.rand_numpy, truth=truth, out=args.out) line = 0 errors = 0 seen_seeds = defaultdict(int) #pbar = tqdm(total= args.chunk_num, desc = "Valid oligos") if args.file_in == '-': f = sys.stdin else: try: f = open(args.file_in, 'r') except: logging.error("%s file not found", args.text_file) sys.exit(0) aggressive = None if args.aggressive: aggressive = Aggressive(g=g, file_in=f, times=args.aggressive) ######## Main loop while True: try: dna = f.readline().rstrip('\n') except: logging.info("Finished reading input file!") break if len(dna) == 0: logging.info("Finished reading input file!") break if (args.fasta and re.search(r"^>", dna)): continue coverage = 0 #when the file is in the format of coverage \t DNA if (len(dna.split()) == 2): coverage, dna = dna.split() ####Aggresive mode if aggressive is not None and aggressive.turn_on( int(coverage), seen_seeds): best_file, value = aggressive.start() if best_file is not None: copyfile(best_file, args.out) logging.info("Done!") else: logging.error("Could not decode all file...") sys.exit(1) ### End of aggressive mode if 'N' in dna: continue line += 1 seed, data = g.add_dna(dna) if seed == -1: #reed-solomon error! errors += 1 else: #pbar.update() if args.debug_barcodes: if not dna in valid_barcodes: logging.error( "Seed or data %d in line %d are not valid:%s", seed, line, dna) else: seen_seeds[dna] += 1 else: seen_seeds[seed] += 1 if line % 1000 == 0: logging.info( "After reading %d lines, %d chunks are done. So far: %d rejections (%f) %d barcodes", line, g.chunksDone(), errors, errors / (line + 0.0), g.len_seen_seed()) pass if line == args.max_line: logging.info("Finished reading maximal number of lines") break if g.isDone(): logging.info( "After reading %d lines, %d chunks are done. So far: %d rejections (%f) %d barcodes", line, g.chunksDone(), errors, errors / (line + 0.0), g.len_seen_seed()) logging.info("Done!") break if not g.isDone(): logging.error("Could not decode all file...") sys.exit(1) outstring = g.getString() f = open(args.out, 'wb') f.write(outstring) f.close() logging.info("MD5 is %s", md5.new(outstring).hexdigest()) json.dump(seen_seeds, open("seen_barocdes.json", 'w'), sort_keys=True, indent=4)
def main(): args = read_args() if args.debug_barcodes: valid_barcodes = load_barcodes(args) comp = None if args.composite_DNA is not None: # alphabet is a dict of int->letter including the std 0->A,1->C,2->G,3->T # the composite alphabet file only contains an ordered list of the *additional* letters alphabet = read_composite_alphabet(args.composite_DNA[0]) BC_bases = int(args.composite_DNA[1]) # TODO - set max binary block limit somehow, get oligo length from somewhere if args.composite_encoder is not None: composite_encoder = create_composite_encoder( alphabet, int(args.composite_encoder[0]), int(args.composite_encoder[1])) else: composite_encoder = create_optimal_composite_encoder( alphabet, 10, 136) comp = { 'alphabet': alphabet, 'BC_bases': BC_bases, 'encoder': composite_encoder } truth = None if args.truth is not None: truth, file_size = read_file(args.truth, args.size) g = Glass(args.chunk_num, header_size=args.header_size, rs=args.rs, c_dist=args.c_dist, delta=args.delta, flag_correct=not (args.no_correction), gc=args.gc, max_homopolymer=args.max_homopolymer, max_hamming=args.max_hamming, decode=not (args.mock), chunk_size=args.size, np=args.rand_numpy, truth=truth, out=args.out, comp=comp) line = 0 errors = 0 seen_seeds = defaultdict(int) # pbar = tqdm(total= args.chunk_num, desc = "Valid oligos") if args.file_in == '-': f = sys.stdin else: try: f = open(args.file_in, 'r') except: logging.error("%s file not found", args.text_file) sys.exit(0) aggressive = None if args.aggressive: aggressive = Aggressive(g=g, file_in=f, times=args.aggressive) ######## Main loop while True: try: dna = f.readline().rstrip('\n') except: logging.info("Finished reading input file!") break if len(dna) == 0: logging.info("Finished reading input file!") break if (args.fasta and re.search(r"^>", dna)): continue coverage = 0 # when the file is in the format of coverage \t DNA if (len(dna.split()) == 2): coverage, dna = dna.split() ####Aggresive mode if aggressive is not None and aggressive.turn_on( int(coverage), seen_seeds): best_file, value = aggressive.start() if best_file is not None: copyfile(best_file, args.out) logging.info("Done!") else: logging.error("Could not decode all file...") sys.exit(1) ### End of aggressive mode if 'N' in dna and comp is None: continue line += 1 seed, data = g.add_dna(dna) if seed == -1: # reed-solomon error! errors += 1 else: # pbar.update() if args.debug_barcodes: if not dna in valid_barcodes: logging.error( "Seed or data %d in line %d are not valid:%s", seed, line, dna) else: seen_seeds[dna] += 1 else: seen_seeds[seed] += 1 if line % 10000 == 0: logging.info( "After reading %d lines, %d chunks are done. So far: %d rejections (%f) %d barcodes", line, g.chunksDone(), errors, errors / (line + 0.0), g.len_seen_seed()) pass if line == args.max_line: logging.info("Finished reading maximal number of lines") break if g.isDone(): logging.info( "After reading %d lines, %d chunks are done. So far: %d rejections (%f) %d barcodes", line, g.chunksDone(), errors, errors / (line + 0.0), g.len_seen_seed()) logging.info("Done!") break if not g.isDone(): logging.error("Could not decode all file...") sys.exit(1) outstring = g.getString() f = open(args.out, 'wb') f.write(outstring) f.close() logging.info("MD5 is %s", md5.new(outstring).hexdigest()) json.dump(seen_seeds, open("seen_barocdes.json", 'w'), sort_keys=True, indent=4)