Esempio n. 1
0
def main():

    args = read_args()

    if args.debug_barcodes:
        valid_barcodes = load_barcodes(args)

    truth = None
    if args.truth is not None:
        truth, file_size = read_file(args.truth, args.size)

    g = Glass(args.chunk_num,
              header_size=args.header_size,
              rs=args.rs,
              c_dist=args.c_dist,
              delta=args.delta,
              flag_correct=not (args.no_correction),
              gc=args.gc,
              max_homopolymer=args.max_homopolymer,
              max_hamming=args.max_hamming,
              decode=not (args.mock),
              exDNA=args.expand_nt,
              chunk_size=args.size,
              np=args.rand_numpy,
              truth=truth,
              out=args.out)

    line = 0
    errors = 0
    seen_seeds = defaultdict(int)

    #pbar = tqdm(total= args.chunk_num, desc = "Valid oligos")
    if args.file_in == '-':
        f = sys.stdin
    else:
        try:
            f = open(args.file_in, 'r')
        except:
            logging.error("%s file not found", args.text_file)
            sys.exit(0)

    aggressive = None
    if args.aggressive:

        aggressive = Aggressive(g=g, file_in=f, times=args.aggressive)

    ######## Main loop
    while True:

        try:
            dna = f.readline().rstrip('\n')
        except:
            logging.info("Finished reading input file!")
            break

        if len(dna) == 0:
            logging.info("Finished reading input file!")
            break

        if (args.fasta and re.search(r"^>", dna)):
            continue

        coverage = 0
        #when the file is in the format of coverage \t DNA
        if (len(dna.split()) == 2):
            coverage, dna = dna.split()
            ####Aggresive mode
            if aggressive is not None and aggressive.turn_on(
                    int(coverage), seen_seeds):
                best_file, value = aggressive.start()
                if best_file is not None:
                    copyfile(best_file, args.out)
                    logging.info("Done!")
                else:
                    logging.error("Could not decode all file...")

                sys.exit(1)
            ### End of aggressive mode

        if 'N' in dna:
            continue

        line += 1
        seed, data = g.add_dna(dna)

        if seed == -1:  #reed-solomon error!
            errors += 1
        else:
            #pbar.update()
            if args.debug_barcodes:
                if not dna in valid_barcodes:
                    logging.error(
                        "Seed or data %d in line %d are not valid:%s", seed,
                        line, dna)
                else:
                    seen_seeds[dna] += 1
            else:
                seen_seeds[seed] += 1

        if line % 1000 == 0:
            logging.info(
                "After reading %d lines, %d chunks are done. So far: %d rejections (%f) %d barcodes",
                line, g.chunksDone(), errors, errors / (line + 0.0),
                g.len_seen_seed())
            pass

        if line == args.max_line:
            logging.info("Finished reading maximal number of lines")
            break

        if g.isDone():
            logging.info(
                "After reading %d lines, %d chunks are done. So far: %d rejections (%f) %d barcodes",
                line, g.chunksDone(), errors, errors / (line + 0.0),
                g.len_seen_seed())
            logging.info("Done!")
            break

    if not g.isDone():
        logging.error("Could not decode all file...")
        sys.exit(1)

    outstring = g.getString()
    f = open(args.out, 'wb')
    f.write(outstring)
    f.close()

    logging.info("MD5 is %s", md5.new(outstring).hexdigest())

    json.dump(seen_seeds,
              open("seen_barocdes.json", 'w'),
              sort_keys=True,
              indent=4)
Esempio n. 2
0
def main():
    args = read_args()

    if args.debug_barcodes:
        valid_barcodes = load_barcodes(args)
    comp = None
    if args.composite_DNA is not None:
        # alphabet is a dict of int->letter including the std 0->A,1->C,2->G,3->T
        # the composite alphabet file only contains an ordered list of the *additional* letters
        alphabet = read_composite_alphabet(args.composite_DNA[0])
        BC_bases = int(args.composite_DNA[1])
        # TODO - set max binary block limit somehow, get oligo length from somewhere
        if args.composite_encoder is not None:
            composite_encoder = create_composite_encoder(
                alphabet, int(args.composite_encoder[0]),
                int(args.composite_encoder[1]))
        else:
            composite_encoder = create_optimal_composite_encoder(
                alphabet, 10, 136)
        comp = {
            'alphabet': alphabet,
            'BC_bases': BC_bases,
            'encoder': composite_encoder
        }

    truth = None
    if args.truth is not None:
        truth, file_size = read_file(args.truth, args.size)

    g = Glass(args.chunk_num,
              header_size=args.header_size,
              rs=args.rs,
              c_dist=args.c_dist,
              delta=args.delta,
              flag_correct=not (args.no_correction),
              gc=args.gc,
              max_homopolymer=args.max_homopolymer,
              max_hamming=args.max_hamming,
              decode=not (args.mock),
              chunk_size=args.size,
              np=args.rand_numpy,
              truth=truth,
              out=args.out,
              comp=comp)

    line = 0
    errors = 0
    seen_seeds = defaultdict(int)

    # pbar = tqdm(total= args.chunk_num, desc = "Valid oligos")
    if args.file_in == '-':
        f = sys.stdin
    else:
        try:
            f = open(args.file_in, 'r')
        except:
            logging.error("%s file not found", args.text_file)
            sys.exit(0)

    aggressive = None
    if args.aggressive:
        aggressive = Aggressive(g=g, file_in=f, times=args.aggressive)

    ######## Main loop
    while True:
        try:
            dna = f.readline().rstrip('\n')
        except:
            logging.info("Finished reading input file!")
            break

        if len(dna) == 0:
            logging.info("Finished reading input file!")
            break

        if (args.fasta and re.search(r"^>", dna)):
            continue

        coverage = 0
        # when the file is in the format of coverage \t DNA
        if (len(dna.split()) == 2):
            coverage, dna = dna.split()
            ####Aggresive mode
            if aggressive is not None and aggressive.turn_on(
                    int(coverage), seen_seeds):
                best_file, value = aggressive.start()
                if best_file is not None:
                    copyfile(best_file, args.out)
                    logging.info("Done!")
                else:
                    logging.error("Could not decode all file...")

                sys.exit(1)
                ### End of aggressive mode

        if 'N' in dna and comp is None:
            continue

        line += 1
        seed, data = g.add_dna(dna)

        if seed == -1:  # reed-solomon error!
            errors += 1
        else:
            # pbar.update()
            if args.debug_barcodes:
                if not dna in valid_barcodes:
                    logging.error(
                        "Seed or data %d in line %d are not valid:%s", seed,
                        line, dna)
                else:
                    seen_seeds[dna] += 1
            else:
                seen_seeds[seed] += 1

        if line % 10000 == 0:
            logging.info(
                "After reading %d lines, %d chunks are done. So far: %d rejections (%f) %d barcodes",
                line, g.chunksDone(), errors, errors / (line + 0.0),
                g.len_seen_seed())
            pass

        if line == args.max_line:
            logging.info("Finished reading maximal number of lines")
            break

        if g.isDone():
            logging.info(
                "After reading %d lines, %d chunks are done. So far: %d rejections (%f) %d barcodes",
                line, g.chunksDone(), errors, errors / (line + 0.0),
                g.len_seen_seed())
            logging.info("Done!")
            break

    if not g.isDone():
        logging.error("Could not decode all file...")
        sys.exit(1)

    outstring = g.getString()
    f = open(args.out, 'wb')
    f.write(outstring)
    f.close()

    logging.info("MD5 is %s", md5.new(outstring).hexdigest())

    json.dump(seen_seeds,
              open("seen_barocdes.json", 'w'),
              sort_keys=True,
              indent=4)