Example #1
0
def ice_fa2fq(in_fa, ccs_fofn, out_fq):
    """Convert an input FASTA file to an output FASTQ file,
       reading QVs from the input ccs.h5 or ccs FOFN.
    """

    qver = basQVcacher()
    if ccs_fofn:
        if ccs_fofn.endswith(".h5"):  # Input is a ccs.h5 file not a FOFN.
            qver.add_bash5(ccs_fofn)
        else:  # Input is a ccs FOFN containing multiple ccs.h5 files.
            for ccs_fn in get_files_from_fofn(ccs_fofn):
                qver.add_bash5(ccs_fn)
    bas_handlers = {}

    with FastaReader(in_fa) as reader, \
            FastqWriter(out_fq) as writer:
        for r in reader:
            seqid = r.name.split(' ')[0]
            movie, hn, s_e = "", "", ""
            try:
                movie, hn, s_e = seqid.split('/')
                hn = int(hn)
            except ValueError:
                raise ValueError("{seqid} is not a valid CCS read".
                                 format(seqid=seqid))
            if ccs_fofn:
                try:
                    bas_file = qver.bas_files[movie][seqid]
                    if bas_file not in bas_handlers:
                        bas_handlers[bas_file] = BasH5Reader(bas_file)
                except KeyError:
                    raise IOError("Could not read {s} from input ccs fofn.".
                              format(s=seqid))
                logging.debug("Getting QVs for {name} ...".format(name=r.name))
                qvs = get_qv_from_bas_handler(bas_handler=bas_handlers[bas_file],
                                          hn=hn, s_e=s_e,
                                          qv_name="QualityValue")
            else: #No quality values provided to pbtranscript.py cluster
                qvs = [60]*len(r.sequence) # No information given, have strong belief in the base calls
            if len(r.sequence) != len(qvs):
                raise ValueError("Sequence and QVs of {r} should be the same!".
                                 format(r=r.name))
            writer.writeRecord(r.name, r.sequence, qvs)

    for bas_file, bas_handler in bas_handlers.iteritems():
        logging.debug("Closing {bas_file} ...".format(bas_file=bas_file))
        bas_handler.close()
Example #2
0
def ice_fa2fq(in_fa, ccs_fofn, out_fq):
    """Convert an input FASTA file to an output FASTQ file,
       reading QVs from the input ccs.h5 or ccs FOFN.
    """

    qver = basQVcacher()
    if ccs_fofn.endswith(".h5"):  # Input is a ccs.h5 file not a FOFN.
        qver.add_bash5(ccs_fofn)
    else:  # Input is a ccs FOFN containing multiple ccs.h5 files.
        for ccs_fn in get_files_from_fofn(ccs_fofn):
            qver.add_bash5(ccs_fn)

    bas_handlers = {}

    with FastaReader(in_fa) as reader, \
            FastqWriter(out_fq) as writer:
        for r in reader:
            seqid = r.name.split(' ')[0]
            movie, hn, s_e = "", "", ""
            try:
                movie, hn, s_e = seqid.split('/')
                hn = int(hn)
            except ValueError:
                raise ValueError(
                    "{seqid} is not a valid CCS read".format(seqid=seqid))
            try:
                bas_file = qver.bas_files[movie][seqid]
                if bas_file not in bas_handlers:
                    bas_handlers[bas_file] = BasH5Reader(bas_file)
            except KeyError:
                raise IOError(
                    "Could not read {s} from input ccs fofn.".format(s=seqid))
            logging.debug("Getting QVs for {name} ...".format(name=r.name))
            qvs = get_qv_from_bas_handler(bas_handler=bas_handlers[bas_file],
                                          hn=hn,
                                          s_e=s_e,
                                          qv_name="QualityValue")
            if len(r.sequence) != len(qvs):
                raise ValueError(
                    "Sequence and QVs of {r} should be the same!".format(
                        r=r.name))
            writer.writeRecord(r.name, r.sequence, qvs)

    for bas_file, bas_handler in bas_handlers.iteritems():
        logging.debug("Closing {bas_file} ...".format(bas_file=bas_file))
        bas_handler.close()
Example #3
0
def convert_fofn_to_fasta(fofn_filename,
                          out_filename,
                          fasta_out_dir,
                          force_overwrite=False):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_fofn(fofn_filename)
    out_fns = []
    mkdir(fasta_out_dir)
    for in_fn in in_fns:
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp')
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug(
                "File {0} already exists. skipping.".format(out_file))
        else:
            cmd = "pls2fasta {0} {1} ".format(in_fn, tmp_out_file) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            logging.debug("CMD: {cmd}".format(cmd=cmd))
            _out, _code, _msg = backticks(cmd)
            if _code != 0:
                raise RuntimeError("CMD failed: {cmd}\n".format(cmd=cmd) +
                                   _msg)
            trim_subread_flanks(tmp_out_file, out_file)
        out_fns.append(out_file)
        if op.exists(tmp_out_file):
            os.remove(tmp_out_file)
    write_files_to_fofn(out_fns, out_filename)
Example #4
0
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir,
                          force_overwrite=False):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_fofn(fofn_filename)
    out_fns = []
    mkdir(fasta_out_dir)
    for in_fn in in_fns:
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp')
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug("File {0} already exists. skipping.".format(out_file))
        else:
            cmd = "pls2fasta {0} {1} ".format(in_fn, tmp_out_file) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            logging.debug("CMD: {cmd}".format(cmd=cmd))
            _out, _code, _msg = backticks(cmd)
            if _code != 0:
                raise RuntimeError("CMD failed: {cmd}\n".format(cmd=cmd) + _msg)
            trim_subread_flanks(tmp_out_file, out_file)
        out_fns.append(out_file)
        if op.exists(tmp_out_file):
            os.remove(tmp_out_file)
    write_files_to_fofn(out_fns, out_filename)
Example #5
0
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir,
                          force_overwrite=False, cpus=1):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_fofn(fofn_filename)
    #out_fns = []
    mkdir(fasta_out_dir)

    # multiprocessing worker stuff
    manager = Manager()
    in_queue = manager.Queue(len(in_fns))
    in_queue_count = 0
    outfile_track = {} # expected out file --> (cmd, tmp)
    pool = []
    out_fns = []

    for in_fn in in_fns:
        #print >> sys.stderr, "DEBUG: converting h5 file:", in_fn
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp')
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug("File {0} already exists. skipping.".format(out_file))
            out_fns.append(out_file)
            if op.exists(tmp_out_file):
                os.remove(tmp_out_file)
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(tmp_out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            print >> sys.stderr, "DEBUG: putting in queue:", cmd, tmp_out_file, out_file
            in_queue.put((cmd, tmp_out_file, out_file))
            in_queue_count += 1
            outfile_track[out_file] = (cmd, tmp_out_file)
            print >> sys.stderr, "DEBUG: put in queue:", cmd, tmp_out_file, out_file

    cpus = min(cpus, in_queue_count) # cap max CPU if there's fewer files to convert
    for i in xrange(cpus):
        p = Process(target=convert_fofn_to_fasta_worker, args=(in_queue,))
        pool.append(p)

    #error_flag = False
    # starting & joining pool worakers
    for p in pool:
        p.start()
        #print >> sys.stderr, "Starting worker", p.name
    for p in pool:
        #print >> sys.stderr, "Waiting join", p.name
        p.join(timeout=1200)
        if p.is_alive(): p.terminate()

    # check that all files exists
    # if it does not, force to run locally
    for out_file,(cmd, tmp_out_file) in outfile_track.iteritems():
        in_queue.put((cmd, tmp_out_file, out_file))
        convert_fofn_to_fasta_worker(in_queue)
        out_fns.append(out_file)

    #if error_flag:
    #    raise Exception, "Unable to successfuly run convert_fofn_to_fasta, ABORT!"

    write_files_to_fofn(out_fns, out_filename)
Example #6
0
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir,
                          force_overwrite=False, cpus=1):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_fofn(fofn_filename)
    #out_fns = []
    mkdir(fasta_out_dir)

    # multiprocessing worker stuff
    manager = Manager()
    in_queue = manager.Queue(len(in_fns))
    in_queue_count = 0
    outfile_track = {} # expected out file --> (cmd, tmp)
    pool = []
    out_fns = []

    for in_fn in in_fns:
        #print >> sys.stderr, "DEBUG: converting h5 file:", in_fn
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp')
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug("File {0} already exists. skipping.".format(out_file))
            out_fns.append(out_file)
            if op.exists(tmp_out_file):
                os.remove(tmp_out_file)
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(tmp_out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            print >> sys.stderr, "DEBUG: putting in queue:", cmd, tmp_out_file, out_file
            in_queue.put((cmd, tmp_out_file, out_file))
            in_queue_count += 1
            outfile_track[out_file] = (cmd, tmp_out_file)
            print >> sys.stderr, "DEBUG: put in queue:", cmd, tmp_out_file, out_file

    cpus = min(cpus, in_queue_count) # cap max CPU if there's fewer files to convert
    for i in xrange(cpus):
        p = Process(target=convert_fofn_to_fasta_worker, args=(in_queue,))
        pool.append(p)

    #error_flag = False
    # starting & joining pool worakers
    for p in pool:
        p.start()
        #print >> sys.stderr, "Starting worker", p.name
    for p in pool:
        #print >> sys.stderr, "Waiting join", p.name
        p.join(timeout=1200)
        if p.is_alive(): p.terminate()

    # check that all files exists
    # if it does not, force to run locally
    for out_file,(cmd, tmp_out_file) in outfile_track.iteritems():
        in_queue.put((cmd, tmp_out_file, out_file))
        convert_fofn_to_fasta_worker(in_queue)
        out_fns.append(out_file)

    #if error_flag:
    #    raise Exception, "Unable to successfuly run convert_fofn_to_fasta, ABORT!"

    write_files_to_fofn(out_fns, out_filename)
Example #7
0
def convert_fofn_to_fasta(fofn_filename,
                          out_filename,
                          fasta_out_dir,
                          force_overwrite=False,
                          cpus=1):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_fofn(fofn_filename)
    #out_fns = []
    mkdir(fasta_out_dir)

    # multiprocessing worker stuff
    manager = Manager()
    out_fns = manager.list()
    in_queue = manager.Queue(99999)
    pool = []
    for i in xrange(cpus):
        p = Process(target=convert_fofn_to_fasta_worker,
                    args=(in_queue, out_fns))
        pool.append(p)

    for in_fn in in_fns:
        #print >> sys.stderr, "DEBUG: converting h5 file:", in_fn
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp')
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug(
                "File {0} already exists. skipping.".format(out_file))
            out_fns.append(out_file)
            if op.exists(tmp_out_file):
                os.remove(tmp_out_file)
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(tmp_out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            print >> sys.stderr, "DEBUG: putting in queue:", cmd, tmp_out_file, out_file
            in_queue.put((cmd, tmp_out_file, out_file))
            print >> sys.stderr, "DEBUG: put in queue:", cmd, tmp_out_file, out_file


#            logging.debug("CMD: {cmd}".format(cmd=cmd))
#            _out, _code, _msg = backticks(cmd)
#            if _code != 0:
#                raise RuntimeError("CMD failed: {cmd}\n".format(cmd=cmd) + _msg)
#            trim_subread_flanks(tmp_out_file, out_file)
#        out_fns.append(out_file)
#        if op.exists(tmp_out_file):
#            os.remove(tmp_out_file)

# starting & joining pool worakers
    for p in pool:
        p.start()
        #print >> sys.stderr, "Starting worker", p.name
    for p in pool:
        #print >> sys.stderr, "Waiting join", p.name
        p.join()

    write_files_to_fofn(out_fns, out_filename)