Beispiel #1
0
def split_sam_by_chr(inSam, prefix):
    if not isinstance(inSam, IOBase):
        inSam = open(inSam)
    header = ''
    d_chr = {}
    i = 0
    j = 0
    outfiles = []
    for line in inSam:
        if line.startswith('@'):
            header += line
            continue
        j += 1
        temp = line.split('\t')
        chrom = temp[2]
        if chrom in d_chr:
            hname = 'f%s' % (d_chr[chrom], )
        else:
            d_chr[chrom] = i + 1
            outfile = '%s.%s.sam' % (prefix, d_chr[chrom])
            outfiles += [outfile]
            hname = 'f%s' % (d_chr[chrom], )
            exec('%s = open("%s", "w")' % (hname, outfile))
            exec('%s.write(header)' % (hname, ))
            i += 1
        exec('%s.write(line)' % (hname, ))
    # close files
    for chrom in list(d_chr.keys()):
        hname = 'f%s' % (d_chr[chrom], )
        exec('%s.close()' % (hname, ))
    return (j, i, j / i, outfiles)
Beispiel #2
0
def split_sam_by_chunk_num(inSam, prefix, chunk_num):
    if not isinstance(inSam, IOBase):
        inSam = open(inSam)
    # open files
    outfiles = []
    for chunk_id in range(chunk_num):
        chunk_id += 1
        outfile = '%s.%s.sam' % (prefix, chunk_id)
        outfiles += [outfile]
        hname = 'f%s' % (chunk_id, )
        exec('%s = open("%s", "w")' % (hname, outfile))

    i = 0
    header = ''
    for line in inSam:
        if line.startswith('@'):
            header += line
            continue
        if i == 0:
            for chunk_id in range(chunk_num):
                chunk_id += 1
                hname = 'f%s' % (chunk_id, )
                exec('%s.write(header)' % (hname, ))
        chunk_id = i % chunk_num + 1
        hname = 'f%s' % (chunk_id, )
        exec('%s.write(line)' % (hname, ))
        i += 1
    # close files
    for chunk_id in range(chunk_num):
        chunk_id += 1
        exec('f%s.close()' % (chunk_id, ))
    return (i, chunk_num, i / chunk_num, outfiles)
Beispiel #3
0
def split_fastx_by_size(inFastx,
                        prefix,
                        chunk_num,
                        seqfmt,
                        suffix,
                        out_random=True):
    import binpacking
    d_seq = {}
    d_len = {}
    for rc in SeqIO.parse(inFastx, seqfmt):
        d_seq[rc.id] = rc
        d_len[rc.id] = len(rc.seq)
    bins = binpacking.to_constant_bin_number(d_len, chunk_num)
    i = 0
    j = 0
    outfiles = []
    if out_random:
        import random
        random.shuffle(bins)
    for d_bin in bins:
        chunk_id = i + 1
        out_file = '%s.%s.%s%s' % (prefix, chunk_id, seqfmt, suffix)
        outfiles += [out_file]
        f = open(out_file, 'w')
        for id in list(d_bin.keys()):
            j += 1
            SeqIO.write(d_seq[id], f, seqfmt)
        f.close()
        i += 1
    return (j, chunk_num, j / chunk_num, outfiles)
Beispiel #4
0
def split_sam_by_chunk_size(inSam, prefix, chunk_size):
    if not isinstance(inSam, IOBase):
        inSam = open(inSam)
    i = 0
    j = 0
    outfiles = []
    header = ''
    for line in inSam:
        if line.startswith('@'):
            header += line
            continue
        chunk_id = i / chunk_size + 1
        hname = 'f%s' % (chunk_id, )
        if hname in dir():
            pass
        else:
            last_hname = 'f%s' % (chunk_id - 1, )
            if last_hname in dir():
                exec('%s.close()' % (last_hname, ))
            outfile = '%s.%s.sam' % (prefix, chunk_id)
            outfiles += [outfile]
            exec('%s = open("%s", "w")' % (hname, outfile))
            exec('%s.write(header)' % (hname, ))
            j += 1

        exec('%s.write(line)' % (hname, ))
        i += 1
    # close files
    exec('%s.close()' % (hname, ))
    return (i, j, chunk_size, outfiles)
Beispiel #5
0
def split_fastx_by_chunk_size(inFastx, prefix, chunk_size, seqfmt, suffix):
    if not isinstance(inFastx, IOBase):
        inFastx = open(inFastx)
    i = 0
    j = 0
    outfiles = []
    header = ''
    for rc in parse_fastx(inFastx):
        chunk_id = i / chunk_size + 1
        hname = 'f%s' % (chunk_id, )
        if hname in dir():
            pass
        else:
            last_hname = 'f%s' % (chunk_id - 1, )
            if last_hname in dir():
                exec('%s.close()' % (last_hname, ))
            outfile = '%s.%s.%s%s' % (prefix, chunk_id, seqfmt, suffix)
            outfiles += [outfile]
            exec('%s = open("%s", "w")' % (hname, outfile))
            j += 1
        exec('%s.write(rc)' % (hname, ))
        i += 1
    # close files
    exec('%s.close()' % (hname, ))
    return (i, j, chunk_size, outfiles)
Beispiel #6
0
def split_paf_by_chr(inPaf, prefix, suffix=''):
    if not isinstance(inPaf, IOBase):
        inPaf = open(inPaf)
    d_chr = {}
    i = 0
    j = 0
    outfiles = []
    for line in inPaf:
        j += 1
        temp = line.split('\t')
        chrom = temp[5]
        if chrom in d_chr:
            hname = 'f%s' % (d_chr[chrom], )
        else:
            d_chr[chrom] = i + 1
            outfile = '%s.%s.paf%s' % (prefix, chrom, suffix)
            outfiles += [outfile]
            hname = 'f%s' % (d_chr[chrom], )
            exec('%s = open("%s", "w")' % (hname, outfile))
            i += 1
        exec('%s.write(line)' % (hname, ))
    # close files
    for chrom in list(d_chr.keys()):
        hname = 'f%s' % (d_chr[chrom], )
        exec('%s.close()' % (hname, ))
    return (j, i, j / i, outfiles)
Beispiel #7
0
def main():
    opts, args = getopt.getopt(sys.argv[1:], 'hi:a:o:t:g:k:f:s:x:d')
    input_file = ''
    output_file = ''
    in_accnos = sys.stdin
    type = 'table'
    process = 'get'
    col = 1
    head = 1
    accnos_sep = None
    sep = "\t"
    dedup = False
    for op, value in opts:
        if op == '-i':
            input_file = value
        elif op == '-o':
            output_file = value
        elif op == '-a':
            in_accnos = open(value)
        elif op == '-t':
            type = value
        elif op == '-g':
            process = value
        elif op == '-k':
            col = int(value)
        elif op == '-f':
            head = int(value)
        elif op == '-x':
            accnos_sep = value
        elif op == '-s':
            sep = value
        elif op == '-d':
            dedup = True
        elif op == '-h':
            usage()
            sys.exit()

    if type not in {'table', 'fasta', 'fastq', 'hmm', 'genbank'}:
        raise TypeError(
            "type must be one of ['table','fasta','fastq'], unexpected '%s'" %
            (type, ))
        usage()
        sys.exit()
    if process not in ['get', 'remove']:
        raise TypeError(
            "process must be one of ['get','remove'], unexpected '%s'" %
            (process, ))
        usage()
        sys.exit()
    get_records(input_file,
                output_file,
                in_accnos,
                type=type,
                process=process,
                col=col,
                head=head,
                accnos_sep=accnos_sep,
                sep=sep,
                dedup=dedup)
Beispiel #8
0
def bin_split_fastx_by_chunk_num(inFastx,
                                 prefix,
                                 chunk_num,
                                 seqfmt,
                                 suffix,
                                 window_size=1e6,
                                 window_ovl=1e5,
                                 tmpdir='/tmp'):
    window_size, window_ovl = int(window_size), int(window_ovl)
    cutSeq = '{}/cut.{}'.format(tmpdir, seqfmt)
    with open(cutSeq, 'w') as f:
        cut_seqs(inFastx,
                 f,
                 window_size=window_size,
                 window_ovl=window_ovl,
                 seqfmt=seqfmt)
    return split_fastx_by_size(cutSeq, prefix, chunk_num, seqfmt, suffix)
Beispiel #9
0
def split_fastx_by_chr(inFastx, prefix, seqfmt, suffix=''):
    if not isinstance(inFastx, IOBase):
        inFastx = open(inFastx)
    i = 0
    j = 0
    outfiles = []
    for rc in parse_fastx(inFastx):
        j += 1
        last_hname = 'f%s' % (i - 1, )
        if last_hname in dir():
            exec('%s.close()' % (last_hname, ))
        chrom = rc.split()[0][1:]
        outfile = '%s.%s.%s%s' % (prefix, chrom, seqfmt, suffix)
        outfiles += [outfile]
        hname = 'f%s' % (i, )
        exec('%s = open("%s", "w")' % (hname, outfile))
        i += 1
        exec('%s.write(rc)' % (hname, ))
    # close files
    exec('%s.close()' % (hname, ))
    return (j, i, j / i, outfiles)
Beispiel #10
0
def split_fastx_by_chunk_num(inFastx, prefix, chunk_num, seqfmt, suffix):
    if not isinstance(inFastx, IOBase):
        inFastx = open(inFastx)
    # open files
    outfiles = []
    for chunk_id in range(chunk_num):
        chunk_id += 1
        outfile = '%s.%s.%s%s' % (prefix, chunk_id, seqfmt, suffix)
        outfiles += [outfile]
        hname = 'f%s' % (chunk_id, )
        exec('%s = open("%s", "w")' % (hname, outfile))

    i = 0
    for rc in parse_fastx(inFastx):
        chunk_id = i % chunk_num + 1
        hname = 'f%s' % (chunk_id, )
        exec('%s.write(rc)' % (hname, ))
        i += 1
    # close files
    for chunk_id in range(chunk_num):
        chunk_id += 1
        exec('f%s.close()' % (chunk_id, ))
    return (i, chunk_num, i / chunk_num, outfiles)
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser(version=__version__)
    parser.add_argument("-i",
                        "--input",
                        action="store",
                        type=str,
                        dest="input",
                        default=sys.stdin,
                        help="input [default=%(default)s]")
    parser.add_argument("--prefix",
                        action="store",
                        dest="prefix",
                        default='chunk',
                        help="output prefix [default=%(default)s]")
    parser.add_argument("-n",
                        "--chunk-number",
                        action="store",
                        type=int,
                        dest="chunk_num",
                        default=None,
                        help="number of chunk [default=%(default)s]")
    parser.add_argument("-s",
                        "--chunk-size",
                        action="store",
                        type=int,
                        dest="chunk_size",
                        default=None,
                        help='size of chunk [default=%(default)s]')
    parser.add_argument("-f",
                        "--format",
                        action="store",
                        dest="rcfmt",
                        default='fasta',
                        choices=['fasta', 'fastq', 'fastx', 'sam', 'paf'],
                        help="record file format [default=%(default)s]")
    parser.add_argument("--gzip-output",
                        action="store_true",
                        dest="gzip_output",
                        default=False,
                        help="if gzip output [default=%(default)s]")
    parser.add_argument(
        "--by-size",
        action="store_true",
        dest="by_size",
        default=False,
        help=
        'split by size (binpacking, only for fastx format) [default=%(default)s]'
    )
    parser.add_argument("--by-chrom",
                        action="store_true",
                        dest="by_chr",
                        default=False,
                        help='split by mapped chromsome [default=%(default)s]')
    parser.add_argument("-pfn",
                        "--print-filenames",
                        action="store_true",
                        dest="print_filenames",
                        default=False,
                        help='print filenames [default=%(default)s]')

    #parser.print_help()
    options = parser.parse_args()
    if len(sys.argv) == 1:
        parser.print_help(sys.stderr)
        sys.exit()
    if options.chunk_num and options.chunk_size:
        parser.print_help(sys.stderr)
        print('chunk-number and chunk-size is not compatible.',
              file=sys.stderr)
        sys.exit()
    elif not (options.chunk_num or options.chunk_size) and not options.by_chr:
        parser.print_help(sys.stderr)
        print('either chunk-number or chunk-size must be speicfied.',
              file=sys.stderr)
        sys.exit()
    if options.gzip_output:
        suffix = '.gz'
        print(
            'Warning: for large dataset, gzip output is very slow. You may want to diable it.',
            file=sys.stderr)
    else:
        suffix = ''
    if not isinstance(options.input, IOBase):
        options.input = open(options.input)

    #execute
    if options.rcfmt == 'sam':
        if options.chunk_num:
            stats = split_sam_by_chunk_num(options.input, options.prefix,
                                           options.chunk_num)
        elif options.chunk_size:
            stats = split_sam_by_chunk_size(options.input, options.prefix,
                                            options.chunk_size)
        elif options.by_chr:
            stats = split_sam_by_chr(
                options.input,
                options.prefix,
            )
    elif options.rcfmt in set(['fasta', 'fastq', 'fastx']):
        if options.chunk_num:
            if options.by_size:
                stats = split_fastx_by_size(options.input, options.prefix,
                                            options.chunk_num, options.rcfmt,
                                            suffix)
            else:
                stats = split_fastx_by_chunk_num(options.input, options.prefix,
                                                 options.chunk_num,
                                                 options.rcfmt, suffix)
        elif options.chunk_size:
            stats = split_fastx_by_chunk_size(options.input, options.prefix,
                                              options.chunk_size,
                                              options.rcfmt, suffix)
        elif options.by_chr:
            stats = split_fastx_by_chr(options.input, options.prefix,
                                       options.rcfmt, suffix)
    elif options.rcfmt == 'paf':
        if options.by_chr:
            stats = split_paf_by_chr(options.input, options.prefix, suffix)
    n_records, n_chunks, per_chunk, outfiles = stats
    print('total %s records, splited into %s chunks, %s per chunk' %
          (n_records, n_chunks, per_chunk),
          file=sys.stderr)
    if options.print_filenames:
        print('\n'.join(outfiles), file=sys.stdout)
Beispiel #12
0
def get_records(input_file,
                output_file,
                in_accnos,
                type='table',
                process='get',
                sep="\t",
                col=1,
                head=1,
                accnos_sep=None,
                dedup=False):
    def get_record(d_accnos, record_id):
        if record_id in d_accnos:
            return True
        else:
            return False

    def remove_record(d_accnos, record_id):
        if record_id in d_accnos:
            return False
        else:
            return True

    if isinstance(in_accnos, IOBase):
        d_accnos = {
            line.strip().split(accnos_sep)[0]
            for line in in_accnos if line.strip()
        }
    else:  # list
        d_accnos = set(in_accnos)

    lst_get = set([])
    f = open(output_file, 'w')
    if type == 'table':
        i = 0
        for line in open(input_file, 'r'):
            i += 1
            if i == head:
                f.write(line)
            else:
                temp = line.strip().split(sep)
                record_id = temp[col - 1]
                if process == 'get':
                    if get_record(d_accnos, record_id):
                        if dedup and record_id in lst_get:
                            continue
                        f.write(line)
                        lst_get.add(record_id)
                    else:
                        continue
                elif process == 'remove':
                    if remove_record(d_accnos, record_id):
                        f.write(line)
                    else:
                        lst_get.add(record_id)
                        continue

    elif type in {'fasta', 'fastq', 'genbank'}:
        for seq_record in SeqIO.parse(open(input_file), type):
            record_id = seq_record.id
            if process == 'get':
                if get_record(d_accnos, record_id):
                    SeqIO.write(seq_record, f, type)
                    lst_get.add(record_id)
                else:
                    continue
            elif process == 'remove':
                if remove_record(d_accnos, record_id):
                    SeqIO.write(seq_record, f, type)
                else:
                    lst_get.add(record_id)
                    continue
    elif type == 'hmm':
        from HMMER import HMMParser
        for rc in HMMParser(open(input_file)):
            values = {getattr(rc, key, None) for key in ['NAME', 'ACC']}
            values = {v for v in values if v}
            if process == 'get':
                if values & d_accnos:
                    rc.write(f)
                    lst_get = lst_get | values
            elif process == 'remove':
                if not (values & d_accnos):
                    rc.write(f)
                    lst_get = lst_get | values
    f.close()

    not_get = d_accnos - set(lst_get)
    if not_get:
        for not_get_id in not_get:
            print(not_get_id)
Beispiel #13
0
def get_records(input_file,
                output_file,
                in_accnos,
                type='table',
                process='get',
                col=1,
                head=1,
                accnos_sep=None):
    def get_record(d_accnos, record_id):
        if record_id in d_accnos:
            return True
        else:
            return False

    def remove_record(d_accnos, record_id):
        if record_id in d_accnos:
            return False
        else:
            return True

    if isinstance(in_accnos, IOBase):
        d_accnos = {line.strip().split(accnos_sep)[0] for line in in_accnos}
    else:  # list
        d_accnos = set(in_accnos)

    lst_get = []
    f = open(output_file, 'w')
    if type == 'table':
        i = 0
        for line in open(input_file, 'r'):
            i += 1
            if i == head:
                f.write(line)
            else:
                temp = line.strip().split('\t')
                record_id = temp[col - 1]
                if process == 'get':
                    if get_record(d_accnos, record_id):
                        f.write(line)
                        lst_get.append(record_id)
                    else:
                        continue
                elif process == 'remove':
                    if remove_record(d_accnos, record_id):
                        f.write(line)
                    else:
                        lst_get.append(record_id)
                        continue

    elif type == 'fasta' or type == 'fastq':
        for seq_record in SeqIO.parse(open(input_file), type):
            record_id = seq_record.id
            if process == 'get':
                if get_record(d_accnos, record_id):
                    SeqIO.write(seq_record, f, type)
                    lst_get.append(record_id)
                else:
                    continue
            elif process == 'remove':
                if remove_record(d_accnos, record_id):
                    SeqIO.write(seq_record, f, type)
                else:
                    lst_get.append(record_id)
                    continue
    elif type == 'hmm':
        from HMMER import HMMParser
        for rc in HMMParser(open(input_file)):
            if process == 'get':
                if rc.NAME in d_accnos or rc.ACC in d_accnos:
                    rc.write(f)
                    lst_get += [rc.NAME, rc.ACC]
            elif process == 'remove':
                if not (rc.NAME in d_accnos or rc.ACC in d_accnos):
                    rc.write(f)
                    lst_get += [rc.NAME, rc.ACC]
    f.close()

    not_get = d_accnos - set(lst_get)
    if not_get:
        for not_get_id in not_get:
            print(not_get_id)