Ejemplo n.º 1
0
def main():
    try:
        invcf = sys.argv[1]

    except:
        print "<invcf>"
        print "EG.: csv_list_multicolumn.py.py 1001genomes_snp-short-indel_only_ACGTN.vcf.gz"
        sys.exit(1)

    print "input vcf              %s" % invcf

    checkfile(invcf)

    names = None
    with openfile(invcf, 'r') as fhdi:
        with open(invcf + '.list.csv', 'wb') as fhdo:
            writer = csv.writer(fhdo, delimiter='\t', quotechar='"')
            for line in fhdi:
                line = line.strip()

                if len(line) == 0:
                    continue

                if line.startswith("#"):  # header
                    print "HEADER", line

                    if line.startswith("##"):  # definition lines
                        print "HEADER :: DEF", line

                    else:  # column description
                        print "HEADER :: COL", line

                        cols = line.split("\t")
                        num_cols = len(cols)
                        shared = cols[:
                                      9]  #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT
                        names = cols[9:]

                        print "HEADER :: COL :: SHARED", shared
                        print "HEADER :: COL :: NAMES", names

                        for ln, name in enumerate(names):
                            cols = ["1", "%s|%d" % (invcf, ln + 1), name]

                            writer.writerow(cols)

                        break
def main():
    try:
        invcf  = sys.argv[1]

    except:
        print "<invcf>"
        print "EG.: csv_list_multicolumn.py.py 1001genomes_snp-short-indel_only_ACGTN.vcf.gz"
        sys.exit(1)

    print "input vcf              %s" % invcf

    checkfile(invcf)

    names = None
    with openfile(invcf, 'r') as fhdi:
        with open(invcf + '.list.csv', 'wb') as fhdo:
            writer = csv.writer(fhdo, delimiter='\t', quotechar='"')
            for line in fhdi:
                line = line.strip()

                if len(line) == 0:
                    continue

                if line.startswith("#"): # header
                    print "HEADER", line

                    if line.startswith("##"): # definition lines
                        print "HEADER :: DEF", line

                    else: # column description
                        print "HEADER :: COL", line

                        cols     = line.split("\t")
                        num_cols = len(cols)
                        shared   = cols[:9] #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT
                        names    = cols[9:]

                        print "HEADER :: COL :: SHARED", shared
                        print "HEADER :: COL :: NAMES" , names

                        for ln, name in enumerate(names):
                            cols = ["1", "%s|%d" % (invcf, ln+1), name]

                            writer.writerow(cols)

                        break
 def open(self):
     """
     Open GFF file
     """
     self.fhd = filemanager.openfile(self.infile, 'r')
Ejemplo n.º 4
0
def main(incsv, translation_str):
    outfile    = incsv + '.vcf.gz'


    if not os.path.exists( incsv ):
        print "input file does not exists. quitting like a whimp"
        sys.exit( 1 )

    print "reading %s" % incsv


    translation = {}
    if translation_str is not None:
        for pair in translation_str.split(';'):
            src, dst = pair.split(':')
            assert src not in translation
            translation[ src ] = dst

        print "Translation", translation


    if os.path.exists( outfile ):
        print "output file %s exists. quitting like a whimp" % outfile
        sys.exit( 1 )

    print "saving to %s" % outfile

    data       = vcfHeap(translation=translation)

    cfh = openfile(incsv, 'r')
    for line in cfh:
        if line[0] == "#": continue
        line = line.strip()
        cols = line.split('\t')

        assert len(cols) >= 2

        print cols, cols[:3]
        data.addFile(*cols[:3])





    mfh = openvcffile(outfile + '.tmp.vcf.gz', 'w', compresslevel=1)

    mfh.write( data.getVcfHeader() )

    mfh.flush()


    num_lines   = 0
    print_every = 1000
    lines       = []
    while not data.isempty():
        val = data.next()

        if val is not None: # if not empty
            lines.append( str( val ) )

            if len( lines ) % (print_every/100) == 0:
                sys.stdout.write('.'                   )
                #sys.stdout.write(' {:14,d}\n'.format(len(lines)))
                #break

                if len( lines ) % print_every == 0:
                    num_lines += len( lines )
                    sys.stdout.write(' {:14,d}\n'.format(num_lines))
                    #break

                    mfh.write( "".join( lines ) )
                    mfh.flush()
                    lines = []

                sys.stdout.flush()

        else:
            print "val is empty"
            break

    mfh.write( "".join( lines ) )
    mfh.flush()
    mfh.close()

    num_lines += len( lines )

    lines = []

    sys.stdout.write('\nTotal {:14,d}\n'.format(num_lines))

    os.rename(outfile + '.tmp.vcf.gz', outfile)

    print "Finished"

    return outfile
Ejemplo n.º 5
0
def main(args):
    parser = argparse.ArgumentParser(description='Simplify merged VCF file.')
    parser.add_argument('-i',
                        '--input',
                        dest='input',
                        required=True,
                        nargs='?',
                        type=str,
                        help='Input file')
    parser.add_argument('-o',
                        '--output',
                        dest='output',
                        default=None,
                        nargs='?',
                        type=str,
                        help='Output file')
    parser.add_argument('-t',
                        '--table',
                        dest='table',
                        default=None,
                        nargs='?',
                        type=str,
                        help='Input table')
    parser.add_argument('-k',
                        '--keys',
                        dest='keys',
                        default=None,
                        nargs='?',
                        type=str,
                        help='Input keys')
    parser.add_argument('-v',
                        '--table-values',
                        dest='table_vs',
                        default=None,
                        nargs='?',
                        type=str,
                        help='Input table values')
    parser.add_argument(
        '-c',
        '--chromosome-translation',
        dest='translation',
        default=None,
        nargs='?',
        type=str,
        help='Translation table to chromosome names [e.g.: 1:Chr1;2:Chr2')
    parser.add_argument('-s',
                        '--samples',
                        dest='samples',
                        default=None,
                        nargs='?',
                        type=str,
                        help='Samples (Columns) to keep [e.g.: Spp1;Spp3;Spp5')
    parser.add_argument('-n',
                        '--keep-no-coverage',
                        dest='keep_no_coverage',
                        action='store_true',
                        help='Keep rows containing no coverage')
    parser.add_argument('-e',
                        '--keep-heterozygous',
                        dest='keep_heterozygous',
                        action='store_true',
                        help='Keep rows hoterozygosity')

    options = parser.parse_args(args)

    print "Options", options

    invcf = options.input

    try:
        checkfile(invcf)
        print "input vcf:              %s" % invcf

    except:
        parser.print_usage()
        #print "%s --input <invcf>" % sys.argv[0]
        print "EG.: %s --input 1001genomes_snp-short-indel_only_ACGTN.vcf.gz" % sys.argv[
            0]
        sys.exit(1)

    outbn = invcf
    if options.output is not None:
        outbn = options.output

    outbn += (".nc" if options.keep_no_coverage else
              "") + (".het" if options.keep_heterozygous else "")
    listFile = outbn + '.list.csv'
    vcfFile = outbn + '.list.csv.vcf.gz'
    outFile = outbn + '.list.csv.vcf.gz.simplified.vcf.gz'
    outFileTmp = outbn + '.list.csv.vcf.gz.simplified.tmp.vcf.gz'

    if os.path.exists(outFile):
        print "Out File (%s) EXISTS. quitting" % outFile
        sys.exit(1)

    print "Out File:               %s" % outFile

    try:
        intbl = options.table
        checkfile(intbl)
        print "Input Table: %s" % intbl

    except:
        intbl = None

    tbl_k = None
    if options.keys is not None:
        tbl_k = options.keys
        print "Input Table keys: %s" % tbl_k

    tbl_vs = None
    if options.table_vs is not None:
        tbl_vs = options.table_vs.split(',')
        print "Table values: %s" % options.table_vs

    data, atad = (None, None)
    if intbl:
        data, atad = get_translation(intbl, tbl_k, tbl_vs)
        print 'DATA', data
        print 'ATAD', atad

    translation = {}
    if options.translation is not None:
        for pair in options.translation.split(';'):
            src, dst = pair.split(':')
            assert src not in translation
            translation[src] = dst

        print "Translation", translation
    else:
        translation = None

    columns = None
    if options.samples is not None:
        columns = options.samples.split(';')
        assert len(columns) > 0, "No Columns %s" % str(columns)

    vcf_holder = vcf()
    names = None
    with openfile(invcf, 'r') as fhdi:
        with openvcffile(outFileTmp, 'w', compresslevel=1) as fhdv:
            vcf_holder.setFhd(fhdv)
            for line in fhdi:
                line = line.strip()

                if len(line) == 0:
                    continue

                if line.startswith("#"):  # header
                    print "HEADER", line

                    if line.startswith("##"):  # definition lines
                        print "HEADER :: DEF", line

                    else:  # column description
                        print "HEADER :: COL", line

                        cols = line.split("\t")
                        num_cols = len(cols)
                        shared = cols[:
                                      9]  #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT
                        names = cols[9:]

                        print "HEADER :: COL :: SHARED", shared
                        #print "HEADER :: COL :: NAMES" , names

                        if columns is not None:
                            cdiff = list(set(columns) - set(names))
                            assert len(
                                cdiff) == 0, "Unknown column name: %s" % (
                                    str(cdiff))

                        with open(listFile, 'wb') as fhdl:
                            writer = csv.writer(fhdl,
                                                delimiter='\t',
                                                quotechar='"')
                            for ln, name in enumerate(names):
                                if columns is not None:
                                    if name not in columns:
                                        continue

                                cols = ["1", "%s|%d" % (invcf, ln + 1), name]

                                if data is not None:
                                    assert name in data, "name %s not in db %s" % (
                                        name, str(data))

                                    #print "converting %s to %s" % (name, data[name])
                                    cols[2] = data[name]
                                    names[ln] = data[name]

                                #print "COLS", cols
                                writer.writerow(cols)

                        print "HEADER :: COL :: NAMES", names

                        vcf_holder.printVcfHeader(names)

                else:
                    cols = line.split("\t")

                    assert len(cols) > 9

                    info = cols[8]
                    assert ':' in info, line
                    assert 'GT' in info, line

                    #print "has desc"
                    infoC = info.split(':')
                    assert len(infoC) > 1
                    #print "  info" , info
                    #print "  infoC", infoC

                    gtpos = info.index('GT')
                    #print "  GT pos", gtpos

                    register = {
                        'chrom': cols[0],
                        'pos': int(cols[1]),
                        'src': cols[3],
                        'dst': cols[4],
                        'desc': {},
                        'stats': {
                            'unphased': 0,
                            'phased': 0,
                            'gap': 0,
                            'ref': 0,
                            'h**o': 0,
                            'het': 0,
                            'x_mnp_ref': 0,
                            'x_mnp_alt': 0,
                            'x_gap': 0,
                            'x_het': 0
                        }
                    }

                    if len(cols[3]) > 1:
                        print "MNP ref", cols[3]
                        vcf_holder.add_stat(cols[0], 'x_mnp_ref', 1)
                        continue

                    if any([len(x) != 1 for x in cols[4].split(',')]):
                        print "MNP alt", cols[4]
                        vcf_holder.add_stat(cols[0], 'x_mnp_alt', 1)
                        continue

                    #descs             =     cols[9:]
                    has_gap = False
                    is_het = False

                    for colNum, desc in enumerate(cols[9:]):
                        colname = names[colNum]

                        if columns is not None:
                            if colname not in columns:
                                continue

                        if (desc == './.') or (desc == '.'):
                            if not options.keep_no_coverage:
                                vcf_holder.add_stat(cols[0], 'x_gap', 1)
                                has_gap = True
                                break

                            else:
                                register['stats']['gap'] += 1
                                continue

                        assert ':' in desc, desc + " " + str(cols[9:])

                        descC = desc.split(":")
                        assert len(descC) > 1
                        #print "  desc" , desc
                        #print "  descC", descC

                        #assert len(infoC) == len(descC), str(infoC) + " " + str(descC) + " " + str(cols[9:])
                        if len(infoC) != len(descC):
                            if not options.keep_no_coverage:
                                vcf_holder.add_stat(cols[0], 'x_gap', 1)
                                has_gap = True
                                break

                            else:
                                register['stats']['gap'] += 1
                                continue

                        #print "   len infoC == len descC", infoC, descC

                        gtDesc = descC[gtpos]
                        gt0, gt1 = (None, None)

                        if '/' in gtDesc:
                            gt0, gt1 = gtDesc.split('/')
                            register['stats']['unphased'] += 1

                        elif '|' in gtDesc:
                            gt0, gt1 = gtDesc.split('|')
                            register['stats']['phased'] += 1

                        else:
                            assert False, 'unknown info fomat: %s (%s, %s)' % (
                                gtDesc, info, desc)

                        if gt0 == '.' or gt1 == '.':  # skip no coverage
                            #sys.stdout.write('.')
                            if not options.keep_no_coverage:
                                register['stats']['uncalled'] += 1
                                has_gap = True
                                break
                            else:
                                vcf_holder.add_stat(cols[0], 'x_gap', 1)
                                continue

                        else:
                            if len(set([gt0, gt1])) == 1:
                                #sys.stdout.write('o')
                                register['stats']['h**o'] += 1

                                if (gt0 == '0'
                                    ):  # homozygous identical to reference
                                    register['stats']['ref'] += 1
                                    continue
                                    #register['desc' ].append( names[colNum] )

                            else:
                                #sys.stdout.write('e')
                                if not options.keep_heterozygous:
                                    vcf_holder.add_stat(cols[0], 'x_het', 1)
                                    is_het = True
                                    break
                                else:
                                    register['stats']['het'] += 1
                                    continue

                            dstC = register['dst'].split(',')
                            nuc0 = register['src'] if gt0 == '0' else dstC[
                                int(gt0) - 1]
                            nuc1 = register['src'] if gt1 == '0' else dstC[
                                int(gt1) - 1]
                            nucK = (nuc0, nuc1)

                            if nucK not in register['desc']:
                                register['desc'][nucK] = []

                            register['desc'][nucK].append(names[colNum])

                            #if gt0 == '0' or gt1 == '0': # if heretozygous and has reference, make it explicit
                            #    #sys.stdout.write('H')
                            #    alts = sorted(list(set(register['src'  ].split(",") + register['dst'  ].split(","))))
                            #    alts = [ a for a in alts if a != '.' ]
                            #    register['dst'  ] = ",".join(alts)
                            #    #print "   added  src to dst", self.register['dst'  ]

                            #register['desc' ].append( names[colNum] )

                    #sys.stdout.flush()

                    if has_gap:
                        continue

                    if is_het:
                        continue

                    if len(register['desc']) > 0:
                        #print '+\n'

                        if translation:
                            register['chrom'] = translation.get(
                                register['chrom'], register['chrom'])

                        descs = deepcopy(register['desc'])
                        for desc in descs:
                            register['desc'] = '|'.join(descs[desc])

                            if len(set(desc)) == 1:
                                desc = desc[0]

                            register['dst'] = ','.join(sorted(list(set(desc))))
                            vcf_holder.printRegister(register)

                    else:
                        #print '-'
                        pass

            fhdv.flush()

        print "\nGLOBAL STATS"
        print 'Global Stats    :', " ".join([
            "{:s}: {:10,d}".format(*i)
            for i in sorted(vcf_holder.stats.items())
        ])

        os.rename(outFileTmp, outFile)

    os.utime(listFile, None)

    if not os.path.exists(vcfFile):
        os.symlink(invcf, vcfFile)

    os.utime(vcfFile, None)

    os.utime(outFile, None)
Ejemplo n.º 6
0
def main():
    try:
        infile = os.sys.argv[1]
    except:
        print "no input file given"
        print sys.argv[0], "<INPUT MULTICOLUMN CSV>"
        sys.exit(1)

    checkfile(infile)

    print "splitting %s" % infile
    defs = []
    names = []
    outfiles = []
    valid = 0
    skipped = 0
    lastCol = ""
    num_cols = None
    line_count = 0
    with openfile(infile, 'r') as fhd:
        for line in fhd:
            line = line.strip()

            if len(line) == 0:
                continue

            if line.startswith("#"):  # header
                print "HEADER", line

                if line.startswith("##"):  # definition lines
                    print "HEADER :: DEF", line
                    defs.append(line)

                else:  # column description
                    print "HEADER :: COL", line

                    cols = line.split("\t")
                    num_cols = len(cols)
                    shared = cols[:
                                  9]  #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMA
                    names = cols[9:]

                    # Project specific method, used to deal with sample names that contain "_"
                    # May be changed later
                    """
                    By default, you should make sure that sample name does not contain "_" or ".", 
                    and len(sample_name) + len(chromosome_name) <= 31.
                    If that is not the case then sample name
                    and/or chromosome name may have to be modified 
                    (like this method)
                    """
                    newnames = []
                    for idx, name in enumerate(names):
                        newname = name
                        if "_" in newname:
                            newname = newname.rpartition("_")[0].replace(
                                "_", "")
                        if "." in newname:
                            newname = newname.partition(".")[0]
                        newnames.append(newname)
                    names = newnames

                    print "HEADER :: COL :: SHARED", shared
                    print "HEADER :: COL :: NAMES", names

                    outfiles = [None] * len(names)
                    outlist = open("%s.lst" % infile, 'w')
                    for np, name in enumerate(names):
                        nof = ("%s.%0" + str(len("%d" % len(names))) +
                               "d.%s.vcf.gz") % (infile, np + 1,
                                                 sanitize(name))
                        print("creating %" + str(len("%d" % len(names))) +
                              "d %-" + str(max([len(x) for x in names])) +
                              "s to %s") % (np + 1, name, nof)
                        nop = openfile(nof, 'w')

                        #                               skipped valid
                        outfiles[np] = [name, nof, nop, 0, 0]

                        outlist.write("1\t%s\t%s\n" %
                                      (os.path.abspath(nof), name))

                        nop.write("\n".join(defs) + "\n")
                        nop.write("##Split from: %s column %d\n" %
                                  (os.path.abspath(infile), np + 1))
                        nop.write("\t".join(shared))
                        nop.write("\t%s\n" % name)
                        nop.flush()

                continue

            line_count += 1

            if line_count % 1000 == 0:
                sys.stdout.write('.')
                if line_count % 100000 == 0:
                    sys.stdout.write(' lines %12d valid %12d skipped %12d\n' %
                                     (line_count, valid, skipped))
                    for nop, ndata in enumerate(outfiles):
                        ndata[2].flush()
                sys.stdout.flush()

            #print "DATA", line
            cols = line.split("\t")
            assert len(cols) == num_cols

            ref = cols[3]
            alts = cols[4].split(',')
            if len(ref) > 1 or any([len(x) > 1
                                    for x in alts]):  # exclude MNP or indel
                # print "Excluded line: " + line
                continue

            shared = cols[:
                          9]  #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMA
            shared_str = "\t".join(shared)
            data = cols[9:]  #used to be " + "\t""

            if cols[0] != lastCol:
                print '\nChromosome', cols[0]
                lastCol = cols[0]

            #print "shared", shared
            #print "data"  , data
            for pos, ndata in enumerate(data):
                #outfiles[np] = [name, nof, 0, 0, nop]
                res = [ndata.startswith(x) for x in targets]
                if any(res):
                    shared[4] = alts[res.index(True)]
                    shared_str = "\t".join(shared)
                    valid += 1
                    outfiles[pos][4] += 1  # valid
                else:
                    skipped += 1
                    outfiles[pos][3] += 1  # skipped
                    continue

                outfiles[pos][2].write(shared_str + "\t" + ndata + "\n")

    for nop, ndata in enumerate(outfiles):
        ndata[2].close()
        print("closing %" + str(len("%d" % len(outfiles))) + "d %-" +
              str(max([len(x[0]) for x in outfiles])) + "s :: %-" +
              str(max([len(x[1]) for x in outfiles])) +
              "s :: skipped %6d exported %6d total %7d") % (
                  nop + 1, ndata[0], ndata[1], ndata[3], ndata[4],
                  ndata[3] + ndata[4])
def main():
    try:
        infile = os.sys.argv[1]
    except:
        print "no input file given"
        print sys.argv[0], "<INPUT MULTICOLUMN CSV>"
        sys.exit(1)

    checkfile(infile)

    print "splitting %s" % infile
    defs = []
    names = []
    outfiles = []
    valid = 0
    skipped = 0
    lastCol = ""
    num_cols = None
    line_count = 0
    with openfile(infile, 'r') as fhd:
        for line in fhd:
            line = line.strip()

            if len(line) == 0:
                continue

            if line.startswith("#"):  # header
                print "HEADER", line

                if line.startswith("##"):  # definition lines
                    print "HEADER :: DEF", line
                    defs.append(line)

                else:  # column description
                    print "HEADER :: COL", line

                    cols = line.split("\t")
                    num_cols = len(cols)
                    shared = cols[:
                                  9]  #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMA
                    names = cols[9:]

                    print "HEADER :: COL :: SHARED", shared
                    print "HEADER :: COL :: NAMES", names

                    outfiles = [None] * len(names)
                    outlist = open("%s.lst" % infile, 'w')
                    for np, name in enumerate(names):
                        nof = ("%s_%0" + str(len("%d" % len(names))) +
                               "d_%s.vcf.gz") % (infile, np + 1,
                                                 sanitize(name))
                        print("creating %" + str(len("%d" % len(names))) +
                              "d %-" + str(max([len(x) for x in names])) +
                              "s to %s") % (np + 1, name, nof)
                        nop = openfile(nof, 'w')

                        #                               skipped valid
                        outfiles[np] = [name, nof, nop, 0, 0]

                        outlist.write("1\t%s\t%s\n" %
                                      (os.path.abspath(nof), name))

                        nop.write("\n".join(defs) + "\n")
                        nop.write("##Split from: %s column %d\n" %
                                  (os.path.abspath(infile), np + 1))
                        nop.write("\t".join(shared))
                        nop.write("\t%s\n" % name)
                        nop.flush()

                continue

            line_count += 1

            if line_count % 1000 == 0:
                sys.stdout.write('.')
                if line_count % 100000 == 0:
                    sys.stdout.write(' lines %12d valid %12d skipped %12d\n' %
                                     (line_count, valid, skipped))
                    for nop, ndata in enumerate(outfiles):
                        ndata[2].flush()
                sys.stdout.flush()

            #print "DATA", line
            cols = line.split("\t")
            assert len(cols) == num_cols
            shared = cols[:
                          9]  #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMA
            shared_str = "\t".join(shared) + "\t"
            data = cols[9:]

            if cols[0] != lastCol:
                print '\nChromosome', cols[0]
                lastCol = cols[0]

            #print "shared", shared
            #print "data"  , data
            for pos, ndata in enumerate(data):
                #outfiles[np] = [name, nof, 0, 0, nop]
                if any([ndata.startswith(x) for x in ignores]):
                    skipped += 1
                    outfiles[pos][3] += 1  # skipped
                    continue

                valid += 1
                outfiles[pos][4] += 1  # valid
                outfiles[pos][2].write(shared_str + "\t" + ndata + "\n")

    for nop, ndata in enumerate(outfiles):
        ndata[2].close()
        print("closing %" + str(len("%d" % len(outfiles))) + "d %-" +
              str(max([len(x[0]) for x in outfiles])) + "s :: %-" +
              str(max([len(x[1]) for x in outfiles])) +
              "s :: skipped %6d exported %6d total %7d") % (
                  nop + 1, ndata[0], ndata[1], ndata[3], ndata[4],
                  ndata[3] + ndata[4])
Ejemplo n.º 8
0
 def open(self):
     """
     Open GFF file
     """
     self.fhd    = filemanager.openfile(self.infile, 'r')
def main():
    try:
        infile = os.sys.argv[1]
    except:
        print "no input file given"
        print sys.argv[0], "<INPUT MULTICOLUMN CSV>"
        sys.exit(1)

    checkfile(infile)

    print "splitting %s" % infile
    defs       = []
    names      = []
    outfiles   = []
    valid      = 0
    skipped    = 0
    lastCol    = ""
    num_cols   = None
    line_count = 0
    with openfile(infile, 'r') as fhd:
        for line in fhd:
            line = line.strip()

            if len(line) == 0:
                continue

            if line.startswith("#"): # header
                print "HEADER", line

                if line.startswith("##"): # definition lines
                    print "HEADER :: DEF", line
                    defs.append( line )

                else: # column description
                    print "HEADER :: COL", line

                    cols     = line.split("\t")
                    num_cols = len(cols)
                    shared   = cols[:9] #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMA
                    names    = cols[9:]

                    print "HEADER :: COL :: SHARED", shared
                    print "HEADER :: COL :: NAMES" , names

                    outfiles = [None]*len(names)
                    outlist  = open("%s.lst" % infile, 'w')
                    for np, name in enumerate(names):
                        nof = ("%s_%0"+str(len("%d"%len(names)))+"d_%s.vcf.gz") % (infile, np+1, sanitize(name))
                        print ("creating %"+str(len("%d"%len(names)))+"d %-"+str(max([len(x) for x in names]))+"s to %s") % (np+1, name, nof)
                        nop = openfile( nof, 'w' )

                        #                               skipped valid
                        outfiles[np] = [name, nof, nop, 0     , 0]

                        outlist.write("1\t%s\t%s\n" % (os.path.abspath(nof), name))

                        nop.write("\n".join(defs) + "\n")
                        nop.write("##Split from: %s column %d\n" % ( os.path.abspath(infile), np + 1) )
                        nop.write("\t".join(shared))
                        nop.write("\t%s\n" % name)
                        nop.flush()

                continue

            line_count += 1

            if line_count % 1000 == 0:
                sys.stdout.write('.')
                if line_count % 100000 == 0:
                    sys.stdout.write(' lines %12d valid %12d skipped %12d\n' % (line_count, valid, skipped) )
                    for nop, ndata in enumerate(outfiles):
                        ndata[2].flush()
                sys.stdout.flush()


            #print "DATA", line
            cols       = line.split("\t")
            assert len(cols) == num_cols
            shared     = cols[:9] #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMA
            shared_str = "\t".join(shared) + "\t"
            data       = cols[9:]

            if cols[0] != lastCol:
                print '\nChromosome', cols[0]
                lastCol = cols[0]

            #print "shared", shared
            #print "data"  , data
            for pos, ndata in enumerate(data):
                #outfiles[np] = [name, nof, 0, 0, nop]
                if any([ndata.startswith(x) for x in ignores]):
                    skipped          += 1
                    outfiles[pos][3] += 1 # skipped
                    continue

                valid            += 1
                outfiles[pos][4] += 1 # valid
                outfiles[pos][2].write(shared_str + "\t" + ndata + "\n")

    for nop, ndata in enumerate(outfiles):
        ndata[2].close()
        print ("closing %"+str(len("%d"%len(outfiles)))+"d %-"+str(max([len(x[0]) for x in outfiles]))+"s :: %-"+str(max([len(x[1]) for x in outfiles]))+"s :: skipped %6d exported %6d total %7d") % (nop+1, ndata[0], ndata[1], ndata[3], ndata[4], ndata[3] + ndata[4])
Ejemplo n.º 10
0
def main(incsv, translation_str):
    outfile = incsv + '.vcf.gz'

    if not os.path.exists(incsv):
        print "input file does not exists. quitting like a whimp"
        sys.exit(1)

    print "reading %s" % incsv

    translation = {}
    if translation_str is not None:
        for pair in translation_str.split(';'):
            src, dst = pair.split(':')
            assert src not in translation
            translation[src] = dst

        print "Translation", translation

    if os.path.exists(outfile):
        print "output file %s exists. quitting like a whimp" % outfile
        sys.exit(1)

    print "saving to %s" % outfile

    data = vcfHeap(translation=translation)

    cfh = openfile(incsv, 'r')
    for line in cfh:
        if line[0] == "#": continue
        line = line.strip()
        cols = line.split('\t')

        assert len(cols) >= 2

        print cols, cols[:3]
        data.addFile(*cols[:3])

    mfh = openvcffile(outfile + '.tmp.vcf.gz', 'w', compresslevel=1)

    mfh.write(data.getVcfHeader())

    mfh.flush()

    num_lines = 0
    print_every = 1000
    lines = []
    while not data.isempty():
        val = data.next()

        if val is not None:  # if not empty
            lines.append(str(val))

            if len(lines) % (print_every / 100) == 0:
                sys.stdout.write('.')
                #sys.stdout.write(' {:14,d}\n'.format(len(lines)))
                #break

                if len(lines) % print_every == 0:
                    num_lines += len(lines)
                    sys.stdout.write(' {:14,d}\n'.format(num_lines))
                    #break

                    mfh.write("".join(lines))
                    mfh.flush()
                    lines = []

                sys.stdout.flush()

        else:
            print "val is empty"
            break

    mfh.write("".join(lines))
    mfh.flush()
    mfh.close()

    num_lines += len(lines)

    lines = []

    sys.stdout.write('\nTotal {:14,d}\n'.format(num_lines))

    os.rename(outfile + '.tmp.vcf.gz', outfile)

    print "Finished"

    return outfile