Exemple #1
0
def main():
    try:
        invcf = sys.argv[1]

    except:
        print "<invcf>"
        print "EG.: csv_list_multicolumn.py.py 1001genomes_snp-short-indel_only_ACGTN.vcf.gz"
        sys.exit(1)

    print "input vcf              %s" % invcf

    checkfile(invcf)

    names = None
    with openfile(invcf, 'r') as fhdi:
        with open(invcf + '.list.csv', 'wb') as fhdo:
            writer = csv.writer(fhdo, delimiter='\t', quotechar='"')
            for line in fhdi:
                line = line.strip()

                if len(line) == 0:
                    continue

                if line.startswith("#"):  # header
                    print "HEADER", line

                    if line.startswith("##"):  # definition lines
                        print "HEADER :: DEF", line

                    else:  # column description
                        print "HEADER :: COL", line

                        cols = line.split("\t")
                        num_cols = len(cols)
                        shared = cols[:
                                      9]  #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT
                        names = cols[9:]

                        print "HEADER :: COL :: SHARED", shared
                        print "HEADER :: COL :: NAMES", names

                        for ln, name in enumerate(names):
                            cols = ["1", "%s|%d" % (invcf, ln + 1), name]

                            writer.writerow(cols)

                        break
    def __init__(self, infile, filedesc, filecare, fileCol):
        self.infile   = infile
        self.filedesc = filedesc
        self.filecare = filecare
        self.fileCol  = fileCol

        checkfile(infile)

        #print "  opening %s" % infile
        self.names                = []
        self.infhd                = openvcffile(infile, 'r')
        self.state                = FHDOPEN
        self.currLine             = ""
        self.register             = vcfRegister()
        self.register['filename'] = infile
        self.register['filedesc'] = filedesc
        self.register['filecare'] = filecare
def main():
    try:
        invcf  = sys.argv[1]

    except:
        print "<invcf>"
        print "EG.: csv_list_multicolumn.py.py 1001genomes_snp-short-indel_only_ACGTN.vcf.gz"
        sys.exit(1)

    print "input vcf              %s" % invcf

    checkfile(invcf)

    names = None
    with openfile(invcf, 'r') as fhdi:
        with open(invcf + '.list.csv', 'wb') as fhdo:
            writer = csv.writer(fhdo, delimiter='\t', quotechar='"')
            for line in fhdi:
                line = line.strip()

                if len(line) == 0:
                    continue

                if line.startswith("#"): # header
                    print "HEADER", line

                    if line.startswith("##"): # definition lines
                        print "HEADER :: DEF", line

                    else: # column description
                        print "HEADER :: COL", line

                        cols     = line.split("\t")
                        num_cols = len(cols)
                        shared   = cols[:9] #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT
                        names    = cols[9:]

                        print "HEADER :: COL :: SHARED", shared
                        print "HEADER :: COL :: NAMES" , names

                        for ln, name in enumerate(names):
                            cols = ["1", "%s|%d" % (invcf, ln+1), name]

                            writer.writerow(cols)

                        break
Exemple #4
0
    def __init__(self, infile, filedesc, filecare, fileCol):
        self.infile = infile
        self.filedesc = filedesc
        self.filecare = filecare
        self.fileCol = fileCol

        checkfile(infile)

        #print "  opening %s" % infile
        self.names = []
        self.infhd = openvcffile(infile, 'r')
        self.state = FHDOPEN
        self.currLine = ""
        self.register = vcfRegister()
        self.register['filename'] = infile
        self.register['filedesc'] = filedesc
        self.register['filecare'] = filecare
def main(args):
    parser = argparse.ArgumentParser(description='Merge VCF files.')
    parser.add_argument('-k', '--allowed-duplicated-keys' , dest='allowed_dup_keys', default=['NU'], action='store'      , nargs='+', metavar='key'       ,           help='Which info keys should be counted despite being duplicated at a given position [NU]')
    parser.add_argument('-a', '--add-all'                 , dest='add_all'         , default=False , action='store_false',                                            help='Add all instead of filtering (>1)')
    parser.add_argument('-d', '--debug'                   , dest='debug'           , default=False , action='store_true' ,                                            help='Run only first three chromosomes')
    parser.add_argument('-t', '--threads'                 , dest='threads'         , default=0     , action='store'      , nargs=1                        , type=int, help='Number of threads (default: number of chromosomes)')

    parser.add_argument('-i', '--input'                   , dest='iinput'          , default=None                        ,                                            help='Input file')
    parser.add_argument('input'                           ,                          default=None  , action='store'      , nargs='?', metavar='input file',           help='Input file')

    options = parser.parse_args(args)


    allowed_dup_keys     =     options.allowed_dup_keys
    add_only_significant = not options.add_all
    debug                =     options.debug
    threads              =     options.threads
    #debug                = True

    #TODO: ACCEPT ARGUMENT
    significancy_filter  =  [
                                #['NV', 'gt', 1],
                                #['NW', 'gt', 1],
                                ['NS', 'gt', 1],
                                #['NT', 'gt', 1]
                                ['NU', 'gt', 1]
                            ]

    print args

    infile    = getOptionInFile(options, parser)
    print "infile", infile
    infile    = checkfile(infile)
    indexFile = infile + ".idx"

    print "Allowed Duplicated Keys: %s" % str(allowed_dup_keys    )
    print "Add Only Significant   : %s" % str(add_only_significant)
    print "Debug                  : %s" % str(debug               )
    print "Input File             : %s" % infile
    print "Index File             : %s (exists: %s)" % (indexFile, os.path.exists(indexFile) )

    if not os.path.exists( indexFile ):
        makeIndexFile( indexFile, infile )

    idx        = readIndex(indexFile)

    reportName = infile + '.report.csv'

    titles     = getTitle(infile)

    nfostats   = infostats(reportName, titles)

    global threads
    if threads == 0:
        threads = len(idx)

    pool    = multiprocessing.Pool(processes=threads)
    #pool    = multiprocessing.Pool(processes=1)
    results = []

    for chrom, pos in sorted(idx.items(), key=lambda item: item[1]):
        if len(results) > 3 and debug:
            print "debug. breaking"
            break
        results.append( pool.apply_async( readParsel, [reportName, idx, infile, chrom], {allowed_dup_keys:allowed_dup_keys, add_only_significant:add_only_significant, debug:debug, significancy_filter:significancy_filter} ) )

    while len(results) > 0:
        for res in results:
            try:
                #print "getting result"
                nfostats += res.get( 5 )
                results.remove( res )
                print "getting result OK"

            except multiprocessing.TimeoutError:
                #print "getting result FAILED. waiting"
                pass

    nfostats.export()
Exemple #6
0
def main(args):
    parser = argparse.ArgumentParser(description='Simplify merged VCF file.')
    parser.add_argument('-i',
                        '--input',
                        dest='input',
                        required=True,
                        nargs='?',
                        type=str,
                        help='Input file')
    parser.add_argument('-o',
                        '--output',
                        dest='output',
                        default=None,
                        nargs='?',
                        type=str,
                        help='Output file')
    parser.add_argument('-t',
                        '--table',
                        dest='table',
                        default=None,
                        nargs='?',
                        type=str,
                        help='Input table')
    parser.add_argument('-k',
                        '--keys',
                        dest='keys',
                        default=None,
                        nargs='?',
                        type=str,
                        help='Input keys')
    parser.add_argument('-v',
                        '--table-values',
                        dest='table_vs',
                        default=None,
                        nargs='?',
                        type=str,
                        help='Input table values')
    parser.add_argument(
        '-c',
        '--chromosome-translation',
        dest='translation',
        default=None,
        nargs='?',
        type=str,
        help='Translation table to chromosome names [e.g.: 1:Chr1;2:Chr2')
    parser.add_argument('-s',
                        '--samples',
                        dest='samples',
                        default=None,
                        nargs='?',
                        type=str,
                        help='Samples (Columns) to keep [e.g.: Spp1;Spp3;Spp5')
    parser.add_argument('-n',
                        '--keep-no-coverage',
                        dest='keep_no_coverage',
                        action='store_true',
                        help='Keep rows containing no coverage')
    parser.add_argument('-e',
                        '--keep-heterozygous',
                        dest='keep_heterozygous',
                        action='store_true',
                        help='Keep rows hoterozygosity')

    options = parser.parse_args(args)

    print "Options", options

    invcf = options.input

    try:
        checkfile(invcf)
        print "input vcf:              %s" % invcf

    except:
        parser.print_usage()
        #print "%s --input <invcf>" % sys.argv[0]
        print "EG.: %s --input 1001genomes_snp-short-indel_only_ACGTN.vcf.gz" % sys.argv[
            0]
        sys.exit(1)

    outbn = invcf
    if options.output is not None:
        outbn = options.output

    outbn += (".nc" if options.keep_no_coverage else
              "") + (".het" if options.keep_heterozygous else "")
    listFile = outbn + '.list.csv'
    vcfFile = outbn + '.list.csv.vcf.gz'
    outFile = outbn + '.list.csv.vcf.gz.simplified.vcf.gz'
    outFileTmp = outbn + '.list.csv.vcf.gz.simplified.tmp.vcf.gz'

    if os.path.exists(outFile):
        print "Out File (%s) EXISTS. quitting" % outFile
        sys.exit(1)

    print "Out File:               %s" % outFile

    try:
        intbl = options.table
        checkfile(intbl)
        print "Input Table: %s" % intbl

    except:
        intbl = None

    tbl_k = None
    if options.keys is not None:
        tbl_k = options.keys
        print "Input Table keys: %s" % tbl_k

    tbl_vs = None
    if options.table_vs is not None:
        tbl_vs = options.table_vs.split(',')
        print "Table values: %s" % options.table_vs

    data, atad = (None, None)
    if intbl:
        data, atad = get_translation(intbl, tbl_k, tbl_vs)
        print 'DATA', data
        print 'ATAD', atad

    translation = {}
    if options.translation is not None:
        for pair in options.translation.split(';'):
            src, dst = pair.split(':')
            assert src not in translation
            translation[src] = dst

        print "Translation", translation
    else:
        translation = None

    columns = None
    if options.samples is not None:
        columns = options.samples.split(';')
        assert len(columns) > 0, "No Columns %s" % str(columns)

    vcf_holder = vcf()
    names = None
    with openfile(invcf, 'r') as fhdi:
        with openvcffile(outFileTmp, 'w', compresslevel=1) as fhdv:
            vcf_holder.setFhd(fhdv)
            for line in fhdi:
                line = line.strip()

                if len(line) == 0:
                    continue

                if line.startswith("#"):  # header
                    print "HEADER", line

                    if line.startswith("##"):  # definition lines
                        print "HEADER :: DEF", line

                    else:  # column description
                        print "HEADER :: COL", line

                        cols = line.split("\t")
                        num_cols = len(cols)
                        shared = cols[:
                                      9]  #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT
                        names = cols[9:]

                        print "HEADER :: COL :: SHARED", shared
                        #print "HEADER :: COL :: NAMES" , names

                        if columns is not None:
                            cdiff = list(set(columns) - set(names))
                            assert len(
                                cdiff) == 0, "Unknown column name: %s" % (
                                    str(cdiff))

                        with open(listFile, 'wb') as fhdl:
                            writer = csv.writer(fhdl,
                                                delimiter='\t',
                                                quotechar='"')
                            for ln, name in enumerate(names):
                                if columns is not None:
                                    if name not in columns:
                                        continue

                                cols = ["1", "%s|%d" % (invcf, ln + 1), name]

                                if data is not None:
                                    assert name in data, "name %s not in db %s" % (
                                        name, str(data))

                                    #print "converting %s to %s" % (name, data[name])
                                    cols[2] = data[name]
                                    names[ln] = data[name]

                                #print "COLS", cols
                                writer.writerow(cols)

                        print "HEADER :: COL :: NAMES", names

                        vcf_holder.printVcfHeader(names)

                else:
                    cols = line.split("\t")

                    assert len(cols) > 9

                    info = cols[8]
                    assert ':' in info, line
                    assert 'GT' in info, line

                    #print "has desc"
                    infoC = info.split(':')
                    assert len(infoC) > 1
                    #print "  info" , info
                    #print "  infoC", infoC

                    gtpos = info.index('GT')
                    #print "  GT pos", gtpos

                    register = {
                        'chrom': cols[0],
                        'pos': int(cols[1]),
                        'src': cols[3],
                        'dst': cols[4],
                        'desc': {},
                        'stats': {
                            'unphased': 0,
                            'phased': 0,
                            'gap': 0,
                            'ref': 0,
                            'h**o': 0,
                            'het': 0,
                            'x_mnp_ref': 0,
                            'x_mnp_alt': 0,
                            'x_gap': 0,
                            'x_het': 0
                        }
                    }

                    if len(cols[3]) > 1:
                        print "MNP ref", cols[3]
                        vcf_holder.add_stat(cols[0], 'x_mnp_ref', 1)
                        continue

                    if any([len(x) != 1 for x in cols[4].split(',')]):
                        print "MNP alt", cols[4]
                        vcf_holder.add_stat(cols[0], 'x_mnp_alt', 1)
                        continue

                    #descs             =     cols[9:]
                    has_gap = False
                    is_het = False

                    for colNum, desc in enumerate(cols[9:]):
                        colname = names[colNum]

                        if columns is not None:
                            if colname not in columns:
                                continue

                        if (desc == './.') or (desc == '.'):
                            if not options.keep_no_coverage:
                                vcf_holder.add_stat(cols[0], 'x_gap', 1)
                                has_gap = True
                                break

                            else:
                                register['stats']['gap'] += 1
                                continue

                        assert ':' in desc, desc + " " + str(cols[9:])

                        descC = desc.split(":")
                        assert len(descC) > 1
                        #print "  desc" , desc
                        #print "  descC", descC

                        #assert len(infoC) == len(descC), str(infoC) + " " + str(descC) + " " + str(cols[9:])
                        if len(infoC) != len(descC):
                            if not options.keep_no_coverage:
                                vcf_holder.add_stat(cols[0], 'x_gap', 1)
                                has_gap = True
                                break

                            else:
                                register['stats']['gap'] += 1
                                continue

                        #print "   len infoC == len descC", infoC, descC

                        gtDesc = descC[gtpos]
                        gt0, gt1 = (None, None)

                        if '/' in gtDesc:
                            gt0, gt1 = gtDesc.split('/')
                            register['stats']['unphased'] += 1

                        elif '|' in gtDesc:
                            gt0, gt1 = gtDesc.split('|')
                            register['stats']['phased'] += 1

                        else:
                            assert False, 'unknown info fomat: %s (%s, %s)' % (
                                gtDesc, info, desc)

                        if gt0 == '.' or gt1 == '.':  # skip no coverage
                            #sys.stdout.write('.')
                            if not options.keep_no_coverage:
                                register['stats']['uncalled'] += 1
                                has_gap = True
                                break
                            else:
                                vcf_holder.add_stat(cols[0], 'x_gap', 1)
                                continue

                        else:
                            if len(set([gt0, gt1])) == 1:
                                #sys.stdout.write('o')
                                register['stats']['h**o'] += 1

                                if (gt0 == '0'
                                    ):  # homozygous identical to reference
                                    register['stats']['ref'] += 1
                                    continue
                                    #register['desc' ].append( names[colNum] )

                            else:
                                #sys.stdout.write('e')
                                if not options.keep_heterozygous:
                                    vcf_holder.add_stat(cols[0], 'x_het', 1)
                                    is_het = True
                                    break
                                else:
                                    register['stats']['het'] += 1
                                    continue

                            dstC = register['dst'].split(',')
                            nuc0 = register['src'] if gt0 == '0' else dstC[
                                int(gt0) - 1]
                            nuc1 = register['src'] if gt1 == '0' else dstC[
                                int(gt1) - 1]
                            nucK = (nuc0, nuc1)

                            if nucK not in register['desc']:
                                register['desc'][nucK] = []

                            register['desc'][nucK].append(names[colNum])

                            #if gt0 == '0' or gt1 == '0': # if heretozygous and has reference, make it explicit
                            #    #sys.stdout.write('H')
                            #    alts = sorted(list(set(register['src'  ].split(",") + register['dst'  ].split(","))))
                            #    alts = [ a for a in alts if a != '.' ]
                            #    register['dst'  ] = ",".join(alts)
                            #    #print "   added  src to dst", self.register['dst'  ]

                            #register['desc' ].append( names[colNum] )

                    #sys.stdout.flush()

                    if has_gap:
                        continue

                    if is_het:
                        continue

                    if len(register['desc']) > 0:
                        #print '+\n'

                        if translation:
                            register['chrom'] = translation.get(
                                register['chrom'], register['chrom'])

                        descs = deepcopy(register['desc'])
                        for desc in descs:
                            register['desc'] = '|'.join(descs[desc])

                            if len(set(desc)) == 1:
                                desc = desc[0]

                            register['dst'] = ','.join(sorted(list(set(desc))))
                            vcf_holder.printRegister(register)

                    else:
                        #print '-'
                        pass

            fhdv.flush()

        print "\nGLOBAL STATS"
        print 'Global Stats    :', " ".join([
            "{:s}: {:10,d}".format(*i)
            for i in sorted(vcf_holder.stats.items())
        ])

        os.rename(outFileTmp, outFile)

    os.utime(listFile, None)

    if not os.path.exists(vcfFile):
        os.symlink(invcf, vcfFile)

    os.utime(vcfFile, None)

    os.utime(outFile, None)
Exemple #7
0
def main():
    try:
        infile = os.sys.argv[1]
    except:
        print "no input file given"
        print sys.argv[0], "<INPUT MULTICOLUMN CSV>"
        sys.exit(1)

    checkfile(infile)

    print "splitting %s" % infile
    defs = []
    names = []
    outfiles = []
    valid = 0
    skipped = 0
    lastCol = ""
    num_cols = None
    line_count = 0
    with openfile(infile, 'r') as fhd:
        for line in fhd:
            line = line.strip()

            if len(line) == 0:
                continue

            if line.startswith("#"):  # header
                print "HEADER", line

                if line.startswith("##"):  # definition lines
                    print "HEADER :: DEF", line
                    defs.append(line)

                else:  # column description
                    print "HEADER :: COL", line

                    cols = line.split("\t")
                    num_cols = len(cols)
                    shared = cols[:
                                  9]  #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMA
                    names = cols[9:]

                    # Project specific method, used to deal with sample names that contain "_"
                    # May be changed later
                    """
                    By default, you should make sure that sample name does not contain "_" or ".", 
                    and len(sample_name) + len(chromosome_name) <= 31.
                    If that is not the case then sample name
                    and/or chromosome name may have to be modified 
                    (like this method)
                    """
                    newnames = []
                    for idx, name in enumerate(names):
                        newname = name
                        if "_" in newname:
                            newname = newname.rpartition("_")[0].replace(
                                "_", "")
                        if "." in newname:
                            newname = newname.partition(".")[0]
                        newnames.append(newname)
                    names = newnames

                    print "HEADER :: COL :: SHARED", shared
                    print "HEADER :: COL :: NAMES", names

                    outfiles = [None] * len(names)
                    outlist = open("%s.lst" % infile, 'w')
                    for np, name in enumerate(names):
                        nof = ("%s.%0" + str(len("%d" % len(names))) +
                               "d.%s.vcf.gz") % (infile, np + 1,
                                                 sanitize(name))
                        print("creating %" + str(len("%d" % len(names))) +
                              "d %-" + str(max([len(x) for x in names])) +
                              "s to %s") % (np + 1, name, nof)
                        nop = openfile(nof, 'w')

                        #                               skipped valid
                        outfiles[np] = [name, nof, nop, 0, 0]

                        outlist.write("1\t%s\t%s\n" %
                                      (os.path.abspath(nof), name))

                        nop.write("\n".join(defs) + "\n")
                        nop.write("##Split from: %s column %d\n" %
                                  (os.path.abspath(infile), np + 1))
                        nop.write("\t".join(shared))
                        nop.write("\t%s\n" % name)
                        nop.flush()

                continue

            line_count += 1

            if line_count % 1000 == 0:
                sys.stdout.write('.')
                if line_count % 100000 == 0:
                    sys.stdout.write(' lines %12d valid %12d skipped %12d\n' %
                                     (line_count, valid, skipped))
                    for nop, ndata in enumerate(outfiles):
                        ndata[2].flush()
                sys.stdout.flush()

            #print "DATA", line
            cols = line.split("\t")
            assert len(cols) == num_cols

            ref = cols[3]
            alts = cols[4].split(',')
            if len(ref) > 1 or any([len(x) > 1
                                    for x in alts]):  # exclude MNP or indel
                # print "Excluded line: " + line
                continue

            shared = cols[:
                          9]  #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMA
            shared_str = "\t".join(shared)
            data = cols[9:]  #used to be " + "\t""

            if cols[0] != lastCol:
                print '\nChromosome', cols[0]
                lastCol = cols[0]

            #print "shared", shared
            #print "data"  , data
            for pos, ndata in enumerate(data):
                #outfiles[np] = [name, nof, 0, 0, nop]
                res = [ndata.startswith(x) for x in targets]
                if any(res):
                    shared[4] = alts[res.index(True)]
                    shared_str = "\t".join(shared)
                    valid += 1
                    outfiles[pos][4] += 1  # valid
                else:
                    skipped += 1
                    outfiles[pos][3] += 1  # skipped
                    continue

                outfiles[pos][2].write(shared_str + "\t" + ndata + "\n")

    for nop, ndata in enumerate(outfiles):
        ndata[2].close()
        print("closing %" + str(len("%d" % len(outfiles))) + "d %-" +
              str(max([len(x[0]) for x in outfiles])) + "s :: %-" +
              str(max([len(x[1]) for x in outfiles])) +
              "s :: skipped %6d exported %6d total %7d") % (
                  nop + 1, ndata[0], ndata[1], ndata[3], ndata[4],
                  ndata[3] + ndata[4])
def main():
    try:
        infile = os.sys.argv[1]
    except:
        print "no input file given"
        print sys.argv[0], "<INPUT MULTICOLUMN CSV>"
        sys.exit(1)

    checkfile(infile)

    print "splitting %s" % infile
    defs = []
    names = []
    outfiles = []
    valid = 0
    skipped = 0
    lastCol = ""
    num_cols = None
    line_count = 0
    with openfile(infile, 'r') as fhd:
        for line in fhd:
            line = line.strip()

            if len(line) == 0:
                continue

            if line.startswith("#"):  # header
                print "HEADER", line

                if line.startswith("##"):  # definition lines
                    print "HEADER :: DEF", line
                    defs.append(line)

                else:  # column description
                    print "HEADER :: COL", line

                    cols = line.split("\t")
                    num_cols = len(cols)
                    shared = cols[:
                                  9]  #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMA
                    names = cols[9:]

                    print "HEADER :: COL :: SHARED", shared
                    print "HEADER :: COL :: NAMES", names

                    outfiles = [None] * len(names)
                    outlist = open("%s.lst" % infile, 'w')
                    for np, name in enumerate(names):
                        nof = ("%s_%0" + str(len("%d" % len(names))) +
                               "d_%s.vcf.gz") % (infile, np + 1,
                                                 sanitize(name))
                        print("creating %" + str(len("%d" % len(names))) +
                              "d %-" + str(max([len(x) for x in names])) +
                              "s to %s") % (np + 1, name, nof)
                        nop = openfile(nof, 'w')

                        #                               skipped valid
                        outfiles[np] = [name, nof, nop, 0, 0]

                        outlist.write("1\t%s\t%s\n" %
                                      (os.path.abspath(nof), name))

                        nop.write("\n".join(defs) + "\n")
                        nop.write("##Split from: %s column %d\n" %
                                  (os.path.abspath(infile), np + 1))
                        nop.write("\t".join(shared))
                        nop.write("\t%s\n" % name)
                        nop.flush()

                continue

            line_count += 1

            if line_count % 1000 == 0:
                sys.stdout.write('.')
                if line_count % 100000 == 0:
                    sys.stdout.write(' lines %12d valid %12d skipped %12d\n' %
                                     (line_count, valid, skipped))
                    for nop, ndata in enumerate(outfiles):
                        ndata[2].flush()
                sys.stdout.flush()

            #print "DATA", line
            cols = line.split("\t")
            assert len(cols) == num_cols
            shared = cols[:
                          9]  #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMA
            shared_str = "\t".join(shared) + "\t"
            data = cols[9:]

            if cols[0] != lastCol:
                print '\nChromosome', cols[0]
                lastCol = cols[0]

            #print "shared", shared
            #print "data"  , data
            for pos, ndata in enumerate(data):
                #outfiles[np] = [name, nof, 0, 0, nop]
                if any([ndata.startswith(x) for x in ignores]):
                    skipped += 1
                    outfiles[pos][3] += 1  # skipped
                    continue

                valid += 1
                outfiles[pos][4] += 1  # valid
                outfiles[pos][2].write(shared_str + "\t" + ndata + "\n")

    for nop, ndata in enumerate(outfiles):
        ndata[2].close()
        print("closing %" + str(len("%d" % len(outfiles))) + "d %-" +
              str(max([len(x[0]) for x in outfiles])) + "s :: %-" +
              str(max([len(x[1]) for x in outfiles])) +
              "s :: skipped %6d exported %6d total %7d") % (
                  nop + 1, ndata[0], ndata[1], ndata[3], ndata[4],
                  ndata[3] + ndata[4])
Exemple #9
0
def main(args):
    parser = argparse.ArgumentParser(description='Merge VCF files.')
    parser.add_argument(
        '-k',
        '--allowed-duplicated-keys',
        dest='allowed_dup_keys',
        default=['NU'],
        action='store',
        nargs='+',
        metavar='key',
        help=
        'Which info keys should be counted despite being duplicated at a given position [NU]'
    )
    parser.add_argument('-a',
                        '--add-all',
                        dest='add_all',
                        default=False,
                        action='store_false',
                        help='Add all instead of filtering (>1)')
    parser.add_argument('-d',
                        '--debug',
                        dest='debug',
                        default=False,
                        action='store_true',
                        help='Run only first three chromosomes')
    parser.add_argument(
        '-t',
        '--threads',
        dest='threads',
        default=0,
        action='store',
        nargs=1,
        type=int,
        help='Number of threads (default: number of chromosomes)')

    parser.add_argument('-i',
                        '--input',
                        dest='iinput',
                        default=None,
                        help='Input file')
    parser.add_argument('input',
                        default=None,
                        action='store',
                        nargs='?',
                        metavar='input file',
                        help='Input file')

    options = parser.parse_args(args)

    allowed_dup_keys = options.allowed_dup_keys
    add_only_significant = not options.add_all
    debug = options.debug
    threads = options.threads
    #debug                = True

    #TODO: ACCEPT ARGUMENT
    significancy_filter = [
        #['NV', 'gt', 1],
        #['NW', 'gt', 1],
        ['NS', 'gt', 1],
        #['NT', 'gt', 1]
        ['NU', 'gt', 1]
    ]

    print args

    infile = getOptionInFile(options, parser)
    print "infile", infile
    infile = checkfile(infile)
    indexFile = infile + ".idx"

    print "Allowed Duplicated Keys: %s" % str(allowed_dup_keys)
    print "Add Only Significant   : %s" % str(add_only_significant)
    print "Debug                  : %s" % str(debug)
    print "Input File             : %s" % infile
    print "Index File             : %s (exists: %s)" % (
        indexFile, os.path.exists(indexFile))

    if not os.path.exists(indexFile):
        makeIndexFile(indexFile, infile)

    idx = readIndex(indexFile)

    reportName = infile + '.report.csv'

    titles = getTitle(infile)

    nfostats = infostats(reportName, titles)

    global threads
    if threads == 0:
        threads = len(idx)

    pool = multiprocessing.Pool(processes=threads)
    #pool    = multiprocessing.Pool(processes=1)
    results = []

    for chrom, pos in sorted(idx.items(), key=lambda item: item[1]):
        if len(results) > 3 and debug:
            print "debug. breaking"
            break
        results.append(
            pool.apply_async(
                readParsel, [reportName, idx, infile, chrom], {
                    allowed_dup_keys: allowed_dup_keys,
                    add_only_significant: add_only_significant,
                    debug: debug,
                    significancy_filter: significancy_filter
                }))

    while len(results) > 0:
        for res in results:
            try:
                #print "getting result"
                nfostats += res.get(5)
                results.remove(res)
                print "getting result OK"

            except multiprocessing.TimeoutError:
                #print "getting result FAILED. waiting"
                pass

    nfostats.export()
def main():
    try:
        infile = os.sys.argv[1]
    except:
        print "no input file given"
        print sys.argv[0], "<INPUT MULTICOLUMN CSV>"
        sys.exit(1)

    checkfile(infile)

    print "splitting %s" % infile
    defs       = []
    names      = []
    outfiles   = []
    valid      = 0
    skipped    = 0
    lastCol    = ""
    num_cols   = None
    line_count = 0
    with openfile(infile, 'r') as fhd:
        for line in fhd:
            line = line.strip()

            if len(line) == 0:
                continue

            if line.startswith("#"): # header
                print "HEADER", line

                if line.startswith("##"): # definition lines
                    print "HEADER :: DEF", line
                    defs.append( line )

                else: # column description
                    print "HEADER :: COL", line

                    cols     = line.split("\t")
                    num_cols = len(cols)
                    shared   = cols[:9] #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMA
                    names    = cols[9:]

                    print "HEADER :: COL :: SHARED", shared
                    print "HEADER :: COL :: NAMES" , names

                    outfiles = [None]*len(names)
                    outlist  = open("%s.lst" % infile, 'w')
                    for np, name in enumerate(names):
                        nof = ("%s_%0"+str(len("%d"%len(names)))+"d_%s.vcf.gz") % (infile, np+1, sanitize(name))
                        print ("creating %"+str(len("%d"%len(names)))+"d %-"+str(max([len(x) for x in names]))+"s to %s") % (np+1, name, nof)
                        nop = openfile( nof, 'w' )

                        #                               skipped valid
                        outfiles[np] = [name, nof, nop, 0     , 0]

                        outlist.write("1\t%s\t%s\n" % (os.path.abspath(nof), name))

                        nop.write("\n".join(defs) + "\n")
                        nop.write("##Split from: %s column %d\n" % ( os.path.abspath(infile), np + 1) )
                        nop.write("\t".join(shared))
                        nop.write("\t%s\n" % name)
                        nop.flush()

                continue

            line_count += 1

            if line_count % 1000 == 0:
                sys.stdout.write('.')
                if line_count % 100000 == 0:
                    sys.stdout.write(' lines %12d valid %12d skipped %12d\n' % (line_count, valid, skipped) )
                    for nop, ndata in enumerate(outfiles):
                        ndata[2].flush()
                sys.stdout.flush()


            #print "DATA", line
            cols       = line.split("\t")
            assert len(cols) == num_cols
            shared     = cols[:9] #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMA
            shared_str = "\t".join(shared) + "\t"
            data       = cols[9:]

            if cols[0] != lastCol:
                print '\nChromosome', cols[0]
                lastCol = cols[0]

            #print "shared", shared
            #print "data"  , data
            for pos, ndata in enumerate(data):
                #outfiles[np] = [name, nof, 0, 0, nop]
                if any([ndata.startswith(x) for x in ignores]):
                    skipped          += 1
                    outfiles[pos][3] += 1 # skipped
                    continue

                valid            += 1
                outfiles[pos][4] += 1 # valid
                outfiles[pos][2].write(shared_str + "\t" + ndata + "\n")

    for nop, ndata in enumerate(outfiles):
        ndata[2].close()
        print ("closing %"+str(len("%d"%len(outfiles)))+"d %-"+str(max([len(x[0]) for x in outfiles]))+"s :: %-"+str(max([len(x[1]) for x in outfiles]))+"s :: skipped %6d exported %6d total %7d") % (nop+1, ndata[0], ndata[1], ndata[3], ndata[4], ndata[3] + ndata[4])