Beispiel #1
0
def open_variant_file(var_type, var_file, burden_file, burden_regions,
                      uncompressed):
    """Open a variant file for use as an iterable

    Args:
        var_type (str)
            Type of variants file (kmers, vcf, Rtab)
        var_file (str)
            Location of file
        burden_file (str)
            File containing regions to group burden tests
        burden_regions (list)
            List of burden regions to be filled in-place
        uncompressed (bool)
            True if kmer file is not gzipped
    """
    sample_order = []
    if var_type == "kmers":
        if uncompressed:
            infile = open(var_file)
        else:
            infile = gzip.open(var_file, 'r')
    elif var_type == "vcf":
        infile = VariantFile(var_file)
        if burden_file:
            load_burden(burden_file, burden_regions)
    else:
        # Rtab files have a header, rather than sample names accessible by row
        infile = open(var_file)
        header = infile.readline().rstrip()
        sample_order = header.split()[1:]

    return infile, sample_order
Beispiel #2
0
def main():
    options = get_options()

    # Create dummy pheno object from sample list
    sample_list = []
    with open(options.samples, 'r') as sample_file:
        for sample in sample_file:
            sample_list.append(sample.rstrip())
    p = pd.Series(np.zeros(len(sample_list)), index=sample_list)

    # Open variant file. Mostly copied from __main__
    sample_order = []
    all_strains = set(p.index)

    if options.kmers:
        var_type = "kmers"
        if options.uncompressed:
            infile = open(options.kmers)
        else:
            infile = gzip.open(options.kmers, 'r')
    elif options.vcf:
        var_type = "vcf"
        infile = VariantFile(options.vcf)
    else:
        # Rtab files have a header, rather than sample names accessible by row
        var_type = "Rtab"
        infile = open(options.pres)
        header = infile.readline().rstrip()
        sample_order = header.split()[1:]

    eof = 0
    # no copy of first variant_mat made. Reserve memory
    G = np.empty((len(p), block_size))
    sys.stderr.write("Reading in variants\n")
    v_iter = load_var_block(var_type, p, None, None, infile, all_strains,
                            sample_order, options.min_af, options.max_af,
                            options.max_missing, options.uncompressed,
                            block_size)
    while not eof:
        variants, variant_mat, eof = next(v_iter)
        if G.shape[1] > block_size:
            G = np.concatenate(G, variant_mat)
        else:
            G = variant_mat

    sys.stderr.write("Calculating sample similarity\n")
    K = np.matmul(G, np.transpose(G))
    K_out = pd.DataFrame(K, index=p.index, columns=p.index)
    K_out.to_csv(sys.stdout, sep='\t')