Beispiel #1
0
def groupinfo(args):

    # open and read the first line of infile
    if args.fmt in ['pickle', 'npy']:

        from seqpy.core.bioio import naltparser
        from types import SimpleNamespace
        nalt_args = SimpleNamespace(infile=args.infile, fmt=args.fmt, n=-1)
        nalt_parser = naltparser.NAltLineParser(nalt_args,
                                                with_group=False,
                                                with_position=False)
        samples = nalt_parser.samples

    elif args.fmt == 'list':
        with gzopen(args.infile) as f:
            buf = f.read()
            samples = buf.split()

    else:
        with gzopen(args.infile) as f:
            samples = f.readline().strip().split()

    group_parser = grpparser.GroupParser(args)
    groups = group_parser.assign_groups(samples)
    total = 0
    cout('Groups:')
    for g in sorted(groups.keys()):
        c = len(groups[g])
        cout('  %3d - %s' % (c, g))
        total += c
    cout('Total: %d samples' % total)
Beispiel #2
0
    def __init__(self,
                 args,
                 datatype='nalt',
                 with_group=True,
                 with_position=True):

        self.group_parser = grpparser.GroupParser(args) if with_group else None
        self.position_parser = PositionParser(args) if with_position else None

        self.infile = args.infile
        self.fmt = args.fmt
        self.n = args.n

        self.dtype = np.int8 if datatype == 'nalt' else np.float
        #self.convert_data = lambda line: np.loadtxt(io.StringIO(line),
        #                        dtype = self.dtype, delimiter='\t')

        #self.convert_data = lambda line: pd.read_csv(io.StringIO(line),
        #                        dtype = dtype, delimiter='\t', header=None).values

        self.convert_data = lambda line: np.fromfile(
            io.StringIO(line), dtype=dtype, delimiter='\t')

        self.df = None
        self.M = None
        self.samples = None

        self.parse_samples()
Beispiel #3
0
def pcoa(args):

    cerr('I: reading group info')
    group_parser = grpparser.GroupParser(args)
    group_parser.parse()

    with open(args.infile, 'rb') as infile:

        cerr('I: reading sample header...')
        samples = next(infile).decode('UTF-8').strip().split()
        groups = group_parser.assign_groups(samples)

        cerr('I: reading distance matrix')
        distm = np.loadtxt(infile, delimiter='\t')

    pcoa = allel.pcoa(distm)

    fig = plt.figure(figsize=(27, 9), dpi=args.dpi)

    fig_idx = 1
    colour_list = group_parser.colour_list()
    for pcx, pcy in combinations([0, 1, 2], 2):

        ax = fig.add_subplot(1, 3, fig_idx)
        fig_idx += 1

        make_plot(ax, pcoa[0][:, pcx], pcoa[0][:, pcy], colour_list,
                  args.dotsize)

    fig.tight_layout()
    fig.savefig(args.outfile)
Beispiel #4
0
def grp2anno(args):

    # read group file

    group_parser = grpparser.GroupParser(args)

    # open infile
    with gzopen(args.infile) as infile:
        header = next(infile)

    if args.delimiter is not None:
        samples = header.strip().split(args.delimiter)
    else:
        samples = header.strip().split()

    #import IPython; IPython.embed()

    groups = group_parser.assign_groups(samples)
    group_keys = sorted(groups.keys())
    colours = group_parser.colour_list()

    with open(args.outfile + '.indv.txt', 'w') as outfile:
        outfile.write('SAMPLE\tCOLOUR\n')
        for s, c in zip(samples, colours):
            outfile.write('%s\t%s\n' % (s, c))

    with open(args.outfile + '.group.txt', 'w') as outfile:
        outfile.write('GROUP\tCOLOUR\n')
        for g, c in zip(group_keys,
                        group_parser.group_colour_list(group_keys)):
            if args.s:
                g = '%s (%d)' % (g, len(groups[g]))
            outfile.write('%s\t%s\n' % (g, c))
Beispiel #5
0
def seq2fst(args):

    # open and read sequence file
    cerr('[I - reading sequence file %s]' % args.infile)
    seqs = load(args.infile)

    # open and read group/meta file using groupfile/metafile if available
    if args.groupfile or args.metafile:
        cerr('[I - reading group information file]')
        group_parser = grpparser.GroupParser(args)
        group_parser.parse()

        group_seqs = {}

        for seq in seqs:
            try:
                grp = group_parser.group_info[seq.label.decode('ASCII')]
            except KeyError:
                cerr('[W - sample %s is not assign to any group]' %
                     seq.label.decode('ASCII'))
                continue
            if grp in group_seqs:
                group_seqs[grp].append(seq)
            else:
                ms = multisequence()
                ms.append(seq)
                group_seqs[grp] = ms
    else:
        cexit('[ERR - seq2fst.py requires group information!]')

    for grp_seq in group_seqs:
        cerr('[I - group %s has %d sample(s)]' %
             (grp_seq, len(group_seqs[grp_seq])))

    if args.sitefile:
        # perform FST site-wise
        FST_sites = calc_site_fst(group_seqs, args.nantozero)

        with open(args.sitefile, 'w') as fout:
            for (label, mat) in FST_sites:
                fout.write(label)
                fout.write('\t')
                np.savetxt(fout,
                           mat,
                           fmt='%5.4f',
                           delimiter='\t',
                           newline='\t')
                fout.write('\n')

        cerr('[I - site FST written to %s]' % (args.sitefile))
        return

    FST_mat, groups = calc_fst(group_seqs)

    with open(args.outfile, 'w') as fout:
        fout.write('\t'.join(groups))
        fout.write('\n')
        np.savetxt(fout, FST_mat, fmt='%5.4f', delimiter='\t')
Beispiel #6
0
def vcf2ped( args ):
    """ create a ped and map file based on vcf and metafile, suitable for isoRelate """

    # open group file
    group_parser = grpparser.GroupParser( args )

    # open VCF file
    cerr('[I: reading VCF...]')
    start_time = time.monotonic()
    vcfset = allel.read_vcf(args.infile,
                fields = ['samples', 'variants/CHROM', 'variants/POS', 'calldata/GT'])
    cerr('[I: read %s site, %s samples in %d secs]' % (len(vcfset['variants/CHROM']),
         len(vcfset['samples']), time.monotonic() - start_time))

    # assign groups
    samples = vcfset['samples']
    group_parser.assign_groups(samples)
    groups = group_parser.group_keys
    #import IPython; IPython.embed()

    # write to PED
    with open(args.outprefix + '.ped', 'w') as outf:
        for i in range(len(samples)):
            outf.write('%s\t%s\t0\t0\t1\t0\t' % (groups[i], samples[i]))
            alleles = []
            for gt in vcfset['calldata/GT'][:,i]:
                allele_1, allele_2 = gt
                #print(allele_1, allele_2)
                if allele_1 == allele_2:
                    if allele_1 == -1:
                        alleles += [0, 0]
                    elif allele_1 == 0:
                        alleles += [1, 1]
                    elif allele_1 == 1:
                        alleles += [2, 2]
                    else:
                        alleles += [1, 1]
                else:
                    alleles += [1, 2]
            outf.write('\t'.join( str(i) for i in alleles))
            outf.write('\n')
            #import IPython; IPython.embed()

    # write to MAP
    with open(args.outprefix + '.map', 'w') as outf:
        last_pos = 0
        curr_chr = None
        for (chrom, pos) in zip( vcfset['variants/CHROM'], vcfset['variants/POS'] ):
            if curr_chr != chrom:
                curr_chr = chrom
                last_pos = 0
            dist = (pos - last_pos) * 1e-6
            last_pos = pos
            outf.write('%s\t%s:%d\t%8.6f\t%d\n' % (chrom, chrom, pos, dist, pos))
Beispiel #7
0
def dist2popdist( args ):

    # read group assignment

    group_parser = grpparser.GroupParser( args )

    # read distance matrix

    df = pd.read_csv(args.infile, sep='\t')
    samples = df.columns
    D = df.values
    groups = group_parser.assign_groups(samples)
    group_keys = sorted(groups.keys())
    n = len(groups)

    M = np.zeros( (n, n) )

    # calculate intra population
    #for i, g in enumerate(group_keys):
    #    d = c = 0
    #    for x,y in combinations( groups[g], 2):
    #        d += D[x,y]
    #        c += 1
    #    M[i,i] = d/c

    # calculate inter population
    for i, j in combinations_with_replacement(range(n), 2):

        d = c = 0
        for x,y in product(groups[ group_keys[i] ], groups[ group_keys[j] ] ):
            d += D[x,y]
            c += 1
        M[i,j] = M[j,i] = d/c

    # perform Dxy calculation

    P = np.zeros( (n,n) )
    for i, j in combinations( range(n), 2 ):

        P[i,j] = P[j,i] = M[i,j] - 0.5*(M[i,i] + M[j,j])

    # write distance matrix

    with open(args.outfile + '.popdxy.txt','wt') as outfile:
        # write dxy
        outfile.write( '%s\n' % '\t'.join( group_keys ) )
        np.savetxt(outfile, M, delimiter='\t', fmt='%4.3f')

    with open(args.outfile + '.popdist.txt', 'wt') as outfile:
        # write distance
        outfile.write( '%s\n' % '\t'.join( group_keys ) )
        np.savetxt(outfile, P, delimiter='\t', fmt='%4.3f')
Beispiel #8
0
def seq2pi(args):

    # open and read sequence file
    cerr('[I - reading sequence file %s]' % args.infile)
    seqs = load(args.infile)

    # open and read group/meta file using groupfile/metafile if available
    if args.groupfile or args.metafile:
        cerr('[I - reading group information file]')
        group_parser = grpparser.GroupParser(args)
        group_parser.parse()

        group_seqs = {}

        for seq in seqs:
            try:
                grp = group_parser.group_info[seq.label.decode('ASCII')]
            except KeyError:
                cerr('[W - sample %s is not assign to any group]' %
                     seq.label.decode('ASCII'))
                continue
            if grp in group_seqs:
                group_seqs[grp].append(seq)
            else:
                ms = multisequence()
                ms.append(seq)
                group_seqs[grp] = ms
    else:
        group_seqs = {'ALL': seqs}

    print('Groups:')
    outf = open(args.outfile, 'w') if args.outfile else None
    if outf:
        outf.write('GROUP\tN\tPI\tSTDDEV\n')
    for g in group_seqs:
        avg, stddev = calc_pi(group_seqs[g])
        cout('  %20s [%3d]: %f +- %f' % (g, len(group_seqs[g]), avg, stddev))
        if outf:
            outf.write('%s\t%d\t%5.4f\t%5.4f\n' %
                       (g, len(group_seqs[g]), avg, stddev))

    if outf:
        cerr('[I - result written to %s' % args.outfile)
Beispiel #9
0
    def __init__(self, args):

        self.group_parser = grpparser.GroupParser(args)
        self.infile = gzopen(args.infile, 'rt')
        self.posfilename = args.posfile
        self.position = None
        self.posfile_header = None
        self.posfile = None
        self.sample_header = None
        self.samples = None

        # read included positions
        self.include_positions = {}
        if args.includepos:
            with open(args.includepos) as infile:
                next(infile)
                for line in infile:
                    tokens = line.strip().split('\t')
                    self.include_positions[(tokens[0], tokens[1])] = True

        # need to read header of genotype
        self.parse_sample()
Beispiel #10
0
def consolidate_predictions(args):

    outreport = None

    if args.samplefile:
        samples = read_samplefile(args.samplefile, args.fmt)
    else:
        samples = None

    group_parser = grpparser.GroupParser(args)
    group_parser.assign_groups(samples)
    #group_parser.group_keys contains [ 'grp1', 'grp2', etc]
    group_keys = group_parser.group_keys

    with open(args.infile, 'rb') as f:
        predictions = pickle.load(f)

    if args.outreport:
        outreport = open(args.outreport, 'wb')
        from sklearn.metrics import confusion_matrix
    reports = {}

    normalize = True

    for model in predictions:
        model_pred = predictions[model]

        for k in model_pred:

            cerr('Preparing for model: {} k: {}'.format(model, k))
            df = generate_dataframe(model_pred[k])

            group_indexes = np.argmax(df.values, axis=1)
            group_predictions = df.columns[group_indexes[:, None]]
            for i in range(len(group_indexes)):
                predicted_group = df.columns[group_indexes[i]]
                prediction_confidence = df.values[i, group_indexes[i]]
                if prediction_confidence < args.threshold or predicted_group != group_keys[
                        i]:
                    cout('{}: {} -> {} ({})'.format(samples[i], group_keys[i],
                                                    predicted_group,
                                                    prediction_confidence))

            if outreport:

                score = lkmodels.calculate_scores(group_keys,
                                                  group_predictions)
                confmat = confusion_matrix(group_keys, group_predictions)

                if normalize:
                    confmat = confmat.astype('float') / confmat.sum(
                        axis=1)[:, np.newaxis]
                    cerr("[I - Normalized confusion matrix]")
                else:
                    cerr('[I - Confusion matrix, without normalization]')

                reports['{}|{}'.format(model, k)] = {
                    'score': score,
                    'confmat': confmat
                }

    if outreport:
        pickle.dump(reports, outreport)
        cerr('[I - writing pickled report to {}]'.format(args.outreport))