def __init__(self, args):
        self.args = args
        self.germline_seqs = utils.read_germlines(
            self.args.datadir)  #, add_fp=True)

        with opener('r')(
                self.args.datadir + '/v-meta.json'
        ) as json_file:  # get location of <begin> cysteine in each v region
            self.cyst_positions = json.load(json_file)
        with opener('r')(
                self.args.datadir + '/j_tryp.csv'
        ) as csv_file:  # get location of <end> tryptophan in each j region (TGG)
            tryp_reader = csv.reader(csv_file)
            self.tryp_positions = {
                row[0]: row[1]
                for row in tryp_reader
            }  # WARNING: this doesn't filter out the header line

        self.precluster_info = {}

        if self.args.seqfile is not None:
            self.input_info, self.reco_info = get_seqfile_info(
                self.args.seqfile, self.args.is_data, self.germline_seqs,
                self.cyst_positions, self.tryp_positions,
                self.args.n_max_queries, self.args.queries, self.args.reco_ids)

        self.outfile = None
        if self.args.outfname != None:
            if os.path.exists(self.args.outfname):
                os.remove(self.args.outfname)
            self.outfile = open(self.args.outfname, 'a')
Beispiel #2
0
def peruse_forward_scores():
    _, reco_info = seqfileopener.get_seqfile_info(simfname, is_data=False)  #, n_max_queries=10000)
    logprobs, partialcorr_logprobs, corr_logprobs = OrderedDict(), OrderedDict(), OrderedDict()
    for n_set in n_set_list:
        print n_set
        # if n_set != 5:
        #     continue
        logprobs[n_set], partialcorr_logprobs[n_set], corr_logprobs[n_set] = OrderedDict(), OrderedDict(), OrderedDict()
        with open(outputdir + '/' + str(n_set) + '-forward.csv') as csvfile:
            reader = csv.DictReader(csvfile)
            for line in reader:
                uidlist = line['unique_ids'].split(':')
                assert utils.from_same_event(reco_info, uidlist)
                reco_id = reco_info[uidlist[0]]['reco_id']
                if reco_id in logprobs[n_set]:
                    raise Exception('already had %s' % reco_id)

                logprobs[n_set][reco_id] = float(line['logprob'])

                factor = 1. / n_set
                partialcorr_logprobs[n_set][reco_id] = factor * float(line['logprob'])

                factor = (1. - 0.24 / pow(float(n_set), 0.9)) / n_set
                # factor = 1. / (0.77547824*n_set + 0.20327936)
                corr_logprobs[n_set][reco_id] = factor * float(line['logprob'])


    i_baseline = -1
    deviations = get_deviations(logprobs, i_baseline)
    # fit_stuff(n_set_list, deviations)
    partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline)
    signed_partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline, signed=True)
    corr_deviations = get_deviations(corr_logprobs, i_baseline)
    signed_corr_deviations = get_deviations(corr_logprobs, i_baseline, signed=True)

    import plotting
    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, deviations, marker='.')
    plotting.mpl_finish(ax, baseplotdir, 'forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline]))  #, ybounds=(-0.02, 0.02))

    # fig, ax = plotting.mpl_init()
    # ax.plot(n_set_list, partialcorr_deviations, marker='.')
    # ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    # plotting.mpl_finish(ax, baseplotdir, 'partially-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))

    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, partialcorr_deviations, marker='.', label='1/n (abs)')
    ax.plot(n_set_list, signed_partialcorr_deviations, marker='.', label='1/n')
    ax.plot(n_set_list, corr_deviations, marker='.', label='1/crap (abs)')
    ax.plot(n_set_list, signed_corr_deviations, marker='.', label='1/crap')
    ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    plotting.mpl_finish(ax, baseplotdir, 'corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))

    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, signed_corr_deviations, marker='.')
    ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    plotting.mpl_finish(ax, baseplotdir, 'signed-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))
    def __init__(self, args):
        self.args = args
        self.germline_seqs = utils.read_germlines(self.args.datadir)  #, add_fp=True)

        with opener('r')(self.args.datadir + '/v-meta.json') as json_file:  # get location of <begin> cysteine in each v region
            self.cyst_positions = json.load(json_file)
        with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file:  # get location of <end> tryptophan in each j region (TGG)
            tryp_reader = csv.reader(csv_file)
            self.tryp_positions = {row[0]:row[1] for row in tryp_reader}  # WARNING: this doesn't filter out the header line

        self.precluster_info = {}

        if self.args.seqfile is not None:
            self.input_info, self.reco_info = get_seqfile_info(self.args.seqfile, self.args.is_data,
                                                               self.germline_seqs, self.cyst_positions, self.tryp_positions,
                                                               self.args.n_max_queries, self.args.queries, self.args.reco_ids)

        self.outfile = None
        if self.args.outfname != None:
            if os.path.exists(self.args.outfname):
                os.remove(self.args.outfname)
            self.outfile = open(self.args.outfname, 'a')
Beispiel #4
0
def peruse_forward_scores():
    _, reco_info = seqfileopener.get_seqfile_info(
        simfname, is_data=False)  #, n_max_queries=10000)
    logprobs, partialcorr_logprobs, corr_logprobs = OrderedDict(), OrderedDict(
    ), OrderedDict()
    for n_set in n_set_list:
        print n_set
        # if n_set != 5:
        #     continue
        logprobs[n_set], partialcorr_logprobs[n_set], corr_logprobs[
            n_set] = OrderedDict(), OrderedDict(), OrderedDict()
        with open(outputdir + '/' + str(n_set) + '-forward.csv') as csvfile:
            reader = csv.DictReader(csvfile)
            for line in reader:
                uidlist = line['unique_ids'].split(':')
                assert utils.from_same_event(reco_info, uidlist)
                reco_id = reco_info[uidlist[0]]['reco_id']
                if reco_id in logprobs[n_set]:
                    raise Exception('already had %s' % reco_id)

                logprobs[n_set][reco_id] = float(line['logprob'])

                factor = 1. / n_set
                partialcorr_logprobs[n_set][reco_id] = factor * float(
                    line['logprob'])

                factor = (1. - 0.24 / pow(float(n_set), 0.9)) / n_set
                # factor = 1. / (0.77547824*n_set + 0.20327936)
                corr_logprobs[n_set][reco_id] = factor * float(line['logprob'])

    i_baseline = -1
    deviations = get_deviations(logprobs, i_baseline)
    # fit_stuff(n_set_list, deviations)
    partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline)
    signed_partialcorr_deviations = get_deviations(partialcorr_logprobs,
                                                   i_baseline,
                                                   signed=True)
    corr_deviations = get_deviations(corr_logprobs, i_baseline)
    signed_corr_deviations = get_deviations(corr_logprobs,
                                            i_baseline,
                                            signed=True)

    import plotting
    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, deviations, marker='.')
    plotting.mpl_finish(ax,
                        baseplotdir,
                        'forwards',
                        xlabel='N simultaneous seqs',
                        ylabel='log prob deviation to ' +
                        str(n_set_list[i_baseline]))  #, ybounds=(-0.02, 0.02))

    # fig, ax = plotting.mpl_init()
    # ax.plot(n_set_list, partialcorr_deviations, marker='.')
    # ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    # plotting.mpl_finish(ax, baseplotdir, 'partially-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))

    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, partialcorr_deviations, marker='.', label='1/n (abs)')
    ax.plot(n_set_list, signed_partialcorr_deviations, marker='.', label='1/n')
    ax.plot(n_set_list, corr_deviations, marker='.', label='1/crap (abs)')
    ax.plot(n_set_list, signed_corr_deviations, marker='.', label='1/crap')
    ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    plotting.mpl_finish(ax,
                        baseplotdir,
                        'corrected-forwards',
                        xlabel='N simultaneous seqs',
                        ylabel='log prob deviation to ' +
                        str(n_set_list[i_baseline]))  #, ybounds=(-0.02, 0.02))

    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, signed_corr_deviations, marker='.')
    ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    plotting.mpl_finish(ax,
                        baseplotdir,
                        'signed-corrected-forwards',
                        xlabel='N simultaneous seqs',
                        ylabel='log prob deviation to ' +
                        str(n_set_list[i_baseline]))  #, ybounds=(-0.02, 0.02))
Beispiel #5
0
#!/usr/bin/env python
import sys
sys.path.insert(1, './python')
import csv
csv.field_size_limit(sys.maxsize)  # make sure we can write very large csv fields
import argparse

from clusterpath import ClusterPath
from seqfileopener import get_seqfile_info
import utils

parser = argparse.ArgumentParser()
parser.add_argument('--infname', required=True)
parser.add_argument('--dont-abbreviate', action='store_true', help='Print full seq IDs (otherwise just prints an \'o\')')
parser.add_argument('--n-to-print', type=int, help='How many partitions to print (centered on the best partition)')
parser.add_argument('--datadir', default='data/imgt')
parser.add_argument('--simfname')
parser.add_argument('--is-data', action='store_true')
args = parser.parse_args()

glfo = utils.read_germline_set(args.datadir)

reco_info = None
if args.simfname is not None:
    input_info, reco_info = get_seqfile_info(args.simfname, args.is_data, glfo=glfo)

cp = ClusterPath()
cp.readfile(args.infname)
cp.print_partitions(abbreviate=(not args.dont_abbreviate), n_to_print=args.n_to_print, reco_info=reco_info)
Beispiel #6
0
sys.path.insert(1, './python')
import csv
import argparse

from clusterpath import ClusterPath
from seqfileopener import get_seqfile_info
import utils

parser = argparse.ArgumentParser()
parser.add_argument('--infname', required=True)
parser.add_argument('--dont-abbreviate', action='store_true', help='Print full seq IDs (otherwise just prints an \'o\')')
parser.add_argument('--n-to-print', type=int, help='How many partitions to print (centered on the best partition)')
parser.add_argument('--datadir', default='data/imgt')
parser.add_argument('--simfname')
parser.add_argument('--is-data', action='store_true')
args = parser.parse_args()

germline_seqs = utils.read_germlines(args.datadir)
cyst_positions = utils.read_cyst_positions(args.datadir)
with open(args.datadir + '/j_tryp.csv') as csv_file:  # get location of <end> tryptophan in each j region
    tryp_reader = csv.reader(csv_file)
    tryp_positions = {row[0]:row[1] for row in tryp_reader}  # WARNING: this doesn't filter out the header line

reco_info = None
if args.simfname is not None:
    input_info, reco_info = get_seqfile_info(args.simfname, args.is_data, germline_seqs, cyst_positions, tryp_positions)

cp = ClusterPath()
cp.readfile(args.infname)
cp.print_partitions(abbreviate=(not args.dont_abbreviate), n_to_print=args.n_to_print, reco_info=reco_info)
Beispiel #7
0
partis_dir = os.path.dirname(os.path.realpath(__file__)).replace('/bin', '')
if not os.path.exists(partis_dir):
    print 'WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir
sys.path.insert(1, partis_dir + '/python')
import utils
import seqfileopener

parser = argparse.ArgumentParser()
parser.add_argument('infile')
parser.add_argument('outfile')
parser.add_argument('--debug', action='store_true')
parser.add_argument('--chimera-freq', default=1., type=float, help='fraction of sequences to make chimeric')
parser.add_argument('--min-chunk-len', default=15, type=int, help='require that each bit of the chimera is at least this long')
args = parser.parse_args()

input_info, _ = seqfileopener.get_seqfile_info(args.infile, is_data=False)
if len(input_info) < 50:
    print '%s making chimeras with only %d sequences, and since we choose from among the existing sequence for templates this won\'t be very effective' % (utils.color('yellow', 'warning'), len(input_info))

n_chimeric = 0
outfo = collections.OrderedDict()
for uid, seqfo in input_info.items():
    if args.debug:
        print uid

    if numpy.random.uniform(0, 1) > args.chimera_freq:  # no chimeras for this sequence
        if args.debug:
            print '        non-chimeric'
        continue

    break_point = random.randint(args.min_chunk_len, len(seqfo['seqs'][0]) - args.min_chunk_len)
Beispiel #8
0
import utils
from seqfileopener import get_seqfile_info
from opener import opener

parser = argparse.ArgumentParser()
parser.add_argument('--infname', required=True)
parser.add_argument('--outdir', required=True)
parser.add_argument('--start-indices', required=True)  # colon-separated list of start indices. E.g. with '0:1:2' we will write three output files. The first seq line in <infname> goes to 0, the next to 1, the third to 2, and then we skip 97 seqs, then yadda yadda
parser.add_argument('--modulo', type=int, default=100)
args = parser.parse_args()
args.start_indices = utils.get_arg_list(args.start_indices, intify=True)

print 'subsetting %s: every %d th sequence' % (args.infname, args.modulo)

infile = opener('r')(args.infname)
input_info, _ = get_seqfile_info(args.infname, is_data=True)  #, n_max_queries=1000)
for key, d in input_info.items():  # get field names (they should be the same for each row, this just grabs the first one)
    fieldnames = d.keys()
    break

utils.prep_dir(args.outdir)  #, '*.bz2')
outfiles, writers = {}, {}
for iout in args.start_indices:
    outfname = args.outdir + ('/every-' + str(args.modulo) + '-subset-%d.csv.bz2' % iout)
    outfiles[iout] = opener('w')(outfname)
    writers[iout] = csv.DictWriter(outfiles[iout], fieldnames, delimiter=',')
    writers[iout].writeheader()

iline = 0
n_written = 0
for line in input_info.values():
from opener import opener

parser = argparse.ArgumentParser()
parser.add_argument('--infname', required=True)
parser.add_argument('--outdir', required=True)
parser.add_argument(
    '--start-indices', required=True
)  # colon-separated list of start indices. E.g. with '0:1:2' we will write three output files. The first seq line in <infname> goes to 0, the next to 1, the third to 2, and then we skip 97 seqs, then yadda yadda
parser.add_argument('--modulo', type=int, default=100)
args = parser.parse_args()
args.start_indices = utils.get_arg_list(args.start_indices, intify=True)

print 'subsetting %s: every %d th sequence' % (args.infname, args.modulo)

infile = opener('r')(args.infname)
input_info, _ = get_seqfile_info(args.infname,
                                 is_data=True)  #, n_max_queries=1000)
for key, d in input_info.items(
):  # get field names (they should be the same for each row, this just grabs the first one)
    fieldnames = d.keys()
    break

utils.prep_dir(args.outdir)  #, '*.bz2')
outfiles, writers = {}, {}
for iout in args.start_indices:
    outfname = args.outdir + ('/every-' + str(args.modulo) +
                              '-subset-%d.csv.bz2' % iout)
    outfiles[iout] = opener('w')(outfname)
    writers[iout] = csv.DictWriter(outfiles[iout], fieldnames, delimiter=',')
    writers[iout].writeheader()

iline = 0