Ejemplo n.º 1
0
def read_linearham_output():
    lh_info = {}
    clusterdirs = glob.glob(
        '%s/linearham/%s/cluster*' % (args.basedir, args.locus)
    )  # /mcmciter10000_mcmcthin10_tuneiter5000_tunethin100_numrates4_seed0/burninfrac0.1_subsampfrac0.05/aa_naive_seqs.dnamap'
    for cdir in clusterdirs:
        input_seqfos = utils.read_fastx('%s/input_seqs.fasta' % cdir)
        input_uids = [
            sfo['name'] for sfo in input_seqfos if sfo['name'] != 'naive'
        ]
        outfnames = subprocess.check_output(
            ['find', cdir, '-name', 'aa_naive_seqs.dnamap']).strip().split()
        if len(outfnames) == 0:
            print '  no linearham output for %s in %s' % (
                os.path.basename(cdir), cdir)
            continue
        elif len(outfnames) != 1:
            raise Exception('too many outfnames %s' % outfnames)
        clusterfo = []
        with open(outfnames[0]) as outfile:
            for line in outfile:
                if line[0] == '>':  # just skip these for now, we're just printing nucleotide level stuff, not aa
                    continue
                prob, naive_seq = line.strip().split(',')
                clusterfo.append((naive_seq, float(prob)))
        clusterfo = sorted(
            clusterfo, key=operator.itemgetter(1), reverse=True
        )  # it's sorted by aa naive seq in the file, and within that I think by nuc naive seq? Anyway, we need to make sure
        lh_info[':'.join(input_uids)] = clusterfo
    return lh_info
Ejemplo n.º 2
0
def read_ramesh_file(fname, outdir, debug=False):
    seqfos = utils.read_fastx(fname)
    glseqs = {
        l: {r: {}
            for r in utils.loci[l]}
        for l in utils.loci if 'ig' in l
    }
    for sfo in seqfos:
        if os.path.basename(fname) == 'coding.fa':
            meta = [x.strip('[]').split('=') for x in sfo['infostrs']]
            mdict = {m[0]: m[1] for m in meta if len(m) == 2}
            if 'gene' not in mdict:
                print 'no gene for %s' % sfo['infostrs']
                continue
            gene = mdict['gene']
        else:
            mdict = {}
            gene = sfo['name']
        if debug:
            print gene
        if utils.is_constant_gene(gene):
            if debug:
                print '  constant'
            continue
        region = utils.get_region(gene)
        utils.split_gene(gene)
        # if 'partial' in mdict:
        #     gene += '_partial_%s' % mdict['partial'].replace('\'', '').replace(',', '')
        if sfo['seq'] in glseqs[utils.get_locus(gene)][region].values():
            if debug:
                print '  duplicate'
            continue
        glseqs[utils.get_locus(gene)][region][gene] = sfo['seq']

    return glseqs
Ejemplo n.º 3
0
def run_igdiscover(infname, outfname, outdir):
    if utils.output_exists(args, outfname):
        return

    prepare_igdiscover_outdir(outdir)

    if args.n_random_queries is not None:
        sub_infname = outdir + '/' + os.path.basename(infname.replace(utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname))))
        if os.path.exists(sub_infname):
            print '    --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries
        else:
            print '    --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries
            seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries)
            with open(sub_infname, 'w') as sub_infile:
                for seqfo in seqfos:
                    sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq']))
        infname = sub_infname

    igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper()

    cmds = getpathcmd()
    cmds += ['conda activate %s' % args.env_label]
    cmds += ['cd %s' % outdir]
    cmds += ['igdiscover init --db db --single-reads %s work' % infname]  # prepares to run, putting files into <outdir>
    cmds += ['cp %s work/' % os.path.basename(args.yamlfname)]
    cmds += ['cd work']
    cmds += ['igdiscover run']
    utils.simplerun('\n'.join(cmds) + '\n', cmdfname=outdir + '/run.sh', print_time='igdiscover', debug=True)

    template_gldir = args.glfo_dir  # if args.glfo_dir is not None else 'data/germlines/ XXX human'  # can probably delete this now that --glfo-dir is required (but leaving for now, to show how it used to be in case it comes up)
    glfo = glutils.create_glfo_from_fasta(igdiscover_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir)
    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo, debug=True)
Ejemplo n.º 4
0
def alternative_naives_with_probabilities(f):
    """
    Create seq, probability tuples by reading the ranked naive probabilities fasta
    """
    seqfos = partisutils.read_fastx(f)
    return [(sfo["seq"], float(sfo["name"].split("_probability_")[1]))
            for sfo in seqfos]
Ejemplo n.º 5
0
    def vsearch_cluster_v_seqs(self, qr_seqs, threshold, debug=False):
        # then vsearch cluster the v-sequences in <qr_seqs> using a heuristic j-mutation-based threshold
        msa_fname = self.args.workdir + '/msa.fa'
        print '   vsearch clustering %d %s segments with threshold %.2f (*300 = %d)' % (len(qr_seqs), self.region, threshold, int(threshold * 300))
        assert self.region == 'v'  # would need to change the 300
        _ = utils.run_vsearch('cluster', qr_seqs, self.args.workdir + '/vsearch', threshold=threshold, msa_fname=msa_fname, vsearch_binary=self.args.vsearch_binary)
        msa_info = []
        msa_seqs = utils.read_fastx(msa_fname)
        for seqfo in msa_seqs:
            if seqfo['name'][0] == '*':  # start of new cluster (centroid is first, and is marked with a '*')
                centroid = seqfo['name'].lstrip('*')
                msa_info.append({'centroid' : centroid, 'seqfos' : [{'name' : centroid, 'seq' : seqfo['seq']}]})  # I don't seem to actually be using the identity of the centroid sequence for anything
            elif seqfo['name'] == 'consensus':
                msa_info[-1]['cons_seq'] = seqfo['seq'].replace('+', '')  # gaaaaah not sure what the +s mean
            else:
                msa_info[-1]['seqfos'].append(seqfo)
        os.remove(msa_fname)
        n_initial_clusters = len(msa_info)
        print '     read %d vsearch clusters (%d sequences))' % (n_initial_clusters, sum([len(cfo['seqfos']) for cfo in msa_info]))

        # then throw out smaller clusters
        # n_seqs_min = max(self.absolute_n_seqs_min, self.min_cluster_fraction * len(msa_info))
        n_seqs_min = self.absolute_n_seqs_min
        clusterfos = [cfo for cfo in msa_info if len(cfo['seqfos']) >= n_seqs_min]
        print '     removed %d clusters with fewer than %d sequences' % (n_initial_clusters - len(clusterfos), n_seqs_min)
        clusterfos = sorted(clusterfos, key=lambda cfo: len(cfo['seqfos']), reverse=True)
        if len(clusterfos) > self.max_number_of_clusters:
            print '     taking the %d largest clusters (removing %d)' % (self.max_number_of_clusters, len(clusterfos) - self.max_number_of_clusters)
            clusterfos = clusterfos[:self.max_number_of_clusters]

        return clusterfos, msa_info
Ejemplo n.º 6
0
def run_partis(infname, outfname):
    if utils.output_exists(args, outfname, offset=8):
        return

    aligned_gl_seqs = {}  # keyed by seq so it's easy to check for duplicates
    for r in utils.regions:  # deduplicate before passing to partis
        for seqfo in utils.read_fastx(get_glfname(r, aligned=True)):
            if seqfo['seq'] in aligned_gl_seqs:
                continue
            aligned_gl_seqs[seqfo['seq']] = '|'.join(seqfo['infostrs'])
    aligned_germline_fname = args.workdir + '/all-aligned-gl-seqs.fa'
    with open(aligned_germline_fname, 'w') as merged_file:
        for seq, gene in aligned_gl_seqs.items():
            merged_file.write('>%s\n%s\n' % (gene, seq))

    cmd = './bin/partis cache-parameters'
    cmd += ' --infname ' + infname
    cmd += ' --leave-default-germline'
    cmd += ' --presto-output --only-smith-waterman'
    cmd += ' --outfname ' + outfname
    if args.glfo_dir is not None:
        cmd += ' --initial-germline-dir ' + args.glfo_dir
    cmd += ' --aligned-germline-fname ' + aligned_germline_fname
    cmd += ' --n-procs ' + str(args.n_procs)

    utils.simplerun(cmd, print_time='partis annotation')

    os.remove(aligned_germline_fname)
Ejemplo n.º 7
0
def run_igdiscover(infname, outfname, outdir):
    if utils.output_exists(args, outfname):
        return

    prepare_igdiscover_outdir(outdir)

    if args.n_random_queries is not None:
        sub_infname = outdir + '/' + os.path.basename(
            infname.replace(
                utils.getsuffix(infname), '-n-random-queries-%d%s' %
                (args.n_random_queries, utils.getsuffix(infname))))
        if os.path.exists(sub_infname):
            print '    --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries
        else:
            print '    --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries
            seqfos = utils.read_fastx(infname,
                                      n_random_queries=args.n_random_queries)
            with open(sub_infname, 'w') as sub_infile:
                for seqfo in seqfos:
                    sub_infile.write('>%s\n%s\n' %
                                     (seqfo['name'], seqfo['seq']))
        infname = sub_infname

    igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper(
    )

    cmds = ['#!/bin/bash']
    cmds += ['export PATH=%s:$PATH' % args.condapath]
    cmds += [
        'export PYTHONNOUSERSITE=True'
    ]  # otherwise it finds the pip-installed packages in .local and breaks (see https://github.com/conda/conda/issues/448)
    cmds += ['cd %s' % outdir]
    cmds += ['igdiscover init --db db --single-reads %s work' % infname
             ]  # prepares to run, putting files into <outdir>
    cmds += ['cp %s work/' % os.path.basename(args.yamlfname)]
    cmds += ['cd work']
    cmds += ['igdiscover run']
    utils.simplerun('\n'.join(cmds) + '\n',
                    cmdfname=outdir + '/run.sh',
                    print_time='igdiscover',
                    debug=True)

    template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human'
    glfo = glutils.create_glfo_from_fasta(
        igdiscover_outfname,
        args.locus,
        args.region,
        template_gldir,
        simulation_germline_dir=args.simulation_germline_dir)
    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo, debug=True)
Ejemplo n.º 8
0
 def read_bppseqgen_output(self, cmdfo, n_leaf_nodes):
     mutated_seqs = []
     for seqfo in utils.read_fastx(
             cmdfo['outfname']
     ):  # get the leaf node sequences from the file that bppseqgen wrote
         mutated_seqs.append(seqfo['seq'])
         if n_leaf_nodes == 1:  # skip the extra leaf we added earlier
             break
     assert n_leaf_nodes == len(mutated_seqs)
     # self.check_tree_simulation(leaf_seq_fname, chosen_tree)
     os.remove(cmdfo['outfname'])
     for otherfname in cmdfo['other-files']:
         os.remove(otherfname)
     os.rmdir(cmdfo['workdir'])
     return mutated_seqs
Ejemplo n.º 9
0
def run_tigger(infname, outfname, outdir):
    if utils.output_exists(args, outfname, offset=8):
        return

    rcmds = ['library(tigger)', 'library(dplyr)']
    # rcmds += ['data(sample_db, germline_ighv)']

    db_name = 'annotations'
    gls_name = 'gls'
    rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)]
    rcmds += ['%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))]

    tigger_outfname = outdir + '/tigger.fasta'
    rcmds += ['novel_df = findNovelAlleles(%s, %s, germline_min=2, nproc=%d)' % (db_name, gls_name, args.n_procs)]  #
    rcmds += ['geno = inferGenotype(%s, find_unmutated = FALSE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name)]
    rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)]
    rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname]
    cmdfname = args.workdir + '/tigger-in.cmd'
    with open(cmdfname, 'w') as cmdfile:
        cmdfile.write('\n'.join(rcmds) + '\n')
    cmdstr = 'R --slave -f ' + cmdfname
    utils.simplerun(cmdstr, shell=True, print_time='tigger')

    # post-process tigger .fa
    gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human'
    glfo = glutils.read_glfo(gldir, args.locus)
    tigger_alleles = set()
    for seqfo in utils.read_fastx(tigger_outfname):
        seq = seqfo['seq'].replace(utils.gap_chars[0], '')  # it should be just dots...
        tigger_alleles.add(seqfo['name'])
        if seqfo['name'] not in glfo['seqs'][args.region]:
            newfo = {'gene' : seqfo['name'], 'seq' : seq}
            use_template_for_codon_info = False
            if '+' in newfo['gene']:
                newfo['template-gene'] = newfo['gene'].split('+')[0]
                use_template_for_codon_info = True
            glutils.add_new_allele(glfo, newfo, use_template_for_codon_info=use_template_for_codon_info, debug=True)
        elif glfo['seqs'][args.region][seqfo['name']] != seq:
            print '%s different sequences in glfo and tigger output for %s:\n    %s\n    %s' % (utils.color('red', 'error'), seqfo['name'], glfo['seqs'][args.region][seqfo['name']], seqfo['seq'])
    for gene in glfo['seqs'][args.region]:  # remove them afterwards so we can use existing ones to get codon info
        if gene not in tigger_alleles:
            glutils.remove_gene(glfo, gene)

    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo)

    os.remove(cmdfname)
Ejemplo n.º 10
0
 def read_bppseqgen_output(self, cmdfo, n_leaf_nodes):
     mutated_seqs = {}
     for seqfo in utils.read_fastx(cmdfo['outfname']):  # get the leaf node sequences from the file that bppseqgen wrote
         if seqfo['name'] == dummy_name_so_bppseqgen_doesnt_break:  # in the unlikely (impossible unless we change tree generators and don't tell them to use the same leaf names) event that we get a non-dummy leaf with this name, it'll fail at the assertion just below
             continue
         mutated_seqs[seqfo['name'].strip('\'')] = seqfo['seq']
     try:  # make sure names are all of form t<n>, and keep track of which sequences goes with which name (have to keep around the t<n> labels so we can translate the tree labels, in event.py)
         names_seqs = [('t' + str(iseq + 1), mutated_seqs['t' + str(iseq + 1)]) for iseq in range(len(mutated_seqs))]
     except KeyError as ke:
         raise Exception('leaf name %s not as expected in bppseqgen output %s' % (ke, cmdfo['outfname']))
     assert n_leaf_nodes == len(names_seqs)
     os.remove(cmdfo['outfname'])
     for otherfname in cmdfo['other-files']:
         os.remove(otherfname)
     os.rmdir(cmdfo['workdir'])
     return zip(*names_seqs)
Ejemplo n.º 11
0
def read_fasta_file(seqs, fname, skip_pseudogenes, aligned=False):
    n_skipped_pseudogenes = 0
    seq_to_gene_map = {}
    for seqfo in utils.read_fastx(fname):
        # first get gene name
        if seqfo['name'][:2] != 'IG' and seqfo['name'][:2] != 'TR':  # if it's an imgt file, with a bunch of header info (and the accession number first)
            gene = seqfo['infostrs'][imgt_info_indices.index('gene')]
            functionality = seqfo['infostrs'][imgt_info_indices.index('functionality')]
            if functionality not in functionalities:
                raise Exception('unexpected functionality %s in %s' % (functionality, fname))
            if skip_pseudogenes and functionality in pseudogene_funcionalities:
                n_skipped_pseudogenes += 1
                continue
        else:  # plain fasta with just the gene name after the '>'
            gene = seqfo['name']
        utils.split_gene(gene)  # just to check if it's a valid gene name
        if not aligned and utils.get_region(gene) != utils.get_region(os.path.basename(fname)):  # if <aligned> is True, file name is expected to be whatever
            raise Exception('gene %s from %s has unexpected region %s' % (gene, os.path.basename(fname), utils.get_region(gene)))
        if gene in seqs[utils.get_region(gene)]:
            raise Exception('gene name %s appears twice in %s' % (gene, fname))

        # then the sequence
        seq = seqfo['seq']
        if not aligned:
            seq = utils.remove_gaps(seq)
        if 'Y' in seq:
            print '      replacing Y --> N (%d of \'em) in %s' % (seq.count('Y'), utils.color_gene(gene))
            seq = seq.replace('Y', 'N')
        if len(seq.strip(''.join(utils.expected_characters))) > 0:  # return the empty string if it only contains expected characters
            raise Exception('unexpected character %s in %s (expected %s)' % (seq.strip(''.join(utils.expected_characters)), seq, ' '.join(utils.expected_characters)))
        if seq not in seq_to_gene_map:
            seq_to_gene_map[seq] = []
        seq_to_gene_map[seq].append(gene)

        seqs[utils.get_region(gene)][gene] = seq

    tmpcounts = [len(gl) for gl in seq_to_gene_map.values()]  # number of names corresponding to each sequence (should all be ones)
    if tmpcounts.count(1) != len(tmpcounts):
        print '  mutliple names in %s for the following sequences:' % fname
        for seq, genelist in seq_to_gene_map.items():
            if len(genelist) > 1:
                print '    %-50s   %s' % (' '.join(genelist), seq)
        raise Exception('please de-duplicate the fasta and re-run.')

    if n_skipped_pseudogenes > 0:
        print '    skipped %d %s pseudogenes (leaving %d)' % (n_skipped_pseudogenes, utils.get_region(os.path.basename(fname)), len(seqs[utils.get_region(os.path.basename(fname))]))
Ejemplo n.º 12
0
def run_igblast(infname, outfname):
    if utils.output_exists(args, outfname, offset=8):
        return

    if args.glfo_dir is not None:
        print '%s --glfo-dir isn\'t getting plugged in to igblast/changeo (would need to rebuild igblast db)' % utils.color(
            'red', 'warning')

    if args.n_random_queries is not None:
        sub_infname = os.path.dirname(outfname) + '/' + os.path.basename(
            infname.replace(
                utils.getsuffix(infname), '-n-random-queries-%d%s' %
                (args.n_random_queries, utils.getsuffix(infname))))
        if os.path.exists(sub_infname):
            print '    --n-random-queries: leaving existing fasta for igblast (hopefully it has %d queries)' % args.n_random_queries
        else:
            print '    --n-random-queries: writing new fasta for igblast (%d queries)' % args.n_random_queries
            seqfos = utils.read_fastx(infname,
                                      n_random_queries=args.n_random_queries)
            with open(sub_infname, 'w') as sub_infile:
                for seqfo in seqfos:
                    sub_infile.write('>%s\n%s\n' %
                                     (seqfo['name'], seqfo['seq']))
        infname = sub_infname

    cmds = ['#!/bin/bash']
    cmds += ['cd %s/%s' % (args.igbdir, args.locus)]
    cmds += ['export PATH=%s:$PATH' % args.condapath]
    cmds += ['igblastn']
    for tmpreg in utils.regions:
        cmds[-1] += ' -germline_db_%s %s%s-unaligned.fasta' % (
            tmpreg.upper(), args.locus, tmpreg)
    cmds[-1] += ' -auxiliary_data optional_file/%s_gl.aux' % args.species
    cmds[
        -1] += ' -domain_system imgt -ig_seqtype Ig -organism %s -outfmt \'7 std qseq sseq btop\'' % args.species
    cmds[-1] += ' -num_threads %d' % utils.auto_n_procs()
    cmds[-1] += ' -query ' + infname + ' -out ' + outfname
    utils.simplerun('\n'.join(cmds) + '\n', cmdfname=args.workdir + '/run.sh')
Ejemplo n.º 13
0
 def read_bppseqgen_output(self, cmdfo, n_leaf_nodes):
     mutated_seqs = {}
     for seqfo in utils.read_fastx(
             cmdfo['outfname']
     ):  # get the leaf node sequences from the file that bppseqgen wrote
         if seqfo[
                 'name'] == dummy_name_so_bppseqgen_doesnt_break:  # in the unlikely (impossible unless we change tree generators and don't tell them to use the same leaf names) event that we get a non-dummy leaf with this name, it'll fail at the assertion just below
             continue
         mutated_seqs[seqfo['name'].strip('\'')] = seqfo['seq']
     try:
         mutated_seqs = [
             mutated_seqs['t' + str(iseq + 1)]
             for iseq in range(len(mutated_seqs))
         ]
     except KeyError as ke:
         raise Exception(
             'leaf name %s not as expected in bppseqgen output %s' %
             (ke, cmdfo['outfname']))
     assert n_leaf_nodes == len(mutated_seqs)
     os.remove(cmdfo['outfname'])
     for otherfname in cmdfo['other-files']:
         os.remove(otherfname)
     os.rmdir(cmdfo['workdir'])
     return mutated_seqs
Ejemplo n.º 14
0
import json
import operator

# if you move this script, you'll need to change this method of getting the imports
partis_dir = os.path.dirname(os.path.realpath(__file__)).replace('/bin', '')
sys.path.insert(1, partis_dir + '/python')

import utils

parser = argparse.ArgumentParser()
parser.add_argument('infname')  #default='/fh/fast/matsen_e/data/goo-dengue-10x/test/filtered_contig.fasta')
parser.add_argument('outfname')
parser.add_argument('--droplet-id-separator', default='_', help='everything in the sequence id before this character is treated as the droplet id, e.g. for the default, the uid AAACGGGCAAGCGAGT-1_contig_2 has a droplet id of AAACGGGCAAGCGAGT-1')
args = parser.parse_args()

seqfos = utils.read_fastx(args.infname)
droplet_ids = {}
for sfo in seqfos:
    did = utils.get_droplet_id(sfo['name'])
    if did not in droplet_ids:
        droplet_ids[did] = []
    droplet_ids[did].append(sfo['name'])

print '  read %d sequences with %d droplet ids' % (len(seqfos), len(droplet_ids))
count_info = {}
for dlist in droplet_ids.values():
    if len(dlist) not in count_info:
        count_info[len(dlist)] = 0
    count_info[len(dlist)] += 1
print '    contigs per'
print '      droplet     count   fraction'
Ejemplo n.º 15
0
def get_seqfile_info(infname,
                     is_data,
                     n_max_queries=-1,
                     args=None,
                     simglfo=None,
                     quiet=False):
    """ return list of sequence info from files of several types """

    suffix = utils.getsuffix(infname)
    if len(re.findall('\.[ct]sv', suffix)) > 0:
        if suffix == '.csv':
            delimiter = ','
        elif suffix == '.tsv':
            delimiter = '\t'
        else:
            assert False
        seqfile = open(infname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    else:
        reader = utils.read_fastx(
            infname,
            name_key='unique_ids',
            seq_key='input_seqs',
            add_info=False,
            sanitize=True,
            n_max_queries=
            n_max_queries,  # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below
            queries=(args.queries if
                     (args is not None and not args.abbreviate) else None)
        )  # NOTE also can't filter on args.queries here if we're also translating

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    # already_printed_forbidden_character_warning = False
    n_queries_added = 0
    found_seed = False
    used_names = set()  # for abbreviating
    if args is not None and args.abbreviate:
        potential_names = list(string.ascii_lowercase)
    iname = None  # line number -- used as sequence id if there isn't a name column in the file
    iline = -1
    for line in reader:
        iline += 1
        if args is not None:
            if args.istartstop is not None:
                if iline < args.istartstop[0]:
                    continue
                if iline >= args.istartstop[1]:
                    break
            if args.name_column is not None:
                line['unique_ids'] = line[args.name_column]
                del line[args.name_column]
            if args.seq_column is not None:
                line['input_seqs'] = line[args.seq_column]
                if args.seq_column != 'seqs':  # stupid god damn weird backwards compatibility edge case bullshit
                    del line[args.seq_column]
        if iname is None and 'unique_ids' not in line and 'unique_id' not in line:
            print '  %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (
                utils.color('yellow', 'warning'))
            iname = 0
        if iname is not None:
            line['unique_ids'] = '%09d' % iname
            iname += 1
        if 'input_seqs' not in line and 'seq' not in line:
            raise Exception(
                'couldn\'t find a sequence column in %s (you can set this with --seq-column)'
                % infname)
        utils.process_input_line(line)
        if len(line['unique_ids']) > 1:
            raise Exception('can\'t yet handle multi-seq csv input files')
        uid = line['unique_ids'][0]
        if uid in input_info:
            new_uid = uid
            iid = 2
            while new_uid in input_info:
                new_uid = uid + '-' + str(iid)
                iid += 1
            print '  %s uid %s already read from input file %s, so replacing with new uid %s' % (
                utils.color('yellow', 'warning'), uid, infname, new_uid)
            uid = new_uid
        inseq = line['input_seqs'][0]

        # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above it has them
        # if any(fc in uid for fc in utils.forbidden_characters):
        #     raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid))
        if args is not None:
            if args.abbreviate:  # note that this changes <uid>, but doesn't modify <line>
                uid = abbreviate(used_names, potential_names, uid)
            if args.queries is not None and uid not in args.queries:
                continue
            if args.reco_ids is not None and line[
                    'reco_id'] not in args.reco_ids:
                continue
            if args.seed_unique_id is not None and uid == args.seed_unique_id:
                found_seed = True

        if uid in input_info:
            raise Exception('found uid \'%s\' twice in input file %s' %
                            (uid, infname))

        if len(inseq.translate(None, ''.join(utils.alphabet))) > 0:
            unexpected_chars = set(
                [ch for ch in inseq if ch not in utils.alphabet])
            raise Exception(
                'unexpected character%s %s (not among %s) in input sequence with id %s:\n  %s'
                % (utils.plural(len(unexpected_chars)), ', '.join([
                    ('\'%s\'' % ch) for ch in unexpected_chars
                ]), utils.nukes + utils.ambiguous_bases, uid, inseq))

        # da business
        input_info[uid] = {
            'unique_ids': [
                uid,
            ],
            'seqs': [
                inseq,
            ]
        }

        if n_queries_added == 0 and is_data and 'reco_id' in line:
            print '  note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % infname)
            reco_info[uid] = copy.deepcopy(line)
            if simglfo is not None:
                utils.add_implicit_info(simglfo, reco_info[uid])

        n_queries_added += 1
        if n_max_queries > 0 and n_queries_added >= n_max_queries:
            if not quiet:  # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now
                print '  --n-max-queries: stopped after reading %d queries from input file' % len(
                    input_info)
            break

    post_process(input_info, reco_info, args, infname, found_seed, is_data,
                 iline)

    if len(input_info) == 0:
        raise Exception('didn\'t read any sequences from %s' % infname)

    return input_info, reco_info
Ejemplo n.º 16
0
def read_sequence_file(infname,
                       is_data,
                       n_max_queries=-1,
                       args=None,
                       simglfo=None,
                       quiet=False,
                       more_input_info=None):
    # NOTE renamed this from get_seqfile_info() since I'm changing the return values, but I don't want to update the calls everywhere (e.g. in compareutils)
    yaml_glfo = None
    suffix = utils.getsuffix(infname)
    if suffix in delimit_info:
        seqfile = open(
            infname
        )  # closes on function exit. no, this isn't the best way to do this
        reader = csv.DictReader(seqfile, delimiter=delimit_info[suffix])
    elif suffix in ['.fa', '.fasta', '.fastx']:
        reader = utils.read_fastx(
            infname,
            name_key='unique_ids',
            seq_key='input_seqs',
            add_info=False,
            sanitize=True,
            n_max_queries=
            n_max_queries,  # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below
            queries=(args.queries if
                     (args is not None and not args.abbreviate) else None)
        )  # NOTE also can't filter on args.queries here if we're also translating
    elif suffix == '.yaml':
        yaml_glfo, reader, _ = utils.read_yaml_output(
            infname,
            n_max_queries=n_max_queries,
            synth_single_seqs=True,
            dont_add_implicit_info=True
        )  # not really sure that long term I want to synthesize single seq lines, but for backwards compatibility it's nice a.t.m.
        if not is_data:
            simglfo = yaml_glfo  # doesn't replace the contents, of course, which is why we return it
    else:
        raise Exception('unhandled file extension %s' % suffix)

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    # already_printed_forbidden_character_warning = False
    n_queries_added = 0
    found_seed = False
    potential_names, used_names = None, None  # for abbreviating
    iname = None  # line number -- used as sequence id if there isn't a name column in the file
    iline = -1
    for line in reader:
        iline += 1
        if args is not None:
            if args.istartstop is not None:
                if iline < args.istartstop[0]:
                    continue
                if iline >= args.istartstop[1]:
                    break
            if args.name_column is not None:
                line['unique_ids'] = line[args.name_column]
                del line[args.name_column]
            if args.seq_column is not None:
                line['input_seqs'] = line[args.seq_column]
                if args.seq_column != 'seqs':  # stupid god damn weird backwards compatibility edge case bullshit
                    del line[args.seq_column]
        if iname is None and 'unique_ids' not in line and 'unique_id' not in line:
            print '  %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (
                utils.color('yellow', 'warning'))
            iname = 0
        if iname is not None:
            line['unique_ids'] = '%09d' % iname
            iname += 1
        if 'input_seqs' not in line and 'seq' not in line:
            raise Exception(
                'couldn\'t find a sequence column in %s (you can set this with --seq-column)'
                % infname)
        if suffix != '.yaml':
            utils.process_input_line(line)
        if len(line['unique_ids']) > 1:
            raise Exception('can\'t yet handle multi-seq csv input files')
        uid = line['unique_ids'][0]
        if uid in input_info:
            new_uid = uid
            iid = 2
            while new_uid in input_info:
                new_uid = uid + '-' + str(iid)
                iid += 1
            print '  %s uid %s already read from input file %s, so replacing with new uid %s' % (
                utils.color('yellow', 'warning'), uid, infname, new_uid)
            uid = new_uid
        inseq = line['input_seqs'][0]

        # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above if it has them
        # if any(fc in uid for fc in utils.forbidden_characters):
        #     raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid))
        if args is not None:
            if args.abbreviate:  # note that this changes <uid>, but doesn't modify <line>
                uid, potential_names, used_names = utils.choose_new_uid(
                    potential_names, used_names)
            if args.queries is not None and uid not in args.queries:
                continue
            if args.reco_ids is not None and line[
                    'reco_id'] not in args.reco_ids:
                continue
            if args.seed_unique_id is not None and uid == args.seed_unique_id:
                found_seed = True

        if uid in input_info:
            raise Exception('found uid \'%s\' twice in input file %s' %
                            (uid, infname))

        if any(c not in utils.alphabet for c in inseq):
            unexpected_chars = set(
                [ch for ch in inseq if ch not in utils.alphabet])
            raise Exception(
                'unexpected character%s %s (not among %s) in input sequence with id %s:\n  %s'
                % (utils.plural(len(unexpected_chars)), ', '.join([
                    ('\'%s\'' % ch) for ch in unexpected_chars
                ]), utils.nukes + utils.ambiguous_bases, uid, inseq))

        # da business
        input_info[uid] = {
            'unique_ids': [
                uid,
            ],
            'seqs': [
                inseq,
            ]
        }

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % infname)
            reco_info[uid] = copy.deepcopy(line)
            if simglfo is not None:
                utils.add_implicit_info(simglfo, reco_info[uid])
            for line_key in utils.input_metafile_keys.values():
                if line_key in reco_info[
                        uid]:  # this is kind of weird to copy from sim info to input info, but it makes sense because affinity is really meta info (the only other place affinity could come from is --input-metafname below). Where i'm defining meta info more or less as any input info besides name and sequence (i think the distinction is only really important because we want to support fastas, which can't [shouldn't!] handle anything else))
                    input_info[uid][line_key] = copy.deepcopy(
                        reco_info[uid][line_key]
                    )  # note that the args.input_metafname stuff below should print a warning if you've also specified that (which you shouldn't, if it's simulation)

        n_queries_added += 1
        if n_max_queries > 0 and n_queries_added >= n_max_queries:
            if not quiet:  # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now
                print '  --n-max-queries: stopped after reading %d queries from input file' % len(
                    input_info)
            break

    if more_input_info is not None:  # if you use this on simulation, the extra queries that aren't in <reco_info> may end up breaking something down the line (but I don't imagine this really getting used on simulation)
        if len(set(more_input_info) & set(input_info)) > 0:
            print '  %s found %d queries in both --infname and --queries-to-include-fname (note that we don\'t check here that they correspond to the same sequence): %s' % (
                utils.color('red', 'note:'),
                len(set(more_input_info) & set(input_info)),
                ' '.join(set(more_input_info) & set(input_info))
            )  # not necessarily a problem, but you probably *shouldn't* have sequences floating around in two different files
        if args is not None and args.seed_unique_id is not None and args.seed_unique_id in more_input_info:
            found_seed = True
        input_info.update(more_input_info)
    if args is not None and args.input_metafname is not None:
        read_input_metafo(args.input_metafname,
                          input_info.values(),
                          debug=True)
    post_process(input_info, reco_info, args, infname, found_seed, is_data,
                 iline)

    if len(input_info) == 0:
        raise Exception('didn\'t read any sequences from %s' % infname)

    return input_info, reco_info, yaml_glfo
Ejemplo n.º 17
0
def parse_bcr_phylo_output(glfos, naive_events, outdir, ievent):
    # ----------------------------------------------------------------------------------------
    def split_seqfos(seqfos):
        hline, lline = naive_events[ievent]
        hseqfos, lseqfos = [], []
        for sfo in seqfos:
            padseq = utils.pad_nuc_seq(hline['naive_seq'])
            assert len(sfo['seq']) == len(padseq) + len(lline['naive_seq'])
            hseqfos.append({
                'name': sfo['name'],
                'seq': sfo['seq'][:len(hline['naive_seq'])]
            })
            lseqfos.append({
                'name': sfo['name'],
                'seq': sfo['seq'][len(padseq):]
            })
        return hseqfos, lseqfos

    # ----------------------------------------------------------------------------------------
    def read_kdvals(kdfname):
        nodefo = {}
        with open(kdfname) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd': float(line['kd']),
                    'relative_kd': float(line['relative_kd']),
                    'lambda': line.get('lambda', None),
                    'target_index': int(line['target_index']),
                }
        return nodefo

    # ----------------------------------------------------------------------------------------
    def get_mature_line(sfos,
                        naive_line,
                        glfo,
                        nodefo,
                        dtree,
                        target_sfos,
                        locus=None):
        assert len(
            naive_line['unique_ids']
        ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
        assert not indelutils.has_indels(
            naive_line['indelfos'][0])  # would have to handle this below
        if args.debug:
            utils.print_reco_event(naive_line)
        reco_info = collections.OrderedDict()
        for sfo in sfos:
            mline = utils.get_non_implicit_copy(naive_line)
            del mline['tree']
            mline['unique_ids'] = [sfo['name']]
            mline['seqs'] = [sfo['seq']]
            mline['input_seqs'] = [
                sfo['seq']
            ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
            mline['duplicates'] = [[]]
            reco_info[sfo['name']] = mline
            try:
                utils.add_implicit_info(glfo, mline)
            except:  # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file
                print 'implicit info adding failed for ievent %d in %s' % (
                    ievent, outdir)
                lines = traceback.format_exception(*sys.exc_info())
                print utils.pad_lines(
                    ''.join(lines)
                )  # NOTE this will still crash on the next line if implicit info adding failed
        final_line = utils.synthesize_multi_seq_line_from_reco_info(
            [sfo['name'] for sfo in sfos], reco_info)

        ftree = copy.deepcopy(dtree)
        if locus is not None:

            def ltr(u):
                return u + '-' + locus

            new_nodefo = {}
            for u_old in nodefo:
                new_nodefo[ltr(u_old)] = nodefo[u_old]
            nodefo = new_nodefo
            treeutils.translate_labels(ftree,
                                       [(u, ltr(u))
                                        for u in final_line['unique_ids']])
            final_line['unique_ids'] = [
                ltr(u) for u in final_line['unique_ids']
            ]
            assert len(sfos) == len(final_line['unique_ids'])
            for iseq, sfo in enumerate(sfos):
                naive_id = naive_line['unique_ids'][0]
                assert naive_id.count('-') == 1
                bstr = naive_id.replace('-' + locus, '')
                pids = final_line['paired-uids'][iseq]
                assert len(pids) == 1 and pids[0].find(
                    bstr
                ) == 0 and pids[0].count('-') == 1 and pids[0].split(
                    '-'
                )[1] in utils.loci  # if uid is xxx-igh, paired id shoud be e.g. xxx-igk
                final_line['paired-uids'][iseq] = [
                    p.replace(bstr, sfo['name']) for p in pids
                ]

        if args.debug:
            utils.print_reco_event(final_line)

        # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
        if len(
                set(nodefo) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [
            1. / nodefo[u]['kd'] for u in final_line['unique_ids']
        ]
        final_line['relative_affinities'] = [
            1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']
        ]
        final_line['lambdas'] = [
            nodefo[u]['lambda'] for u in final_line['unique_ids']
        ]
        final_line['nearest_target_indices'] = [
            nodefo[u]['target_index'] for u in final_line['unique_ids']
        ]
        ftree.scale_edges(1. / numpy.mean([len(s)
                                           for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ftree),
                                  padwidth=12)
        final_line['tree'] = ftree.as_string(schema='newick')

        tmp_event = RecombinationEvent(
            glfo
        )  # I don't want to move the function out of event.py right now
        tmp_event.set_reco_id(
            final_line, irandom=ievent
        )  # not sure that setting <irandom> here actually does anything
        final_line['target_seqs'] = [tfo['seq'] for tfo in target_sfos]
        return final_line

    # ----------------------------------------------------------------------------------------
    assert args.stype == 'selection'  # i don't know that non-'selection' is possible or has any point at this point (can just set selection strength to zero)
    kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir
    if not utils.output_exists(
            args, kdfname, outlabel='kd/nwk conversion', offset=4
    ):  # eh, don't really need to check for both kd and nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later
        cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (
            outdir, args.extrastr, kdfname, nwkfname)
        utils.run_ete_script(cmd, ete_path, debug=args.n_procs == 1)
    nodefo = read_kdvals(kdfname)
    dtree = treeutils.get_dendro_tree(treefname=nwkfname)
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(
        outdir))  # output mutated sequences from bcr-phylo
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    if args.paired_loci:
        mevents = []
        for tline, sfos, tsfos in zip(naive_events[ievent],
                                      split_seqfos(seqfos),
                                      split_seqfos(target_seqfos)):
            mevents.append(
                get_mature_line(sfos,
                                tline,
                                glfos[tline['loci'][0]],
                                nodefo,
                                dtree,
                                target_seqfos,
                                locus=tline['loci'][0]))
        return mevents
    else:
        return get_mature_line(seqfos, naive_events[ievent], glfos[0], nodefo,
                               dtree, target_seqfos)
Ejemplo n.º 18
0
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(
        outdir))  # output mutated sequences from bcr-phylo

    assert len(
        naive_line['unique_ids']
    ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(
        naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['duplicates'] = [[]]
        reco_info[sfo['name']] = mline
        utils.add_implicit_info(glfo, mline)
    final_line = utils.synthesize_multi_seq_line_from_reco_info(
        [sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % (
            outdir, args.extrastr, outdir, outdir)
        utils.run_ete_script(cmd, ete_path)
        nodefo = {}
        with open('%s/kd-vals.csv' % outdir) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd': float(line['kd']),
                    'relative_kd': float(line['relative_kd']),
                    'lambda': line.get('lambda', None),
                    'target_index': int(line['target_index']),
                }
        if len(
                set(nodefo) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [
            1. / nodefo[u]['kd'] for u in final_line['unique_ids']
        ]
        final_line['relative_affinities'] = [
            1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']
        ]
        final_line['lambdas'] = [
            nodefo[u]['lambda'] for u in final_line['unique_ids']
        ]
        final_line['nearest_target_indices'] = [
            nodefo[u]['target_index'] for u in final_line['unique_ids']
        ]
        tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir)
        tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree),
                                  padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(
        glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(
        final_line, irandom=ievent
    )  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]

    return final_line
Ejemplo n.º 19
0
args = parser.parse_args()
if os.path.dirname(args.fname) == '':
    args.fname = '%s/%s' % (os.getcwd(), args.fname)
if args.outdir is None:
    args.outdir = utils.getprefix(args.fname)

if any(
        os.path.exists(ofn)
        for ofn in paircluster.paired_dir_fnames(args.outdir)):
    if args.overwrite:
        paircluster.clean_paired_dir(args.outdir)
    else:
        print '  split-loci.py output exists and --overwrite was not set, so not doing anything: %s' % args.outdir
        sys.exit(0)

seqfos = utils.read_fastx(args.fname, n_max_queries=args.n_max_queries)
if args.fasta_info_index is not None:
    for sfo in seqfos:
        sfo['name'] = sfo['infostrs'][args.fasta_info_index]
if args.reverse_negative_strands:
    revfos = [
        {
            'name': s['name'],
            'seq': utils.revcomp(s['seq'])
        } for s in seqfos
    ]  # NOTE this is not on an equal footing with <seqfos>, since we add all the vsearch info to <seqfos>, then use it do decide on locus, and then to write output

if os.path.exists(args.germline_dir + '/' +
                  args.species):  # ick that is hackey
    args.germline_dir += '/' + args.species
Ejemplo n.º 20
0
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(
        '%s/%s.fasta' %
        (outdir, args.extrastr))  # output mutated sequences from bcr-phylo

    assert len(
        naive_line['unique_ids']
    ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(
        naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        reco_info[sfo['name']] = mline
        utils.add_implicit_info(glfo, mline)
    final_line = utils.synthesize_multi_seq_line_from_reco_info(
        [sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        cmd = 'export PATH=%s:$PATH && xvfb-run -a python ./bin/view-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % (
            ete_path, outdir, args.extrastr, outdir, outdir)
        utils.simplerun(cmd, shell=True)
        kdvals = {}
        with open('%s/kd-vals.csv' % outdir) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                kdvals[line['uid']] = float(line['kd'])
        if len(
                set(kdvals) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(kdvals) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(kdvals)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(kdvals))
        final_line['affinities'] = [
            1. / kdvals[u] for u in final_line['unique_ids']
        ]
        tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir)
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree),
                                  padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(
        glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(
        final_line, irandom=ievent
    )  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]
    from Bio.Seq import Seq
    final_line['nearest_target_indices'] = []
    aa_targets = [Seq(seq).translate() for seq in final_line['target_seqs']]
    for mseq in final_line['input_seqs']:
        aa_mseq = Seq(mseq).translate()
        aa_hdists = [
            utils.hamming_distance(aa_t, aa_mseq, amino_acid=True)
            for aa_t in aa_targets
        ]
        imin = aa_hdists.index(
            min(aa_hdists)
        )  # NOTE doesn't do anything differently if there's more than one min
        final_line['nearest_target_indices'].append(imin)

    return final_line
Ejemplo n.º 21
0
from sklearn.metrics import euclidean_distances
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

sys.path.insert(0, 'python')
import utils
import mds

# ----------------------------------------------------------------------------------------
parser = argparse.ArgumentParser()
parser.add_argument('--n-clusters', type=int, required=True)
parser.add_argument('--n-components', type=int, default=2)
parser.add_argument('--plotdir')
parser.add_argument('--workdir',
                    default='/tmp/dralph/mds/' +
                    str(random.randint(0, 999999)))
parser.add_argument('--seed', type=int, default=1)
args = parser.parse_args()

seqfos = utils.read_fastx('v-qr.fa', n_max_queries=500)
for iseq in range(len(seqfos)):
    seqfos[iseq]['name'] = str(iseq)

# mds.run_sklearn_mds(args.n_components, args.n_clusters, seqfos, args.seed, plotdir=args.plotdir)
mds.bios2mds_kmeans_cluster(args.n_components,
                            args.n_clusters,
                            seqfos,
                            args.workdir,
                            args.seed,
                            plotdir=args.plotdir)
Ejemplo n.º 22
0
def get_seqfile_info(infname,
                     is_data,
                     n_max_queries=-1,
                     args=None,
                     glfo=None,
                     simglfo=None):
    """ return list of sequence info from files of several types """

    if not is_data and glfo is None:
        print '  WARNING glfo is None, so not adding implicit info'

    suffix = os.path.splitext(infname)[1]
    if len(re.findall('\.[ct]sv', suffix)) > 0:
        if suffix == '.csv':
            delimiter = ','
        elif suffix == '.tsv':
            delimiter = '\t'
        else:
            assert False
        seqfile = open(infname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    else:
        reader = utils.read_fastx(
            infname,
            name_key='unique_ids',
            seq_key='input_seqs',
            add_info=False,
            sanitize=True,
            queries=(args.queries if args is not None else None),
            n_max_queries=n_max_queries)

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    # already_printed_forbidden_character_warning = False
    n_queries_added = 0
    found_seed = False
    used_names = set()  # for abbreviating
    if args is not None and args.abbreviate:
        potential_names = list(string.ascii_lowercase)
    iname = None  # line number -- used as sequence id if there isn't a name column in the file
    iline = -1
    for line in reader:
        iline += 1
        if args is not None:
            if args.istartstop is not None:
                if iline < args.istartstop[0]:
                    continue
                if iline >= args.istartstop[1]:
                    break
            if args.name_column is not None:
                line['unique_ids'] = line[args.name_column]
                del line[args.name_column]
            if args.seq_column is not None:
                line['input_seqs'] = line[args.seq_column]
                if args.seq_column != 'seqs':  # stupid god damn weird backwards compatibility edge case bullshit
                    del line[args.seq_column]
        if iname is None and 'unique_ids' not in line and 'unique_id' not in line:
            print '  %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (
                utils.color('yellow', 'warning'))
            iname = 0
        if iname is not None:
            line['unique_ids'] = '%09d' % iname
            iname += 1
        if 'input_seqs' not in line and 'seq' not in line:
            raise Exception(
                'couldn\'t find a sequence column in %s (you can set this with --seq-column)'
                % infname)
        utils.process_input_line(line)
        if len(line['unique_ids']) > 1:
            raise Exception('can\'t yet handle multi-seq csv input files')
        uid = line['unique_ids'][0]
        inseq = line['input_seqs'][0]

        # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above it has them
        # if any(fc in uid for fc in utils.forbidden_characters):
        #     raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid))
        if args is not None:
            if args.abbreviate:  # note that this changes <uid>, but doesn't modify <line>
                uid = abbreviate(used_names, potential_names, uid)
            if args.queries is not None and uid not in args.queries:
                continue
            if args.reco_ids is not None and line[
                    'reco_id'] not in args.reco_ids:
                continue
            if args.seed_unique_id is not None and uid == args.seed_unique_id:
                found_seed = True

        if uid in input_info:
            raise Exception('found uid \'%s\' twice in input file %s' %
                            (uid, infname))

        if len(inseq.translate(None, ''.join(utils.alphabet))) > 0:
            raise Exception(
                'unexpected character (not among %s) in input sequence with id %s:\n  %s'
                % (utils.nukes + utils.ambiguous_bases, uid, inseq))

        input_info[uid] = {
            'unique_ids': [
                uid,
            ],
            'seqs': [
                inseq,
            ]
        }

        if n_queries_added == 0 and is_data and 'v_gene' in line:
            print '  note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % infname)
            reco_info[uid] = copy.deepcopy(line)
            if simglfo is not None:
                utils.add_implicit_info(simglfo, reco_info[uid])

        n_queries_added += 1
        if n_max_queries > 0 and n_queries_added >= n_max_queries:
            break

    post_process(input_info, reco_info, args, infname, found_seed, is_data)

    if len(input_info) == 0:
        raise Exception('didn\'t read any sequences from %s' % infname)

    return input_info, reco_info
Ejemplo n.º 23
0
    default=partis_dir + '/data/germlines/human',
    help=
    'germline info directory. Only used if --partis-output-file is an old-style .csv, and this default dir may work if your output file doesn\'t have novel inferred genes. Otherwise, is the germline info dir from the partis inferred parameter directory corresponding to your output file --partis-output-file.'
)
parser.add_argument('--locus', default='igh')
parser.add_argument('--outfile', required=True, help='output partis yaml file')
parser.add_argument('--debug', action='store_true')
parser.add_argument(
    '--n-test-subset-seqs',
    type=int,
    help=
    'take only the first N seqs from both the fasta file and the annotation in the partis output file (e.g. for testing when the family is huge)'
)
args = parser.parse_args()

new_seqfos = utils.read_fastx(args.new_seq_file, sanitize_seqs=True)
print '    read %d seqs from %s' % (len(new_seqfos), args.new_seq_file)

glfo = None
if utils.getsuffix(args.partis_output_file) == '.csv':
    print '    reading deprecated csv format, so need to read germline info from somewhere else, using --glfo-dir %s, hopefully it works' % args.glfo_dir
    glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus)

glfo, annotation_list, cpath = utils.read_output(args.partis_output_file,
                                                 glfo=glfo,
                                                 locus=args.locus)
if args.partition_index is not None:
    print '  using non-best partition index %d (best is %d)' % (
        args.partition_index, cpath.i_best)
partition = cpath.partitions[cpath.i_best if args.
                             partition_index is None else args.partition_index]
Ejemplo n.º 24
0
def get_new_alignments(glfo, region, debug=False):
    aligned_seqs = {}

    genes_with_alignments = set(aligned_seqs)  # used to already have some sequences aligned, and may as well keep around the code to handle that case
    genes_without_alignments = set(glfo['seqs'][region]) - set(aligned_seqs)
    if len(genes_without_alignments) == 0:
        if debug:
            print '  no missing %s alignments' % region
        return

    if debug:
        print '        missing alignments for %d %s genes' % (len(genes_without_alignments), region)
        if len(aligned_seqs) > 0:
            print '      existing alignments:'
            for g, seq in aligned_seqs.items():
                print '    %s   %s' % (seq, utils.color_gene(g))

    # find the longest aligned sequence, so we can pad everybody else with dots on the right out to that length
    biggest_length = None
    for gene in genes_with_alignments:
        if biggest_length is None or len(aligned_seqs[gene]) > biggest_length:
            biggest_length = len(aligned_seqs[gene])

    tmpdir = tempfile.mkdtemp()
    already_aligned_fname = tmpdir + '/already-aligned.fasta'
    not_aligned_fname = tmpdir + '/not-aligned.fasta'
    msa_table_fname = tmpdir + '/msa-table.txt'
    aligned_and_not_fnamefname = tmpdir + '/aligned-and-not.fasta'
    mafft_outfname = tmpdir + '/everybody-aligned.fasta'
    with open(already_aligned_fname, 'w') as tmpfile, open(msa_table_fname, 'w') as msafile:
        mysterious_index = 1
        msa_str = ''
        for gene in genes_with_alignments:
            dotstr = '.' * (biggest_length - len(aligned_seqs[gene]))
            alistr = aligned_seqs[gene] + dotstr
            tmpfile.write('>%s\n%s\n' % (gene, alistr.replace('.', '-')))
            msa_str += ' ' + str(mysterious_index)
            mysterious_index += 1
        msafile.write('%s # %s\n' % (msa_str, already_aligned_fname))
    with open(not_aligned_fname, 'w') as tmpfile:
        for gene in genes_without_alignments:
            tmpfile.write('>%s\n%s\n' % (gene, glfo['seqs'][region][gene]))

    check_call('cat ' + already_aligned_fname + ' ' + not_aligned_fname + ' >' + aligned_and_not_fnamefname, shell=True)

    # actually run mafft
    cmd = 'mafft --merge ' + msa_table_fname + ' ' + aligned_and_not_fnamefname + ' >' + mafft_outfname  # options=  # "--localpair --maxiterate 1000"
    if debug:
        print '          RUN %s' % cmd
    proc = Popen(cmd, shell=True, stderr=PIPE)
    _, err = proc.communicate()  # debug info goes to err

    if debug and False:  # aw, screw it, I don't even know what any of mafft's output means
        # deal with debug info (for err -- out gets redirected to a file)
        err = err.replace('\r', '\n')
        printstrs = []
        for errstr in err.split('\n'):  # remove the stupid progress bar things
            matches = re.findall('[0-9][0-9]* / [0-9][0-9]*', errstr)
            if len(matches) == 1 and errstr.strip() == matches[0]:
                continue
            if len(errstr) == 0:
                continue
            printstrs.append(errstr)
        print '        ' + '\n        '.join(printstrs)

    # deal with fasta output
    for seqfo in utils.read_fastx(mafft_outfname):
        gene = seqfo['name']
        seq = seqfo['seq']
        if gene not in glfo['seqs'][region]:  # only really possible if there's a bug in the preceding fifty lines, but oh well, you can't be too careful
            raise Exception('unexpected gene %s in mafft output' % gene)
        aligned_seqs[gene] = seq  # overwrite the old alignment with the new one
    if debug > 1:
        print '  new alignments:'
        for g, seq in aligned_seqs.items():
            print '            %s   %s  %s' % (seq, utils.color_gene(g, width=12 if region == 'v' else 8), '<--- new' if g in genes_without_alignments else '')

    os.remove(already_aligned_fname)
    os.remove(not_aligned_fname)
    os.remove(msa_table_fname)
    os.remove(aligned_and_not_fnamefname)
    os.remove(mafft_outfname)
    os.rmdir(tmpdir)

    return aligned_seqs
Ejemplo n.º 25
0
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(outdir))  # output mutated sequences from bcr-phylo

    assert len(naive_line['unique_ids']) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [sfo['seq']]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [sfo['seq']]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['duplicates'] = [[]]
        reco_info[sfo['name']] = mline
        try:
            utils.add_implicit_info(glfo, mline)
        except:  # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file
            print 'implicit info adding failed for ievent %d in %s' % (ievent, outdir)
            lines = traceback.format_exception(*sys.exc_info())
            print utils.pad_lines(''.join(lines))  # NOTE this will still crash on the next line if implicit info adding failed
    final_line = utils.synthesize_multi_seq_line_from_reco_info([sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir
        if not utils.output_exists(args, kdfname, outlabel='kd/nwk conversion', offset=4):  # eh, don't really need to check for both kd an nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later
            cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (outdir, args.extrastr, kdfname, nwkfname)
            utils.run_ete_script(cmd, ete_path, debug=args.n_procs==1)
        nodefo = {}
        with open(kdfname) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd' : float(line['kd']),
                    'relative_kd' : float(line['relative_kd']),
                    'lambda' : line.get('lambda', None),
                    'target_index' : int(line['target_index']),
                }
        if len(set(nodefo) - set(final_line['unique_ids'])) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [1. / nodefo[u]['kd'] for u in final_line['unique_ids']]
        final_line['relative_affinities'] = [1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']]
        final_line['lambdas'] = [nodefo[u]['lambda'] for u in final_line['unique_ids']]
        final_line['nearest_target_indices'] = [nodefo[u]['target_index'] for u in final_line['unique_ids']]
        tree = treeutils.get_dendro_tree(treefname=nwkfname)
        tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(final_line, irandom=ievent)  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]

    return final_line