Esempio n. 1
0
def get_character_str(character, charval):
    if character == 'subject':
        return charval
    elif character == 'timepoint':
        if charval == 'merged':
            return ''
        elif charval[0] == 'W':
            return 'week %d' % int(charval[1:])
        elif charval[0] == 'M':
            return 'month %d' % int(charval[1:])
        elif 'dpi' in charval:
            return charval.replace('dpi', ' dpi')
        elif charval[0] in ['p', 'm'] and charval[-1] in ['h', 'd']:
            plusminusstr = charval[0].replace('p', '+').replace('m', '-')
            number = int(charval[1:-1])
            unitstr = charval[-1].replace('h', 'hour').replace('d', 'day')
            return '%s%d %s%s' % (plusminusstr, number, unitstr, utils.plural(number))
        elif 'week' in charval:
            return 'week %d' % int(charval.replace('week', ''))
        else:
            raise Exception('not sure what to do with %s' % charval)
    elif character == 'isotype':
        return 'Ig%s' % charval.upper()
    else:
        assert False
Esempio n. 2
0
    async def star_who(self, ctx, message_id: int):
        query = """
                SELECT ARRAY(
                    SELECT starrers.author_id
                    FROM starboard_entries
                    JOIN starrers
                    ON starboard_entries.message_id=starrers.message_id
                    WHERE starboard_entries.guild_id=$1
                    AND (starboard_entries.message_id=$2 OR starboard_entries.bot_message_id=$2)
                );
                """
        member_ids = await ctx.pool.fetchval(query, ctx.guild.id, message_id)
        if member_ids is None:
            return await ctx.send('This message has not been starred.')

        members = list(
            map(str, filter(None, map(ctx.guild.get_member, member_ids))))

        base = utils.plural('star', len(member_ids))
        if len(member_ids) > len(members):
            base += f' ({len(member_ids) - len(members)} left server)'

        try:
            paginator = utils.EmbedPaginator(ctx, entries=members, per_page=20)
            paginator.embed.title = base
            await paginator.paginate()
        except Exception as e:
            await ctx.send(e)
Esempio n. 3
0
def member_weight(member: BinaryAgent,
                  target_value: int,
                  owned_goods: set,
                  remaining_goods: set,
                  num_of_families: int = 2) -> float:
    """
    Calculate the voting-weight of the given member with the given owned goods and remaining goods.

    >>> Alice = BinaryAgent({"w","x"})
    >>> Bob   = BinaryAgent({"w","x","y","z"})
    >>> member_weight(Alice, 1, set(), {"x","y","z"})
    0.5
    >>> member_weight(Bob, 2, set(), {"x","y","z"})
    0.375
    """
    member_remaining_value = member.value(
        remaining_goods)  # the "r" of the member
    member_current_value = member.value(owned_goods)
    member_should_get_value = target_value - member_current_value  # the "s" of the member
    the_member_weight = weight(member_remaining_value, member_should_get_value,
                               num_of_families)
    members_string = "{} member{}".format(member.cardinality,
                                          plural(member.cardinality))
    desired_goods_string = ",".join(sorted(member.desired_goods))
    member_weight.logger.info(
        AGENT_WEIGHT_FORMAT.format(members_string, desired_goods_string,
                                   member_remaining_value,
                                   member_should_get_value, the_member_weight))
    return the_member_weight
Esempio n. 4
0
def get_dbg_str(indelfo):
    if len(indelfo['qr_gap_seq']) != len(indelfo['gl_gap_seq']):
        print indelfo['qr_gap_seq']
        print indelfo['gl_gap_seq']
        raise Exception('different length qr and gl gap seqs (see previous lines)')
    qrprintstr, glprintstr = [], []
    for ich in range(len(indelfo['qr_gap_seq'])):
        qrb, glb = indelfo['qr_gap_seq'][ich], indelfo['gl_gap_seq'][ich]
        qrcolor, glcolor = None, None
        if qrb in utils.gap_chars or glb in utils.gap_chars:
            qrcolor = 'light_blue'
            glcolor = 'light_blue'
        elif qrb in utils.ambiguous_bases:
            qrcolor = 'light_blue'
        elif glb in utils.ambiguous_bases:
            glcolor = 'light_blue'
        elif qrb != glb:
            qrcolor = 'red'
        qrprintstr.append(utils.color(qrcolor, qrb if qrb not in utils.gap_chars else '*'))  # change it to a start just cause that's what it originally was... at some point should switch to just leaving it whatever gap char it was
        glprintstr.append(utils.color(glcolor, glb if glb not in utils.gap_chars else '*'))
    qrprintstr = ''.join(qrprintstr)
    glprintstr = ''.join(glprintstr)

    gene_str = ''
    gwidth = str(len('query'))
    if 'v' in indelfo['genes']:
        gene_str = utils.color_gene(indelfo['genes']['v'], width=int(gwidth), leftpad=True)
        gwidth = str(utils.len_excluding_colors(gene_str))
    dj_gene_str = ' '.join([utils.color_gene(indelfo['genes'][r]) for r in 'dj' if r in indelfo['genes']])
    dbg_str_list = [('  %' + gwidth + 's  %s  %s') % (gene_str, glprintstr, dj_gene_str),
                    ('  %' + gwidth + 's  %s') % ('query', qrprintstr)]
    for idl in indelfo['indels']:
        dbg_str_list.append('%10s: %d base%s at %d (%s)' % (idl['type'], idl['len'], utils.plural(idl['len']), idl['pos'], idl['seqstr']))
    return '\n'.join(dbg_str_list)
Esempio n. 5
0
 def too_close_to_already_added_gene(self, new_seq, new_alleles, debug=False):
     for added_name, added_info in new_alleles.items():
         n_snps = utils.hamming_distance(added_info['seq'], new_seq, align=True)
         if n_snps < self.min_n_snps or n_snps < self.args.n_max_snps:
             if debug:
                 print 'too close (%d snp%s) to gene we just added %s' % (n_snps, utils.plural(n_snps), utils.color_gene(added_name))
             return True
     return False
Esempio n. 6
0
def check_unread():
    dialogs = api.messages.getDialogs(count=200, unread=True)
    if dialogs['count'] > 0:
        msg_ids = []
        d = utils.plural(dialogs['count'])
        response = '<b>' + str(
            dialogs['count']
        ) + ' ' + d + ' с непрочитанными сообщениями:</b>\n'
        for i in dialogs['items']:
            sleep(2)
            user_id = i['message']['user_id']
            msg_ids.append(i['message']['id'])
            try:
                user = api.users.get(user_ids=user_id)[0]
                full_name = utils.escapize(user['first_name'] + ' ' +
                                           user['last_name'])
                response += full_name + ' ' + str(user_id) + '\n'
            except exceptions.VkException:
                pass
            # User notification
            if user_id > 0:
                targets = utils.dbget(user_id)
                if targets is not None:
                    for t in targets:
                        u = api.users.get(user_ids=user_id, name_case='gen')[0]
                        u_nom = api.users.get(user_ids=user_id)[0]
                        full_name = utils.escapize(u['first_name'] + ' ' +
                                                   u['last_name'])
                        full_name_nom = utils.escapize(u_nom['first_name'] +
                                                       ' ' +
                                                       u_nom['last_name'])
                        sud = str(u_nom['id'])
                        utils.dbadd('activity',
                                    '✉️ ' + full_name_nom + ' - ' + sud)
                        text = 'Есть новые сообщения от <b>' + full_name + '.</b> Id: ' + sud + '\nВведите <code>/d ' + sud + '</code> чтобы получить историю сообщений'
                        try:
                            tg.sendMessage(chat_id=t,
                                           text=text,
                                           parse_mode='HTML',
                                           disable_web_page_preview=True)
                        except Exception as e:
                            et = 'Exception:\n' + str(
                                e
                            ) + '\nIn check_unread while sending to target - ' + str(
                                t)
                            # noinspection PyTypeChecker
                            tg.send_message(admin, et)
                        sleep(2)
        tg.send_message(log_channel, response, 'HTML', True)
        try:
            api.messages.markAsRead(message_ids=msg_ids)
        except exceptions.VkException as e:
            ete = 'Exception:\n' + str(
                e) + '\nIn check_unread while markAsRead'
            # noinspection PyTypeChecker
            tg.send_message(admin, ete)
Esempio n. 7
0
    def choose_seqs_to_remove(
            chain_ids,
            max_hdist=4,
            tdbg=False):  # choose one of <chain_ids> to eliminate
        # look for pairs with the same locus that
        ids_to_remove = set(u for u in chain_ids if getloc(u) == '?')
        if tdbg and len(
                ids_to_remove
        ) > 0:  # i think this actually can't happen a.t.m. TODO maybe remove it
            print '      removed %d with missing annotations' % len(
                ids_to_remove)

        dbgstr = []
        n_equivalent = 0
        for tpair in itertools.combinations(chain_ids, 2):
            if len(set(getloc(u) for u in tpair)) > 1:
                continue
            if len(set(len(gval(u, 'seqs')) for u in tpair)) > 1:
                continue
            hdist = utils.hamming_distance(*[gval(u, 'seqs') for u in tpair])
            if tdbg:
                dbgstr.append(
                    utils.color('blue' if hdist == 0 else 'yellow',
                                '%d' % hdist))
            if hdist <= max_hdist:  # TODO would be nice to be able to combine their sequences, but I think propagating the resulting annotation modifications would be hard
                # print '      identical sequence overlap, choosing longer one'
                better_id, worse_id = sorted(
                    tpair, key=lambda q: utils.ambig_frac(gval(q, 'seqs'))
                )  # TODO if we're tossing one with hdist > 0, maybe should take the lower-shm one if they're the same length?
                ids_to_remove.add(worse_id)
                n_equivalent += 1
        if tdbg and len(dbgstr) > 0:
            print '        %d pair%s equivalent with hdists %s' % (
                n_equivalent, utils.plural(n_equivalent), ' '.join(dbgstr))

        # remove unproductive
        dbgstr = []
        unproductive_ids = []
        for uid in chain_ids:
            if not utils.is_functional(
                    all_antns[uid], all_antns[uid]['unique_ids'].index(uid)):
                unproductive_ids.append(uid)
                if tdbg:
                    dbgstr.append(
                        utils.is_functional_dbg_str(
                            all_antns[uid],
                            all_antns[uid]['unique_ids'].index(uid),
                            sep='+'))
        # unproductive_ids = [u for u in chain_ids if not utils.is_functional(all_antns[u], all_antns[u]['unique_ids'].index(u))]  # this way is only one line, which may or may not be nicer
        if tdbg and len(unproductive_ids) > 0:
            print '        %d unproductive  %s' % (len(unproductive_ids),
                                                   ',  '.join(dbgstr))
            ids_to_remove |= set(unproductive_ids)

        return ids_to_remove
Esempio n. 8
0
def add_indels(n_indels, qrseq, glseq, mean_length, codon_positions, indel_location=None, indel_positions=None, keep_in_frame=False, dbg_pad=0, debug=False):
    def getpos():  # if <pos> is specified we use that, otherwise we use <indel_location> to decide the region of the sequence from which to choose a position
        if indel_location is None:  # uniform over entire sequence
            return random.randint(5, len(qrseq) - 6)  # this will actually exclude either before the first index or after the last index. No, I don't care.
        elif indel_location == 'v':  # within the meat of the v
            return random.randint(5, codon_positions['v'])  # NOTE this isn't actually right, since the codon positions get modified as we add each indel... but it won't usually make a difference
        elif indel_location == 'cdr3':  # inside cdr3
            return random.randint(codon_positions['v'], codon_positions['j'])
        else:
            assert False
    def getlen():
        length = numpy.random.geometric(1. / mean_length)
        if keep_in_frame:
            itry = 0
            while length % 3 != 0:
                length = numpy.random.geometric(1. / mean_length)
                itry += 1
                if itry > 9999:
                    raise Exception('tried too many times to get in-frame indel length')
        return length
    def overlaps(pos, length):  # see if there's any existing indels close to where we're thinking of putting this one NOTE in practice this _really_ shouldn't happen much -- there should be only a a couple of indels per sequence at most -- this just keeps other things (e.g. indelfo consistency checks) from getting confused and crashing
        for gapseq in (indelfo['qr_gap_seq'], indelfo['gl_gap_seq']):
            if len(gapseq) < pos + length + 1:
                return True
            if utils.gap_len(gapseq[pos - length : pos + length]) > 0:  # this leaves a pretty, albeit inexact, large buffer
                return True
        return False

    # choose positions and lengths
    if indel_positions is None:
        indel_positions = [None for _ in range(n_indels)]
    if debug:
        print '%sadding %d indel%s' % (dbg_pad * ' ', n_indels, utils.plural(n_indels))

    # then build the indelfo
    indelfo = get_empty_indel()
    indelfo['genes'] = {}  # it's kind of awkward to have the match info here, but I need some way to pasp it between the aligner that's calling the indel (typically vsearch) and the aligner that's using it (typically sw)
    indelfo['qr_gap_seq'], indelfo['gl_gap_seq'] = qrseq, glseq
    indelfo['reversed_seq'] = qrseq
    for pos in indel_positions:
        length = getlen()
        while pos is None or overlaps(pos, length):
            pos = getpos()
        add_single_indel(indelfo, pos, length, codon_positions, keep_in_frame=keep_in_frame, debug=debug)

    # make the "input seq", i.e. without gaps, and account for this in the codon positions
    input_seq = filter(utils.alphabet.__contains__, indelfo['qr_gap_seq'])
    for region in codon_positions:
        codon_positions[region] -= utils.count_gap_chars(indelfo['qr_gap_seq'], aligned_pos=codon_positions[region])

    if debug:
        print utils.pad_lines(get_dbg_str(indelfo), dbg_pad + 4)

    return input_seq, indelfo
Esempio n. 9
0
def new_endpoint(table_name, base_name, id):
    table_name_py = u.jar_to_py(table_name)
    table_name_py_plur = u.plural(u.jar_to_py(table_name))

    text = "ns" + str(
        id
    ) + ".add_resource(resources_" + base_name + "." + table_name + "List, '/" + table_name_py_plur + "')\n"
    text += "ns" + str(
        id
    ) + ".add_resource(resources_" + base_name + "." + table_name + "Resource, '/" + table_name_py_plur + "/<string:id_" + table_name_py + ">')\n"

    return text
Esempio n. 10
0
    async def prefix(self, ctx):
        """Manages the server's custom prefixes.

        If called without a subcommand, this will list the currently set
        prefixes.
        """

        prefixes = [ctx.me.mention]
        prefixes.extend(
            ctx.bot.command_prefix(ctx.bot, ctx.message, mentions=False))

        embed = discord.Embed(title='Prefixes')
        embed.description = '\n'.join(
            f'{index}. {value}' for index, value in enumerate(prefixes, 1))
        embed.set_footer(
            text=utils.plural('prefix', len(prefixes), ending='es'))
        await ctx.send(embed=embed)
Esempio n. 11
0
def read_sequence_file(infname,
                       is_data,
                       n_max_queries=-1,
                       args=None,
                       simglfo=None,
                       quiet=False,
                       more_input_info=None):
    # NOTE renamed this from get_seqfile_info() since I'm changing the return values, but I don't want to update the calls everywhere (e.g. in compareutils)
    yaml_glfo = None
    suffix = utils.getsuffix(infname)
    if suffix in delimit_info:
        seqfile = open(
            infname
        )  # closes on function exit. no, this isn't the best way to do this
        reader = csv.DictReader(seqfile, delimiter=delimit_info[suffix])
    elif suffix in ['.fa', '.fasta', '.fastx']:
        reader = utils.read_fastx(
            infname,
            name_key='unique_ids',
            seq_key='input_seqs',
            add_info=False,
            sanitize=True,
            n_max_queries=
            n_max_queries,  # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below
            queries=(args.queries if
                     (args is not None and not args.abbreviate) else None)
        )  # NOTE also can't filter on args.queries here if we're also translating
    elif suffix == '.yaml':
        yaml_glfo, reader, _ = utils.read_yaml_output(
            infname,
            n_max_queries=n_max_queries,
            synth_single_seqs=True,
            dont_add_implicit_info=True
        )  # not really sure that long term I want to synthesize single seq lines, but for backwards compatibility it's nice a.t.m.
        if not is_data:
            simglfo = yaml_glfo  # doesn't replace the contents, of course, which is why we return it
    else:
        raise Exception('unhandled file extension %s' % suffix)

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    # already_printed_forbidden_character_warning = False
    n_queries_added = 0
    found_seed = False
    potential_names, used_names = None, None  # for abbreviating
    iname = None  # line number -- used as sequence id if there isn't a name column in the file
    iline = -1
    for line in reader:
        iline += 1
        if args is not None:
            if args.istartstop is not None:
                if iline < args.istartstop[0]:
                    continue
                if iline >= args.istartstop[1]:
                    break
            if args.name_column is not None:
                line['unique_ids'] = line[args.name_column]
                del line[args.name_column]
            if args.seq_column is not None:
                line['input_seqs'] = line[args.seq_column]
                if args.seq_column != 'seqs':  # stupid god damn weird backwards compatibility edge case bullshit
                    del line[args.seq_column]
        if iname is None and 'unique_ids' not in line and 'unique_id' not in line:
            print '  %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (
                utils.color('yellow', 'warning'))
            iname = 0
        if iname is not None:
            line['unique_ids'] = '%09d' % iname
            iname += 1
        if 'input_seqs' not in line and 'seq' not in line:
            raise Exception(
                'couldn\'t find a sequence column in %s (you can set this with --seq-column)'
                % infname)
        if suffix != '.yaml':
            utils.process_input_line(line)
        if len(line['unique_ids']) > 1:
            raise Exception('can\'t yet handle multi-seq csv input files')
        uid = line['unique_ids'][0]
        if uid in input_info:
            new_uid = uid
            iid = 2
            while new_uid in input_info:
                new_uid = uid + '-' + str(iid)
                iid += 1
            print '  %s uid %s already read from input file %s, so replacing with new uid %s' % (
                utils.color('yellow', 'warning'), uid, infname, new_uid)
            uid = new_uid
        inseq = line['input_seqs'][0]

        # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above if it has them
        # if any(fc in uid for fc in utils.forbidden_characters):
        #     raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid))
        if args is not None:
            if args.abbreviate:  # note that this changes <uid>, but doesn't modify <line>
                uid, potential_names, used_names = utils.choose_new_uid(
                    potential_names, used_names)
            if args.queries is not None and uid not in args.queries:
                continue
            if args.reco_ids is not None and line[
                    'reco_id'] not in args.reco_ids:
                continue
            if args.seed_unique_id is not None and uid == args.seed_unique_id:
                found_seed = True

        if uid in input_info:
            raise Exception('found uid \'%s\' twice in input file %s' %
                            (uid, infname))

        if any(c not in utils.alphabet for c in inseq):
            unexpected_chars = set(
                [ch for ch in inseq if ch not in utils.alphabet])
            raise Exception(
                'unexpected character%s %s (not among %s) in input sequence with id %s:\n  %s'
                % (utils.plural(len(unexpected_chars)), ', '.join([
                    ('\'%s\'' % ch) for ch in unexpected_chars
                ]), utils.nukes + utils.ambiguous_bases, uid, inseq))

        # da business
        input_info[uid] = {
            'unique_ids': [
                uid,
            ],
            'seqs': [
                inseq,
            ]
        }

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % infname)
            reco_info[uid] = copy.deepcopy(line)
            if simglfo is not None:
                utils.add_implicit_info(simglfo, reco_info[uid])
            for line_key in utils.input_metafile_keys.values():
                if line_key in reco_info[
                        uid]:  # this is kind of weird to copy from sim info to input info, but it makes sense because affinity is really meta info (the only other place affinity could come from is --input-metafname below). Where i'm defining meta info more or less as any input info besides name and sequence (i think the distinction is only really important because we want to support fastas, which can't [shouldn't!] handle anything else))
                    input_info[uid][line_key] = copy.deepcopy(
                        reco_info[uid][line_key]
                    )  # note that the args.input_metafname stuff below should print a warning if you've also specified that (which you shouldn't, if it's simulation)

        n_queries_added += 1
        if n_max_queries > 0 and n_queries_added >= n_max_queries:
            if not quiet:  # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now
                print '  --n-max-queries: stopped after reading %d queries from input file' % len(
                    input_info)
            break

    if more_input_info is not None:  # if you use this on simulation, the extra queries that aren't in <reco_info> may end up breaking something down the line (but I don't imagine this really getting used on simulation)
        if len(set(more_input_info) & set(input_info)) > 0:
            print '  %s found %d queries in both --infname and --queries-to-include-fname (note that we don\'t check here that they correspond to the same sequence): %s' % (
                utils.color('red', 'note:'),
                len(set(more_input_info) & set(input_info)),
                ' '.join(set(more_input_info) & set(input_info))
            )  # not necessarily a problem, but you probably *shouldn't* have sequences floating around in two different files
        if args is not None and args.seed_unique_id is not None and args.seed_unique_id in more_input_info:
            found_seed = True
        input_info.update(more_input_info)
    if args is not None and args.input_metafname is not None:
        read_input_metafo(args.input_metafname,
                          input_info.values(),
                          debug=True)
    post_process(input_info, reco_info, args, infname, found_seed, is_data,
                 iline)

    if len(input_info) == 0:
        raise Exception('didn\'t read any sequences from %s' % infname)

    return input_info, reco_info, yaml_glfo
Esempio n. 12
0
def read_input_metafo(input_metafname, annotation_list, required_keys=None, n_warn_print=10, debug=False):  # read input metafo from <input_metafname> and put in <annotation_list> (when we call this below, <annotation_list> is <input_info>
    # NOTE <annotation_list> doesn't need to be real annotations, it only uses the 'unique_ids' key
    metafo = utils.read_json_yaml(input_metafname)
    if any(isinstance(tkey, int) for tkey in metafo):  # would be better to check for not being a string, but that's harder, and this probably only happens for my simulation hash ids
        raise Exception('meta info keys need to be string (maybe just need to add \'\' around sequence ids in yaml file), but got: %s' % ' '.join(str(type(tk)) for tk in metafo if isinstance(tk, int)))
    metafile_keys = set(k for mfo in metafo.values() for k in mfo)
    if len(metafile_keys & set(utils.input_metafile_keys)) == 0:
        raise Exception('no overlap between %d metafile keys and %d allowed keys:\n    %s\n    %s' % (len(metafile_keys), len(utils.input_metafile_keys), ' '.join(metafile_keys), ' '.join(utils.input_metafile_keys)))
    if required_keys is not None and len(set(required_keys) - metafile_keys) > 0:
        raise Exception('required metafile key(s) (%s) not found in %s' % (', '.join(set(required_keys) - metafile_keys), input_metafname))
    added_uids, added_keys = set(), set()
    n_modified, modified_keys = 0, set()
    for line in annotation_list:
        for input_key, line_key in utils.input_metafile_keys.items():
            if line_key not in utils.linekeys['per_seq']:
                raise Exception('doesn\'t make sense to have per-seq meta info that isn\'t per-sequence')
            mvals = [None for _ in line['unique_ids']]
            for iseq, uid in enumerate(line['unique_ids']):
                if uid not in metafo or input_key not in metafo[uid]:
                    continue
                mval = metafo[uid][input_key]
                if line_key in line and mval != line[line_key][iseq]:  # the meta info shouldn't generally already be in the input file if you're also specifying a separate meta file
                    if n_modified < n_warn_print:
                        print '  %s replacing \'%s\'/\'%s\' value for \'%s\' with value from %s: %s --> %s' % (utils.color('yellow', 'warning'), input_key, line_key, uid, input_metafname, line[line_key][iseq], mval)
                    n_modified += 1
                    modified_keys.add(line_key)
                if input_key == 'multiplicity' and mval == 0:
                    raise Exception('input meta info value for \'multiplicity\' must be greater than 1 (since it includes this sequence), but got %d for \'%s\'' % (mval, uid))
                added_uids.add(uid)
                added_keys.add(line_key)
                mvals[iseq] = mval
            if mvals.count(None) < len(mvals):  # we used to add it even if they were all empty, but that means that you always get all the possible input meta keys, which is super messy (the downside of skipping them is some seqs can have them while others don't)
                line[line_key] = mvals
    if n_modified > 0:  # should really add this for the next function as well
        print '%s replaced input metafo for %d instances of key%s %s (see above, only printed the first %d)' % (utils.color('yellow', 'warning'), n_modified, utils.plural(modified_keys), ', '.join(modified_keys), n_warn_print)
    if debug:
        print '  --input-metafname: added meta info (%s) for %d sequences from %s' % (', '.join('\'%s\'' % k for k in added_keys), len(added_uids), input_metafname)
Esempio n. 13
0
 def filter_products(x):
     words = x["title"].lower().split()
     words.extend([utils.plural(word)
                   for word in words])  # Add plurals
     return " ".join(words)
Esempio n. 14
0
    def too_close_to_existing_glfo_gene(self, clusterfo, new_seq, template_seq, template_cpos, template_gene, debug=False):
        if len(new_seq[:template_cpos]) != len(template_seq[:template_cpos]):  # TODO update this to use the new n_snps from the aligned template/new seqs
            return False

        mean_j_mutations = numpy.mean([self.all_j_mutations[seqfo['name']] for seqfo in clusterfo['seqfos']])  # TODO <self.all_j_mutations> uses everybody in the cluster, rather than just the representative. It'd be nice to synchronize this with other things
        # TODO should probably update this to do the same thing (with min([])) as up in decide_whether_to_remove_template_genes(), or just use the new <align> option to utils.hamming_distance (although that would be slower, and this seems to work ok)
        pre_cpos_snps = utils.hamming_distance(new_seq[:template_cpos], template_seq[:template_cpos])
        factor = 1.75
        if pre_cpos_snps < self.min_n_snps or pre_cpos_snps < factor * mean_j_mutations:  # i.e. we keep if it's *further* than factor * <number of j mutations> from the closest existing allele (should presumably rescale by some factor to go from j --> v, but it seems like the factor's near to 1.)
            if debug:
                print 'too close to existing glfo gene %s (%d snp%s < %.2f = %.2f * %.1f mean j mutation%s)' % (utils.color_gene(template_gene), pre_cpos_snps, utils.plural(pre_cpos_snps), factor * mean_j_mutations, factor, mean_j_mutations, utils.plural(mean_j_mutations))
            return True

        return False
Esempio n. 15
0
    def run_treesim(self, seed, outfname, workdir):
        if self.args.debug or utils.getsuffix(outfname) == '.nwk':
            print '  generating %d tree%s,' % (
                self.args.n_trees, utils.plural(self.args.n_trees)),
            if self.args.constant_number_of_leaves:
                print 'all with %s leaves' % str(self.args.n_leaves)
            else:
                print 'n-leaves from %s' % (
                    'hist in parameter dir' if self.final_nldist == 'hist' else
                    '%s distribution with parameter %s' %
                    (self.final_nldist, str(self.args.n_leaves)))
            if self.args.debug:
                print '        mean branch lengths from %s' % (
                    self.parameter_dir
                    if self.parameter_dir is not None else 'scratch')
                for mtype in [
                        'all',
                ] + utils.regions:
                    print '         %4s %7.3f (ratio %7.3f)' % (
                        mtype, self.branch_lengths[mtype]['mean'],
                        self.branch_lengths[mtype]['mean'] /
                        self.branch_lengths['all']['mean'])

        ages, treestrs = [], []

        cmd_lines = []
        pkgname = 'TreeSim'  # TreeSimGM when root_mrca_weibull_parameter is set, otherwise TreeSim
        if self.args.root_mrca_weibull_parameter is not None:
            pkgname += 'GM'
        cmd_lines += ['require(%s, quietly=TRUE)' % pkgname]
        cmd_lines += ['set.seed(' + str(seed) + ')']
        for itree in range(self.args.n_trees):
            n_leaves = self.choose_n_leaves()
            age = self.choose_full_sequence_branch_length()
            ages.append(age)
            if n_leaves == 1:  # add singleton trees by hand
                treestrs.append('t1:%f;' % age)
                continue
            treestrs.append(None)

            # NOTE these simulation functions seem to assume that we want all the extant leaves to have the same height. Which is kind of weird. Maybe makes more sense at some point to change this.
            params = {'n': n_leaves, 'numbsim': self.n_trees_each_run}
            if self.args.root_mrca_weibull_parameter is None:
                fcn = 'sim.bd.taxa.age'
                params['lambda'] = 1  # speciation_rate
                params['mu'] = 0.5  # extinction_rate
                params['age'] = age
            else:
                fcn = 'sim.taxa'
                params['distributionspname'] = '"rweibull"'
                params[
                    'distributionspparameters'] = 'c(%f, 1)' % self.args.root_mrca_weibull_parameter
                params[
                    'labellivingsp'] = '"t"'  # TreeSim doesn't let you do this, but a.t.m. this is their default
            cmd_lines += [
                'trees <- %s(%s)' % (fcn, ', '.join(
                    ['%s=%s' % (k, str(v)) for k, v in params.items()]))
            ]
            cmd_lines += [
                'write.tree(trees[[1]], \"' + outfname + '\", append=TRUE)'
            ]

        if None not in treestrs:  # if every tree has one leaf, we don't need to run R
            open(outfname, 'w').close()
        else:
            if os.path.exists(outfname):
                os.remove(outfname)
            utils.run_r(
                cmd_lines,
                workdir,
                print_time='tree generation' if self.args.debug else None)

        with open(outfname) as treefile:
            for itree, tstr in enumerate(treestrs):
                if tstr is None:
                    treestrs[itree] = treefile.readline().strip()
            if None in treestrs:
                raise Exception(
                    'didn\'t read enough trees from %s: still %d empty places in treestrs'
                    % (outfname, treestrs.count(None)))

        # rescale branch lengths (TreeSim lets you specify the number of leaves and the height at the same time, but TreeSimGM doesn't, and TreeSim's numbers are usually a little off anyway... so we rescale everybody)
        for itree in range(len(ages)):
            treestrs[itree] = '(%s):0.0;' % treestrs[itree].rstrip(
                ';'
            )  # the trees it spits out have non-zero branch length above root (or at least that's what the newick strings turn into when dendropy reads them), which is f****d up and annoying, so here we add a new/real root at the top of the original root's branch
            treestrs[itree] = treeutils.rescale_tree(ages[itree],
                                                     treestr=treestrs[itree])

        return ages, treestrs
Esempio n. 16
0
def merge_chains(
    ploci,
    cpaths,
    antn_lists,
    iparts=None,
    check_partitions=False,
    true_partitions=None,
    debug=False
):  # NOTE the clusters in the resulting partition generally have the uids in a totally different order to in either of the original partitions
    # ----------------------------------------------------------------------------------------
    def akey(klist):
        return ':'.join(klist)

    # ----------------------------------------------------------------------------------------
    def any_in_common(
        l1, l2
    ):  # true if any uids in any cluster in l1 are found in any clusters in l2
        for tclust in l1:
            tset = set(tclust)
            if any(len(tset & set(tc)) > 0 for tc in l2):
                return True
        return False

    # ----------------------------------------------------------------------------------------
    def common_clusters(
        tclust,
        tlist,
        return_indices=False
    ):  # return all clusters in tlist that have uids in common with tclust
        tset = set(tclust)
        return [(i if return_indices else c) for i, c in enumerate(tlist)
                if len(set(c) & tset) > 0]

    # ----------------------------------------------------------------------------------------
    def is_clean_partition(
        putative_partition
    ):  # make sure the list of clusters is actually disjoint
        return not any(
            len(set(c1) & set(c2)) > 0
            for c1, c2 in itertools.combinations(putative_partition, 2))

    # ----------------------------------------------------------------------------------------
    # Starting with <single_cluster> (from one chain) and <cluster_list> (all clusters in the other chain that overlap with <single_cluster>), decide which of the "splits" (i.e. cluster boundaries) in <cluster_list> should be applied to <single_cluster>.
    # Reapportions all uids from <single_cluster> and <cluster_list> into <return_clusts>, splitting definitely/first by cdr3, and then (if over some threshold) by naive hamming distance.
    def resolve_discordant_clusters(single_cluster,
                                    single_annotation,
                                    cluster_list,
                                    annotation_list,
                                    tdbg=False):
        # NOTE single_cluster and cluster_list in general have quite different sets of uids, and that's fine. All that matters here is we're trying to find all the clusters that should be split from one another (without doing some all against all horror)
        if len(cluster_list) == 1:  # nothing to do
            return [single_cluster
                    ]  # NOTE <single_cluster> doesn't get used after here
        adict = utils.get_annotation_dict(annotation_list)
        cdr3_groups = utils.group_seqs_by_value(
            cluster_list, lambda c: adict[akey(c)]['cdr3_length']
        )  # group the together clusters in <cluster_list> that have the same cdr3 (there's already utils.split_clusters_by_cdr3(), but it uses different inputs (e.g. sw_info) so i think it makes sense to not use it here)
        if tdbg:
            print '   %s one cluster vs %d clusters' % (utils.color(
                'blue', 'syncing'), len(cluster_list))
            print '     split into %d cdr3 groups' % len(cdr3_groups)
        lo_hbound, hi_hbound = utils.get_naive_hamming_bounds(
            'likelihood',
            overall_mute_freq=numpy.mean(
                [f for l in annotation_list for f in l['mut_freqs']])
        )  # these are the wider bounds, so < lo is almost certainly clonal, > hi is almost certainly not
        return_clusts = []
        for icdr, cdrgroup in enumerate(
                cdr3_groups
        ):  # within each cdr3 group, split (i.e. use the cluster boundaries from cluster_list rather than single_cluster) if naive hfrac is > hi_hbound (but then there's shenanigans to adjudicate between different possibilities)
            if tdbg:
                print '      %s hfrac bound %.2f' % (utils.color(
                    'purple', 'icdr %d' % icdr), hi_hbound)

            # first figure out who needs to be split from whom
            clusters_to_split = {
                akey(c): []
                for c in cdrgroup
            }  # map from each cluster ('s key) to a list of clusters from which it should be split
            for c1, c2 in itertools.combinations(
                    cdrgroup, 2
            ):  # we could take account of the hfrac of both chains at this point, but looking at only the "split" one rather than the "merged" one, as we do here, is i think equivalent to assuming the merged one has zero hfrac, which is probably fine, since we only split if the split chain is very strongly suggesting we split
                hfrac = utils.hamming_fraction(
                    adict[akey(c1)]['naive_seq'], adict[akey(c2)]['naive_seq']
                )  # all clusters with the same cdr3 len have been padded in waterer so their naive seqs are the same length
                if hfrac > hi_hbound:
                    clusters_to_split[akey(c1)].append(c2)
                    clusters_to_split[akey(c2)].append(c1)

            # then do the splitting, which is accomplished by merging each cluster in <cdrgroup> with every other cluster in <cdrgroup> from which we aren't supposed to split it (i.e. that aren't in its <clusters_to_split>)
            if tdbg:
                print '                  N to     new'
                print '          size    split   cluster?'
            tmpclusts_for_return = [
            ]  # final (return) clusters for this cdr3 class
            for cclust in cdrgroup:
                split_clusts = clusters_to_split[akey(cclust)]
                if tdbg:
                    print '         %4d    %3d' % (len(cclust),
                                                   len(split_clusts)),
                found_one = False
                for rclust in tmpclusts_for_return:  # look for an existing return cluster to which we can merge cclust, i.e. that doesn't have any uids from which we want to split
                    if any_in_common(
                        [rclust], split_clusts
                    ):  # if any uid in rclust is in a cluster from which we want to be split, skip it, i.e. don't merge with that cluster (note that we have to do it by uid because the rclusts are already merged so don't necessarily correspond to any existing cluster)
                        continue
                    # if found_one: print 'it happened!'  # can't happen any more since I switched to 'break' (although see note below)
                    if tdbg: print '     merging with size %d' % len(rclust)
                    rclust += cclust
                    found_one = True
                    break  # i.e. we just merge with the first one we find and stop looking; if there's more than one, it means we could merge all three together if we wanted (triangle inequality-ish, see diagram linked at top of fcn), but i doubt it'll matter either way, and this is easier
                if not found_one:
                    if tdbg: print '      y'
                    tmpclusts_for_return.append(
                        cclust
                    )  # if we didn't find an existing cluster that we can add it to, add it as a new cluster

            return_clusts += tmpclusts_for_return

        if debug:
            print '      returning: %s' % ' '.join(
                [str(len(c)) for c in return_clusts])
            # ptnprint(return_clusts)
        return return_clusts

    # ----------------------------------------------------------------------------------------
    init_partitions = {}
    for tch in utils.chains:
        if iparts is None or ploci[tch] not in iparts:
            init_partitions[tch] = cpaths[ploci[tch]].best()
        else:
            init_partitions[tch] = cpaths[ploci[tch]].partitions[iparts[
                ploci[tch]]]
            print '  %s using non-best partition index %d for %s (best is %d)' % (
                utils.color('red', 'note'), iparts[ploci[tch]], tch,
                cpaths[ploci[tch]].i_best)
# ----------------------------------------------------------------------------------------
# TODO
# return {ploci[ch] : ip for ch, ip in init_partitions.items()}
# ----------------------------------------------------------------------------------------
    l_translations = translate_paired_uids(ploci, init_partitions, antn_lists)
    if debug:
        for tstr, tpart in [('heavy', init_partitions['h']),
                            ('light', init_partitions['l'])]:
            ptnprint(tpart,
                     extrastr=utils.color('blue', '%s  ' % tstr),
                     print_partition_indices=True,
                     n_to_print=1,
                     sort_by_size=False,
                     print_header=tstr == 'heavy')

    common_uids, _, _ = utils.check_intersection_and_complement(
        init_partitions['h'],
        init_partitions['l'],
        only_warn=True,
        a_label='heavy',
        b_label='light'
    )  # check that h and l partitions have the same uids (they're expected to be somewhat different because of either failed queries or duplicates [note that this is why i just turned off default duplicate removal])
    if len(common_uids) == 0:
        raise Exception('no uids in common between heavy and light')

    antn_dict = {
        ch: utils.get_annotation_dict(antn_lists[ploci[ch]])
        for ch in ploci
    }

    final_partition = []
    if debug:
        print '    N        N       hclusts     lclusts       h/l'
        print '  hclusts  lclusts    sizes       sizes      overlaps'
    # For each single cluster in each partition, get a list of the clusters in the other partition that have common uids
    # Pass this cluster + list to a fcn to resolve discrepancies by splitting on the cluster boundaries in <cluster_list> that we're sure of (i.e. that have different cdr3, or very different naive hamming fraction)
    for h_initclust, l_initclust in [
        (c, None) for c in init_partitions['h']
    ] + [
        (None, c) for c in init_partitions['l']
    ]:  # just loops over each single cluster in h and l partitions, but in a way that we know whether the single cluster is from h or l
        single_chain, list_chain = 'h' if l_initclust is None else 'l', 'l' if l_initclust is None else 'h'
        single_cluster = h_initclust if single_chain == 'h' else l_initclust
        cluster_list = common_clusters(single_cluster,
                                       init_partitions[list_chain])
        single_annotation = antn_dict[single_chain][akey(single_cluster)]
        annotation_list = [
            antn_dict[list_chain][akey(c)] for c in cluster_list
        ]

        if debug:
            hclusts, lclusts = ([single_cluster],
                                cluster_list) if single_chain == 'h' else (
                                    cluster_list, [single_cluster])
            overlaps = [[len(set(hc) & set(lc)) for lc in lclusts]
                        for hc in hclusts]
            overlapstr = '   '.join(
                [' '.join(str(ov) for ov in ovlist) for ovlist in overlaps])

            def getcstr(clist):
                return ' '.join(str(len(c)) for c in clist)

            hcstr, lcstr = getcstr(hclusts), getcstr(lclusts)
            cw = 10
            if len(hcstr) < cw and len(lcstr) < cw:  # fits on a single line
                print('    %2d      %2d         %-' + str(cw) + 's  %-' +
                      str(cw) + 's  %s') % (len(hclusts), len(lclusts), hcstr,
                                            lcstr, overlapstr)
            else:  # split the last few columns over multiple lines
                print('    %2d      %2d         %-s') % (len(hclusts),
                                                         len(lclusts), hcstr)
                print('    %2s      %2s         %-' + str(cw) +
                      's%-s') % ('', '', '', lcstr)
                print('    %2s      %2s         %-' + str(cw) + 's%-' +
                      str(cw) + 's   %s') % ('', '', '', '', overlapstr)

        resolved_clusters = resolve_discordant_clusters(
            copy.deepcopy(single_cluster), single_annotation,
            copy.deepcopy(cluster_list), annotation_list)
        if check_partitions:
            assert is_clean_partition(resolved_clusters)
        if debug:
            print '    adding %d resolved cluster%s to %d clusters in final partition' % (
                len(resolved_clusters), utils.plural(
                    len(resolved_clusters)), len(final_partition))
            print '      ifclust N rclusts'
        n_clean = 0
        # for each cluster that's already in <final_partition> that has uids in common with a cluster in <resolved_clusters>, decide how to apportion the common uids (basically we remove them from the larger of the two clusters)
        for ifclust in range(
                len(final_partition)
        ):  # iteration/<ifclust> won't get as far as any clusters that we're just adding (to the end of <final_partition>), which is what we want
            fclust = final_partition[ifclust]
            if not any_in_common(
                [fclust], resolved_clusters
            ):  # this is probably faster than combining it with getting the common cluster indices below, but maybe not
                n_clean += 1
                continue
            irclusts = common_clusters(
                fclust, resolved_clusters, return_indices=True
            )  # indices of any resolved_clusters that overlap with this fclust
            if debug: dbgstr = []
            new_fset = set(
                fclust
            )  # we'll remove uids from this, and then replace fclust with its remains
            for irclust in irclusts:  # resolve any discrepancies between these newly-resolved clusters and fclust
                rset = set(resolved_clusters[irclust])
                common_uids = new_fset & rset
                if len(new_fset) > len(
                        rset
                ):  # remove the common ids from the larger one (effectively splitting according to the splittier one)
                    new_fset -= common_uids
                    if debug:
                        dbgstr.append(
                            '  fclust %d --> %d' %
                            (len(new_fset) + len(common_uids), len(new_fset)))
                else:
                    rset -= common_uids
                    if debug:
                        dbgstr.append(
                            '  rclust %d --> %d' %
                            (len(rset) + len(common_uids), len(rset)))
                resolved_clusters[irclust] = list(rset)
            if debug:
                print '       %4d  %4d  %s' % (ifclust, len(irclusts),
                                               ''.join(dbgstr))
            final_partition[ifclust] = list(new_fset)
        if debug:
            print '       %d fclusts clean' % n_clean
        assert is_clean_partition(resolved_clusters)
        final_partition += resolved_clusters

    if debug:
        print '    removing %d/%d empty clusters' % (final_partition.count(
            []), len(final_partition))
    final_partition = [c for c in final_partition if len(c) > 0]

    # if debug:
    #     print '    final: %s' % ' '.join([str(len(c)) for c in final_partition])
    def chstr(n_before, n_after):
        if n_before == n_after: return ''
        else: return ' ' + utils.color('red', '%+d' % (n_after - n_before))

    print '   N clusters:\n        h %4d --> %-4d%s\n        l %4d --> %-4d%s' % (
        len(init_partitions['h']), len(final_partition),
        chstr(len(init_partitions['h']), len(final_partition)),
        len(init_partitions['l']), len(final_partition),
        chstr(len(init_partitions['l']), len(final_partition)))

    if check_partitions:
        assert is_clean_partition(final_partition)
        for tch, initpart in init_partitions.items():
            _, _, _ = utils.check_intersection_and_complement(
                initpart,
                final_partition,
                only_warn=True,
                a_label=tch,
                b_label='joint'
            )  # check that h and l partitions have the same uids (they're expected to be somewhat different because of either failed queries or duplicates [note that this is why i just turned off default duplicate removal])
            assert len(
                set([u for c in initpart for u in c]) -
                set([u for c in final_partition for u in c])
            ) == 0  # everybody from both initial partitions is in final_partition
        assert len(
            set([u for c in final_partition for u in c]) -
            set([u for c in init_partitions['h'] for u in c]) -
            set([u for c in init_partitions['l'] for u in c])
        ) == 0  # nobody extra got added (i don't see how this could happen, but maybe it's just checking that I didnt' modify the initial partitions)

    joint_partitions = {
        ch: copy.deepcopy(final_partition)
        for ch in utils.chains
    }
    if len(l_translations) > 0:
        untranslate_pids(ploci, init_partitions, antn_lists, l_translations)
        joint_partitions['l'] = [[l_translations.get(u, u) for u in c]
                                 for c in joint_partitions['l']]
    if true_partitions is not None:
        evaluate_joint_partitions(ploci, true_partitions, init_partitions,
                                  joint_partitions, antn_lists)

    return {ploci[ch]: jp for ch, jp in joint_partitions.items()}
Esempio n. 17
0
    def finalize_region(self,
                        region,
                        sorted_gene_counts,
                        annotations=None,
                        debug=False):
        easycounts = {gene: counts for gene, counts in sorted_gene_counts}
        total_counts = sum([counts for counts in easycounts.values()])
        class_counts = self.separate_into_classes(region, sorted_gene_counts,
                                                  easycounts)

        genes_to_keep = set()

        if debug:
            print '   %s groups separated by %d snps  (-: same group as previous kept gene)' % (
                utils.color('blue', region), self.n_max_snps[region])
            print '     %-20s       %5s %s        removed genes (snps counts%s)%s%s' % (
                'genes to keep',
                'counts',
                '' if self.simglfo is None else utils.color('blue', 'sim'),
                '' if self.simglfo is None else utils.color(
                    'blue', ' sim counts'),
                '' if self.simglfo is None else
                ('  ' + utils.color('red', 'x:') + ' not in simulation'),
                '' if (annotations is None or self.reco_info is None) else
                ('               %s sim counts/genes for the queries assigned to this kept gene %s'
                 % (utils.color('blue', '['), utils.color('blue', ']'))),
            ),

            def count_str(cnt):
                if cnt < 10.:
                    return '%.1f' % cnt
                else:
                    return '%.0f' % cnt

            def simcountstr(
                gene, ws
            ):  # counts in simulation for <gene> (note that this is _not_ the same as sim_gene_count_str(), since this takes no account of _which_ queries these counts occur in [plus it's coming from the opposite point of view])
                if self.simglfo is None:
                    rstr = ''
                elif gene in self.simglfo['seqs'][utils.get_region(gene)]:
                    rstr = utils.color(
                        'blue', (' %' + ws + 'd') %
                        self.simcounts[utils.get_region(gene)][gene])
                else:
                    rstr = utils.color('red', (' %' + ws + 's') % 'x')
                return rstr

            def sim_gene_count_str(
                kgene
            ):  # figure out simulation genes and counts for the uids assigned to <kgene>
                if annotations is None or self.reco_info is None:
                    return ''
                uids_this_gene = [
                    uid for uid, line in annotations.items()
                    if line[region + '_gene'] == kgene
                ]
                sim_genes = {
                }  # simulation genes for the uids that we assigned to <kgene> (note that self.simcounts doesn't have this per-uid information)
                for uid in uids_this_gene:
                    sgene = self.reco_info[uid][region + '_gene']
                    if sgene not in sim_genes:
                        sim_genes[sgene] = 0
                    sim_genes[sgene] += 1
                sorted_sim_gene_counts = sorted(sim_genes.items(),
                                                key=operator.itemgetter(1),
                                                reverse=True)
                count_str = ' '.join([
                    utils.color('blue' if sg == kgene else 'red', str(c))
                    for sg, c in sorted_sim_gene_counts
                ])
                sgene_str = ' '.join(
                    [utils.color_gene(sg) for sg, _ in sorted_sim_gene_counts])
                return '%s   %s' % (count_str, sgene_str)

        for iclass in range(len(class_counts)):
            gclass = class_counts[iclass]
            kept_this_class = []
            for ig in range(len(gclass)):
                gfo = gclass[ig]

                if float(
                        gfo['counts']
                ) / total_counts < self.args.min_allele_prevalence_fraction:  # always skip everybody that's super uncommon
                    pass  # don't keep it
                elif ig == 0:  # keep the first one from this class
                    genes_to_keep.add(gfo['gene'])
                    kept_this_class.append(gfo['gene'])
                elif utils.hamming_distance(
                        gclass[0]['seq'], gclass[ig]['seq']
                ) == 0:  # don't keep it if it's indistinguishable from the most common one (the matches are probably mostly really the best one)
                    pass  # don't keep it
                elif len(
                        kept_this_class
                ) < self.args.n_alleles_per_gene:  # always keep the most common <self.args.n_alleles_per_gene> in each class [note: defaults to 1 if looking for new alleles, otherwise 2]
                    genes_to_keep.add(gfo['gene'])
                    kept_this_class.append(gfo['gene'])
                else:
                    pass  # don't keep it

                if debug and gfo['gene'] in genes_to_keep:
                    snpstr = ' ' if ig == 0 else '(%d)' % utils.hamming_distance(
                        gclass[0]['seq'], gfo['seq']
                    )  # only happens if we keep more than one from this class
                    print '\n      %s%-s  %7s%s  %-3s' % (
                        '- ' if ig > 0 else '  ',
                        utils.color_gene(gfo['gene'], width=20),
                        count_str(gfo['counts']), simcountstr(
                            gfo['gene'], '4'), snpstr),
            if debug:
                if len(kept_this_class) == 0:
                    print '\n      %s%-s  %7s%4s  %-3s' % (
                        '  ',
                        utils.color('blue', 'none', width=20,
                                    padside='right'), '-', '', ''),
                removedfo = [
                    gfo for gfo in gclass if gfo['gene'] not in genes_to_keep
                ]
                removed_str = ''
                if len(removedfo) > 0:
                    number_strs = [
                        '(%d %3s%s)' % (gfo['hdist'], count_str(
                            gfo['counts']), simcountstr(gfo['gene'], '1'))
                        for gfo in removedfo
                    ]
                    name_strs = [
                        '%s' % (utils.color_gene(gfo['gene']))
                        for gfo in removedfo
                    ]
                    removed_str = '%s  %s' % (' '.join(number_strs),
                                              ' '.join(name_strs))
                annotation_str = ''
                if (annotations is not None and self.reco_info
                        is not None) and len(kept_this_class) > 0:
                    annotation_str = '%s %s %s' % (utils.color(
                        'blue', '['), sim_gene_count_str(
                            kept_this_class[-1]), utils.color('blue', ']'))
                print '     %s  %s  %s' % (
                    removed_str,
                    (70 - utils.len_excluding_colors(removed_str)) * ' ',
                    annotation_str),
        if debug:
            print ''

        genes_to_remove = set(self.glfo['seqs'][region]) - genes_to_keep

        print '    keeping %d / %d %s gene%s' % (
            len(genes_to_keep), len(self.glfo['seqs'][region]), region,
            utils.plural(len(genes_to_keep)))
        if len(genes_to_keep) == 0:
            print '   would\'ve kept zero genes, instead keeping all of them'
            genes_to_keep = copy.deepcopy(genes_to_remove)
            genes_to_remove.clear()

        if self.simglfo is not None:
            missing_genes = set(self.simglfo['seqs'][region]) - genes_to_keep
            if len(missing_genes) > 0:
                print '    %s %d simulation genes (counts): %s' % (utils.color(
                    'red', 'missing'), len(missing_genes), '  '.join(
                        [('%s %d' %
                          (utils.color_gene(g), self.simcounts[region][g]))
                         for g in sorted(missing_genes)]))
            completely_absent_genes = missing_genes - genes_to_remove
            if len(completely_absent_genes) > 0:
                print '%s %d simulation genes completely absent: %s' % (
                    utils.color('red', 'warning'),
                    len(completely_absent_genes), '  '.join(
                        [('%s %d' %
                          (utils.color_gene(g), self.simcounts[region][g]))
                         for g in sorted(completely_absent_genes)]))

        self.genes_to_keep |= genes_to_keep  # add the ones from _this_ region (rhs) to the ones from all regions (lhs)
        self.genes_to_remove |= genes_to_remove

        self.finalized = True
Esempio n. 18
0
def test_plural():
    assert plural('apple') == 'apples'
    assert plural('dish') == 'dishes'
    assert plural('dish', count=1) == 'dish'
Esempio n. 19
0
def header(type_model, data_model, entities_directory_path):

    header = ''

    header += '#pragma once\n'
    header += '\n'
    header += '#ifndef __DATABASE__H__\n'
    header += '#define __DATABASE__H__\n'
    header += '\n'
    header += '#include <string>\n'
    header += '#include <map>\n'
    header += '#include <vector>\n'
    header += '\n'
    header += '#include <pqxx/pqxx>\n'
    header += '\n'

    for class_model in data_model['classes']:

        # Hot fix. Entity path has ../ in it.
        header += \
            '#include "model/' +\
            entities_directory_path[3:] +\
            '/' +\
            class_model['class'] +\
            '.h"\n'

    header += '\n'
    header += 'namespace chronos {\n'
    header += '\n'
    header += 'class DatabasePartial {\n'
    header += '\n'
    header += 'public:\n'
    header += '\n'
    header += '    DatabasePartial(\n'
    header += '        std::string p_user,\n'
    header += '        std::string p_pass,\n'
    header += '        std::string p_host,\n'
    header += '        std::string p_database\n'
    header += '    );\n'
    header += '\n'

    for class_model in data_model['classes']:

        header += \
            '    std::vector<' +\
            utils.cpp_class_name(class_model) +\
            '> get_' +\
            utils.plural(class_model['class']) +\
            '();\n'

    for class_model in data_model['classes']:

        header += \
            '    std::map<' +\
            type_model['id']['cpp'] +\
            ', ' +\
            utils.cpp_class_name(class_model) +\
            '> get_' +\
            utils.plural(class_model['class']) +\
            '_map();\n'

    header += '\n'
    header += '    bool test();\n'
    header += '    void init();\n'
    header += '    void fill();\n'
    header += '    void destroy();\n'
    header += '\n'
    header += 'protected:\n'
    header += '\n'
    header += '    std::string _user;\n'
    header += '    pqxx::connection _db;\n'
    header += '\n'
    header += '};\n'
    header += '\n'
    header += '}\n'
    header += '\n'
    header += '#endif //__DATABASE__H__\n'
    header += '\n'

    return header
Esempio n. 20
0
def source(type_model, data_model, data_sql):

    source = ''

    source += '#include "database_partial.h"\n'
    source += '\n'
    source += '#include <iostream>\n'
    source += '\n'
    source += 'using namespace chronos;\n'
    source += '\n'
    source += 'DatabasePartial::DatabasePartial(\n'
    source += '    std::string p_user,\n'
    source += '    std::string p_password,\n'
    source += '    std::string p_host,\n'
    source += '    std::string p_database\n'
    source += '):\n'
    source += '    _user(p_user),\n'
    source += '    _db(\n'
    source += '        "host = " + p_host + " "\n'
    source += '        "dbname = " + p_database + " "\n'
    source += '        "user = "******" "\n'
    source += '        "password = "******"SELECT * FROM ' +\
            utils.plural(class_model['class']) +\
            ';"));\n'

        source += '\n'
        source += '        for(auto i = r.begin(); i != r.end(); ++i) {\n'
        source += '\n'
        source += \
            '            ' +\
            utils.cpp_class_name(class_model) +\
            ' p(\n'

        for i, member in enumerate(
                utils.cpp_class_members(type_model, data_model, class_model)):

            source += \
                '                i[' +\
                str(i) +\
                '].as<' +\
                type_model[member['type']]['cpp'] +\
                '>()' +\
                (',' if i != len(utils.cpp_class_members(type_model, data_model, class_model))-1 else '') +\
                ' // ' +\
                member['name'] +\
                '\n'

        source += '            );\n'
        source += '\n'
        source += '            ret.push_back(p);\n'
        source += '        }\n'
        source += '\n'
        source += '    } catch (std::exception& e) {\n'
        source += '\n'
        source += '        std::cerr << e.what() << std::endl;\n'
        source += '    }\n'
        source += '\n'
        source += '    return ret;\n'
        source += '}\n'
        source += '\n'

    for class_model in data_model['classes']:

        source += \
            'std::map<' +\
            type_model['id']['cpp'] +\
            ', ' +\
            utils.cpp_class_name(class_model) +\
            '> DatabasePartial::get_' +\
            utils.plural(class_model['class']) +\
            '_map() {\n'

        source += '\n'

        source += \
            '    std::map<' +\
            type_model['id']['cpp'] +\
            ', ' +\
            utils.cpp_class_name(class_model) +\
            '> ret;\n'

        source += '\n'
        source += '    try {\n'
        source += '\n'
        source += '        pqxx::nontransaction n(_db);\n'
        source += '\n'

        source += \
            '        pqxx::result r(n.exec("SELECT * FROM ' +\
            utils.plural(class_model['class']) +\
            ';"));\n'

        source += '\n'
        source += '        for(auto i = r.begin(); i != r.end(); ++i) {\n'
        source += '\n'
        source += \
            '            ' +\
            utils.cpp_class_name(class_model) +\
            ' p(\n'

        for i, member in enumerate(
                utils.cpp_class_members(type_model, data_model, class_model)):

            source += \
                '                i[' +\
                str(i) +\
                '].as<' +\
                type_model[member['type']]['cpp'] +\
                '>()' +\
                (',' if i != len(utils.cpp_class_members(type_model, data_model, class_model))-1 else '') +\
                ' // ' +\
                member['name'] +\
                '\n'

        source += '            );\n'
        source += '\n'
        source += '            ret[p.id()] = p;\n'
        source += '        }\n'
        source += '\n'
        source += '    } catch (std::exception& e) {\n'
        source += '\n'
        source += '        std::cerr << e.what() << std::endl;\n'
        source += '    }\n'
        source += '\n'
        source += '    return ret;\n'
        source += '}\n'
        source += '\n'

    source += 'bool DatabasePartial::test() {\n'
    source += '\n'
    source += '    if(_db.is_open()) {\n'
    source += '\n'
    source += '        std::cerr << "Connection successful!" << std::endl;\n'
    source += '\n'
    source += '    } else {\n'
    source += '\n'
    source += '        std::cerr << "Connection failed." << std::endl;\n'
    source += '    }\n'
    source += '\n'
    source += '    return _db.is_open();'
    source += '\n'
    source += '}\n'
    source += '\n'
    source += 'void DatabasePartial::init() {\n'
    source += '\n'
    source += '    std::vector<std::string> creates;\n'
    source += '\n'

    for class_model in data_model['classes']:

        source += '    creates.push_back(\n'
        source += \
            '        "CREATE TABLE ' +\
            utils.plural(class_model['class']) +\
            ' ( "\n'

        for i, member in enumerate(
                utils.sql_class_members(type_model, data_model, class_model)):

            source += \
                '            "' +\
                member +\
                (', ' if i != len(utils.sql_class_members(type_model, data_model, class_model))-1 else ');') +\
                '"\n'

        source += '    );\n'
        source += '\n'

    source += '    try {\n'
    source += '\n'
    source += '        pqxx::work w(_db);\n'
    source += '\n'
    source += '        for(auto create : creates) {\n'
    source += '\n'
    source += '            w.exec(create);\n'
    source += '        }\n'
    source += '\n'
    source += '        w.commit();\n'
    source += '\n'
    source += '    } catch (std::exception& e) {\n'
    source += '\n'
    source += '        std::cerr << e.what();\n'
    source += '    }\n'
    source += '}\n'
    source += '\n'
    source += 'void DatabasePartial::fill() {\n'
    source += '\n'
    source += '    std::vector<std::string> inserts;\n'
    source += '\n'

    for sql in data_sql:

        source += '    inserts.push_back("' + sql + '");\n'

    source += '    try {\n'
    source += '\n'
    source += '        pqxx::work w(_db);\n'
    source += '\n'
    source += '        for(auto insert : inserts) {\n'
    source += '\n'
    source += '            w.exec(insert);\n'
    source += '        }\n'
    source += '\n'
    source += '        w.commit();\n'
    source += '\n'
    source += '    } catch (std::exception& e) {\n'
    source += '\n'
    source += '        std::cerr << e.what();\n'
    source += '    }\n'
    source += '}\n'
    source += '\n'

    source += 'void DatabasePartial::destroy() {\n'
    source += '\n'
    source += '    try {\n'
    source += '\n'
    source += '        pqxx::work w(_db);\n'
    source += '        w.exec("DROP OWNED BY " + _user + ";");\n'
    source += '        w.commit();\n'
    source += '\n'
    source += '    } catch (std::exception& e) {\n'
    source += '\n'
    source += '        std::cerr << e.what();\n'
    source += '    }\n'
    source += '}\n'
    source += '\n'

    return source
Esempio n. 21
0
def combine_indels(regional_indelfos,
                   full_qrseq,
                   qrbounds,
                   uid=None,
                   debug=False):
    # debug = 2
    joint_indelfo = get_empty_indel()

    # make sure the regional qrbounds consist of a nice orderly progression
    tmpqrblist = [b for r in utils.regions for b in qrbounds[r]]
    if tmpqrblist != sorted(tmpqrblist):
        raise Exception(
            'messed up qrbounds %s for qr sequence with length %d:\n  %s' %
            ('   '.join([('%s %s' % (r, qrbounds[r])) for r in utils.regions
                         ]), len(full_qrseq), full_qrseq))
    if qrbounds['j'][1] > len(
            full_qrseq
    ):  # qrbounds['v'][1] > len(full_qrseq) or qrbounds['d'][1] > len(full_qrseq) or qrbounds['j'][1] > len(full_qrseq):
        raise Exception(
            'qrbounds %s extend beyond sequence with len %d:\n  %s' %
            (qrbounds, len(full_qrseq), full_qrseq))

    if debug > 1:
        print 'combining %d indelfo%s from %s' % (
            len(regional_indelfos), utils.plural(len(regional_indelfos)),
            ' '.join([r for r in utils.regions if r in regional_indelfos]))
        print '  qrbounds:   %s' % '   '.join([('%s %s' % (r, qrbounds[r]))
                                               for r in utils.regions])
        print '     full qr %s' % full_qrseq
    qr_gap_seq, gl_gap_seq = [], []
    for region in utils.regions:
        ireg = utils.regions.index(region)
        if debug > 1:
            print '  %s' % region
        if region in regional_indelfos:
            rfo = regional_indelfos[region]
            assert has_indels(
                rfo
            )  # calling fcn needs to not add it if it doesn't have indels
            joint_indelfo['genes'][region] = rfo['genes'][region]
            if utils.non_gap_len(
                    rfo['qr_gap_seq']
            ) != qrbounds[region][1] - qrbounds[region][
                    0]:  # should be fixed by overlapping boundary shifter
                return None  # UPDATE eh screw it this managed to happen *again* (see issue #310)
                # raise Exception('%sqr_gap_seq non-gap length %d not the same as qrbound length %d in %s region indelfo' % ('%s: ' % uid if uid is not None else '', utils.non_gap_len(rfo['qr_gap_seq']), qrbounds[region][1] - qrbounds[region][0], region))
            qr_gap_seq += [rfo['qr_gap_seq']]
            gl_gap_seq += [rfo['gl_gap_seq']]

            reg_indel_list = copy.deepcopy(rfo['indels'])
            for i_prev_reg in range(0, ireg):  # loop over previous regions
                prev_reg = utils.regions[i_prev_reg]
                if prev_reg not in regional_indelfos:  # don't need to do anything if the previous region didn't have indels
                    continue
                prev_reg_gaps = utils.gap_len(
                    regional_indelfos[prev_reg]['qr_gap_seq']
                )  # number of gaps in the previous region's qr gap seq
                for ifo in reg_indel_list:
                    ifo['pos'] += prev_reg_gaps
                    if debug > 1:
                        print '    add %d to pos for gaps in %s' % (
                            prev_reg_gaps, prev_reg)
            joint_indelfo['indels'] += reg_indel_list
        else:
            qr_gap_seq += [full_qrseq[qrbounds[region][0]:qrbounds[region][1]]]
            gl_gap_seq += [
                utils.ambig_base * (qrbounds[region][1] - qrbounds[region][0])
            ]
        if debug > 1:
            print '    %s\n    %s' % (qr_gap_seq[-1].replace(
                utils.gap_chars[0], utils.color(
                    'red', utils.gap_chars[0])), gl_gap_seq[-1].replace(
                        utils.gap_chars[0],
                        utils.color('red', utils.gap_chars[0])))

        if ireg < len(utils.regions) - 1:
            next_reg = utils.regions[ireg + 1]
            assert region + next_reg in utils.boundaries
            qr_gap_seq += [
                full_qrseq[qrbounds[region][1]:qrbounds[next_reg][0]]
            ]
            gl_gap_seq += [
                utils.ambig_base *
                (qrbounds[next_reg][0] - qrbounds[region][1])
            ]
            if debug > 1:
                print '  %s%s' % (region, next_reg)
                print '    %s\n    %s' % (
                    full_qrseq[qrbounds[region][1]:qrbounds[next_reg][0]],
                    utils.ambig_base *
                    (qrbounds[next_reg][0] - qrbounds[region][1]))

    if debug > 1:
        print 'combined gap seqs:'
        print '  qr %s' % '  '.join(qr_gap_seq)
        print '  gl %s' % '  '.join(gl_gap_seq)

    joint_indelfo['qr_gap_seq'] = ''.join(qr_gap_seq)
    joint_indelfo['gl_gap_seq'] = ''.join(gl_gap_seq)
    assert len(joint_indelfo['qr_gap_seq']) == len(joint_indelfo['gl_gap_seq'])
    joint_indelfo['reversed_seq'] = get_reversed_seq(
        joint_indelfo['qr_gap_seq'], joint_indelfo['gl_gap_seq'],
        full_qrseq[:qrbounds['v'][0]], full_qrseq[qrbounds['j'][1]:])
    # assert 'N' not in joint_indelfo['reversed_seq']  # this happens if there's Ns in the initial sequence

    joint_indelfo[
        'qr_gap_seq'] = full_qrseq[:qrbounds['v'][0]] + joint_indelfo[
            'qr_gap_seq'] + full_qrseq[qrbounds['j'][1]:]
    joint_indelfo['gl_gap_seq'] = utils.ambig_base * qrbounds['v'][
        0] + joint_indelfo['gl_gap_seq'] + utils.ambig_base * (
            len(full_qrseq) - qrbounds['j'][1])

    if debug:
        print 'combined'
        print get_dbg_str(joint_indelfo)

    return joint_indelfo
Esempio n. 22
0
 def __repr__(self):
     return "{} binary agent{} who want {}".format(self.cardinality, plural(self.cardinality), sorted(self.desired_goods))
Esempio n. 23
0
 def filter_products(x):
     words = x["title"].lower().split()
     words.extend([utils.plural(word) for word in words])  # Add plurals
     return " ".join(words)
Esempio n. 24
0
    def finalize(self, sorted_gene_counts, debug=False):
        # NOTE a.t.m. this is using vsearch V alignments only, so we can't collapse clones beforehand. It would be more accurate, though, if we could collapse clones first
        # NOTE <sorted_gene_counts> is usually/always floats instead of integers
        assert not self.finalized
        easycounts = {gene: counts for gene, counts in sorted_gene_counts}
        total_counts = sum([counts for counts in easycounts.values()])

        self.genes_to_keep = set()

        if debug:
            print '  removing least likely genes (%.1f total counts)' % total_counts
            print '     %-20s    %5s (%s)      removed genes (snps counts)  names' % (
                'genes to keep', 'counts', 'snps'),

            def count_str(cnt):
                if cnt < 10.:
                    return '%.1f' % cnt
                else:
                    return '%.0f' % cnt

        class_counts = self.separate_into_classes(sorted_gene_counts,
                                                  easycounts)
        for iclass in range(len(class_counts)):
            gclass = class_counts[iclass]
            n_from_this_class = 0
            for ig in range(len(gclass)):
                gfo = gclass[ig]
                if self.args.n_max_total_alleles is not None and len(
                        self.genes_to_keep
                ) >= self.args.n_max_total_alleles:  # command line can specify the total number of alleles
                    break

                if float(
                        gfo['counts']
                ) / total_counts < self.args.min_allele_prevalence_fraction:  # always skip everybody that's super uncommon
                    pass
                elif ig == 0:  # keep the first one from this class
                    self.genes_to_keep.add(gfo['gene'])
                    n_from_this_class += 1
                elif utils.hamming_distance(
                        gclass[0]['seq'], gclass[ig]['seq']
                ) == 0:  # don't keep it if it's indistinguishable from the most common one (the matches are probably mostly really the best one)
                    pass  # don't keep it
                elif n_from_this_class < self.args.n_alleles_per_gene:  # always keep the most common <self.args.n_alleles_per_gene> in each class
                    self.genes_to_keep.add(gfo['gene'])
                    n_from_this_class += 1
                else:
                    pass  # don't keep it

                if debug and gfo['gene'] in self.genes_to_keep:
                    snpstr = ' ' if ig == 0 else '(%d)' % utils.hamming_distance(
                        gclass[0]['seq'], gfo['seq'])
                    print '\n       %-s  %7s  %-3s' % (utils.color_gene(
                        gfo['gene'], width=20), count_str(
                            gfo['counts']), snpstr),
            if debug:
                if n_from_this_class == 0:
                    print '\n       %-s  %7s  %-3s' % (utils.color(
                        'blue', 'none', width=20, padside='right'), '-', ''),
                removedfo = [
                    gfo for gfo in gclass
                    if gfo['gene'] not in self.genes_to_keep
                ]
                if len(removedfo) > 0:
                    number_strs = [
                        '(%d %s)' % (gfo['hdist'], count_str(gfo['counts']))
                        for gfo in removedfo
                    ]
                    name_strs = [
                        '%s' % utils.color_gene(gfo['gene'])
                        for gfo in removedfo
                    ]
                    print '        %s  %s' % (' '.join(number_strs),
                                              ' '.join(name_strs)),
        if debug:
            print ''

        self.genes_to_remove = set(
            self.glfo['seqs'][self.region]) - self.genes_to_keep

        print '    keeping %d / %d %s gene%s' % (
            len(self.genes_to_keep), len(self.glfo['seqs'][self.region]),
            self.region, utils.plural(len(self.genes_to_keep)))
        # print '    removing %d %s genes: %d with no matches, %d with unconvincing matches' % (len(self.genes_to_remove), self.region, len(set(self.glfo['seqs'][self.region]) - set(easycounts)), len(set(easycounts) - self.genes_to_keep))

        self.finalized = True
Esempio n. 25
0
 def __repr__(self):
     return "{} agent{} with monotone valuations. Desired goods: {}".format(self.cardinality, plural(self.cardinality), sorted(self.desired_goods))
Esempio n. 26
0
def read_mute_freqs_with_weights(
    indir,
    approved_genes,
    debug=False
):  # it would be nice to eventually align the genes before combining
    # returns:
    #  - mute_freqs: inverse error-weighted average mute freq over all genes for each position
    #     - also includes weighted and unweigthed means over positions

    if len(approved_genes) == 0:
        raise Exception('no approved genes')

    if approved_genes[0] == glutils.dummy_d_genes[utils.get_locus(
            approved_genes[0])]:
        return {'overall_mean': 0.5, 'unweighted_overall_mean': 0.5}

    if debug:
        print '    reading mute freqs from %s for %d gene%s: %s' % (
            indir, len(approved_genes), utils.plural(
                len(approved_genes)), utils.color_genes(approved_genes))

    # add an observation for each position, for each gene where we observed that position NOTE this would be more sensible if they were aligned first
    observed_freqs = {}
    for gene in approved_genes:
        mutefname = indir + '/mute-freqs/' + utils.sanitize_name(gene) + '.csv'
        if not os.path.exists(mutefname):
            continue
        with open(mutefname, 'r') as mutefile:
            reader = csv.DictReader(mutefile)
            for line in reader:
                pos = int(line['position'])
                freq = float(line['mute_freq'])
                lo_err = float(
                    line['lo_err']
                )  # NOTE lo_err in the file is really the lower *bound*
                hi_err = float(line['hi_err'])  #   same deal
                assert freq >= 0.0 and lo_err >= 0.0 and hi_err >= 0.0  # you just can't be too careful

                if freq < utils.eps or abs(
                        1.0 - freq
                ) < utils.eps:  # if <freq> too close to 0 or 1, replace it with the midpoint of its uncertainty band
                    freq = 0.5 * (lo_err + hi_err)

                if pos not in observed_freqs:
                    observed_freqs[pos] = []

                observed_freqs[pos].append({
                    'freq':
                    freq,
                    'err':
                    max(abs(freq - lo_err), abs(freq - hi_err))
                })  # append one for each gene

    # set final mute_freqs[pos] to the (inverse error-weighted) average over all the observations [i.e. genes] for each position
    mute_freqs = {}
    for pos in observed_freqs:
        total, sum_of_weights = 0.0, 0.0
        for obs in observed_freqs[pos]:  # loop over genes
            assert obs['err'] > 0.0
            weight = 1.0 / obs['err']
            total += weight * obs['freq']
            sum_of_weights += weight
        assert sum_of_weights > 0.0
        mean_freq = total / sum_of_weights
        mute_freqs[pos] = mean_freq

    # NOTE I'm sure that this weighting scheme makes sense for comparing differeing genes at the same position, but I'm less sure it makes sense for the overall mean. But, I don't want to track down all the places that changing it might affect right now
    mute_freqs['overall_mean'] = 0.
    weighted_denom = sum([
        1. / obs['err'] for pos in observed_freqs
        for obs in observed_freqs[pos]
    ])
    if weighted_denom > 0.:
        mute_freqs['overall_mean'] = sum([
            obs['freq'] / obs['err'] for pos in observed_freqs
            for obs in observed_freqs[pos]
        ]) / weighted_denom

    # I need the inverse-error-weighted numbers to sensibly combine genes, but then I also need unweigthed values that I can easily write to the yaml files for other people to use
    mute_freqs['unweighted_overall_mean'] = 0.
    unweighted_denom = sum(
        [len(observed_freqs[pos]) for pos in observed_freqs])
    if unweighted_denom > 0.:
        mute_freqs['unweighted_overall_mean'] = sum([
            obs['freq'] for pos in observed_freqs
            for obs in observed_freqs[pos]
        ]) / unweighted_denom

    if debug:
        iskipstart = 35  # i.e. for v genes skip the middle positions
        positions = sorted(observed_freqs)
        if len(positions) > 2 * iskipstart:
            print '      %s%s%s' % (' '.join([
                ('%4d' % p) for p in positions[:iskipstart]
            ]), utils.color('blue', ' [...] '), ' '.join([
                ('%4d' % p) for p in positions[len(positions) - iskipstart:]
            ]))
            print '      %s%s%s' % (' '.join([
                ('%4.2f' % mute_freqs[p]) for p in positions[:iskipstart]
            ]), utils.color('blue', ' [...] '), ' '.join(
                [('%4.2f' % mute_freqs[p])
                 for p in positions[len(positions) - iskipstart:]]))
        else:
            print '      %s' % ' '.join([('%4d' % p) for p in positions])
            print '      %s' % ' '.join([('%4.2f' % mute_freqs[p])
                                         for p in positions])
        print '        overall mean: %5.3f (unweighted %5.3f)' % (
            mute_freqs['overall_mean'], mute_freqs['unweighted_overall_mean'])

    return mute_freqs
Esempio n. 27
0
 def __repr__(self):
     vals = " ".join(["{}={}".format(k,v) for k,v in sorted(self.map_good_to_value.items())])
     return "{} agent{} with additive valuations: {}".format(self.cardinality, plural(self.cardinality), vals)
Esempio n. 28
0
def get_indelfo_from_cigar(cigarstr,
                           qrseq,
                           qrbounds,
                           glseq,
                           glbounds,
                           gene,
                           vsearch_conventions=False,
                           debug=False):
    if debug:
        print '  initial:'
        print '    %s' % color_cigar(cigarstr)
        print '    qr %3d %3d %s' % (qrbounds[0], qrbounds[1], qrseq)
        print '    gl %3d %3d %s' % (glbounds[0], glbounds[1], glseq)

    cigars = [
        split_cigarstr(cstr) for cstr in re.findall('[0-9]*[A-Z]', cigarstr)
    ]  # split cigar string into its parts, then split each part into the code and the length
    if vsearch_conventions:
        assert utils.get_region(gene) == 'v'  # would need to be generalized
        cigars = [
            (code.translate(string.maketrans('ID', 'DI')), length)
            for code, length in cigars
        ]  # vsearch reverses what's the query and what's the target/gene/whathaveyou compared to what ig-sw does
        for iend in [0, -1]:
            if cigars[iend][
                    0] == 'I':  # qr extends beyond gl: ig-sw calls these soft-clips, vsearch calls them insertions
                cigars[iend] = ('S', cigars[iend][1])
            elif cigars[iend][
                    0] == 'D':  # gl goes past qr: ig-sw just calls them not part of the alignment, vsearch calls them deletions
                cigars.pop(iend)
    cigars = [(code, length) for code, length in cigars
              if code != 'S']  # remove soft-clipping
    cigarstr = ''.join(['%d%s' % (l, c) for c, l in cigars])
    qrseq = qrseq[qrbounds[0]:qrbounds[1]]  # ...and trim qrseq and glseq
    glseq = glseq[glbounds[0]:glbounds[1]]

    if debug:
        print '  parsed:'
        print '    %s' % color_cigar(cigarstr)
        print '    %s' % '   '.join(['%s %d' % (c, l) for c, l in cigars])
        print '    qr %s' % qrseq
        print '    gl %s' % glseq

    # check consistency between cigar and qr/gl seqs
    for seqtype, tmpseq, tmpcode in (('qr', qrseq, 'D'), ('gl', glseq, 'I')):
        cigar_len = sum([length for code, length in cigars if code != tmpcode])
        if cigar_len != len(tmpseq):
            raise Exception('cigar length %d doesn\'t match %s seq length %d' %
                            (cigar_len, seqtype, len(tmpseq)))

    indelfo = get_empty_indel(
    )  # replacement_seq: query seq with insertions removed and germline bases inserted at the position of deletions
    # TODO should probably also ignore indels on either end (I think only relevant for vsearch)
    if 'I' not in cigarstr and 'D' not in cigarstr:  # has to happen after we've changed from vsearch conventions
        if debug:
            print '  no indels'
        return indelfo

    # add each indel to <indelfo['indels']>, and build <codestr> and <tmp_indices> to keep track of what's going on at each position
    codestr = ''.join(
        [length * code for code, length in cigars]
    )  # each position is cigar code corresponding to that position in the alignment
    qpos = 0  # position within query sequence
    tmp_indices = [
    ]  # integer for each position in the alignment, giving the index of the indel that we're within (None if we're not in an indel)
    if debug:
        print '      code  length'
    for code, length in cigars:
        if debug:
            print '        %s     %3d' % (code, length)
        if code == 'I':  # advance qr seq but not gl seq
            indelfo['indels'].append(
                {
                    'type': 'insertion',
                    'pos': qpos,
                    'len': length,
                    'seqstr': []
                }
            )  # insertion begins at <pos> (note that 'seqstr' later on gets converted from a list to a string)
            tmp_indices += [
                len(indelfo['indels']) - 1 for _ in range(length)
            ]  # indel index corresponding to this position in the alignment
        elif code == 'D':  # advance qr seq but not gl seq
            indelfo['indels'].append(
                {
                    'type': 'deletion',
                    'pos': qpos,
                    'len': length,
                    'seqstr': []
                }
            )  # first deleted base is <pos> (well, first base which is in the position of the first deleted base)
            tmp_indices += [
                len(indelfo['indels']) - 1 for _ in range(length)
            ]  # indel index corresponding to this position in the alignment
        else:
            tmp_indices += [
                None for _ in range(length)
            ]  # indel index corresponding to this position in the alignment
        qpos += length

    if debug:
        print '      %s  codestr' % ''.join(
            [c if c not in 'ID' else utils.color('blue', c) for c in codestr])
        print '      %s  indel index' % ''.join(
            [str(ti if ti is not None else ' ') for ti in tmp_indices])

    # then construct the dbg strings, indel-reversed input sequence, and 'seqstr' entries in indelfo
    qrprintstr, glprintstr, reversed_seq = [], [], []
    iqr, igl = 0, 0
    for icode in range(len(codestr)):
        code = codestr[icode]
        if code == 'M':
            qrbase = qrseq[iqr]
            if qrbase != glseq[igl]:
                qrbase = utils.color('red', qrbase)
            qrprintstr.append(qrbase)
            glprintstr.append(glseq[igl])
            reversed_seq.append(
                qrseq[iqr]
            )  # add the base to the overall sequence with all indels reversed
        elif code == 'S':
            continue
        elif code == 'I':
            qrprintstr.append(utils.color('light_blue', qrseq[iqr]))
            glprintstr.append(utils.color('light_blue', '*'))
            indelfo['indels'][tmp_indices[icode]]['seqstr'].append(
                qrseq[iqr])  # and to the sequence of just this indel
            igl -= 1
        elif code == 'D':
            qrprintstr.append(utils.color('light_blue', '*'))
            glprintstr.append(utils.color('light_blue', glseq[igl]))
            reversed_seq.append(
                glseq[igl]
            )  # add the base to the overall sequence with all indels reversed
            indelfo['indels'][tmp_indices[icode]]['seqstr'].append(
                glseq[igl])  # and to the sequence of just this indel
            iqr -= 1
        else:
            raise Exception('unhandled cigar code %s' % code)

        iqr += 1
        igl += 1

    # convert character lists to strings (indels are rare enough that this probably isn't that much faster, but it just feels wrong not to)
    qrprintstr = ''.join(qrprintstr)
    glprintstr = ''.join(glprintstr)
    indelfo['reversed_seq'] = ''.join(reversed_seq)
    for ifo in indelfo['indels']:
        ifo['seqstr'] = ''.join(ifo['seqstr'])

    # make the dbg str for indelfo
    gwidth = str(
        len(gene))  # doesn't account for color abbreviation, but oh well
    dbg_str_list = [
        ('%' + gwidth + 's  %s') %
        (utils.color_gene(gene, width=int(gwidth), leftpad=True), glprintstr),
        ('%' + gwidth + 's  %s') % ('query', qrprintstr)
    ]
    for idl in indelfo['indels']:
        dbg_str_list.append('%10s: %d base%s at %d (%s)' %
                            (idl['type'], idl['len'], utils.plural(
                                idl['len']), idl['pos'], idl['seqstr']))
    indelfo['dbg_str'] = '\n'.join(dbg_str_list)

    if debug:
        print utils.pad_lines(indelfo['dbg_str'], 0)

    return indelfo
Esempio n. 29
0
def get_seqfile_info(infname,
                     is_data,
                     n_max_queries=-1,
                     args=None,
                     simglfo=None,
                     quiet=False):
    """ return list of sequence info from files of several types """

    suffix = utils.getsuffix(infname)
    if len(re.findall('\.[ct]sv', suffix)) > 0:
        if suffix == '.csv':
            delimiter = ','
        elif suffix == '.tsv':
            delimiter = '\t'
        else:
            assert False
        seqfile = open(infname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    else:
        reader = utils.read_fastx(
            infname,
            name_key='unique_ids',
            seq_key='input_seqs',
            add_info=False,
            sanitize=True,
            n_max_queries=
            n_max_queries,  # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below
            queries=(args.queries if
                     (args is not None and not args.abbreviate) else None)
        )  # NOTE also can't filter on args.queries here if we're also translating

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    # already_printed_forbidden_character_warning = False
    n_queries_added = 0
    found_seed = False
    used_names = set()  # for abbreviating
    if args is not None and args.abbreviate:
        potential_names = list(string.ascii_lowercase)
    iname = None  # line number -- used as sequence id if there isn't a name column in the file
    iline = -1
    for line in reader:
        iline += 1
        if args is not None:
            if args.istartstop is not None:
                if iline < args.istartstop[0]:
                    continue
                if iline >= args.istartstop[1]:
                    break
            if args.name_column is not None:
                line['unique_ids'] = line[args.name_column]
                del line[args.name_column]
            if args.seq_column is not None:
                line['input_seqs'] = line[args.seq_column]
                if args.seq_column != 'seqs':  # stupid god damn weird backwards compatibility edge case bullshit
                    del line[args.seq_column]
        if iname is None and 'unique_ids' not in line and 'unique_id' not in line:
            print '  %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (
                utils.color('yellow', 'warning'))
            iname = 0
        if iname is not None:
            line['unique_ids'] = '%09d' % iname
            iname += 1
        if 'input_seqs' not in line and 'seq' not in line:
            raise Exception(
                'couldn\'t find a sequence column in %s (you can set this with --seq-column)'
                % infname)
        utils.process_input_line(line)
        if len(line['unique_ids']) > 1:
            raise Exception('can\'t yet handle multi-seq csv input files')
        uid = line['unique_ids'][0]
        if uid in input_info:
            new_uid = uid
            iid = 2
            while new_uid in input_info:
                new_uid = uid + '-' + str(iid)
                iid += 1
            print '  %s uid %s already read from input file %s, so replacing with new uid %s' % (
                utils.color('yellow', 'warning'), uid, infname, new_uid)
            uid = new_uid
        inseq = line['input_seqs'][0]

        # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above it has them
        # if any(fc in uid for fc in utils.forbidden_characters):
        #     raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid))
        if args is not None:
            if args.abbreviate:  # note that this changes <uid>, but doesn't modify <line>
                uid = abbreviate(used_names, potential_names, uid)
            if args.queries is not None and uid not in args.queries:
                continue
            if args.reco_ids is not None and line[
                    'reco_id'] not in args.reco_ids:
                continue
            if args.seed_unique_id is not None and uid == args.seed_unique_id:
                found_seed = True

        if uid in input_info:
            raise Exception('found uid \'%s\' twice in input file %s' %
                            (uid, infname))

        if len(inseq.translate(None, ''.join(utils.alphabet))) > 0:
            unexpected_chars = set(
                [ch for ch in inseq if ch not in utils.alphabet])
            raise Exception(
                'unexpected character%s %s (not among %s) in input sequence with id %s:\n  %s'
                % (utils.plural(len(unexpected_chars)), ', '.join([
                    ('\'%s\'' % ch) for ch in unexpected_chars
                ]), utils.nukes + utils.ambiguous_bases, uid, inseq))

        # da business
        input_info[uid] = {
            'unique_ids': [
                uid,
            ],
            'seqs': [
                inseq,
            ]
        }

        if n_queries_added == 0 and is_data and 'reco_id' in line:
            print '  note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % infname)
            reco_info[uid] = copy.deepcopy(line)
            if simglfo is not None:
                utils.add_implicit_info(simglfo, reco_info[uid])

        n_queries_added += 1
        if n_max_queries > 0 and n_queries_added >= n_max_queries:
            if not quiet:  # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now
                print '  --n-max-queries: stopped after reading %d queries from input file' % len(
                    input_info)
            break

    post_process(input_info, reco_info, args, infname, found_seed, is_data,
                 iline)

    if len(input_info) == 0:
        raise Exception('didn\'t read any sequences from %s' % infname)

    return input_info, reco_info
Esempio n. 30
0
    def make_single_tree(self, partitions, annotations, uid_set, get_fasttrees=False, n_max_cons_seqs=10, debug=False):
        # NOTE don't call this externally -- if you want a single tree, call make_trees() with <i_only_cluster> set
        def getline(uidstr, uid_set=None):
            if uidstr in annotations:  # if we have this exact annotation
                return annotations[uidstr]
            else:
                if uid_set is None:
                    uid_set = set(uidstr.split(':'))  # should only get called if it's a singleton
                # note that for internal nodes in a fasttree-derived subtree, the uids will be out of order compared the the annotation keys
                for line in annotations.values():  # we may actually have the annotation for every subcluster (e.g. if --calculate-alternative-annotations was set), but in case we don't, this is fine
                    if len(uid_set & set(line['unique_ids'])) > 0:  # just take the first one with any overlap. Yeah, it's not necessarily the best, but its naive sequence probably isn't that different, and for just getting the fasttree it reeeeeeaaaallly doesn't matter
                        return line
            raise Exception('couldn\'t find uid %s in annotations' % uid)
        def getseq(uid):
            line = getline(uid)
            return line['seqs'][line['unique_ids'].index(uid)]
        def lget(uid_list):
            return ':'.join(uid_list)

        # check for repeated uids (was only from seed uid, which shouldn't happen any more, but the code below throws an infinite loop if we do, so may as well be careful)
        for partition in partitions:
            if sum(len(c) for c in partition) > len(set(u for c in partition for u in c)):
                repeated_uids = [u for u, count in collections.Counter([u for c in partition for u in c]).items() if count > 1]
                raise Exception('found %d uid%s in more than one cluster (%s)' % (len(repeated_uids), utils.plural(len(repeated_uids)), ', '.join(repeated_uids)))

        default_edge_length = 999999  # it's nice to have the edges all set to something that's numeric (so the trees print), but also obvious wrong, if we forget to set somebody
        assert len(partitions[-1]) == 1
        root_label = lget(partitions[-1][0])  # we want the order of the uids in the label to correspond to the order in self.partitions
        tns = dendropy.TaxonNamespace([root_label])
        root_node = dendropy.Node(taxon=tns.get_taxon(root_label))
        root_node.uids = uid_set  # each node keeps track of the uids of its children
        dtree = dendropy.Tree(taxon_namespace=tns, seed_node=root_node)
        if debug:
            print '    starting tree with %d leaves' % len(uid_set)
        for ipart in reversed(range(len(partitions) - 1)):  # dendropy seems to only have fcns to build a tree from the root downward, so we loop starting with the last partition (- 1 is because the last partition is guaranteed to be just one cluster)
            for lnode in dtree.leaf_node_iter():  # look for leaf nodes that contain uids from two clusters in this partition, and add those as children
                tclusts = [c for c in partitions[ipart] if len(set(c) & lnode.uids) > 0]
                if len(tclusts) < 2:
                    continue
                for tclust in tclusts:
                    ttaxon = dendropy.Taxon(lget(tclust))
                    tns.add_taxon(ttaxon)
                    child = lnode.new_child(taxon=ttaxon, edge_length=default_edge_length)
                    child.uids = set(tclust)
                if debug:
                    print '      ipart %d' % ipart
                    print '        split node: %d --> %s      %s --> %s' % (len(lnode.uids), ' '.join([str(len(tc)) for tc in tclusts]), lnode.taxon.label, ' '.join([c.taxon.label for c in lnode.child_node_iter()]))

        # split existing leaves, which are probably not singletons (they're probably from the initial naive sequence collapse step) into subtrees such that each leaf is a singleton
        for lnode in dtree.leaf_node_iter():
            if len(lnode.uids) == 1:
                continue
            if get_fasttrees and len(lnode.uids) > 2:
                seqfos = [{'name' : uid, 'seq' : getseq(uid)} for uid in lnode.taxon.label.split(':')]  # may as well add them in the right order, although I don't think it matters
                subtree = treeutils.get_fasttree_tree(seqfos, getline(lnode.taxon.label, uid_set=lnode.uids)['naive_seq'], suppress_internal_node_taxa=True)  # note that the fasttree distances get ignored below (no idea if they'd be better than what we set down there, but they probably wouldn't be consistent, so I'd rather ignore them)
                for tmpnode in subtree.postorder_node_iter():
                    if tmpnode.is_leaf():
                        tmpnode.uids = set([tmpnode.taxon.label])
                    else:
                        tmpnode.uids = set([uid for c in tmpnode.child_node_iter() for uid in c.uids])
                        ttaxon = dendropy.Taxon(lget(tmpnode.uids))
                        subtree.taxon_namespace.add_taxon(ttaxon)
                        tmpnode.taxon = ttaxon  # ...and use the string of leaf nodes, even though they'll be in the wrong order (I think these get ignored when I call label_nodes() below, but it's still tidier to have them right in the meantime, and anyway since I'm suppressing internal taxa I think I need to set them to something)

                if debug:
                    print '   adding subtree with %d leaves from fastree at leaf node %s' % (len(seqfos), lnode.taxon.label)
                    print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=subtree))
                dtree.taxon_namespace.add_taxa(subtree.taxon_namespace)
                lnode.add_child(subtree.seed_node)
                assert len(lnode.child_edges()) == 1  # we're iterating over leaves, so this should always be true
                lnode.child_edges()[0].collapse()
            else:  # just add a star subtree
                for uid in lnode.taxon.label.split(':'):  # may as well add them in the right order, although I don't think it matters
                    ttaxon = dendropy.Taxon(uid)
                    tns.add_taxon(ttaxon)
                    child = lnode.new_child(taxon=ttaxon, edge_length=default_edge_length)
                    child.uids = set([uid])
                if debug:
                    print '      added %d singleton children for %s' % (len(lnode.uids), lnode.taxon.label)

        # in order to set edge lengths, we need node sequences, so first set leaf node seqs
        for lnode in dtree.leaf_node_iter():
            assert len(lnode.uids) == 1
            lnode.seq = getseq(lnode.taxon.label)
            lnode.n_descendent_leaves = 1  # keep track of how many leaf nodes contributed to each node's consensus sequence (these are leaves, so it's trivally 1). This is less accurate than keeping track of all the sequences, but also faster

        # then set internal node seqs as the consensus of their children, and set the distance as hamming distance to child seqs
        if debug:
            print '    adding edge lengths either from fasttree %s or cons seq %s' % (utils.color('blue', 'x'), utils.color('red', 'x'))
        min_edge_length = None  # setting this is nice for better debug viewing
        for node in dtree.postorder_internal_node_iter():  # includes root node
            child_cons_seq_counts = [c.n_descendent_leaves for c in node.child_node_iter()]
            total_descendent_leaves = sum(child_cons_seq_counts)
            if total_descendent_leaves > n_max_cons_seqs:  # if there's tons of descendent leaves, we don't want to pass them all to the consensus fcn since it's slow, so we choose them in proportion to their actual proportions, but scaled down to <n_max_cons_seqs>
                child_cons_seq_counts = [int(n_max_cons_seqs * csc / float(total_descendent_leaves)) for csc in child_cons_seq_counts]
                child_cons_seq_counts = [max(1, csc) for csc in child_cons_seq_counts]  # don't eliminate any sequences entirely (this makes the proportions less accurate (in some cases), but is the easy way to handle the case where there's a ton of singleton children
            if debug:
                print '  %s' % utils.color('green', node.taxon.label)
                csc_str = '  (reduced: %s)' % ' '.join([str(csc) for csc in child_cons_seq_counts]) if total_descendent_leaves > n_max_cons_seqs else ''
                print '      desc leaves per child: %s%s' % (' '.join(str(c.n_descendent_leaves) for c in node.child_node_iter()), csc_str)
            child_seqfos = [{'name' : cn.taxon.label + '-leaf-' + str(il), 'seq' : cn.seq} for cn, count in zip(node.child_node_iter(), child_cons_seq_counts) for il in range(count)]
            node.seq = utils.cons_seq(0.01, aligned_seqfos=child_seqfos, tie_resolver_seq=getline(root_label)['naive_seq'])  #, debug=debug)  # the consensus has an N at every position where the constituent sequences gave a tie. But Ns screw up the distances (especially because once we *get* an N, we can't get rid of it and it's propagated all the way up the tree), and in almost all cases the correct choice should be the naive base, so we use that
            node.n_descendent_leaves = total_descendent_leaves
            for edge in node.child_edge_iter():
                from_fasttree = False
                if edge.length == default_edge_length:  # otherwise it was set by fasttree, and it's probably better than what we'd get from this (it'd be nice to skip the cons seq stuff for the whole fasttree subtree, but then we don't have the cons seqs we need for later)
                    edge.length = utils.hamming_distance(edge.head_node.seq, node.seq) / float(len(node.seq))
                else:
                    from_fasttree = True
                if min_edge_length is not None:
                    edge.length = max(min_edge_length, edge.length)
                if debug:
                    print '       %6.3f   %s  %s' % (edge.length, utils.color('blue' if from_fasttree else 'red', 'x'), edge.head_node.taxon.label)

        if debug:
            print '        naive seq %s' % getline(root_label)['naive_seq'] # NOTE might be worthwhile to add an edge connecting seed node and the actual naive sequence (i.e. for cases where our approximate naive is off)
            print '    root cons seq %s' % utils.color_mutants(getline(root_label)['naive_seq'], dtree.seed_node.seq)

        for node in dtree.preorder_node_iter():
            del node.uids
            del node.seq
            del node.n_descendent_leaves

        treeutils.label_nodes(dtree, ignore_existing_internal_node_labels=True, ignore_existing_internal_taxon_labels=True, debug=debug)
        dtree.update_bipartitions()  # probably don't really need this
        if debug:
            print treeutils.utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=dtree, width=250))

        return dtree