Esempio n. 1
0
 def run(self, T=10, T_min=0.04, n_sweeps=1000):
     temperatures = self.generate_temperatures(T, T_min) if isinstance(
         T, int) else T
     self.cost = self.cost(self.solution)
     if self.plot:
         energies = [self.cost]
         hammings = [hamming_distance(self.solution, self.config)]
     # swaps = np.arange(0, len(self.solution))
     swaps = [i for i, val in enumerate(self.solution)]
     for T in temperatures:
         start = time.time()
         accept_probs = self.get_accept_probs(T)
         for sweep in xrange(n_sweeps):
             self.shuffle(swaps)
             for swap in swaps:
                 new_sol, new_cost = self.update_solution(swap)
                 diff = int(self.cost - new_cost)
                 if diff >= 0 or self.random() < accept_probs[diff]:
                     self.solution = new_sol
                     self.cost = new_cost
             if self.plot:
                 energies.append(self.cost)
                 hammings.append(
                     hamming_distance(self.solution, self.config))
         print T, ': ', 'Current best: ', self.cost
         print 'timing', time.time() - start
     if self.plot:
         self.save_plot(energies, T, hammings)
     return (self.solution, self.cost)
Esempio n. 2
0
    def reassign_template_counts(self, msa_info, new_alleles, debug=False):
        # XXX need to update family_groups here
        if len(new_alleles) == 0:
            return

        if debug:
            print '              template  new'
            print '      size      snps    snps    assigned',
            if self.reco_info is not None:
                print '         true',
            print ''

        dbg_print = debug  # don't print all the tiny clusters
        templates = {newfo['template-gene'] : newfo['gene'] for newfo in new_alleles.values()}
        self.adjusted_glcounts = {}
        for clusterfo in sorted(msa_info, key=lambda cfo: len(cfo['seqfos']), reverse=True):
            sorted_glcounts, true_sorted_glcounts = self.get_glcounts(clusterfo)  # it would be nice to not re-call this for the clusters we already called it on above
            for gene, counts in sorted_glcounts:  # <gene> is the one assigned by sw before allele clustering
                if debug and len(clusterfo['seqfos']) < 5:
                    if dbg_print:
                        print '     not printing clusters smaller than 5'
                    dbg_print = False

                if gene not in self.adjusted_glcounts:  # add it before we decide whether to switch it, so a template gene with zero counts will be in there with zero counts
                    self.adjusted_glcounts[gene] = 0
                if gene in templates:  # if this was a template for a new allele, we have to decide whether to apportion some or all of the sequences in this cluster to that new allele
                    template_gene = gene
                    template_cpos = utils.cdn_pos(self.glfo, self.region, template_gene)
                    cons_seq = clusterfo['cons_seq']
                    template_seq = self.glfo['seqs'][self.region][template_gene]
                    new_allele_seq = new_alleles[templates[template_gene]]['seq']

                    compare_len = min([template_cpos, len(cons_seq), len(template_seq), len(new_allele_seq)])  # NOTE this doesn't account for indels, i.e. the template and consensus sequences are in general different lengths, but that's ok, it'll just inflate the hamming distance for sequences that differ from consensus by indels, and all we care is finding the one that doesn't have any indels
                    n_template_snps = utils.hamming_distance(cons_seq[:compare_len], template_seq[:compare_len])
                    n_new_snps = utils.hamming_distance(cons_seq[:compare_len], new_allele_seq[:compare_len])

                    if debug and dbg_print:
                        print '    %5d      %3d     %3d' % (len(clusterfo['seqfos']), n_template_snps, n_new_snps),

                    if n_new_snps < n_template_snps:  # reassign to the new allele
                        gene = templates[template_gene]
                        if gene not in self.adjusted_glcounts:  # add it before we decide whether to switch it, so a template gene with zero counts will be in there with zero counts
                            self.adjusted_glcounts[gene] = 0

                    if debug and dbg_print:
                        print '    %s' % utils.color_gene(gene, width=15),
                        if self.reco_info is not None:
                            true_gene = true_sorted_glcounts[0][0]  # NOTE this is the most *common* simulated gene in the cluster, not necessarily the one corresponding to these particular sequences... but clusters with new alleles should generally be dominated by one gene, so oh, well
                            if true_gene == gene:
                                print '    %s' % utils.color('green', 'ok'),
                            else:
                                print '    %s' % utils.color_gene(true_gene, width=15),
                        print ''

                self.adjusted_glcounts[gene] += counts

        if debug:
            print '  final counts:'
            for gene, counts in sorted(self.adjusted_glcounts.items(), key=operator.itemgetter(1), reverse=True):
                print '    %4d  %s' % (counts, utils.color_gene(gene))
Esempio n. 3
0
    def finalize(self, sorted_gene_counts, debug=False):
        # NOTE <sorted_gene_counts> is usually/always floats instead of integers
        assert not self.finalized
        easycounts = {gene : counts for gene, counts in sorted_gene_counts}
        total_counts = sum([counts for counts in easycounts.values()])

        self.genes_to_keep = set()

        if debug:
            print '  removing least likely genes (%.1f total counts)' % total_counts
            print '     %-20s    %5s (%s)      removed genes (counts)' % ('genes to keep', 'counts', 'snps'),
            def count_str(cnt):
                if cnt < 10.:
                    return '%.1f' % cnt
                else:
                    return '%.0f' % cnt

        class_counts = self.separate_into_classes(sorted_gene_counts, easycounts)
        for iclass in range(len(class_counts)):
            gclass = class_counts[iclass]
            n_from_this_class = 0
            for ig in range(len(gclass)):
                gfo = gclass[ig]
                if self.args.n_max_total_alleles is not None and len(self.genes_to_keep) >= self.args.n_max_total_alleles:  # command line can specify the total number of alleles
                    break

                if float(gfo['counts']) / total_counts < self.args.min_allele_prevalence_fraction:  # always skip everybody that's super uncommon
                    pass
                elif ig == 0:  # keep the first one from this class
                    self.genes_to_keep.add(gfo['gene'])
                    n_from_this_class += 1
                elif utils.hamming_distance(gclass[0]['seq'], gclass[ig]['seq']) == 0:  # don't keep it if it's indistinguishable from the most common one (the matches are probably mostly really the best one)
                    pass  # don't keep it
                elif n_from_this_class < self.args.n_alleles_per_gene:  # always keep the most common <self.args.n_alleles_per_gene> in each class
                    self.genes_to_keep.add(gfo['gene'])
                    n_from_this_class += 1
                else:
                    pass  # don't keep it

                if debug and gfo['gene'] in self.genes_to_keep:
                    snpstr = ' ' if ig == 0 else '(%d)' % utils.hamming_distance(gclass[0]['seq'], gfo['seq'])
                    print '\n       %-s  %7s  %-3s' % (utils.color_gene(gfo['gene'], width=20), count_str(gfo['counts']), snpstr),
            if debug:
                if n_from_this_class == 0:
                    print '\n       %-s  %7s  %-3s' % (utils.color('blue', 'none', width=20, padside='right'), '-', ''),
                removedfo = [gfo for gfo in gclass if gfo['gene'] not in self.genes_to_keep]
                if len(removedfo) > 0:
                    removal_strs = ['%s (%s)' % (utils.color_gene(gfo['gene']), count_str(gfo['counts'])) for gfo in removedfo]
                    print '        %s' % '  '.join(removal_strs),
        if debug:
            print ''

        self.genes_to_remove = set(self.glfo['seqs'][self.region]) - self.genes_to_keep

        print '    keeping %d / %d %s gene%s' % (len(self.genes_to_keep), len(self.glfo['seqs'][self.region]), self.region, utils.plural(len(self.genes_to_keep)))
        # print '    removing %d %s genes: %d with no matches, %d with unconvincing matches' % (len(self.genes_to_remove), self.region, len(set(self.glfo['seqs'][self.region]) - set(easycounts)), len(set(easycounts) - self.genes_to_keep))

        self.finalized = True
Esempio n. 4
0
def nao_valid(queue, model):
    pa = utils.AvgrageMeter()
    hs = utils.AvgrageMeter()
    with torch.no_grad():
        model.eval()
        for step, sample in enumerate(queue):
            encoder_input = sample['encoder_input']
            encoder_target = sample['encoder_target']
            decoder_target = sample['decoder_target']

            encoder_input = encoder_input.cuda()
            encoder_target = encoder_target.cuda()
            decoder_target = decoder_target.cuda()

            predict_value, logits, arch = model(encoder_input)
            n = encoder_input.size(0)
            pairwise_acc = utils.pairwise_accuracy(
                encoder_target.data.squeeze().tolist(),
                predict_value.data.squeeze().tolist())
            hamming_dis = utils.hamming_distance(
                decoder_target.data.squeeze().tolist(),
                arch.data.squeeze().tolist())
            pa.update(pairwise_acc, n)
            hs.update(hamming_dis, n)
    return pa.avg, hs.avg
Esempio n. 5
0
def naive_hdist_or_none(line1, line2):
    if line1['cdr3_length'] != line2['cdr3_length']:
        return None
    hdist = utils.hamming_distance(naive_cdr3(line1), naive_cdr3(line2))
    if hdist > args.max_cdr3_distance:
        return None
    return hdist
Esempio n. 6
0
    def keep_this_gene(self, this_gene, pcounter, easycounts, debug=False):
        assert self.region == 'v'  # conserved codon stuff below will have to be changed for j
        glseqs = self.glfo['seqs'][self.region]
        this_seq = glseqs[this_gene][:self.codon_positions[this_gene] + 3]  # only compare up to the conserved cysteine

        # don't keep it if it's pretty close to a gene we already have
        n_close_genes = 0
        nearest_gene, nearest_hdist = None, None
        for kgene in self.genes_to_keep:
            kseq = glseqs[kgene][:self.codon_positions[kgene] + 3]
            if len(kseq) != len(this_seq):
                continue
            hdist = utils.hamming_distance(kseq, this_seq)
            if nearest_hdist is None or hdist < nearest_hdist:
                nearest_hdist = hdist
                nearest_gene = kgene
            if hdist < self.args.n_max_snps - 1:
                n_close_genes += 1

        if easycounts[this_gene] < self.alfinder.n_total_min:  # if we hardly ever saw it, there's no good reason to believe it wasn't the result of just mutational wandering
            self.dbg_strings[this_gene] = 'not enough counts (%d < %d)' % (easycounts[this_gene], self.alfinder.n_total_min)
            return False

        self.dbg_strings[this_gene] = 'nearest gene %s %s' % (nearest_gene, nearest_hdist)
        return True
Esempio n. 7
0
def nao_valid(queue, model):
    inputs = []
    targets = []
    predictions = []
    archs = []
    with torch.no_grad():
        model.eval()
        for step, sample in enumerate(queue):
            encoder_input = sample['encoder_input']
            encoder_target = sample['encoder_target']
            decoder_target = sample['decoder_target']

            encoder_input = encoder_input.cuda()
            encoder_target = encoder_target.cuda()
            decoder_target = decoder_target.cuda()

            predict_value, logits, arch = model(encoder_input)
            n = encoder_input.size(0)
            inputs += encoder_input.data.squeeze().tolist()
            targets += encoder_target.data.squeeze().tolist()
            predictions += predict_value.data.squeeze().tolist()
            archs += arch.data.squeeze().tolist()
    pa = utils.pairwise_accuracy(targets, predictions)
    hd = utils.hamming_distance(inputs, archs)
    return pa, hd
Esempio n. 8
0
    def test_hamming_distance_between_base_case_strings_as_specified_in_problem_spec(
            self):
        from utils import hamming_distance
        from data_utils import convert_string_to_binary_string

        self.assertEqual(37,hamming_distance(convert_string_to_binary_string('this is a test'),\
             convert_string_to_binary_string('wokka wokka!!!')))
def rate_key_size(key_size, ciphertext):
	dist = 0
	for block_1, block_2 in zip(chunks(ciphertext, key_size), chunks(ciphertext, key_size)[1:]):
		dist += hamming_distance(block_1, block_2)
	dist /=  len(ciphertext) / key_size
	normalized = dist / key_size
	return normalized
Esempio n. 10
0
    def keep_this_gene(self, this_gene, pcounter, easycounts, debug=False):
        assert self.region == 'v'  # conserved codon stuff below will have to be changed for j
        glseqs = self.glfo['seqs'][self.region]
        this_seq = glseqs[this_gene][:self.codon_positions[this_gene] + 3]  # only compare up to the conserved cysteine

        # don't keep it if it's pretty close to a gene we already have
        n_close_genes = 0
        nearest_gene, nearest_hdist = None, None
        for kgene in self.genes_to_keep:
            kseq = glseqs[kgene][:self.codon_positions[kgene] + 3]
            if len(kseq) != len(this_seq):
                continue
            hdist = utils.hamming_distance(kseq, this_seq)
            if nearest_hdist is None or hdist < nearest_hdist:
                nearest_hdist = hdist
                nearest_gene = kgene
            if hdist < self.args.n_max_snps - 1:
                n_close_genes += 1

        if easycounts[this_gene] < self.alfinder.n_total_min:  # if we hardly ever saw it, there's no good reason to believe it wasn't the result of just mutational wandering
            self.dbg_strings[this_gene] = 'not enough counts (%d < %d)' % (easycounts[this_gene], self.alfinder.n_total_min)
            return False

        self.dbg_strings[this_gene] = 'nearest gene %s %s' % (nearest_gene, nearest_hdist)
        return True
Esempio n. 11
0
    def find(self, NAP):
        crude_hash = utils.d_hash(NAP, hash_size=8)
        fine_hash = utils.d_hash(NAP, hash_size=16)
        clean_key, overlap_key = self._digitize(NAP)

        unsorted = self.NAP_intervals[clean_key] if clean_key in self.NAP_intervals else [] + self.NAP_intervals[overlap_key] if overlap_key in self.NAP_intervals else []

        return sorted(unsorted, key = lambda x: utils.hamming_distance(fine_hash, x.fine_hash))[0] if len(unsorted) else [], crude_hash, fine_hash, clean_key, overlap_key
Esempio n. 12
0
def rate_key_size(key_size, ciphertext):
    dist = 0
    for block_1, block_2 in zip(chunks(ciphertext, key_size),
                                chunks(ciphertext, key_size)[1:]):
        dist += hamming_distance(block_1, block_2)
    dist /= len(ciphertext) / key_size
    normalized = dist / key_size
    return normalized
Esempio n. 13
0
 def too_close_to_already_added_gene(self, new_seq, new_alleles, debug=False):
     for added_name, added_info in new_alleles.items():
         n_snps = utils.hamming_distance(added_info['seq'], new_seq, align=True)
         if n_snps < self.min_n_snps or n_snps < self.args.n_max_snps:
             if debug:
                 print 'too close (%d snp%s) to gene we just added %s' % (n_snps, utils.plural(n_snps), utils.color_gene(added_name))
             return True
     return False
Esempio n. 14
0
    def choose_seqs_to_remove(
            chain_ids,
            max_hdist=4,
            tdbg=False):  # choose one of <chain_ids> to eliminate
        # look for pairs with the same locus that
        ids_to_remove = set(u for u in chain_ids if getloc(u) == '?')
        if tdbg and len(
                ids_to_remove
        ) > 0:  # i think this actually can't happen a.t.m. TODO maybe remove it
            print '      removed %d with missing annotations' % len(
                ids_to_remove)

        dbgstr = []
        n_equivalent = 0
        for tpair in itertools.combinations(chain_ids, 2):
            if len(set(getloc(u) for u in tpair)) > 1:
                continue
            if len(set(len(gval(u, 'seqs')) for u in tpair)) > 1:
                continue
            hdist = utils.hamming_distance(*[gval(u, 'seqs') for u in tpair])
            if tdbg:
                dbgstr.append(
                    utils.color('blue' if hdist == 0 else 'yellow',
                                '%d' % hdist))
            if hdist <= max_hdist:  # TODO would be nice to be able to combine their sequences, but I think propagating the resulting annotation modifications would be hard
                # print '      identical sequence overlap, choosing longer one'
                better_id, worse_id = sorted(
                    tpair, key=lambda q: utils.ambig_frac(gval(q, 'seqs'))
                )  # TODO if we're tossing one with hdist > 0, maybe should take the lower-shm one if they're the same length?
                ids_to_remove.add(worse_id)
                n_equivalent += 1
        if tdbg and len(dbgstr) > 0:
            print '        %d pair%s equivalent with hdists %s' % (
                n_equivalent, utils.plural(n_equivalent), ' '.join(dbgstr))

        # remove unproductive
        dbgstr = []
        unproductive_ids = []
        for uid in chain_ids:
            if not utils.is_functional(
                    all_antns[uid], all_antns[uid]['unique_ids'].index(uid)):
                unproductive_ids.append(uid)
                if tdbg:
                    dbgstr.append(
                        utils.is_functional_dbg_str(
                            all_antns[uid],
                            all_antns[uid]['unique_ids'].index(uid),
                            sep='+'))
        # unproductive_ids = [u for u in chain_ids if not utils.is_functional(all_antns[u], all_antns[u]['unique_ids'].index(u))]  # this way is only one line, which may or may not be nicer
        if tdbg and len(unproductive_ids) > 0:
            print '        %d unproductive  %s' % (len(unproductive_ids),
                                                   ',  '.join(dbgstr))
            ids_to_remove |= set(unproductive_ids)

        return ids_to_remove
Esempio n. 15
0
def build_v_gene_set(glfo, introns):
    total_d_counts = {}
    refseqs = {}
    for d_gene, counts in introns.items():
        total_d_counts[d_gene] = sum(counts.values())
    for d_gene, _ in sorted(total_d_counts.items(), key=operator.itemgetter(1), reverse=True):
        counts = introns[d_gene]

        # first decide on the reference sequences
        refseq, column_counts = None, None
        for seq in sorted(counts, key=len, reverse=True):
            if refseq is None:  # first one, i.e. the longest
                refseq = seq
                column_counts = [{n : 0 for n in utils.nukes} for i in range(len(refseq))]
            ioffset = len(refseq) - len(seq)
            partial_refseq = refseq[ioffset:]
            assert len(partial_refseq) == len(seq)
            for ibase in range(ioffset, len(refseq)):
                column_counts[ibase][seq[ibase - ioffset]] += counts[seq]

        refseqs[d_gene] = []
        for basecounts in column_counts:
            most_common_base = sorted(basecounts.items(), key=operator.itemgetter(1), reverse=True)[0][0]
            refseqs[d_gene].append(most_common_base)
        refseqs[d_gene] = ''.join(refseqs[d_gene])

        n_ok = 0
        mutecounts = {}
        for seq in sorted(counts, key=len, reverse=True):
            # print '    %3d   %150s' % (count, seq)
            partial_refseq = refseqs[d_gene][len(refseqs[d_gene]) - len(seq):]
            if seq == partial_refseq:
                n_ok += counts[seq]
            else:
                # utils.color_mutants(partial_refseq, seq, print_result=True, extra_str='                ')
                n_mutes = utils.hamming_distance(partial_refseq, seq)
                if n_mutes not in mutecounts:
                    mutecounts[n_mutes] = 0
                mutecounts[n_mutes] += counts[seq]
        print '  %s   %4d / %-4d ok' % (utils.color_gene(d_gene, width=10), n_ok, n_ok + sum(mutecounts.values())),
        if len(mutecounts) > 0:
            print '(mean of %.1f mutations among the other %d' % (numpy.average(mutecounts.keys(), weights=mutecounts.values()), sum(mutecounts.values())),
        print ''

    # add the intronic v genes to glfo
    for d_gene, refseq in refseqs.items():
        glfo['seqs']['v'][utils.generate_dummy_v(d_gene)] = refseq
        glfo['cyst-positions'][utils.generate_dummy_v(d_gene)] = len(refseq) - 3

    # write a glfo dir with everything
    glutils.write_glfo(outdir + '/germlines/imgt-and-intronic', glfo, debug=True)

    # remove the original v genes, and write a glfo dir with just the intronic ones
    glutils.remove_genes(glfo, [g for g in glfo['seqs']['v'] if 'xDx' not in g], debug=True)
    glutils.write_glfo(outdir + '/germlines/intronic', glfo, debug=True)
Esempio n. 16
0
 def test_barcodes(self, barcode_set):
     '''
     validate stored barcodes to check if they are at leasts ${min_distance} nucleotide apart
     '''
     for bc1 in barcode_set:
         for bc2 in barcode_set:
             if bc1 != bc2:
                 assert hamming_distance(bc1, bc2) >= self.min_distance, \
                         'Bad barcodes %s and %s' %(bc1,bc2)
     print('All barcode had at least %i nucleotide distance apart' %
           self.min_distance,
           file=sys.stderr)
Esempio n. 17
0
def c6():
	str1 = "this is a test"
	str2 = "wokka wokka!!!"
	result = utils.hamming_distance(str1, str2)
	expect(37.0, result)

	filename = "data/6.txt"
	ciphertext = utils.get_ciphertext(filename)
	result = cryptopals.break_repeating_key_xor(ciphertext)
	print
	print result[1]
	print result[0]
Esempio n. 18
0
    def guess_inflection(self, lemma, best_node):
        '''
        :lemma: the lemma to inflect
        :best_node: the deepest node logically compatible with the lemma.

        :return: an inflected form using the nearest-neighbor at the :best_node: (using Hamming distance).
        '''
        options = sorted(best_node.switch_statement.vocab,
                         key=lambda it: hamming_distance(lemma, it[0]))
        closest_lemma, closest_inflected = options[0][:-1]
        suffix_of_closest = closest_inflected[len(closest_lemma):]
        return f'{lemma}{suffix_of_closest}'
Esempio n. 19
0
def approx_pattern_matching(pattern: str, genome: str, d: int) -> List[int]:
    """Find locations of k-mer (pattern') in the genome with
    hamming_distance(pattern, pattern') <= d

    >>> approx_pattern_matching("ATTCTGGA", "CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAAT", 3)
    [6, 7, 26, 27]
    """
    intervals = sliding_window(genome, len(pattern))
    return [
        i for i, interval in enumerate(intervals)
        if hamming_distance(pattern, interval) <= d
    ]
Esempio n. 20
0
    def too_close_to_existing_glfo_gene(self, clusterfo, new_seq, template_seq, template_cpos, template_gene, debug=False):
        if len(new_seq[:template_cpos]) != len(template_seq[:template_cpos]):  # TODO update this to use the new n_snps from the aligned template/new seqs
            return False

        mean_j_mutations = numpy.mean([self.all_j_mutations[seqfo['name']] for seqfo in clusterfo['seqfos']])  # TODO <self.all_j_mutations> uses everybody in the cluster, rather than just the representative. It'd be nice to synchronize this with other things
        # TODO should probably update this to do the same thing (with min([])) as up in decide_whether_to_remove_template_genes(), or just use the new <align> option to utils.hamming_distance (although that would be slower, and this seems to work ok)
        pre_cpos_snps = utils.hamming_distance(new_seq[:template_cpos], template_seq[:template_cpos])
        factor = 1.75
        if pre_cpos_snps < self.min_n_snps or pre_cpos_snps < factor * mean_j_mutations:  # i.e. we keep if it's *further* than factor * <number of j mutations> from the closest existing allele (should presumably rescale by some factor to go from j --> v, but it seems like the factor's near to 1.)
            if debug:
                print 'too close to existing glfo gene %s (%d snp%s < %.2f = %.2f * %.1f mean j mutation%s)' % (utils.color_gene(template_gene), pre_cpos_snps, utils.plural(pre_cpos_snps), factor * mean_j_mutations, factor, mean_j_mutations, utils.plural(mean_j_mutations))
            return True

        return False
Esempio n. 21
0
    def learn(self, NAP, wav_file, segment_idxs):
        best_match, crude_hash, fine_hash, clean_key, overlap_key = self.find(NAP)
        
        if not len(best_match):
            audio_id = self.audio_id_counter
            print 'New audio_id {}, never before heard length {}'.format(audio_id, NAP.shape[0])
            self.audio_id_counter += 1
        else:
            if utils.hamming_distance(crude_hash, best_match.crude_hash) < AUDIO_HAMMERTIME:
                audio_id = best_match.audio_id
                print 'Similar to audio_id {}, hamming distance {}'.format(audio_id, utils.hamming_distance(crude_hash, best_match.crude_hash))
            else:
                audio_id = self.audio_id_counter
                self.audio_id_counter += 1                
                print 'New audio_id {}, hamming distance {} from audio_id {}'.format(audio_id, utils.hamming_distance(crude_hash, best_match.crude_hash), best_match.audio_id)
                
        audio_segment = AudioSegment(audio_id, crude_hash, fine_hash, wav_file, segment_idxs)

        self._insert(self.NAP_intervals, clean_key, audio_segment)
        self._insert(self.NAP_intervals, overlap_key, audio_segment)
        self._insert(self.audio_ids, audio_id, audio_segment)
        
        return audio_id
Esempio n. 22
0
 def hamming_to_true_naive(self, true_line, line, restrict_to_region=''):
     true_naive_seq = true_line['naive_seq']
     inferred_naive_seq = line['naive_seq']
     if len(true_naive_seq) != len(inferred_naive_seq):
         raise Exception('different length true and inferred naive seqs for %s\n  %s\n  %s' % (' '.join(line['unique_ids']), true_line['naive_seq'], line['naive_seq']))
     if restrict_to_region != '':  # NOTE very similar to utils.get_n_muted(), except, we want to use the true bounds for both true and naive sequences
         if restrict_to_region in utils.regions:
             bounds = true_line['regional_bounds'][restrict_to_region]
         elif restrict_to_region == 'cdr3':
             bounds = (true_line['codon_positions']['v'], true_line['codon_positions']['j'] + 3)
         else:
             assert False
         true_naive_seq = true_naive_seq[bounds[0] : bounds[1]]
         inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]]
     return utils.hamming_distance(true_naive_seq, inferred_naive_seq)
Esempio n. 23
0
 def hamming_to_true_naive(self, true_line, line, restrict_to_region=''):
     true_naive_seq = true_line['naive_seq']
     inferred_naive_seq = line['naive_seq']
     if len(true_naive_seq) != len(inferred_naive_seq):
         raise Exception('different length true and inferred naive seqs for %s\n  %s\n  %s' % (' '.join(line['unique_ids']), true_line['naive_seq'], line['naive_seq']))
     if restrict_to_region != '':  # NOTE very similar to utils.get_n_muted(), except, we want to use the true bounds for both true and naive sequences
         if restrict_to_region in utils.regions:
             bounds = true_line['regional_bounds'][restrict_to_region]
         elif restrict_to_region == 'cdr3':
             bounds = (true_line['codon_positions']['v'], true_line['codon_positions']['j'] + 3)
         else:
             print 'invalid regional restriction %s' % restrict_to_region
         true_naive_seq = true_naive_seq[bounds[0] : bounds[1]]
         inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]]
     return utils.hamming_distance(true_naive_seq, inferred_naive_seq)
Esempio n. 24
0
 def hamming_to_true_naive(self, true_line, line, restrict_to_region=''):
     true_naive_seq, inferred_naive_seq = self.harmonize_naive_seq_lengths(
         true_line, line)
     if restrict_to_region != '':  # NOTE very similar to utils.get_n_muted(), except, we want to use the true bounds for both true and naive sequences
         if restrict_to_region in utils.regions:
             bounds = true_line['regional_bounds'][restrict_to_region]
         elif restrict_to_region == 'cdr3':
             bounds = (true_line['codon_positions']['v'],
                       true_line['codon_positions']['j'] + 3)
         else:
             print 'invalid regional restriction %s' % restrict_to_region
         if restrict_to_region == 'v':  # NOTE this is kind of hackey, especially treating v differently to d and j, but it kind of makes sense -- v is fundamentally different in that germline v is a real source of diversity, so it makes sense to isolate the v germline accuracy from the boundary-call accuracy like this
             bounds = (
                 bounds[0], max(bounds[0], bounds[1] - self.v_3p_exclusion)
             )  # most of the boundary uncertainty is in the last three bases
         true_naive_seq = true_naive_seq[bounds[0]:bounds[1]]
         inferred_naive_seq = inferred_naive_seq[bounds[0]:bounds[1]]
     return utils.hamming_distance(true_naive_seq, inferred_naive_seq)
Esempio n. 25
0
def nao_valid(queue, model):
    pa = utils.AvgMeter()
    hs = utils.AvgMeter()
    mse = utils.AvgMeter()

    for step, sample in enumerate(queue):
        encoder_input = sample['encoder_input']
        encoder_target = sample['encoder_target']
        decoder_target = sample['decoder_target']

        predict_value, logits, arch = model(encoder_input)
        n = encoder_input.shape[0]
        pairwise_acc = utils.pairwise_accuracy(encoder_target.data.squeeze().tolist(),
                                               predict_value.data.squeeze().tolist())
        hamming_dis = utils.hamming_distance(decoder_target.data.squeeze().tolist(), arch.data.squeeze().tolist())
        mse.update(keras.losses.MSE(encoder_target.data.squeeze(), predict_value.data.squeeze()), n)
        pa.update(pairwise_acc, n)
        hs.update(hamming_dis, n)

    return mse.avg, pa.avg, hs.avg
Esempio n. 26
0
    def solve_(self, problem: CSProblem) -> Tuple[str, dict]:
        m, n, strings, alphabet = problem.m, problem.n, problem.strings, problem.alphabet

        i, j = max(combinations(range(n), 2),
                   key=lambda coords: utils.hamming_distance(
                       strings[coords[0]], strings[coords[1]]))

        si = strings[i]
        sj = strings[j]
        P = utils.P(si, sj)
        Q = utils.Q(si, sj)

        k = len(P)  # Number of positions that they disagree on
        epsilon = self.config['epsilon']
        sigma = self.config['sigma']

        solve_func = solve_by_lp_relaxation
        lp_used = True

        decision_measure = (6 * math.log(sigma * m)) // (epsilon**2)

        if k <= decision_measure:
            solve_func = solve_by_force
            lp_used = False

        # print(f'Decision: |P| = {k}', file=sys.stderr)
        # print(f'Measure = {decision_measure}', file=sys.stderr)
        # print(f'Meaning {"FORCE" if not lp_used else "LP"}', file=sys.stderr)

        ss = solve_func(P, Q, alphabet, m, n, strings, si)
        if ss is None:
            # print('not stonks?')
            return si, {'lp_used': lp_used, 'orig': True}

        new_sol, new_sol_metric = ss

        # print(f'Found sol: {new_sol}')

        if new_sol_metric < k:
            return new_sol, {'lp_used': lp_used, 'orig': False}
        return si, {'lp_used': lp_used, 'orig': True}
Esempio n. 27
0
    def separate_into_classes(self, sorted_gene_counts, easycounts):
        class_counts = []
        for gene, counts in sorted_gene_counts:
            seq = self.glfo['seqs'][self.region][gene][:self.codon_positions[gene] + 3]
            add_new_class = True
            for gclass in class_counts:
                for gfo in gclass:
                    if len(gfo['seq']) != len(seq):
                        continue
                    hdist = utils.hamming_distance(gfo['seq'], seq)
                    if hdist < self.args.n_max_snps - 1:  # if this gene is close to any gene in the class, add it to this class
                        add_new_class = False
                        class_counts[class_counts.index(gclass)].append({'gene' : gene, 'counts' : counts, 'seq' : seq})
                        break
                if not add_new_class:
                    break

            if add_new_class:
                class_counts.append([{'gene' : gene, 'counts' : counts, 'seq' : seq}, ])

        return class_counts
Esempio n. 28
0
    def separate_into_classes(self, sorted_gene_counts, easycounts):
        class_counts = []
        for gene, counts in sorted_gene_counts:
            seq = self.glfo['seqs'][self.region][gene][:self.codon_positions[gene] + 3]
            add_new_class = True
            for gclass in class_counts:
                for gfo in gclass:
                    if len(gfo['seq']) != len(seq):
                        continue
                    hdist = utils.hamming_distance(gfo['seq'], seq)
                    if hdist < self.args.n_max_snps - 1:  # if this gene is close to any gene in the class, add it to this class
                        add_new_class = False
                        class_counts[class_counts.index(gclass)].append({'gene' : gene, 'counts' : counts, 'seq' : seq})
                        break
                if not add_new_class:
                    break

            if add_new_class:
                class_counts.append([{'gene' : gene, 'counts' : counts, 'seq' : seq}, ])

        return class_counts
Esempio n. 29
0
#!/usr/bin/env python

import utils

utils.test(utils.hamming_distance(utils.str_to_bin('this is a test'),
                                  utils.str_to_bin('wokka wokka!!!')), 37)

with open('files/6.txt') as f:
    text = f.read().decode('base64')

bin_text = utils.str_to_bin(text)
bin_text_arr = bin_text.split()
min_distance = 0
sizes = []

keysizes = []
for keysize in range(2, 41):
    # first KEYSIZE worth of bytes, and the second KEYSIZE worth of bytes
    info = [bin_text_arr[i:i+keysize]
            for i in range(0, len(bin_text_arr), keysize)]

    total_normalized_distance = 0

    # average the hamming distance between sets of bytes
    for a, b in zip(info[::2], info[1::2]):
        edit_distance = utils.hamming_distance(''.join(a), ''.join(b))
        # Normalize this result by dividing by KEYSIZE.
        total_normalized_distance += float(edit_distance) / float(keysize)

    avg_distance = float(total_normalized_distance) / float(len(info))
Esempio n. 30
0
    def finalize(self, pcounter, swfo, debug=False):
        assert not self.finalized
        sorted_gene_counts = [(deps[0], counts) for deps, counts in sorted(pcounter.counts[self.region + '_gene'].items(), key=operator.itemgetter(1), reverse=True)]
        easycounts = {gene : counts for gene, counts in sorted_gene_counts}
        total_counts = sum([counts for counts in easycounts.values()])

        self.genes_to_keep = set()

        if debug:
            print '  removing least likely alleles (%d total counts)' % total_counts
            print '     %-20s    %5s (%s)      removed counts     removed genes' % ('genes to keep', 'counts', 'snps'),

        class_counts = self.separate_into_classes(sorted_gene_counts, easycounts)
        for iclass in range(len(class_counts)):
            gclass = class_counts[iclass]
            n_from_this_class = 0
            for ig in range(len(gclass)):
                gfo = gclass[ig]
                if self.args.n_max_total_alleles is not None and len(self.genes_to_keep) >= self.args.n_max_total_alleles:  # command line can specify the total number of alleles
                    break

                if float(gfo['counts']) / total_counts < self.args.min_allele_prevalence_fraction:  # always skip everybody that's super uncommon
                    pass
                elif ig == 0:  # keep the first one from this class
                    self.genes_to_keep.add(gfo['gene'])
                    n_from_this_class += 1
                elif utils.hamming_distance(gclass[0]['seq'], gclass[ig]['seq']) == 0:  # don't keep it if it's indistinguishable from the most common one (the matches are probably mostly really the best one)
                    pass  # don't keep it
                elif n_from_this_class < self.args.n_alleles_per_gene:  # always keep the most common <self.args.n_alleles_per_gene> in each class
                    self.genes_to_keep.add(gfo['gene'])
                    n_from_this_class += 1
                else:
                    pass  # don't keep it

                if debug and gfo['gene'] in self.genes_to_keep:
                    snpstr = ' ' if ig == 0 else '(%d)' % utils.hamming_distance(gclass[0]['seq'], gfo['seq'])
                    print '\n       %-s  %7d  %-3s' % (utils.color_gene(gfo['gene'], width=20), gfo['counts'], snpstr),
            if debug:
                if n_from_this_class == 0:
                    print '\n       %-s  %7s  %-3s' % (utils.color('blue', 'none', width=20, padside='right'), '-', ''),
                removedfo = [gfo for gfo in gclass if gfo['gene'] not in self.genes_to_keep]
                if len(removedfo) > 0:
                    print '           %5d            %s' % (sum([gfo['counts'] for gfo in removedfo]), ' '.join([utils.color_gene(gfo['gene']) for gfo in removedfo])),
        if debug:
            print ''

        # for igene in range(len(sorted_gene_counts)):
        #     gene, counts = sorted_gene_counts[igene]
        #     if igene == 0:  # always keep the first one
        #         self.dbg_strings[gene] = 'first gene'
        #         self.genes_to_keep.add(gene)
        #         continue
        #     if self.keep_this_gene(gene, pcounter, easycounts, debug=debug):
        #         self.genes_to_keep.add(gene)

        # print '  keeping:'
        # for gene in [g for g, _ in sorted_gene_counts if g in self.genes_to_keep]:
        #     print '    %5d  %s  %s' % (easycounts[gene], utils.color_gene(gene, width=15), self.dbg_strings[gene])

        self.genes_to_remove = set(self.glfo['seqs'][self.region]) - self.genes_to_keep

        # print '  removing:'
        # for gene in [g for g, _ in sorted_gene_counts if g in self.genes_to_remove]:
        #     print '    %5d  %s  %s' % (easycounts[gene], utils.color_gene(gene, width=15), self.dbg_strings[gene])

        n_queries_with_removed_genes = 0
        for query in swfo['queries']:
            line = swfo[query]
            if line[self.region + '_gene'] in self.genes_to_remove:
                n_queries_with_removed_genes += 1
                # unpadded_line = copy.deepcopy(line)
                # unpadded_line['seqs'][0] = unpadded_line['seqs'][0][unpadded_line['padlefts'][0] : ]
                # if unpadded_line['padrights'][0] > 0:
                #     unpadded_line['seqs'][0] = unpadded_line['seqs'][0][ : -unpadded_line['padrights'][0]]
                # utils.print_reco_event(self.glfo['seqs'], unpadded_line)

        print '    keeping %d %s genes' % (len(self.genes_to_keep), self.region)
        print '    removing %d %s genes: %d with no matches, %d with unconvincing matches (%d / %d queries had their best match removed)' % (len(self.genes_to_remove), self.region, len(set(self.glfo['seqs'][self.region]) - set(easycounts)), len(set(easycounts) - self.genes_to_keep), n_queries_with_removed_genes, len(swfo['queries']))

        self.finalized = True
Esempio n. 31
0
def new_respond(control_host, learn_host, debug=False):
    context = zmq.Context()
    
    eventQ = context.socket(zmq.SUB)
    eventQ.connect('tcp://{}:{}'.format(control_host, IO.EVENT))
    eventQ.setsockopt(zmq.SUBSCRIBE, b'') 

    projector = context.socket(zmq.PUSH)
    projector.connect('tcp://{}:{}'.format(control_host, IO.PROJECTOR)) 

    sender = context.socket(zmq.PUSH)
    sender.connect('tcp://{}:{}'.format(control_host, IO.EXTERNAL))

    brainQ = context.socket(zmq.PULL)
    brainQ.bind('tcp://*:{}'.format(IO.BRAIN))

    counterQ = context.socket(zmq.REQ)
    counterQ.connect('tcp://{}:{}'.format(control_host, IO.COUNTER))
    
    cognitionQ = context.socket(zmq.PUSH)
    cognitionQ.connect('tcp://{}:{}'.format(control_host, IO.COGNITION))

    association = context.socket(zmq.REQ)
    association.connect('tcp://{}:{}'.format(learn_host, IO.ASSOCIATION))

    snapshot = context.socket(zmq.REQ)
    snapshot.connect('tcp://{}:{}'.format(control_host, IO.SNAPSHOT))

    scheduler = context.socket(zmq.PUSH)
    scheduler.connect('tcp://{}:{}'.format(control_host, IO.SCHEDULER))

    dreamQ = context.socket(zmq.PULL)
    dreamQ.bind('tcp://*:{}'.format(IO.DREAM))

    snapshot.send_json('Give me state!')
    state = snapshot.recv_json()

    poller = zmq.Poller()
    poller.register(eventQ, zmq.POLLIN)
    poller.register(brainQ, zmq.POLLIN)
    poller.register(dreamQ, zmq.POLLIN)

    sound_to_face = []
    wordFace = {}
    face_to_sound = []
    faceWord = {}
    register = {}
    video_producer = {}
    voiceType1 = 1
    voiceType2 = 6
    wordSpace1 = 0.3
    wordSpaceDev1 = 0.3
    wordSpace2 = 0.1
    wordSpaceDev2 = 0.3

    audio_ids = []
    wavs = []
    wav_audio_ids = []
    NAP_hashes = {}
    most_significant_audio_id = []
    
    if debug:
        import matplotlib.pyplot as plt
        plt.ion()
    
    while True:
        events = dict(poller.poll())

        if brainQ in events:
            cells = brainQ.recv_pyobj()

            mode = cells[0]
            wav_file = cells[1]

            if wav_file not in register:
                register[wav_file] = [False, False, False]

            if mode == 'audio_learn':
                register[wav_file][0] = cells
                            
            if mode == 'video_learn':
                register[wav_file][1] = cells

            if mode == 'face_learn':
                register[wav_file][2] = cells

            if all(register[wav_file]):
                _, _, audio_ids, audio_memory, most_significant_audio_id, wavs, wav_audio_ids = register[wav_file][0]
                _, _, tarantino = register[wav_file][1]
                _, _, face_id, face_recognizer = register[wav_file][2]          
                print 'Audio - video - face recognizers related to {} arrived at responder, total processing time {} seconds'.format(wav_file, time.time() - utils.filetime(wav_file))

                for audio_id in audio_ids: # If audio_ids is empty, none of this will happen
                    video_producer[(audio_id, face_id)] = tarantino 
                    if audio_id < len(sound_to_face) and not face_id in sound_to_face[audio_id]: # sound heard before, but not said by this face 
                        sound_to_face[audio_id].append(face_id)
                    if audio_id == len(sound_to_face):
                        sound_to_face.append([face_id])

                    wordFace.setdefault(audio_id, [[face_id,0]])
                    found = 0
                    for item in wordFace[audio_id]:
                        if item[0] == face_id:
                            item[1] += 1
                            found = 1
                    if found == 0:
                        wordFace[audio_id].append([face_id,1])

                    # We can't go from a not known face to any of the sounds, that's just the way it is.
                    print 'face_id for audio segment learned', face_id
                    if face_id is not -1:
                        if face_id < len(face_to_sound) and not audio_id in face_to_sound[face_id]: #face seen before, but the sound is new
                            face_to_sound[face_id].append(audio_id)
                        if face_id == len(face_to_sound):
                            face_to_sound.append([audio_id])
                        faceWord.setdefault(face_id, [[audio_id,0]])
                        found = 0
                        for item in faceWord[face_id]:
                            if item[0] == audio_id:
                                item[1] += 1
                                found = 1
                        if found == 0:
                            faceWord[face_id].append([audio_id,1])
                            
                del register[wav_file]
                
                similar_ids = []
                for audio_id in audio_ids:
                    # I SUSPECT THIS IS WRONG, SINCE THERE IS NO SORTING OF THESE HAMMING DISTANCES IN ASSOCIATION.PY
                    new_audio_hash = audio_memory.audio_ids[audio_id][-1].crude_hash
                    similar_ids_for_this_audio_id = [ utils.hamming_distance(new_audio_hash, random.choice(h).crude_hash) for h in audio_memory.audio_ids.itervalues() ]
                    similar_ids.append(similar_ids_for_this_audio_id)

                if len(audio_ids):
                    association.send_pyobj(['analyze',wav_file,wav_audio_ids,audio_ids,wavs,similar_ids,wordFace,faceWord])
                    association.recv_pyobj()
                    sender.send_json('last_most_significant_audio_id {}'.format(most_significant_audio_id))

                cognitionQ.send_pyobj(face_recognizer) # A possiblity of recognizing a face that is not connecting to any soundfiles

                                
        if eventQ in events:
            pushbutton = eventQ.recv_json()
            if 'respond_single' in pushbutton:
                try:
                    filename = pushbutton['filename']
                    audio_segments = utils.get_segments(filename)
                    print 'Single response to {} duration {} seconds with {} segments'.format(filename, audio_segments[-1], len(audio_segments)-1)
                    new_sentence = utils.csv_to_array(filename + 'cochlear')
                    norm_segments = np.rint(new_sentence.shape[0]*audio_segments/audio_segments[-1]).astype('int')

                    segment_id = utils.get_most_significant_word(filename)

                    NAP = utils.trim_right(new_sentence[norm_segments[segment_id]:norm_segments[segment_id+1]])
           
                    if debug:            
                        plt.imshow(NAP.T, aspect='auto')
                        plt.draw()

                    best_match,_,_,_,_ = audio_memory.find(NAP)
                    soundfile = best_match.wav_file
                    segstart, segend = best_match.segment_idxs

                    voiceChannel = 1
                    speed = 1
                    amp = -3 # voice amplitude in dB
                    _,dur,maxamp,_ = utils.getSoundInfo(soundfile)
                    
                    start = 0
                    voice1 = 'playfile {} {} {} {} {} {} {} {} {}'.format(1, voiceType1, start, soundfile, speed, segstart, segend, amp, maxamp)
                    voice2 = ''

                    print 'Recognized as sound {}'.format(best_match.audio_id)

                    # sound_to_face, video_producer
                    projection = _project(best_match.audio_id, sound_to_face, NAP, video_producer)

                    scheduler.send_pyobj([[ dur, voice1, voice2, projection, FRAME_SIZE ]])
                    print 'Respond time from creation of wav file was {} seconds'.format(time.time() - utils.filetime(filename))
                except:
                    utils.print_exception('Single response aborted.')


            if 'play_sentence' in pushbutton:
                try:
                    sentence = pushbutton['sentence']
                    sentence = eval(sentence)
                    print '*** (play) Play sentence', sentence
                    start = 0 
                    nextTime1 = 0
                    play_events = []
                    for i in range(len(sentence)):
                        word_id = sentence[i]
                        soundfile = np.random.choice(wavs[word_id])
                        speed = 1

                        segstart, segend = wav_audio_ids[(soundfile, word_id)]
                        NAP = _extract_NAP(segstart, segend, soundfile)

                        amp = -3 # voice amplitude in dB
                        _,totaldur,maxamp,_ = utils.getSoundInfo(soundfile)
                        dur = segend-segstart
                        if dur <= 0: dur = totaldur
                        # play in both voices
                        voice1 = 'playfile {} {} {} {} {} {} {} {} {}'.format(1, voiceType1, start, soundfile, speed, segstart, segend, amp, maxamp)
                        voice2 = 'playfile {} {} {} {} {} {} {} {} {}'.format(2, voiceType1, start, soundfile, speed, segstart, segend, amp, maxamp)
                        wordSpacing1 = wordSpace1 + np.random.random()*wordSpaceDev1
                        print 'PLAY RESPOND SPACING', wordSpacing1
                        nextTime1 += (dur/speed)+wordSpacing1

                        projection = _project(audio_id, sound_to_face, NAP, video_producer)

                        play_events.append([ dur+wordSpacing1, voice1, voice2, projection, FRAME_SIZE ])                        
                    scheduler.send_pyobj(play_events)
                except:
                    utils.print_exception('Sentence play aborted.')

            if 'respond_sentence' in pushbutton:
                print 'SENTENCE Respond to', pushbutton['filename'][-12:]
                    
                try:
                    filename = pushbutton['filename']
                    audio_segments = utils.get_segments(filename)
                    print 'Sentence response to {} duration {} seconds with {} segments'.format(filename, audio_segments[-1], len(audio_segments)-1)
                    new_sentence = utils.csv_to_array(filename + 'cochlear')
                    norm_segments = np.rint(new_sentence.shape[0]*audio_segments/audio_segments[-1]).astype('int')

                    segment_id = utils.get_most_significant_word(filename)
                    print '**Sentence selected to respond to segment {}'.format(segment_id)

                    NAP = utils.trim_right(new_sentence[norm_segments[segment_id]:norm_segments[segment_id+1]])

                    best_match,_,_,_,_ = audio_memory.find(NAP)
                    audio_id = best_match.audio_id
                    soundfile = best_match.wav_file
        
                    numWords = len(audio_segments)-1
                    print numWords
                    association.send_pyobj(['setParam', 'numWords', numWords ])
                    association.recv_pyobj()
                    
                    association.send_pyobj(['makeSentence', audio_id])
                    print 'respond_sentence waiting for association output...', 
                    sentence, secondaryStream = association.recv_pyobj()

                    print '*** (respond) Play sentence', sentence, secondaryStream
                    start = 0 
                    nextTime1 = 0
                    nextTime2 = 0
                    enableVoice2 = 1

                    play_events = []

                    for i in range(len(sentence)):
                        word_id = sentence[i]
                        soundfile = np.random.choice(wavs[word_id])
                        voiceChannel = 1
                        speed = 1
                        
                        # segment start and end within sound file, if zero, play whole file
                        segstart, segend = wav_audio_ids[(soundfile, word_id)]
                        NAP = _extract_NAP(segstart, segend, soundfile)
                        
                        amp = -3 # voice amplitude in dB
                        #totaldur, maxamp = utils.getSoundParmFromFile(soundfile)
                        _,totaldur,maxamp,_ = utils.getSoundInfo(soundfile)
                        dur = segend-segstart
                        if dur <= 0: dur = totaldur
                        voice1 = 'playfile {} {} {} {} {} {} {} {} {}'.format(voiceChannel, voiceType1, start, soundfile, speed, segstart, segend, amp, maxamp)
                        #start += dur # if we want to create a 'score section' for Csound, update start time to make segments into a contiguous sentence
                        wordSpacing1 = wordSpace1 + np.random.random()*wordSpaceDev1
                        nextTime1 += (dur/speed)+wordSpacing1
                        #print 'voice 2 ready to play', secondaryStream[i], i
                        if enableVoice2:
                            word_id2 = secondaryStream[i]
                            #print 'voice 2 playing', secondaryStream[i]
                            soundfile2 = np.random.choice(wavs[word_id2])
                            voiceChannel2 = 2
                            start2 = 0.7 #  set delay between voice 1 and 2
                            speed2 = 0.7
                            amp2 = -10 # voice amplitude in dB
                            try:
                                segstart2, segend2 = wav_audio_ids[(soundfile2, word_id2)]
                                dur2 = segend2-segstart2
                                #totalDur2, maxamp2 = utils.getSoundParmFromFile(soundfile2)
                                _,totalDur2,maxamp2,_ = utils.getSoundInfo(soundfile)
                                if dur2 <= 0: dur2 = totalDur2
                                voice2 = 'playfile {} {} {} {} {} {} {} {} {}'.format(voiceChannel2, voiceType2, start2, soundfile2, speed2, segstart2, segend2, amp2, maxamp2)
                                wordSpacing2 = wordSpace2 + np.random.random()*wordSpaceDev2
                                nextTime2 += (dur2/speed2)+wordSpacing2
                            except:
                                voice2 = ''
                                utils.print_exception('VOICE 2 tried to access an illegal soundfile/audio_id combination.')
                            #enableVoice2 = 0
                        # trig another word in voice 2 only if word 2 has finished playing (and sync to start of voice 1)
                        if nextTime1 > nextTime2: enableVoice2 = 1 

                        projection = _project(audio_id, sound_to_face, NAP, video_producer)
                        print 'SENTENCE RESPOND SPACING', wordSpacing1
                        play_events.append([ dur+wordSpacing1, voice1, voice2, projection, FRAME_SIZE ])

                    scheduler.send_pyobj(play_events)
                    print 'Sentence respond time from creation of wav file was {} seconds'.format(time.time() - utils.filetime(filename))
                except:
                    utils.print_exception('Sentence response aborted.')
                    
            if 'testSentence' in pushbutton:
                print 'testSentence', pushbutton
                association.send_pyobj(['makeSentence',int(pushbutton['testSentence'])])
                print 'testSentence waiting for association output...'
                sentence, secondaryStream = association.recv_pyobj()
                print '*** Test sentence', sentence, secondaryStream
            
            if 'assoc_setParam' in pushbutton:
                try:
                    parm, value = pushbutton['assoc_setParam'].split()
                    association.send_pyobj(['setParam', parm, value ])
                    association.recv_pyobj()
                except:
                    utils.print_exception('Assoc set param aborted.')

            if 'respond_setParam' in pushbutton:
                items = pushbutton['respond_setParam'].split()
                if items[0] == 'voiceType':
                    chan = items[1]
                    if chan == '1': voiceType1 = int(items[2])
                    if chan == '2': voiceType2 = int(items[2])
                if items[0] == 'wordSpace':
                    chan = items[1]
                    print 'wordSpace chan', chan, items
                    if chan == '1': wordSpace1 = float(items[2])
                    if chan == '2': wordSpace2 = float(items[2])
                if items[0] == 'wordSpaceDev':
                    chan = items[1]
                    print 'wordSpaceDev1 chan', chan, items
                    if chan == '1': wordSpaceDev1 = float(items[2])
                    if chan == '2': wordSpaceDev2 = float(items[2])

            if 'play_id' in pushbutton:
                try:
                    items = pushbutton['play_id'].split(' ')
                    if len(items) < 3: print 'PARAMETER ERROR: play_id audio_id voiceChannel voiceType'
                    play_audio_id = int(items[0])
                    voiceChannel = int(items[1])
                    voiceType = int(items[2])
                    print 'play_audio_id', play_audio_id, 'voice', voiceChannel
                    print 'wavs[play_audio_id]', wavs[play_audio_id]
                    #print wavs
                    soundfile = np.random.choice(wavs[play_audio_id])
                    
                    speed = 1
                    #print 'wav_audio_ids', wav_audio_ids
                    segstart, segend = wav_audio_ids[(soundfile, play_audio_id)]
                    #segstart = 0 # segment start and end within sound file
                    #segend = 0 # if zero, play whole file
                    amp = -3 # voice amplitude in dB
                    #dur, maxamp = utils.getSoundParmFromFile(soundfile)
                    _,dur,maxamp,_ = utils.getSoundInfo(soundfile)
                    start = 0
                    sender.send_json('playfile {} {} {} {} {} {} {} {} {}'.format(voiceChannel, voiceType, start, soundfile, speed, segstart, segend, amp, maxamp))
                except:
                    utils.print_exception('play_id aborted.')

            if 'print_me' in pushbutton:
                # just for inspecting the contents of objects while running 
                print 'printing '+pushbutton['print_me']
                if 'brain ' in pushbutton['print_me']: 
                    print_variable = pushbutton['print_me'].split('brain ')[-1]
                    try:
                        print eval(print_variable)
                    except Exception, e:
                        print e, 'print_me in brain failed.'
                elif 'association ' in pushbutton['print_me']: 
                    print_variable = pushbutton['print_me'].split('association ')[-1]
                    association.send_pyobj(['print_me',print_variable])

            if 'dream' in pushbutton:
                play_events = []
                for audio_segment in audio_memory.all_segments():
                    segstart, segend = audio_segment.segment_idxs
                    dur = segend - segstart
                    NAP = _extract_NAP(segstart, segend, audio_segment.wav_file)
                    speed = 1
                    amp = -3
                    maxamp = 1
                    start = 0
                    voice1 = 'playfile {} {} {} {} {} {} {} {} {}'.format(1, 6, np.random.rand()/3, audio_segment.wav_file, speed, segstart, segend, amp, maxamp)
                    projection = _project(audio_segment.audio_id, sound_to_face, NAP, video_producer)
                    voice2 = 'playfile {} {} {} {} {} {} {} {} {}'.format(2, 6, np.random.randint(3,6), audio_segment.wav_file, speed, segstart, segend, amp, maxamp)
                    play_events.append([ dur, voice1, voice2, projection, FRAME_SIZE ])
                print 'Dream mode playing back {} memories'.format(len(play_events))
                scheduler.send_pyobj(play_events)

            if 'save' in pushbutton:
                utils.save('{}.{}'.format(pushbutton['save'], mp.current_process().name), [ sound_to_face, wordFace, face_to_sound, faceWord, video_producer, wavs, wav_audio_ids, audio_classifier, maxlen, NAP_hashes, face_id, face_recognizer, audio_memory ])

            if 'load' in pushbutton:
                sound_to_face, wordFace, face_to_sound, faceWord, video_producer, wavs, wav_audio_ids, audio_classifier, maxlen, NAP_hashes, face_id, face_recognizer, audio_memory = utils.load('{}.{}'.format(pushbutton['load'], mp.current_process().name))
Esempio n. 32
0
#!/usr/bin/env python

import utils

utils.test(
    utils.hamming_distance(utils.str_to_bin('this is a test'),
                           utils.str_to_bin('wokka wokka!!!')), 37)

with open('files/6.txt') as f:
    text = f.read().decode('base64')

bin_text = utils.str_to_bin(text)
bin_text_arr = bin_text.split()
min_distance = 0
sizes = []

keysizes = []
for keysize in range(2, 41):
    # first KEYSIZE worth of bytes, and the second KEYSIZE worth of bytes
    info = [
        bin_text_arr[i:i + keysize]
        for i in range(0, len(bin_text_arr), keysize)
    ]

    total_normalized_distance = 0

    # average the hamming distance between sets of bytes
    for a, b in zip(info[::2], info[1::2]):
        edit_distance = utils.hamming_distance(''.join(a), ''.join(b))
        # Normalize this result by dividing by KEYSIZE.
        total_normalized_distance += float(edit_distance) / float(keysize)
    raw_text = ''.join(raw_list) 
# convert input text from base64 to plain
text = raw_text.decode('base64')

hamm_scores = {}

for b in range(2,41):
    # Split text into chunks of length b
    text_chunks = [text[x:x+b] for x in range(0,len(text),b)]
    # First chunk
    s1 = text_chunks[0]
    hamming_sum = 0 
    for block in range(1,sample_size):
        # Subsequent chunks
        s2 = text_chunks[block]
        d = hamming_distance(s1, s2)
        hamming_sum += d
    average_hamm = hamming_sum / float(sample_size)

    normalised_hamming_distance = average_hamm / float(b)
    hamm_scores[b] = normalised_hamming_distance

best = min(hamm_scores.items(), key=itemgetter(1))[0]
#print best

rows = ['' for x in range(best)]
for i, t in enumerate(text):
    rows[i % best] += t

output_list = []
Esempio n. 34
0
 def hamming_distance_penalty(set1, set2):
     longest_hypo = len(max(set1 + set2, key=len))
     hypos = utils.as_ndarray(set1, min_length=longest_hypo)
     other_hypos = utils.as_ndarray(set2, min_length=longest_hypo)
     return np.apply_along_axis(
         lambda x: utils.hamming_distance(x, other_hypos), 1, hypos)
Esempio n. 35
0
    def finalize_region(self,
                        region,
                        sorted_gene_counts,
                        annotations=None,
                        debug=False):
        easycounts = {gene: counts for gene, counts in sorted_gene_counts}
        total_counts = sum([counts for counts in easycounts.values()])
        class_counts = self.separate_into_classes(region, sorted_gene_counts,
                                                  easycounts)

        genes_to_keep = set()

        if debug:
            print '   %s groups separated by %d snps  (-: same group as previous kept gene)' % (
                utils.color('blue', region), self.n_max_snps[region])
            print '     %-20s       %5s %s        removed genes (snps counts%s)%s%s' % (
                'genes to keep',
                'counts',
                '' if self.simglfo is None else utils.color('blue', 'sim'),
                '' if self.simglfo is None else utils.color(
                    'blue', ' sim counts'),
                '' if self.simglfo is None else
                ('  ' + utils.color('red', 'x:') + ' not in simulation'),
                '' if (annotations is None or self.reco_info is None) else
                ('               %s sim counts/genes for the queries assigned to this kept gene %s'
                 % (utils.color('blue', '['), utils.color('blue', ']'))),
            ),

            def count_str(cnt):
                if cnt < 10.:
                    return '%.1f' % cnt
                else:
                    return '%.0f' % cnt

            def simcountstr(
                gene, ws
            ):  # counts in simulation for <gene> (note that this is _not_ the same as sim_gene_count_str(), since this takes no account of _which_ queries these counts occur in [plus it's coming from the opposite point of view])
                if self.simglfo is None:
                    rstr = ''
                elif gene in self.simglfo['seqs'][utils.get_region(gene)]:
                    rstr = utils.color(
                        'blue', (' %' + ws + 'd') %
                        self.simcounts[utils.get_region(gene)][gene])
                else:
                    rstr = utils.color('red', (' %' + ws + 's') % 'x')
                return rstr

            def sim_gene_count_str(
                kgene
            ):  # figure out simulation genes and counts for the uids assigned to <kgene>
                if annotations is None or self.reco_info is None:
                    return ''
                uids_this_gene = [
                    uid for uid, line in annotations.items()
                    if line[region + '_gene'] == kgene
                ]
                sim_genes = {
                }  # simulation genes for the uids that we assigned to <kgene> (note that self.simcounts doesn't have this per-uid information)
                for uid in uids_this_gene:
                    sgene = self.reco_info[uid][region + '_gene']
                    if sgene not in sim_genes:
                        sim_genes[sgene] = 0
                    sim_genes[sgene] += 1
                sorted_sim_gene_counts = sorted(sim_genes.items(),
                                                key=operator.itemgetter(1),
                                                reverse=True)
                count_str = ' '.join([
                    utils.color('blue' if sg == kgene else 'red', str(c))
                    for sg, c in sorted_sim_gene_counts
                ])
                sgene_str = ' '.join(
                    [utils.color_gene(sg) for sg, _ in sorted_sim_gene_counts])
                return '%s   %s' % (count_str, sgene_str)

        for iclass in range(len(class_counts)):
            gclass = class_counts[iclass]
            kept_this_class = []
            for ig in range(len(gclass)):
                gfo = gclass[ig]

                if float(
                        gfo['counts']
                ) / total_counts < self.args.min_allele_prevalence_fraction:  # always skip everybody that's super uncommon
                    pass  # don't keep it
                elif ig == 0:  # keep the first one from this class
                    genes_to_keep.add(gfo['gene'])
                    kept_this_class.append(gfo['gene'])
                elif utils.hamming_distance(
                        gclass[0]['seq'], gclass[ig]['seq']
                ) == 0:  # don't keep it if it's indistinguishable from the most common one (the matches are probably mostly really the best one)
                    pass  # don't keep it
                elif len(
                        kept_this_class
                ) < self.args.n_alleles_per_gene:  # always keep the most common <self.args.n_alleles_per_gene> in each class [note: defaults to 1 if looking for new alleles, otherwise 2]
                    genes_to_keep.add(gfo['gene'])
                    kept_this_class.append(gfo['gene'])
                else:
                    pass  # don't keep it

                if debug and gfo['gene'] in genes_to_keep:
                    snpstr = ' ' if ig == 0 else '(%d)' % utils.hamming_distance(
                        gclass[0]['seq'], gfo['seq']
                    )  # only happens if we keep more than one from this class
                    print '\n      %s%-s  %7s%s  %-3s' % (
                        '- ' if ig > 0 else '  ',
                        utils.color_gene(gfo['gene'], width=20),
                        count_str(gfo['counts']), simcountstr(
                            gfo['gene'], '4'), snpstr),
            if debug:
                if len(kept_this_class) == 0:
                    print '\n      %s%-s  %7s%4s  %-3s' % (
                        '  ',
                        utils.color('blue', 'none', width=20,
                                    padside='right'), '-', '', ''),
                removedfo = [
                    gfo for gfo in gclass if gfo['gene'] not in genes_to_keep
                ]
                removed_str = ''
                if len(removedfo) > 0:
                    number_strs = [
                        '(%d %3s%s)' % (gfo['hdist'], count_str(
                            gfo['counts']), simcountstr(gfo['gene'], '1'))
                        for gfo in removedfo
                    ]
                    name_strs = [
                        '%s' % (utils.color_gene(gfo['gene']))
                        for gfo in removedfo
                    ]
                    removed_str = '%s  %s' % (' '.join(number_strs),
                                              ' '.join(name_strs))
                annotation_str = ''
                if (annotations is not None and self.reco_info
                        is not None) and len(kept_this_class) > 0:
                    annotation_str = '%s %s %s' % (utils.color(
                        'blue', '['), sim_gene_count_str(
                            kept_this_class[-1]), utils.color('blue', ']'))
                print '     %s  %s  %s' % (
                    removed_str,
                    (70 - utils.len_excluding_colors(removed_str)) * ' ',
                    annotation_str),
        if debug:
            print ''

        genes_to_remove = set(self.glfo['seqs'][region]) - genes_to_keep

        print '    keeping %d / %d %s gene%s' % (
            len(genes_to_keep), len(self.glfo['seqs'][region]), region,
            utils.plural(len(genes_to_keep)))
        if len(genes_to_keep) == 0:
            print '   would\'ve kept zero genes, instead keeping all of them'
            genes_to_keep = copy.deepcopy(genes_to_remove)
            genes_to_remove.clear()

        if self.simglfo is not None:
            missing_genes = set(self.simglfo['seqs'][region]) - genes_to_keep
            if len(missing_genes) > 0:
                print '    %s %d simulation genes (counts): %s' % (utils.color(
                    'red', 'missing'), len(missing_genes), '  '.join(
                        [('%s %d' %
                          (utils.color_gene(g), self.simcounts[region][g]))
                         for g in sorted(missing_genes)]))
            completely_absent_genes = missing_genes - genes_to_remove
            if len(completely_absent_genes) > 0:
                print '%s %d simulation genes completely absent: %s' % (
                    utils.color('red', 'warning'),
                    len(completely_absent_genes), '  '.join(
                        [('%s %d' %
                          (utils.color_gene(g), self.simcounts[region][g]))
                         for g in sorted(completely_absent_genes)]))

        self.genes_to_keep |= genes_to_keep  # add the ones from _this_ region (rhs) to the ones from all regions (lhs)
        self.genes_to_remove |= genes_to_remove

        self.finalized = True
Esempio n. 36
0
 def print_cluster(self, iclust, clusterfo, sorted_glcounts, new_seq, true_sorted_glcounts, mean_cluster_mfreqs, has_indels):
     if iclust > 0:
         print ''
     print '    %-3d  %4d   %6.3f' % (iclust, len(clusterfo['seqfos']), mean_cluster_mfreqs['v'] / mean_cluster_mfreqs['j']),
     for igene in range(len(sorted_glcounts)):
         if igene > 0:
             print '%22s' % '',
         gene, counts = sorted_glcounts[igene]
         print '   %-s %4d      %2d%s' % (utils.color_gene(gene, width=20), counts, utils.hamming_distance(new_seq, self.glfo['seqs'][self.region][gene], align=True), ' (%s)' % utils.color('blue', 'x') if has_indels else '   '),
         if igene < len(sorted_glcounts) - 1 or self.reco_info is not None:
             print ''
     if self.reco_info is not None:
         for igene in range(len(true_sorted_glcounts)):
             gene, counts = true_sorted_glcounts[igene]
             print '%17s       %s %-s %4d %s    %2d   ' % ('', utils.color('green', '['), utils.color_gene(gene[:23], width=20), counts, utils.color('green', ']'), utils.hamming_distance(new_seq, self.simglfo['seqs'][self.region][gene], align=True)),
             if igene < len(true_sorted_glcounts) - 1:
                 print ''
Esempio n. 37
0
def main():
    from utils import base64_to_binary, hamming_distance, single_char_key_search
    import numpy as np
    from encryption import repeating_key_xor_base64
    from conversion import base64_to_text
    from utils import get_english_score
    import codecs
    from data_utils import generate_binary_to_hex_mapping

    with open('data/problem6.txt') as f:
        content = f.read()
    content = content.replace('\n', '')

    binary_data = base64_to_binary(content)

    min_info = []
    for keysize in range(2, 41):
        num_blocks = 4
        distance = 0
        for i in range(num_blocks):
            distance += hamming_distance(
                binary_data[keysize * i * 8:keysize * (i + 1) * 8],
                binary_data[keysize * (i + 1) * 8:keysize * (i + 2) * 8])
        for i in range(num_blocks):
            distance += hamming_distance(
                binary_data[keysize * i * 8:keysize * (i + 1) * 8],
                binary_data[keysize * (i + 2) * 8:keysize * (i + 3) * 8])
        distance = distance / keysize
        min_info.append((keysize, distance))

    best_key_sizes = sorted(min_info, key=lambda x: x[1])[:3]
    potential_keys = []

    for keysize_data in best_key_sizes:
        potential_key = ''
        key_size = keysize_data[0]
        keysize_num_bits = key_size * 8

        if len(binary_data) % (keysize_num_bits) != 0:
            binary_data_divisible = binary_data[:-(len(binary_data) %
                                                   (keysize_num_bits))]
        else:
            binary_data_divisible = binary_data

        splatted_binary_data = np.asarray([*binary_data_divisible])
        numpy_binary_data = np.reshape(splatted_binary_data, (int(
            len(splatted_binary_data) / keysize_num_bits), keysize_num_bits))

        for block_start in range(0, keysize_num_bits - 7, 8):
            binary_string_builder = ''
            block = numpy_binary_data[:, block_start:block_start + 8]

            splatted_bin = list(block)

            for list_of_bytes in splatted_bin:
                binary_string_builder += ''.join(list_of_bytes)

            bin_to_hex = generate_binary_to_hex_mapping()

            hex_string = ''
            for i in range(0, len(binary_string_builder), 4):
                hex_string += bin_to_hex[binary_string_builder[i:i + 4]]

            potential_key += single_char_key_search(hex_string)[2]
        potential_keys.append(potential_key)

    potential_answers = []
    for key in potential_keys:
        decrypted_output = base64_to_text(
            repeating_key_xor_base64(key, content))
        english_score = get_english_score(bytes(decrypted_output, 'utf-8'))
        potential_answers.append((english_score, decrypted_output))

    final_answer = sorted(potential_answers, key=lambda x: x[0],
                          reverse=True)[0]
    print(final_answer[1])
Esempio n. 38
0
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(
        '%s/%s.fasta' %
        (outdir, args.extrastr))  # output mutated sequences from bcr-phylo

    assert len(
        naive_line['unique_ids']
    ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(
        naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        reco_info[sfo['name']] = mline
        utils.add_implicit_info(glfo, mline)
    final_line = utils.synthesize_multi_seq_line_from_reco_info(
        [sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        cmd = 'export PATH=%s:$PATH && xvfb-run -a python ./bin/view-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % (
            ete_path, outdir, args.extrastr, outdir, outdir)
        utils.simplerun(cmd, shell=True)
        kdvals = {}
        with open('%s/kd-vals.csv' % outdir) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                kdvals[line['uid']] = float(line['kd'])
        if len(
                set(kdvals) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(kdvals) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(kdvals)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(kdvals))
        final_line['affinities'] = [
            1. / kdvals[u] for u in final_line['unique_ids']
        ]
        tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir)
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree),
                                  padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(
        glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(
        final_line, irandom=ievent
    )  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]
    from Bio.Seq import Seq
    final_line['nearest_target_indices'] = []
    aa_targets = [Seq(seq).translate() for seq in final_line['target_seqs']]
    for mseq in final_line['input_seqs']:
        aa_mseq = Seq(mseq).translate()
        aa_hdists = [
            utils.hamming_distance(aa_t, aa_mseq, amino_acid=True)
            for aa_t in aa_targets
        ]
        imin = aa_hdists.index(
            min(aa_hdists)
        )  # NOTE doesn't do anything differently if there's more than one min
        final_line['nearest_target_indices'].append(imin)

    return final_line
Esempio n. 39
0
import utils

with open("6.txt") as f:
    contents = utils.ByteArray.fromBase64(f.read())
    # print contents.asBase64()

    normalized_edit_dists = {}
    min_edit_dist = 40 * 8
    for keysize in range(2, 41):
        edit_dist = utils.hamming_distance(
            contents[:keysize],
            contents[2 * keysize:3 * keysize]) / float(keysize)
        edit_dist2 = utils.hamming_distance(
            contents[keysize:2 * keysize],
            contents[3 * keysize:4 * keysize]) / float(keysize)
        normalized_edit_dists[keysize] = (edit_dist + edit_dist2) / 2

    keysizes = sorted(normalized_edit_dists.iteritems(), key=lambda v: v[1])
    possible_keysizes = keysizes[:3]
    print "key sizes", possible_keysizes

    for possible_keysize, _ in possible_keysizes:
        transposed_blocks = [
            utils.ByteArray() for _ in range(possible_keysize)
        ]

        for i, e in enumerate(contents):
            transposed_blocks[i % possible_keysize].append(e)

        possible_key = [
            utils.freq_analysis(block) for block in transposed_blocks
Esempio n. 40
0
    def make_single_tree(self, partitions, annotations, uid_set, get_fasttrees=False, n_max_cons_seqs=10, debug=False):
        # NOTE don't call this externally -- if you want a single tree, call make_trees() with <i_only_cluster> set
        def getline(uidstr, uid_set=None):
            if uidstr in annotations:  # if we have this exact annotation
                return annotations[uidstr]
            else:
                if uid_set is None:
                    uid_set = set(uidstr.split(':'))  # should only get called if it's a singleton
                # note that for internal nodes in a fasttree-derived subtree, the uids will be out of order compared the the annotation keys
                for line in annotations.values():  # we may actually have the annotation for every subcluster (e.g. if --calculate-alternative-annotations was set), but in case we don't, this is fine
                    if len(uid_set & set(line['unique_ids'])) > 0:  # just take the first one with any overlap. Yeah, it's not necessarily the best, but its naive sequence probably isn't that different, and for just getting the fasttree it reeeeeeaaaallly doesn't matter
                        return line
            raise Exception('couldn\'t find uid %s in annotations' % uid)
        def getseq(uid):
            line = getline(uid)
            return line['seqs'][line['unique_ids'].index(uid)]
        def lget(uid_list):
            return ':'.join(uid_list)

        # check for repeated uids (was only from seed uid, which shouldn't happen any more, but the code below throws an infinite loop if we do, so may as well be careful)
        for partition in partitions:
            if sum(len(c) for c in partition) > len(set(u for c in partition for u in c)):
                repeated_uids = [u for u, count in collections.Counter([u for c in partition for u in c]).items() if count > 1]
                raise Exception('found %d uid%s in more than one cluster (%s)' % (len(repeated_uids), utils.plural(len(repeated_uids)), ', '.join(repeated_uids)))

        default_edge_length = 999999  # it's nice to have the edges all set to something that's numeric (so the trees print), but also obvious wrong, if we forget to set somebody
        assert len(partitions[-1]) == 1
        root_label = lget(partitions[-1][0])  # we want the order of the uids in the label to correspond to the order in self.partitions
        tns = dendropy.TaxonNamespace([root_label])
        root_node = dendropy.Node(taxon=tns.get_taxon(root_label))
        root_node.uids = uid_set  # each node keeps track of the uids of its children
        dtree = dendropy.Tree(taxon_namespace=tns, seed_node=root_node)
        if debug:
            print '    starting tree with %d leaves' % len(uid_set)
        for ipart in reversed(range(len(partitions) - 1)):  # dendropy seems to only have fcns to build a tree from the root downward, so we loop starting with the last partition (- 1 is because the last partition is guaranteed to be just one cluster)
            for lnode in dtree.leaf_node_iter():  # look for leaf nodes that contain uids from two clusters in this partition, and add those as children
                tclusts = [c for c in partitions[ipart] if len(set(c) & lnode.uids) > 0]
                if len(tclusts) < 2:
                    continue
                for tclust in tclusts:
                    ttaxon = dendropy.Taxon(lget(tclust))
                    tns.add_taxon(ttaxon)
                    child = lnode.new_child(taxon=ttaxon, edge_length=default_edge_length)
                    child.uids = set(tclust)
                if debug:
                    print '      ipart %d' % ipart
                    print '        split node: %d --> %s      %s --> %s' % (len(lnode.uids), ' '.join([str(len(tc)) for tc in tclusts]), lnode.taxon.label, ' '.join([c.taxon.label for c in lnode.child_node_iter()]))

        # split existing leaves, which are probably not singletons (they're probably from the initial naive sequence collapse step) into subtrees such that each leaf is a singleton
        for lnode in dtree.leaf_node_iter():
            if len(lnode.uids) == 1:
                continue
            if get_fasttrees and len(lnode.uids) > 2:
                seqfos = [{'name' : uid, 'seq' : getseq(uid)} for uid in lnode.taxon.label.split(':')]  # may as well add them in the right order, although I don't think it matters
                subtree = treeutils.get_fasttree_tree(seqfos, getline(lnode.taxon.label, uid_set=lnode.uids)['naive_seq'], suppress_internal_node_taxa=True)  # note that the fasttree distances get ignored below (no idea if they'd be better than what we set down there, but they probably wouldn't be consistent, so I'd rather ignore them)
                for tmpnode in subtree.postorder_node_iter():
                    if tmpnode.is_leaf():
                        tmpnode.uids = set([tmpnode.taxon.label])
                    else:
                        tmpnode.uids = set([uid for c in tmpnode.child_node_iter() for uid in c.uids])
                        ttaxon = dendropy.Taxon(lget(tmpnode.uids))
                        subtree.taxon_namespace.add_taxon(ttaxon)
                        tmpnode.taxon = ttaxon  # ...and use the string of leaf nodes, even though they'll be in the wrong order (I think these get ignored when I call label_nodes() below, but it's still tidier to have them right in the meantime, and anyway since I'm suppressing internal taxa I think I need to set them to something)

                if debug:
                    print '   adding subtree with %d leaves from fastree at leaf node %s' % (len(seqfos), lnode.taxon.label)
                    print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=subtree))
                dtree.taxon_namespace.add_taxa(subtree.taxon_namespace)
                lnode.add_child(subtree.seed_node)
                assert len(lnode.child_edges()) == 1  # we're iterating over leaves, so this should always be true
                lnode.child_edges()[0].collapse()
            else:  # just add a star subtree
                for uid in lnode.taxon.label.split(':'):  # may as well add them in the right order, although I don't think it matters
                    ttaxon = dendropy.Taxon(uid)
                    tns.add_taxon(ttaxon)
                    child = lnode.new_child(taxon=ttaxon, edge_length=default_edge_length)
                    child.uids = set([uid])
                if debug:
                    print '      added %d singleton children for %s' % (len(lnode.uids), lnode.taxon.label)

        # in order to set edge lengths, we need node sequences, so first set leaf node seqs
        for lnode in dtree.leaf_node_iter():
            assert len(lnode.uids) == 1
            lnode.seq = getseq(lnode.taxon.label)
            lnode.n_descendent_leaves = 1  # keep track of how many leaf nodes contributed to each node's consensus sequence (these are leaves, so it's trivally 1). This is less accurate than keeping track of all the sequences, but also faster

        # then set internal node seqs as the consensus of their children, and set the distance as hamming distance to child seqs
        if debug:
            print '    adding edge lengths either from fasttree %s or cons seq %s' % (utils.color('blue', 'x'), utils.color('red', 'x'))
        min_edge_length = None  # setting this is nice for better debug viewing
        for node in dtree.postorder_internal_node_iter():  # includes root node
            child_cons_seq_counts = [c.n_descendent_leaves for c in node.child_node_iter()]
            total_descendent_leaves = sum(child_cons_seq_counts)
            if total_descendent_leaves > n_max_cons_seqs:  # if there's tons of descendent leaves, we don't want to pass them all to the consensus fcn since it's slow, so we choose them in proportion to their actual proportions, but scaled down to <n_max_cons_seqs>
                child_cons_seq_counts = [int(n_max_cons_seqs * csc / float(total_descendent_leaves)) for csc in child_cons_seq_counts]
                child_cons_seq_counts = [max(1, csc) for csc in child_cons_seq_counts]  # don't eliminate any sequences entirely (this makes the proportions less accurate (in some cases), but is the easy way to handle the case where there's a ton of singleton children
            if debug:
                print '  %s' % utils.color('green', node.taxon.label)
                csc_str = '  (reduced: %s)' % ' '.join([str(csc) for csc in child_cons_seq_counts]) if total_descendent_leaves > n_max_cons_seqs else ''
                print '      desc leaves per child: %s%s' % (' '.join(str(c.n_descendent_leaves) for c in node.child_node_iter()), csc_str)
            child_seqfos = [{'name' : cn.taxon.label + '-leaf-' + str(il), 'seq' : cn.seq} for cn, count in zip(node.child_node_iter(), child_cons_seq_counts) for il in range(count)]
            node.seq = utils.cons_seq(0.01, aligned_seqfos=child_seqfos, tie_resolver_seq=getline(root_label)['naive_seq'])  #, debug=debug)  # the consensus has an N at every position where the constituent sequences gave a tie. But Ns screw up the distances (especially because once we *get* an N, we can't get rid of it and it's propagated all the way up the tree), and in almost all cases the correct choice should be the naive base, so we use that
            node.n_descendent_leaves = total_descendent_leaves
            for edge in node.child_edge_iter():
                from_fasttree = False
                if edge.length == default_edge_length:  # otherwise it was set by fasttree, and it's probably better than what we'd get from this (it'd be nice to skip the cons seq stuff for the whole fasttree subtree, but then we don't have the cons seqs we need for later)
                    edge.length = utils.hamming_distance(edge.head_node.seq, node.seq) / float(len(node.seq))
                else:
                    from_fasttree = True
                if min_edge_length is not None:
                    edge.length = max(min_edge_length, edge.length)
                if debug:
                    print '       %6.3f   %s  %s' % (edge.length, utils.color('blue' if from_fasttree else 'red', 'x'), edge.head_node.taxon.label)

        if debug:
            print '        naive seq %s' % getline(root_label)['naive_seq'] # NOTE might be worthwhile to add an edge connecting seed node and the actual naive sequence (i.e. for cases where our approximate naive is off)
            print '    root cons seq %s' % utils.color_mutants(getline(root_label)['naive_seq'], dtree.seed_node.seq)

        for node in dtree.preorder_node_iter():
            del node.uids
            del node.seq
            del node.n_descendent_leaves

        treeutils.label_nodes(dtree, ignore_existing_internal_node_labels=True, ignore_existing_internal_taxon_labels=True, debug=debug)
        dtree.update_bipartitions()  # probably don't really need this
        if debug:
            print treeutils.utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=dtree, width=250))

        return dtree