def predict(self, input_F): distance_v_plus = utils.hamming(input_F, self.prototype_v_plus) distance_v_min = utils.hamming(input_F, self.prototype_v_min) distance_a_high = utils.hamming(input_F, self.prototype_a_high) distance_a_low = utils.hamming(input_F, self.prototype_a_low) self.distance_v_history.append([distance_v_plus, distance_v_min]) self.distance_a_history.append([distance_a_high, distance_a_low]) return (0 if distance_v_plus < distance_v_min else 1, 0 if distance_a_high < distance_a_low else 1)
def rate_matrix(q,koffs,verbose=False): """Generate the stochastic rate matrix for the givens system.""" # Chromosome states can be represented by binary numerals; order the # states this way. G = len(koffs) states = enumerate_states(G,q) num_states = len(states) assert len(states) == sum(choose(G,i) for i in range(q+1)) R = np.zeros((num_states,num_states)) for i,state_i in enumerate(states): for j,state_j in enumerate(states): if verbose: print "considering:",i,state_i,"->",j,state_j dist = hamming(state_i,state_j) if dist != 1: # deal with diagonal elements later... if verbose: print "distance is:",dist,"continuing..." continue if sum(state_j) == sum(state_i) + 1: R[i][j] = q - sum(state_i) if verbose: print i,state_i,"->",j,state_j, "is an on-reaction, rate:",R[i][j] elif sum(state_j) == sum(state_i) - 1: diff_idx,diff_site = find(lambda (idx,(si,sj)):si != sj,enumerate(zip(state_i,state_j))) R[i][j] = koffs[diff_idx] if verbose: print i,state_i,"->",j,state_j, "is an off-reaction (at site",diff_idx,") rate:",R[i][j] # deal with diagonal elements for i in range(num_states): R[i][i] = -sum(R[i]) print "finished rate matrix" return R
def ddpl_dHdt_singular(ps,l): K = len(ps) L = num_cols_from_vector(ps) kmers = list(make_kmers(L)) s = kmers[l] term1 = -one_point_avg_inner(ps,l)/(ps[l]) term2 = -sum(log(ps[k]) for k in range(K) if hamming(kmers[k],s) == 1)/(3*L) term3 = log(ps[k]) + 1 return term1 + term2 + term3
def compute_hamming(self, sample, train): """ compute hamming distance for one sample over the train-set :param sample: sample from test-set :param train: the train-set :return: list of distances between every example from train set to the sample """ distace_sample = [] for t in train: distace_sample.append((hamming(t[0], sample), t[1], t[2])) return distace_sample
def hamming_distance_to_true_naive(self, true_line, line, query_name, restrict_to_region='', normalize=False, debug=False): """ Hamming distance between the inferred naive sequence and the tue naive sequence. <restrict_to_region> if set, restrict the comparison to the section of the *true* sequence assigned to the given region. NOTE this will not in general correspond to the similarly-assigned region in the inferred naive sequence. if <normalize> divide by sequence length """ true_naive_seq = utils.get_full_naive_seq(self.germlines, true_line) inferred_naive_seq = utils.get_full_naive_seq(self.germlines, line) left_hack_add_on = '' right_hack_add_on = '' if len(true_line['seq']) > len(line['seq']): # ihhhmmm doesn't report the bits of the sequence it erodes off the ends, so we have to add them back on # if len(true_naive_seq) > len(inferred_naive_seq): # hm, now why I did use line['seq'] stuff before? start = true_line['seq'].find(line['seq']) assert start >= 0 end = len(line['seq']) + start left_hack_add_on = true_line['seq'][: start] right_hack_add_on = true_line['seq'][ end :] # extra_penalty = len(left_hack_add_on) + len(right_hack_add_on) inferred_naive_seq = 'x'*len(left_hack_add_on) + inferred_naive_seq + 'x'*len(right_hack_add_on) if debug: print ' adding to inferred naive seq' bounds = None if restrict_to_region != '': bounds = utils.get_regional_naive_seq_bounds(restrict_to_region, self.germlines, true_line) # get the bounds of this *true* region true_naive_seq = true_naive_seq[bounds[0] : bounds[1]] inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]] # if len(true_naive_seq) > len(inferred_naive_seq): if debug: print restrict_to_region, 'region, bounds', bounds print ' true ', true_naive_seq print ' infer', inferred_naive_seq if len(true_naive_seq) != len(inferred_naive_seq): print 'ERROR still not the same lengths for %s' % query_name print ' true ', true_naive_seq print ' infer', inferred_naive_seq sys.exit() total_distance = utils.hamming(true_naive_seq, inferred_naive_seq) if len(true_naive_seq) == 0: print 'WARNING zero length sequence in hamming_distance_to_true_naive' return 0 if normalize: return int(100 * (float(total_distance) / len(true_naive_seq))) else: return total_distance
def one_point_avg_inner(ps,k): L = int(log(len(ps),4)) kmers = list(make_kmers(L)) s = kmers[k] acc = 0 hits = 0 for i,kmer in enumerate(kmers): if hamming(kmer,s) == 1: acc += ps[i] hits += 1 assert hits == 3*L return acc/(3*L)
def get_hamming_distances(self, pairs): #, return_info): # NOTE duplicates a function in utils return_info = [] for query_a, query_b in pairs: seq_a = self.input_info[query_a]['seq'] seq_b = self.input_info[query_b]['seq'] if self.args.truncate_pairs: # chop off the left side of the longer one if they're not the same length min_length = min(len(seq_a), len(seq_b)) seq_a = seq_a[-min_length : ] seq_b = seq_b[-min_length : ] chopped_off_left_sides = True mutation_frac = utils.hamming(seq_a, seq_b) / float(len(seq_a)) return_info.append({'id_a':query_a, 'id_b':query_b, 'score':mutation_frac}) return return_info
def calcsimilarity(known, table, id1, id2, comparison): tokens = re.split("[/,;]", comparison[1]) ret = False for j in xrange(0, len(known)): if i == j: continue if comparison[2] == known[j][2]: similarity = 1.0 else: compared_genre = re.split("[/,;]", known[j][1]) distance = {} sametags = 0 for a in tokens: if not a: continue for b in compared_genre: if not b or b in distance: continue if len(a) == len(b): h = hamming(a, b) / float(len(a)) if h: distance[b] = h else: sametags = sametags + 1 else: distance[b] = levenshtein(a, b) / \ float(max(len(a), len(b))) if distance: # geometric mean + weighted equal tags similarity = 1.0 - ( reduce(lambda x, y: x * y, distance.values())) ** \ (1.0 / len(distance)) + \ (sametags / (sametags + len(distance))) else: similarity = 0.0 if similarity > 0.33: if not db.execute( "select * from %s where %s = ? and %s = ?" % (table, id1, id2), (comparison[0], known[j][0])).fetchall(): db.execute( "insert or ignore into %s " "(%s, %s, similarity) values ( ?, ?, ?)" % (table, id1, id2), (comparison[0], known[j][0], similarity)) ret = True return ret
def get_hamming_distances(self, pairs): #, return_info): # NOTE duplicates a function in utils return_info = [] for query_a, query_b in pairs: seq_a = self.input_info[query_a]['seq'] seq_b = self.input_info[query_b]['seq'] if self.args.truncate_pairs: # chop off the left side of the longer one if they're not the same length min_length = min(len(seq_a), len(seq_b)) seq_a = seq_a[-min_length:] seq_b = seq_b[-min_length:] chopped_off_left_sides = True mutation_frac = utils.hamming(seq_a, seq_b) / float(len(seq_a)) return_info.append({ 'id_a': query_a, 'id_b': query_b, 'score': mutation_frac }) return return_info
def checkGolomg(seq): """ 1. 0s and 1s in the sequence are as near as possible to n/2 2. The number of runs of given length should halve when the length is in- creased by one (as long as possible), and where possible equally many runs of given length should consist of 0s as of 1s 3. The out-of-phase autocorrelation should be constant (independent of the shift) """ postulates = [True, True, True] ## Check postulate 1 zeros = seq.count("0") ones = seq.count("1") if abs(zeros - ones) > 1: postulates[0] = False ## Postulate 2 r = Runs(seq) keys = r.keys() if r: for i in xrange(len(keys) - 1): if keys[i] - keys[i + 1] != -1: postulates[1] = False break if r[keys[i]] != 2 * r[keys[i + 1]]: if r[keys[i]] != 1 and r[keys[i + 1]] != 1: postulates[1] = False break else: postulates[1] = False ## Postulate 3 postulates[2] = hamming(seq) return True if sum(postulates) == 3 else False
def run(self): if not self.fpcalc: return logging.debug("fpcalc: %s" % self.fpcalc) self.db = dbapi.connect(self.dbpath) # lastrelease = "" lastdata = [] lastquery = "" laststatus = 0 starttime = time() stoptime = starttime + 1 requests = 0 while self.running: try: path, title, artist, album = self.queue.get() except Empty as e: logging.warning(e) continue except Exception as e: logging.error(e) continue if not path or not album: logging.warning("No path/album name provided") continue if requests / (stoptime - starttime) > 3: sleep(1) starttime = stoptime logging.info("Getting infos for %s %s" % (artist, album)) fingerprint = '' duration = 0 try: logging.info("Analyzing %s file" % path) if self.fpcalc: logging.debug("fingerprint for %s" % path) fpcalc_process = subprocess.Popen( ["/usr/bin/fpcalc", path], stdout=subprocess.PIPE, stderr=subprocess.PIPE) fpcalc_output = fpcalc_process.communicate()[0].split('\n') duration = fpcalc_output[1][9:] fingerprint = fpcalc_output[2][12:] except Exception as e: logging.error(e) if fingerprint: query = u"/v2/lookup?" \ "client=8XaBELgH" \ "&meta=recording+releasegroups" \ "+tracks+puids+usermeta+compress" \ "&duration=%s&format=json&fingerprint=%s" % \ (duration, fingerprint) if query == lastquery and laststatus == 200: logging.info("Same request already occurred - skipping") try: conn = HTTPConnection("api.acoustid.org", 80) conn.request("GET", query) response = conn.getresponse() except: continue puid = "" mb_title = "" mb_artists = "" if response.status != 200: continue try: lastquery = query laststatus = 200 results = json.loads(response.read()) lastdata = results["results"][0] logging.debug(lastdata) release = "releasegroups" in lastdata \ and len(lastdata) and lastdata["releasegroups"][0] recording = "recordings" in lastdata \ and len(lastdata) and lastdata["recordings"][0] score = "score" in lastdata and lastdata['score'] logging.debug(release) logging.debug(recording) if len(lastdata): logging.debug("%s results found" % len(lastdata)) puid = 'puids' in lastdata and lastdata["puids"][0] mbid = release and release['id'] \ or recording \ and recording[0]['releasegroups'][0]['id'] mb_title = release and release["title"] \ or recording \ and recording[0]['title'] mb_artists = " ".join([ i['name'] for i in (release and release["artists"] or recording and recording[0]['artists']) ]) logging.debug("Response status: %d %s" % (response.status, response.read())) except Exception as e: continue logging.error(e) stoptime = time() requests = (requests + 1) % 3 if score < 0.7: continue if len(title) == len(mb_title): title_distance = hamming(title, mb_title) / float( len(title)) else: title_distance = levenshtein(title, mb_title) / float( max(len(title), len(mb_title))) if len(artist) == len(mb_artists): author_distance = hamming(artist, mb_artists) / float( len(artist)) else: author_distance = levenshtein(artist, mb_artists) / float( max(len(artist), len(mb_artists))) # if title_distance > 0.33 and author_distance > 0.5: logging.debug("distances: %s %s %s" % (score, title_distance, author_distance)) # continue logging.debug("puid: %s, mbid %s" % (puid, mbid)) with self.condition: try: song_id, album_id = self.db.execute( "select id, album_id from song " "where path = ?;", (path, )).fetchone() self.db.execute( "update song set puid = ?, mbid = ? " "where id = ?", (puid, mbid, song_id)) if title_distance > 0: self.db.execute( "update song set title = ? " "where id = ?", (mb_title, song_id)) self.db.commit() except Exception as e: logging.error(e) self.db.close()
def run(self): if not self.fpcalc: return logging.debug("fpcalc: %s" % self.fpcalc) self.db = dbapi.connect(self.dbpath) # lastrelease = "" lastdata = [] lastquery = "" laststatus = 0 starttime = time() stoptime = starttime + 1 requests = 0 while self.running: try: path, title, artist, album = self.queue.get() except Empty as e: logging.warning(e) continue except Exception as e: logging.error(e) continue if not path or not album: logging.warning("No path/album name provided") continue if requests / (stoptime - starttime) > 3: sleep(1) starttime = stoptime logging.info("Getting infos for %s %s" % (artist, album)) fingerprint = '' duration = 0 try: logging.info("Analyzing %s file" % path) if self.fpcalc: logging.debug("fingerprint for %s" % path) fpcalc_process = subprocess.Popen( ["/usr/bin/fpcalc", path], stdout=subprocess.PIPE, stderr=subprocess.PIPE) fpcalc_output = fpcalc_process.communicate()[0].split('\n') duration = fpcalc_output[1][9:] fingerprint = fpcalc_output[2][12:] except Exception as e: logging.error(e) if fingerprint: query = u"/v2/lookup?" \ "client=8XaBELgH" \ "&meta=recording+releasegroups" \ "+tracks+puids+usermeta+compress" \ "&duration=%s&format=json&fingerprint=%s" % \ (duration, fingerprint) if query == lastquery and laststatus == 200: logging.info("Same request already occurred - skipping") try: conn = HTTPConnection("api.acoustid.org", 80) conn.request("GET", query) response = conn.getresponse() except: continue puid = "" mb_title = "" mb_artists = "" if response.status != 200: continue try: lastquery = query laststatus = 200 results = json.loads(response.read()) lastdata = results["results"][0] logging.debug(lastdata) release = "releasegroups" in lastdata \ and len(lastdata) and lastdata["releasegroups"][0] recording = "recordings" in lastdata \ and len(lastdata) and lastdata["recordings"][0] score = "score" in lastdata and lastdata['score'] logging.debug(release) logging.debug(recording) if len(lastdata): logging.debug("%s results found" % len(lastdata)) puid = 'puids' in lastdata and lastdata["puids"][0] mbid = release and release['id'] \ or recording \ and recording[0]['releasegroups'][0]['id'] mb_title = release and release["title"] \ or recording \ and recording[0]['title'] mb_artists = " ".join( [i['name'] for i in (release and release["artists"] or recording and recording[0]['artists'])]) logging.debug( "Response status: %d %s" % (response.status, response.read())) except Exception as e: continue logging.error(e) stoptime = time() requests = (requests + 1) % 3 if score < 0.7: continue if len(title) == len(mb_title): title_distance = hamming( title, mb_title) / float(len(title)) else: title_distance = levenshtein( title, mb_title) / float( max(len(title), len(mb_title))) if len(artist) == len(mb_artists): author_distance = hamming( artist, mb_artists) / float(len(artist)) else: author_distance = levenshtein( artist, mb_artists) / float( max(len(artist), len(mb_artists))) # if title_distance > 0.33 and author_distance > 0.5: logging.debug( "distances: %s %s %s" % (score, title_distance, author_distance)) # continue logging.debug("puid: %s, mbid %s" % (puid, mbid)) with self.condition: try: song_id, album_id = self.db.execute( "select id, album_id from song " "where path = ?;", (path,)).fetchone() self.db.execute( "update song set puid = ?, mbid = ? " "where id = ?", (puid, mbid, song_id)) if title_distance > 0: self.db.execute( "update song set title = ? " "where id = ?", (mb_title, song_id)) self.db.commit() except Exception as e: logging.error(e) self.db.close()
desired_states = [] desired_unclamped_states = [] diffs = [] unclamped_diffs = [] for i in trange(trials): rstate = random_state(self.V) init_state = clamp(rstate,init_obs) final_state = self.sample_from_clamped_equilibrium(init_state,treatment) final_unclamped_state = self.sample_from_equilibrium(init_state) final_states.append(final_state) final_unclamped_states.append(final_unclamped_state) desired_state = clamp(final_state,final_obs) desired_states.append(desired_state) desired_unclamped_state = clamp(final_unclamped_state,final_obs) desired_unclamped_states.append(desired_unclamped_state) discrepancy = hamming(final_state,desired_state) unclamped_discrepancy = hamming(final_unclamped_state,desired_state) diff = final_state - desired_state unclamped_diff = final_unclamped_state - desired_unclamped_state diffs.append(diff) unclamped_diffs.append(unclamped_diff) discrepancies.append(discrepancy) print "distinct final states:",len(set(map(tuple,final_states))) print "distinct desired states:",len(set(map(tuple,desired_states))) print "distinct final unclamped states:",len(set(map(tuple,final_unclamped_states))) print "distinct desired unclamped states:",len(set(map(tuple,desired_unclamped_states))) print "distinct diffs:",len(set(map(tuple,diffs))) print "distinct unclamped diffs:",len(set(map(tuple,unclamped_diffs))) print [i for i,d in enumerate(diffs[0]) if d != 0] return discrepancies
# block. Put them together and you have the key. # import sys sys.path.insert(1, "../common") # Want to locate modules in our 'common' directory import string import binascii import utils import ltrfreq s1 = "this is a test" s2 = "wokka wokka!!!" hamster = utils.hamming(s1, s2) print hamster assert hamster == 37 # That's all for now - below here doesnt work decoded = [] bytes1 = [] bytes2 = [] minkeylength = 2 maxkeylength = 40
def test_hamming(): m = 10 assert np.allclose(hamming(m), np.hamming(m))