def rearrange(): """ Rearrange the TABLE hash_dup_temp into more formal format, s.t. rooting from single track and no duplicated alarms. """ store = get_rivendell_store() data = store.execute("SELECT * FROM hash_dup_temp") data = np.array(data) # Arrange the duplicated songs with the same root for item in data: former = item[0] latter = item[1] for itr in data: if itr[0]==latter: itr[0] = former if itr[1]==former: itr[1] = latter # delete the duplicated entries output = set() for i in range(len(data)): output.add((data[i,0],data[i,1])) # input the result into database for item in output: store.execute("INSERT INGORE INTO hash_dup_songs (original, duplicate) VALUES (%d, %d)" % (item[0], item[1]))
def saveData(filename, landmarks): """ Save the generated fingerprint of specific audio file into database filename: the name of the song in the INTEGER format landmarks: the 2D array with fingerprints in every row formatted as: (time started, freqency stared, freq-delta, time-delta) """ store = get_rivendell_store() # Select proper # parallel table mod = 10 num = filename % mod table = "hash_track_%d"%(num) hasht = [8,6,8] # This is came from retrieval.py as definition for hash table length = 3 for item in landmarks: checkItem(item[1:], length) if item[1]<256 and item[2]<64 and item[3]<256: hid = int((item[1]<<14)+(item[2]<<8)+item[3]) else: raise ValueError, "The scope of find_landmarks and hashing length are not corresponding." # save one landmark of a track into database i = [int(k) for k in item] # change numpy.int64 to int store.execute("INSERT IGNORE INTO %s (song_id, time, hid) VALUES (%d, %d, %d)"% (table, filename, i[0], hid)) store.commit()
def saveDuplicated(filename, hits): """ Save the potential duplicated song into TABLE hash_dup_temp. The table has two columns: original and duplicate """ store = get_rivendell_store() for item in hits: store.execute("INSERT IGNORE INTO hash_dup_temp (original, duplicate) VALUES (%d, %d)" % (filename, item)) store.commit()
def fetchTrack(filename): """ Return the landmarks of a specific track Note that filename should be the unique int id of that track. """ store = get_rivendell_store() # Select proper # parallel table mod = 10 num = filename % mod table = "hash_track_%d"%(num) # Fetch the entire song data = store.execute("SELECT time, hid FROM %s WHERE song_id=%s" % (table, filename)) return np.array(data)
def iter_database(self, filename, landmarks): dup_flag = False partial_lm, start, end = self._findLargestCoverage(landmarks) for i in range(self.numoftable): table = "hash_track_%d"%(i%self.numoftable) # balance the load hits = self.validatelm(table, filename, start, end, partial_lm) if filename in hits: hits.pop(hits.index(filename)) if len(hits) >0: print "Hitting duplicated songs for " , filename saveDuplicated(filename, hits) dup_flag = True if self.mode == 2: # For mode 2, the database is preloaded into database @ superbatch break if dup_flag and self.mode==0: store = get_rivendell_store() table = "hash_track_%d"%(filename%self.numoftable) store.execute("delete from %s where song_id=%d" % (table, filename)) print "Complete file ", filename
from DoubanAlg import get_rivendell_store import numpy as np store = get_rivendell_store() data = store.execute("select hid from hash_track_temp") data = np.array(data).flatten() np.save("hid_in_hash_track_temp", data) data = list(data.sort()) length = len(data) step = 10 for i in range(step): print "The breaking point is ", data[length/step*(i+1)] print "Hoorry, done:)"