Exemple #1
0
def rearrange():
    """
    Rearrange the TABLE hash_dup_temp into more formal format,
    s.t. rooting from single track and no duplicated alarms.
    """
    store = get_rivendell_store()
    data = store.execute("SELECT * FROM hash_dup_temp")
    data = np.array(data)

    # Arrange the duplicated songs with the same root
    for item in data:
        former = item[0]
        latter = item[1]
        for itr in data:
            if itr[0]==latter:
                itr[0] = former
                if itr[1]==former:
                    itr[1] = latter

    # delete the duplicated entries
    output = set()
    for i in range(len(data)):
        output.add((data[i,0],data[i,1]))

    # input the result into database
    for item in output:
        store.execute("INSERT INGORE INTO hash_dup_songs (original, duplicate) VALUES (%d, %d)" % (item[0], item[1]))
Exemple #2
0
def saveData(filename, landmarks):
    """
    Save the generated fingerprint of specific audio file into database
    filename: the name of the song in the INTEGER format
    landmarks: the 2D array with fingerprints in every row formatted as:
    (time started, freqency stared, freq-delta, time-delta)
    """
    store = get_rivendell_store()

    # Select proper # parallel table
    mod = 10
    num = filename % mod
    table = "hash_track_%d"%(num)

    hasht = [8,6,8]   # This is came from retrieval.py as definition for hash table
    length = 3

    for item in landmarks:
        checkItem(item[1:], length)
        if item[1]<256 and item[2]<64 and item[3]<256:
            hid = int((item[1]<<14)+(item[2]<<8)+item[3])
        else:
            raise ValueError, "The scope of find_landmarks and hashing length are not corresponding."
        # save one landmark of a track into database
        i = [int(k) for k in item]      # change numpy.int64 to int
        store.execute("INSERT IGNORE INTO %s (song_id, time, hid) VALUES (%d, %d, %d)"% (table, filename, i[0], hid))
    store.commit()
Exemple #3
0
def saveDuplicated(filename, hits):
    """
    Save the potential duplicated song into TABLE hash_dup_temp.
    The table has two columns: original and duplicate
    """
    store = get_rivendell_store()
    for item in hits:
        store.execute("INSERT IGNORE INTO hash_dup_temp (original, duplicate) VALUES (%d, %d)" % (filename, item))
    store.commit()
Exemple #4
0
def fetchTrack(filename):
    """
    Return the landmarks of a specific track
    Note that filename should be the unique int id of that track.
    """
    store = get_rivendell_store()

    # Select proper # parallel table
    mod = 10
    num = filename % mod
    table = "hash_track_%d"%(num)

    # Fetch the entire song
    data = store.execute("SELECT time, hid FROM %s WHERE song_id=%s" % (table, filename))
    return np.array(data)
Exemple #5
0
    def iter_database(self, filename, landmarks):
        dup_flag = False
        partial_lm, start, end = self._findLargestCoverage(landmarks)
        
        for i in range(self.numoftable):
            table = "hash_track_%d"%(i%self.numoftable)   # balance the load
            hits = self.validatelm(table, filename, start, end, partial_lm)

            if filename in hits:
                hits.pop(hits.index(filename))
            if len(hits) >0:
                print "Hitting duplicated songs for " , filename
                saveDuplicated(filename, hits)
                dup_flag = True
            if self.mode == 2:
                # For mode 2, the database is preloaded into database @ superbatch
                break
        
        if dup_flag and self.mode==0:
            store = get_rivendell_store()
            table = "hash_track_%d"%(filename%self.numoftable)
            store.execute("delete from %s where song_id=%d" % (table, filename))
        print "Complete file ", filename
Exemple #6
0
from DoubanAlg import get_rivendell_store
import numpy as np

store = get_rivendell_store()
data = store.execute("select hid from hash_track_temp")
data = np.array(data).flatten()

np.save("hid_in_hash_track_temp", data)
data = list(data.sort())

length = len(data)
step = 10

for i in range(step):
    print "The breaking point is ", data[length/step*(i+1)]


print "Hoorry, done:)"