def process_filelist_test(filelist=None,
                          model=None,
                          tmpfilename=None,
                          npicks=None,
                          winsize=None,
                          finaldim=None,
                          K=1,
                          typecompress='picks'):
    """
    Main function, process all files in the list (as long as their artist
    is in testartist)
    INPUT
       filelist     - a list of song files
       model        - h5 file containing feats and year for all train songs
       tmpfilename  - where to save our processed features
       npicks       - number of segments to pick per song
       winsize      - size of each segment we pick
       finaldim     - how many values do we keep
       K            - param of KNN (default 1)
       typecompress - feature type, 'picks', 'corrcoeff' or 'cov'
                      must be the same as in training
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None, 'process_filelist_test, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file', tmpfilename, 'already exists.'
        return
    if not os.path.isfile(model):
        print 'ERROR: model', model, 'does not exist.'
        return
    # create kdtree
    h5model = tables.openFile(model, mode='r')
    assert h5model.root.data.feats.shape[
        1] == finaldim, 'inconsistency in final dim'
    kd = ANN.kdtree(h5model.root.data.feats)
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/", 'data', 'TMP FILE FOR YEAR RECOGNITION')
    output.createEArray(group,
                        'year_real',
                        tables.IntAtom(shape=()), (0, ),
                        '',
                        expectedrows=len(filelist))
    output.createEArray(group,
                        'year_pred',
                        tables.Float64Atom(shape=()), (0, ),
                        '',
                        expectedrows=len(filelist))
    # random projection
    ndim = 12  # fixed in this dataset
    if typecompress == 'picks':
        randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim)
    elif typecompress == 'corrcoeff' or typecompress == 'cov':
        randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim)
    elif typecompress == 'avgcov':
        randproj = RANDPROJ.proj_point5(90, finaldim)
    else:
        assert False, 'Unknown type of compression: ' + str(typecompress)
    # go through files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        if cnt_f % 5000 == 0:
            print 'TESTING FILE #' + str(cnt_f)
        # check file
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        year = GETTERS.get_year(h5)
        track_id = GETTERS.get_track_id(h5)
        h5.close()
        if year <= 0:  # probably useless but...
            continue
        if typecompress == 'picks':
            # we have a train artist with a song year, we're good
            bttimbre = get_bttimbre(f)
            if bttimbre is None:
                continue
            # we even have normal features, awesome!
            processed_feats = CBTF.extract_and_compress(bttimbre,
                                                        npicks,
                                                        winsize,
                                                        finaldim,
                                                        randproj=randproj)
        elif typecompress == 'corrcoeff':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.corr_and_compress(timbres,
                                                     finaldim,
                                                     randproj=randproj)
        elif typecompress == 'cov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.cov_and_compress(timbres,
                                                    finaldim,
                                                    randproj=randproj)
        elif typecompress == 'avgcov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.avgcov_and_compress(timbres,
                                                       finaldim,
                                                       randproj=randproj)
        else:
            assert False, 'Unknown type of compression: ' + str(typecompress)
        if processed_feats is None:
            continue
        if processed_feats.shape[0] == 0:
            continue
        # do prediction
        year_pred = do_prediction(processed_feats, kd, h5model, K)
        # add pred and ground truth to output
        if not year_pred is None:
            output.root.data.year_real.append([year])
            output.root.data.year_pred.append([year_pred])
    # close output and model
    del kd
    h5model.close()
    output.close()
    # done
    return
def process_filelist_train(filelist=None,
                           testartists=None,
                           tmpfilename=None,
                           npicks=None,
                           winsize=None,
                           finaldim=None,
                           typecompress='picks'):
    """
    Main function, process all files in the list (as long as their artist
    is not in testartist)
    INPUT
       filelist     - a list of song files
       testartists  - set of artist ID that we should not use
       tmpfilename  - where to save our processed features
       npicks       - number of segments to pick per song
       winsize      - size of each segment we pick
       finaldim     - how many values do we keep
       typecompress - one of 'picks' (win of btchroma), 'corrcoef' (correlation coefficients),
                      'cov' (covariance)
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None, 'process_filelist_train, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file', tmpfilename, 'already exists.'
        return
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/", 'data', 'TMP FILE FOR YEAR RECOGNITION')
    output.createEArray(group,
                        'feats',
                        tables.Float64Atom(shape=()), (0, finaldim),
                        '',
                        expectedrows=len(filelist))
    output.createEArray(group,
                        'year',
                        tables.IntAtom(shape=()), (0, ),
                        '',
                        expectedrows=len(filelist))
    output.createEArray(group,
                        'track_id',
                        tables.StringAtom(18, shape=()), (0, ),
                        '',
                        expectedrows=len(filelist))
    # random projection
    ndim = 12  # fixed in this dataset
    if typecompress == 'picks':
        randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim)
    elif typecompress == 'corrcoeff' or typecompress == 'cov':
        randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim)
    elif typecompress == 'avgcov':
        randproj = RANDPROJ.proj_point5(90, finaldim)
    else:
        assert False, 'Unknown type of compression: ' + str(typecompress)
    # iterate over files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        # verbose
        if cnt_f % 50000 == 0:
            print 'training... checking file #', cnt_f
        # check file
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        year = GETTERS.get_year(h5)
        track_id = GETTERS.get_track_id(h5)
        h5.close()
        if year <= 0 or artist_id in testartists:
            continue
        # we have a train artist with a song year, we're good
        bttimbre = get_bttimbre(f)
        if typecompress == 'picks':
            if bttimbre is None:
                continue
            # we even have normal features, awesome!
            processed_feats = CBTF.extract_and_compress(bttimbre,
                                                        npicks,
                                                        winsize,
                                                        finaldim,
                                                        randproj=randproj)
        elif typecompress == 'corrcoeff':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.corr_and_compress(timbres,
                                                     finaldim,
                                                     randproj=randproj)
        elif typecompress == 'cov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.cov_and_compress(timbres,
                                                    finaldim,
                                                    randproj=randproj)
        elif typecompress == 'avgcov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.avgcov_and_compress(timbres,
                                                       finaldim,
                                                       randproj=randproj)
        else:
            assert False, 'Unknown type of compression: ' + str(typecompress)
        # save them to tmp file
        n_p_feats = processed_feats.shape[0]
        output.root.data.year.append(np.array([year] * n_p_feats))
        output.root.data.track_id.append(np.array([track_id] * n_p_feats))
        output.root.data.feats.append(processed_feats)
    # we're done, close output
    output.close()
    return
def process_filelist_train(filelist=None,testartists=None,tmpfilename=None,
                           npicks=None,winsize=None,finaldim=None,typecompress='picks'):
    """
    Main function, process all files in the list (as long as their artist
    is not in testartist)
    INPUT
       filelist     - a list of song files
       testartists  - set of artist ID that we should not use
       tmpfilename  - where to save our processed features
       npicks       - number of segments to pick per song
       winsize      - size of each segment we pick
       finaldim     - how many values do we keep
       typecompress - one of 'picks' (win of btchroma), 'corrcoef' (correlation coefficients),
                      'cov' (covariance)
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None,'process_filelist_train, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file',tmpfilename,'already exists.'
        return
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/",'data','TMP FILE FOR YEAR RECOGNITION')
    output.createEArray(group,'feats',tables.Float64Atom(shape=()),(0,finaldim),'',
                        expectedrows=len(filelist))
    output.createEArray(group,'year',tables.IntAtom(shape=()),(0,),'',
                        expectedrows=len(filelist))
    output.createEArray(group,'track_id',tables.StringAtom(18,shape=()),(0,),'',
                        expectedrows=len(filelist))
    # random projection
    ndim = 12 # fixed in this dataset
    if typecompress == 'picks':
        randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim)
    elif typecompress == 'corrcoeff' or typecompress == 'cov':
        randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim)
    elif typecompress == 'avgcov':
        randproj = RANDPROJ.proj_point5(90, finaldim)
    else:
        assert False,'Unknown type of compression: '+str(typecompress)
    # iterate over files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        # verbose
        if cnt_f % 50000 == 0:
            print 'training... checking file #',cnt_f
        # check file
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        year = GETTERS.get_year(h5)
        track_id = GETTERS.get_track_id(h5)
        h5.close()
        if year <= 0 or artist_id in testartists:
            continue
        # we have a train artist with a song year, we're good
        bttimbre = get_bttimbre(f)
        if typecompress == 'picks':
            if bttimbre is None:
                continue
            # we even have normal features, awesome!
            processed_feats = CBTF.extract_and_compress(bttimbre,npicks,winsize,finaldim,
                                                        randproj=randproj)
        elif typecompress == 'corrcoeff':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.corr_and_compress(timbres,finaldim,randproj=randproj)
        elif typecompress == 'cov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.cov_and_compress(timbres,finaldim,randproj=randproj)
        elif typecompress == 'avgcov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.avgcov_and_compress(timbres,finaldim,randproj=randproj)
        else:
            assert False,'Unknown type of compression: '+str(typecompress)
        # save them to tmp file
        n_p_feats = processed_feats.shape[0]
        output.root.data.year.append( np.array( [year] * n_p_feats ) )
        output.root.data.track_id.append( np.array( [track_id] * n_p_feats ) )
        output.root.data.feats.append( processed_feats )
    # we're done, close output
    output.close()
    return
def process_filelist_test(filelist=None,model=None,tmpfilename=None,
                           npicks=None,winsize=None,finaldim=None,K=1,
                          typecompress='picks'):
    """
    Main function, process all files in the list (as long as their artist
    is in testartist)
    INPUT
       filelist     - a list of song files
       model        - h5 file containing feats and year for all train songs
       tmpfilename  - where to save our processed features
       npicks       - number of segments to pick per song
       winsize      - size of each segment we pick
       finaldim     - how many values do we keep
       K            - param of KNN (default 1)
       typecompress - feature type, 'picks', 'corrcoeff' or 'cov'
                      must be the same as in training
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None,'process_filelist_test, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file',tmpfilename,'already exists.'
        return
    if not os.path.isfile(model):
        print 'ERROR: model',model,'does not exist.'
        return
    # create kdtree
    h5model = tables.openFile(model, mode='r')
    assert h5model.root.data.feats.shape[1]==finaldim,'inconsistency in final dim'
    kd = ANN.kdtree(h5model.root.data.feats)
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/",'data','TMP FILE FOR YEAR RECOGNITION')
    output.createEArray(group,'year_real',tables.IntAtom(shape=()),(0,),'',
                        expectedrows=len(filelist))
    output.createEArray(group,'year_pred',tables.Float64Atom(shape=()),(0,),'',
                        expectedrows=len(filelist))
    # random projection
    ndim = 12 # fixed in this dataset
    if typecompress == 'picks':
        randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim)
    elif typecompress == 'corrcoeff' or typecompress=='cov':
        randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim)
    elif typecompress == 'avgcov':
        randproj = RANDPROJ.proj_point5(90, finaldim)
    else:
        assert False,'Unknown type of compression: '+str(typecompress)
    # go through files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        if cnt_f % 5000 == 0:
            print 'TESTING FILE #'+str(cnt_f)
        # check file
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        year = GETTERS.get_year(h5)
        track_id = GETTERS.get_track_id(h5)
        h5.close()
        if year <= 0: # probably useless but...
            continue
        if typecompress == 'picks':
            # we have a train artist with a song year, we're good
            bttimbre = get_bttimbre(f)
            if bttimbre is None:
                continue
            # we even have normal features, awesome!
            processed_feats = CBTF.extract_and_compress(bttimbre,npicks,winsize,finaldim,
                                                        randproj=randproj)
        elif typecompress == 'corrcoeff':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.corr_and_compress(timbres,finaldim,randproj=randproj)
        elif typecompress == 'cov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.cov_and_compress(timbres,finaldim,randproj=randproj)
        elif typecompress == 'avgcov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.avgcov_and_compress(timbres,finaldim,randproj=randproj)
        else:
            assert False,'Unknown type of compression: '+str(typecompress)
        if processed_feats is None:
            continue
        if processed_feats.shape[0] == 0:
            continue
        # do prediction
        year_pred = do_prediction(processed_feats,kd,h5model,K)
        # add pred and ground truth to output
        if not year_pred is None:
            output.root.data.year_real.append( [year] )
            output.root.data.year_pred.append( [year_pred] )
    # close output and model
    del kd
    h5model.close()
    output.close()
    # done
    return