Esempio n. 1
0
def extract_features(in_dir, out_dir, path, verbose=False):
    in_dir = utils.abs_path_dir(in_dir)
    path = utils.abs_path_dir(path)
    cur_dir = os.getcwd()
    os.chdir(in_dir)
    script = "harmony-analyser-script-jar-with-dependencies.jar"
    src = path + script
    dst = in_dir + script
    shutil.copy(src, dst)
    options = [
        "nnls-chroma:nnls-chroma", "nnls-chroma:chordino-tones",
        "nnls-chroma:chordino-labels", "qm-vamp-plugins:qm-keydetector",
        "chord_analyser:tps_distance"
        # "chord_analyser:chord_complexity_distance",
        # "chroma_analyser:complexity_difference",
        # "chord_analyser:average_chord_complexity_distance"
    ]
    for opt in options:
        cmd = "java -jar " + script + " -a " + opt + " -s .wav -t 0.07"
        utils.run_cmd(cmd, verbose)
    os.remove(dst)
    cp_cmd = "cp *.txt " + out_dir
    utils.run_cmd(cp_cmd)
    # utils.run_cmd("rm *.txt")
    os.chdir(cur_dir)
def test_models_parallel(models_dir, out_dir, test_dir=None, test_file=None):
    """Description of test_models_parallel

    17h16m12s DecisionTree done in 16135373ms
    17h25m08s GradientBoosting done in 16671109ms
    18h59m05s RandomForest done in 22307811ms
    18h59m07s AdaBoost done in 22310633ms
    19h18m12s ExtraTrees done in 23455779ms

    """
    models_dir = utils.abs_path_dir(models_dir) + "/"
    models = os.listdir(models_dir)
    utils.create_dir(out_dir)

    if test_dir is not None:
        test_dir = utils.abs_path_dir(test_dir) + "/"
        test_files = os.listdir(test_dir)
        test_file = None
    elif test_file is not None:
        test_files = None
    else:
        utils.print_warning(
            "TODO Error in arg for test_models_parallel() function")

    partial_test_model = partial(test_model,
                                 models_dir=models_dir,
                                 test_dir=test_dir,
                                 out_dir=out_dir,
                                 test_files=test_files,
                                 test_file=test_file)
    pool = multiprocessing.Pool(len(models))
    pool.map(partial_test_model, models)  #make our results with a map call
    pool.close()  #we are not adding any more processes
    pool.join()  #tell it to wait until all threads are done before going on
Esempio n. 3
0
def match_feat_with_song_gt(dir_feat, dir_gts):
    """Description of match_feat_gt

    Use groundtruth created by 
    http://www.mathieuramona.com/wp/data/jamendo/ 

    associate to local features
    csv 7041 lines yaafe
    lab 326.973 sec ramona
    Definition of YAAFE from 
    http://yaafe.sourceforge.net/features.html
    """
    utils.print_success("Matching local feat to song/instru groundtruths")
    dir_feat = utils.abs_path_dir(dir_feat)
    dir_gts = utils.abs_path_dir(dir_gts)
    block_size = 1024.
    step_size = 512.
    fech = 22050.
    frame_size_ms = block_size / fech
    filenames = [fn for fn in os.listdir(dir_gts)]
    for index, filename in enumerate(filenames):
        utils.print_progress_start(str(index) + "/" + str(len(filenames)) + " " + filename)
        # gather groundtruths
        groundtruths = []
        with open(dir_gts + filename, "r") as filep:
            for row in filep:
                line = row.split(" ")
                end = float(line[1])
                if "no" in line[2]:
                    tag = ",i\n"
                else:
                    tag = ",s\n"
                groundtruths.append([end, tag])
        gt_len = len(groundtruths)
        overflow = False
        gt_index = 0
        cpt = 0
        # Write features & groundtruths to file
        str_to_write = ""
        feat_fn = filename.split(".")[0]
        feat_fn += ".wav.mfcc.csv"
        with open(dir_feat + feat_fn, "r") as filep:
            for index, line in enumerate(filep):
                # todo cleanup
                if gt_index < gt_len:
                    if frame_size_ms * index > groundtruths[gt_index][0]:
                        gt_index += 1
                    if gt_index < gt_len:
                        str_to_write += line[:-1] + groundtruths[gt_index][1]
        with open(dir_feat + feat_fn, "w") as filep:
            filep.write(str_to_write)
    utils.print_progress_end()
Esempio n. 4
0
def match_feat_with_instru_gt(indir, outdir):
    """Description of match_feat_gt

    Apply instru groundtruth to CCmixter and MedleyDB
    """
    utils.print_success("Matching local features to instrumental groundtruths")
    indir = utils.abs_path_dir(indir) + "/"
    outdir = utils.abs_path_dir(outdir) + "/"
    filenames = [fn for fn in os.listdir(indir)]
    for filename in filenames:
        outfile = open(outdir + filename, "w")
        with open(indir + filename, "r") as filep:
            for line in filep:
                outfile.write(line[:-1] + " i\n")
        outfile.close()
Esempio n. 5
0
def new_algo_final(indir, file_gts_track):
    utils.print_success("Approx. time ~6 hours.")
    # Preprocess arg
    indir = utils.abs_path_dir(indir)
    file_gts_track = utils.abs_path_file(file_gts_track)
    dir_tmp = utils.create_dir(utils.create_dir("src/tmp") + "bayle")
    feat_frame_train = utils.create_dir(dir_tmp + "feat_frame_train")
    feat_frame_test = utils.create_dir(dir_tmp + "feat_frame_test")
    outdir_global = utils.create_dir(dir_tmp + "feat_track")
    feat_train = outdir_global + "train.csv"
    feat_test = outdir_global + "test.csv"
    models_dir = utils.create_dir(dir_tmp + "models")
    loc_feat_testset_dirpath = "features/database2/"
    filelist_train = "groundtruths/database1.csv"
    filelist_test = "groundtruths/database2.csv"
    models_global = utils.create_dir(dir_tmp + "models_track")

    process_local_feat(indir, file_gts_track, outdir_local=feat_frame_train, out_feat_global=feat_train, train=False)
    classify.create_models(outdir=models_dir, train_dir=feat_frame_train, separator=",", classifiers="RandomForest")

    """
    Create features at track scale for the train set
    Features: MFCC + Delta + Double Delta + ngrams + hist
    """
    model_file = "src/tmp/bayle/models/RandomForest/RandomForest.pkl"
    model_file = "/media/sf_DATA/ReproducibleResearchIEEE2017/src/tmp/bayle/models/RandomForest/RandomForest.pkl"
    create_track_feat_testset(indir, filelist_train, feat_train, model_file, train=True)

    # # 15h28m44s to 19h08m28s Done in 13184117ms
    create_track_feat_testset(loc_feat_testset_dirpath, filelist_test, feat_test, model_file)  

    classify.create_models(outdir=models_global, train_file=feat_train, classifiers="RandomForest")
    process_results(feat_train, feat_test)
def plot_isrc_year_distribution(isrc_filename="ISRC_valid.txt", img_outdir=""):
    """Description of plot_isrc_year_distribution

    Create a png image of the distribution of ISRCs over the years
    """
    img_outdir = utils.abs_path_dir(img_outdir)
    years = []
    with open(isrc_filename, 'r') as csvfile:
        isrcs = csv.reader(csvfile)
        for isrc in isrcs:
            year = int(isrc[0][5:7]) + 2000
            if year > date.today().year:
                year -= 100
            years.append(year)

    axe = plt.subplot(111)
    hist_bins_range = range(min(years), max(years) + 1, 1)
    plt.hist(years, bins=hist_bins_range, color="#BBBBBB")
    plt.xlabel("Registration years")
    plt.ylabel("ISRC number")
    plt.xlim(min(years) - 2, max(years) + 2)
    axe.spines['top'].set_visible(False)
    axe.spines['right'].set_visible(False)
    axe.get_xaxis().tick_bottom()
    axe.get_yaxis().tick_left()
    plt.savefig(img_outdir + "Figure_1_ISRC_year_distribution.png")
    utils.print_success("ISRC year distribution image saved")
def yaafe_feat_extraction(dir_tracks):
    """Description of yaafe_feat_extraction
    yaafe.py -r 22050 -f "mfcc: MFCC blockSize=2048 stepSize=1024" audio_fn.txt
    """
    utils.print_success("YAAFE features extraction (approx. 8 minutes)")
    
    # Assert Python version
    if sys.version_info.major != 2:
        utils.print_error("Yaafe needs Python 2 environment")
    
    # Assert folder exists
    dir_tracks = utils.abs_path_dir(dir_tracks)    
    
    filelist = os.listdir(dir_tracks)
    dir_feat = utils.create_dir(utils.create_dir("features") + "database1")
    # dir_tmp = utils.create_dir("tmp")
    # dir_yaafe = utils.create_dir(dir_tmp + "yaafe")
    # fn_filelist = dir_yaafe + "filelist.txt"
    dir_current = os.getcwd()
    os.chdir(dir_tracks)
    yaafe_cmd = 'yaafe -r 22050 -f "mfcc: MFCC blockSize=2048 stepSize=1024" '
    yaafe_cmd += "--resample -b " + dir_feat + " "
    for index, filen in enumerate(filelist):
        utils.print_progress_start(str(index+1) + "/" + str(len(filelist)) + " " + filen)
        os.system(yaafe_cmd + filen + "> /dev/null 2>&1")
    utils.print_progress_end()
    os.chdir(dir_current)
Esempio n. 8
0
def add_feat_yaafe(dir_feat, data, ids=None):
    """
    @brief      Reads features files.
    
    @param      dir_feat  The folder containing the songs' features
    
    @return     dict (key=ids and values=features)
    """
    dir_feat = utils.abs_path_dir(dir_feat + "yaafe/")
    for filen in os.listdir(dir_feat):
        new_id = re.search(r"\d{3,9}", filen).group()
        if ids is None or new_id in ids:
            MFCCs = [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]
            with open(dir_feat + filen, "r") as filep:
                for _ in range(0, 5):
                    next(filep)
                for line_num, line in enumerate(filep):
                    row = line.split(",")
                    for index, val in enumerate(row):
                        MFCCs[index] += float(val)
            for index, mfcc in enumerate(MFCCs):
                MFCCs[index] = mfcc / (line_num + 1)
            if new_id in data:
                data[new_id].append(MFCCs)
            else:
                data[new_id] = MFCCs
    return data
Esempio n. 9
0
def add_feat_essentia(dir_feat, data, ids=None):
    """
    @brief      Reads essentia features files.
    
    @param      dir_feat  The folder containing the songs' features
    
    @return     dict (key=ids and values=features)
    """
    dir_feat = utils.abs_path_dir(dir_feat + "essentia/")
    features = []
    for filen in os.listdir(dir_feat):
        new_id = re.search(r"\d{3,9}", filen).group()
        if ids is None or new_id in ids:
            with open(dir_feat + filen) as filep:
                essentia_feat = json.load(filep)

            features.append(essentia_feat["tonal"]["chords_changes_rate"])
            features.append(essentia_feat["tonal"]["chords_number_rate"])
            features.append(essentia_feat["tonal"]["tuning_frequency"])
            # features.append(essentia_feat["tonal"]["chords_key"])
            # features.append(essentia_feat["tonal"]["chords_scale"])
            # features.append(essentia_feat["tonal"]["key_key"])
            # features.append(essentia_feat["tonal"]["key_scale"])

            features.append(essentia_feat["rhythm"]["beats_count"])
            features.append(essentia_feat["rhythm"]["bpm"])
            features.append(essentia_feat["rhythm"]["danceability"])
            features.append(essentia_feat["rhythm"]["onset_rate"])

            # utils.print_error(features)
            if new_id in data:
                data[new_id].append(features)
            else:
                data[new_id] = features
    return data
def read_train_files(indir, separator=" "):
    """Description of read_train_files

    Gather local features and GT from every individual train songs
    """
    utils.print_success("Reading multiple train files")
    indir = utils.abs_path_dir(indir) + "/"
    groundtruths = []
    features = []
    included_extenstions = ["csv"]
    filenames = [
        fn for fn in os.listdir(indir) if any(
            fn.endswith(ext) for ext in included_extenstions)
    ]
    for index, filename in enumerate(filenames):
        print(str(index + 1) + "/" + str(len(filenames)) + " " + filename)
        sys.stdout.write("\033[F")  # Cursor up one line
        sys.stdout.write("\033[K")  # Clear line
        with open(indir + filename, "r") as filep:
            for row in filep:
                line = row.split(separator)
                features.append([float(i) for i in line[:-1]])
                groundtruths.append(line[-1][:-1])
    sys.stdout.write("\033[K")  # Clear line
    return features, groundtruths
Esempio n. 11
0
def experiments_2_3(vqmm_cmd, codebook_file):
    utils.print_success("Experiment 2 & 3 (approx. 6h")
    dir_tmp = utils.create_dir(utils.create_dir("src/tmp") + "vqmm")

    # train
    dir_models = utils.create_dir(dir_tmp + "models_expe2_3")
    train(vqmm_cmd, codebook_file, dir_models, dir_tmp + "filelist.txt")

    # Models file
    # Need to explicitly create models_file here for VQMM
    models_list = os.listdir(dir_models)
    models_file = dir_tmp + "models_file_expe2_3.txt"
    with open(models_file, "w") as filep:
        for model_path in models_list:
            if not "NOT" in model_path:
                filep.write(dir_models + model_path + "\n")

    # test
    test_dir = utils.abs_path_dir("features/database2/")
    groundtruths = utils.read_groundtruths("groundtruths/database2.csv")
    test_file_list = os.listdir(test_dir)
    with open(dir_tmp + "test_file_list.txt", "w") as filep:
        for test_filen in test_file_list:
            filep.write(test_dir + test_filen + "\t" +
                        groundtruths[test_filen.split("_")[0]] + "\n")
    dir_res = utils.create_dir(dir_tmp + "results_expe2_3")
    test(vqmm_cmd,
         codebook_file,
         outputdir=dir_res,
         models_file=models_file,
         testfile=dir_tmp + "test_file_list.txt")

    # disp results
    utils.print_success("Experiment 2 & 3 Done processing")
Esempio n. 12
0
def preprocess_yaafe_features(dir_features="features/database1/"):
    utils.print_success("Preprocessing YAAFE's features  (approx. 2 minutes)")
    groundtruths = utils.read_groundtruths("groundtruths/database1.csv")
    dir_features = utils.abs_path_dir(dir_features)
    filenames = os.listdir(dir_features)
    dir_tmp = utils.create_dir(utils.create_dir("src/tmp") + "ghosal")
    res_file_name = dir_tmp + "database1.csv"
    res_file = open(res_file_name, "w")
    res_file.write(
        "filename,MFCC_01,MFCC_02,MFCC_03,MFCC_04,MFCC_05,MFCC_06,MFCC_07,MFCC_08,MFCC_09,MFCC_10,MFCC_11,MFCC_12,MFCC_13,tag\n"
    )
    nb_header_lines = 4
    for index, filename in enumerate(filenames):
        utils.print_progress_start(
            str(index + 1) + "/" + str(len(filenames)) + " " + filename)
        with open(dir_features + filename, "r+") as filep:
            tmp_mfcc = np.zeros(shape=(13, 1))
            for line_index, line in enumerate(filep):
                # Skip 5 first header lines generated by YAAFE
                if line_index > nb_header_lines:
                    index = 0
                    mfccs = line[:-1].split(",")
                    for mfcc in mfccs:
                        tmp_mfcc[index] += float(mfcc)
                        index += 1
            tmp_mfcc /= (line_index - nb_header_lines)
            mfcc_str = ["%.15f" % number for number in tmp_mfcc]
            filen = filename.split(".")[0]
            if filen in groundtruths:
                res_file.write(filen + "," + ",".join(mfcc_str) + "," +
                               groundtruths[filen] + "\n")
    res_file.close()
    return res_file_name
def plot_precision_recall(indir, gts_file, outdir):
    groundtruths = read_item_tag(gts_file)
    plt.figure(1)

    indir = utils.abs_path_dir(indir)
    for item in os.listdir(indir):
        if ".csv" in item:
            isrcs = read_preds(indir + "/" + item)
            test_groundtruths = []
            predictions = []
            for isrc in isrcs:
                if isrc in groundtruths:
                    test_groundtruths.append(groundtruths[isrc])
                    predictions.append(isrcs[isrc])
            test_groundtruths = [tag == "s" for tag in test_groundtruths]
            precision, recall, _ = precision_recall_curve(
                test_groundtruths, predictions)
            plt.plot(recall,
                     precision,
                     label=item[:-4] + " (" + str(
                         round(
                             average_precision_score(test_groundtruths,
                                                     predictions), 3)) + ")")

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([-0.05, 1.05])
    plt.title('Precision-Recall curve for Algo (AUC)')
    plt.legend(loc='best')
    plt.savefig(outdir + "precision_recall.png", dpi=200, bbox_inches="tight")
    # plt.show()
    plt.close()
    utils.print_success("Precision-Recall curve created in " + outdir)
Esempio n. 14
0
def run_kea_on_folds(folds_dir):
    """Description of run_kea_on_folds

    Wrapper for kea on folds
    """
    folds_dir = utils.abs_path_dir(folds_dir)
    out_file = folds_dir + "/results.txt"
    if os.path.exists(folds_dir + "/train_test.arff"):
        train_file = folds_dir + "/train_test.arff"
        test_file = train_file
        run_kea(train_file, test_file, out_file)
    else:
        nb_folds = len([
            name for name in os.listdir(folds_dir)
            if os.path.isfile(os.path.join(folds_dir, name))
        ])
        # Run on multiple train/test
        for index in range(1, int(nb_folds / 2) + 1):
            utils.print_success("Train/Test on fold " + str(index))
            train_file = folds_dir + "/train_" + str(index).zfill(2) + ".arff"
            test_file = folds_dir + "/test_" + str(index).zfill(2) + ".arff"
            out_file = folds_dir + "/results_" + str(index).zfill(2) + ".arff"
            run_kea(train_file, test_file, out_file)

        utils.print_warning("TODO multiprocessing")
Esempio n. 15
0
def extract_features(dir_audio, dir_feat):
    dir_audio = utils.abs_path_dir(dir_audio)
    dir_feat = utils.abs_path_dir(dir_feat)
    filelist = []
    for elem in os.listdir(dir_audio):
        if os.path.isfile(dir_audio + elem):
            filelist.append(dir_audio + elem)
        else:
            for filename in os.listdir(dir_audio + elem):
                if "ld.wav" in filename:
                    filelist.append(dir_audio + elem + "/" + filename)
    # marsyas(dir_feat, filelist)
    for index, filen in enumerate(filelist):
        utils.print_progress_start(str(index+1) + "/" + str(len(filelist)) + " " + filen.split(os.sep)[-1])
        utils.yaafe(filen)
        essentia(dir_feat, filen)
    utils.print_progress_end()
Esempio n. 16
0
def validate_isrcs(infile="isrc.txt", outfile="ISRC_invalid.txt", indir=None):
    """Description of validate_isrcs

    Validate a list of ISRCs contained into a file
    All line must only contain the ISRC and the \n
    """
    rm_infile = False
    if indir:
        indir = utils.abs_path_dir(indir)
        print("Directory to analyse: " + indir)
        infile = "tmpISRCs.txt"
        os.system("ls " + indir + " > " + infile)
        rm_infile = True
    else:
        if os.path.isfile(infile):
            infile = os.path.abspath(infile)
        else:
            print("Invalid input file")
            sys.exit()
    if not os.path.isfile(outfile):
        outfile = os.path.abspath(outfile)
    else:
        print("Already existing output file will be overwritten")

    valid_isrcs = ""
    invalid_isrcs = ""
    cpt_invalid = 0
    isrc_file = open(infile, "r")

    for index, line in enumerate(isrc_file):
        isrc = line[0:12]
        print("\t" + str(index) + "\t" + isrc)
        sys.stdout.write("\033[F")  # Cursor up one line
        # sys.stdout.write("\033[K") # Clear line
        # if len(line) == 13 and validate_isrc(line[0:12]):
        if validate_isrc(isrc):
            valid_isrcs = valid_isrcs + line
        else:
            invalid_isrcs = invalid_isrcs + line
            cpt_invalid += 1
    sys.stdout.write("\033[K")  # Clear line

    isrc_file.close()

    if rm_infile:
        os.remove(infile)

    file_valid = open("ISRC_valid.txt", "w")
    file_valid.write(valid_isrcs)
    file_valid.close()

    if len(invalid_isrcs) != 0:
        print(str(cpt_invalid) + " invalid ISRCs stored in: " + outfile)
        file_invalid = open(outfile, "w")
        file_invalid.write(invalid_isrcs)
        file_invalid.close()
    else:
        print("All ISRCs are valid")
Esempio n. 17
0
def preprocess_features(folder):
    utils.print_success("Preprocessing train set")
    folder = utils.abs_path_dir(folder)
    filelist = os.listdir(folder)
    nb_file = str(len(filelist))
    for index, filename in enumerate(filelist):
        utils.print_progress_start(str(index) + "/" + nb_file + " " + filename)
        convert_feats_files(folder + filename)
    utils.print_progress_end()
Esempio n. 18
0
def figures1bd(indir, file_gts_track):
    """Description of figures1bd

    infile is formated like:
    /media/sf_github/yann/train/01 - 01 Les Jardins Japonais.wav.mfcc.csv
    feat1 feat2 ... featn tag1
    feat1 feat2 ... featn tag2
    ...
    feat1 feat2 ... featn tag2

    0 Input the local extracted features from YAAFE
        13 MFCC per frame
        186 musical pieces as train set
    1 Computes delta and double delta (39 features per frame)
    2 Gather global mean (39 features per musical pieces)
    3 train on mfcc & deltas (39 feat/frame) to output global predictions
    4 Use global preds to compute song and instru n-grams and histogramm
        which add 70 feat/track
        lead to a total of 109 feat/track
    5 Fit on 109x186
    6 predict (or predict_proba) on 41491 track 
    """

    # Preprocess arg
    indir = utils.abs_path_dir(indir)
    file_gts_track = utils.abs_path_file(file_gts_track)
    feat_frame_train = "feat_frame_train/"
    utils.create_dir(feat_frame_train)
    feat_frame_test = "feat_frame_test/"
    utils.create_dir(feat_frame_test)
    outdir_global = "feat_track/"
    utils.create_dir(outdir_global)
    feat_train = outdir_global + "train.csv"
    feat_test = outdir_global + "test.csv"
    models_dir = "models/"
    utils.create_dir(models_dir)
    loc_feat_testset_dirpath = "/media/sf_DATA/Datasets/Simbals/yaafe/results/processed/"
    filelist_test = "filelist_test.tsv"
    filelist_train = "filelist_train.tsv"
    models_global = "models_track/"
    utils.create_dir(models_global)

    # process_local_feat(indir, file_gts_track, feat_frame_train, feat_train, train=True)    
    # classify.create_models(outdir=models_dir, train_dir=feat_frame_train, separator=",")
    # create_track_feat_testset(indir, filelist_train, feat_train, train=True)

    # 15h28m44s to 19h08m28s Done in 13184117ms
    # create_track_feat_testset(loc_feat_testset_dirpath, filelist_test, feat_test)  

    # classify.create_models(outdir=models_global, train_file=feat_train)
    # classify.test_models_parallel(
        # models_dir=models_global,
        # out_dir="results/",
        # test_file=feat_test)
    
    # Display results
    reproduce.plot_results("results/")
Esempio n. 19
0
def plot_roc(indir, gts_file, outdir):
    groundtruths = read_item_tag(gts_file)
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--', label="Random (0.5)")
    
    indir = utils.abs_path_dir(indir)
    for item in os.listdir(indir):
        if ".csv" in item:
            isrcs = read_preds(indir + "/" + item)
            test_groundtruths = []
            predictions = []
            for isrc in isrcs:
                if isrc in groundtruths:
                    test_groundtruths.append(groundtruths[isrc])
                    predictions.append(isrcs[isrc])
            test_groundtruths = [tag=="s" for tag in test_groundtruths]
            fpr_rf, tpr_rf, _ = roc_curve(test_groundtruths, predictions)
            label = item[:-4] + " (" + str(round(roc_auc_score(test_groundtruths, predictions), 3)) + ")"
            color = ""
            if "VQMM" in item:
                color = "ro"
            elif "SVMBFF" in item:
                color = "g-"
            elif "GA" in item:
                color = "b:"
            
            plt.plot(fpr_rf, tpr_rf, color, label=label)

    ax = plt.gca()
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    # plt.title('ROC curve for Algo (AUC)')
    plt.legend(loc='best')
    outdir = utils.abs_path_dir(outdir)
    roc_fn = outdir + "Figure_3_ROC.png"
    plt.savefig(roc_fn, dpi=200, bbox_inches="tight")
    plt.savefig(outdir + "Figure_3_ROC.eps")
    # plt.show()
    plt.close()
    utils.print_success("ROC curve successfully created in " + roc_fn)
Esempio n. 20
0
def merge_files(folder, name):
    utils.print_success("Merging files")
    subfolder = utils.abs_path_dir(folder + name)
    data = ""
    for filen in os.listdir(subfolder):
        with open(subfolder + filen, "r") as filep:
            for line in filep:
                data += line
    with open(folder + name + ".csv", "w") as filep:
        filep.write(data)
def test_models(models_dir, test_dir, out_dir):
    models_dir = utils.abs_path_dir(models_dir) + "/"
    test_dir = utils.abs_path_dir(test_dir) + "/"
    utils.create_dir(out_dir)
    test_files = os.listdir(test_dir)
    models = os.listdir(models_dir)
    for model in models:
        utils.print_success(model)
        pred_dir = out_dir + model + "/"
        utils.create_dir(pred_dir)
        clf = joblib.load(models_dir + model + "/" + model + ".pkl")
        for index, test_file in enumerate(test_files):
            print(str(index) + "\t" + test_file)
            sys.stdout.write("\033[F")
            sys.stdout.write("\033[K")
            test_features = read_test_file(test_dir + test_file)
            predictions = clf.predict_proba(test_features)
            with open(pred_dir + test_file, "w") as filep:
                for pred in predictions:
                    filep.write(str(pred[0]) + "\n")
        sys.stdout.write("\033[K")
def plot_clf(indir="res/"):
    indir = utils.abs_path_dir(indir) + "/"
    algos = []
    measure = []
    with open(indir + "global.csv", "r") as filep:
        for line in filep:
            line = line.split(",")
            algos.append(line[0])
            measure.append(tuple(map(float, line[1:4])))

    n_groups = 3
    fig, ax = plt.subplots(figsize=(10, 6))

    index = np.arange(n_groups)
    bar_width = 0.2

    opacity = 0.4
    error_config = {'ecolor': '0.3'}

    color = utils.rand_color(len(algos))
    rects = {}
    offset = 0.15
    for ind, algo in enumerate(algos):
        print(ind)
        print(tuple(measure[ind]))
        rects[ind] = plt.bar(index + bar_width * ind + offset,
                             tuple(measure[ind]),
                             bar_width,
                             alpha=opacity,
                             color=color[ind],
                             label=algo)

    plt.ylabel('Scores (in %)')
    plt.xticks(index + bar_width * ind + offset,
               ('Precision', 'Recall', 'F-Measure'))
    plt.legend()
    plt.ylim(0, 1)

    # spines & axis
    ax = plt.gca()
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')

    art = []
    lgd = ax.legend(loc=9, bbox_to_anchor=(1.1, 1.), frameon=False)
    # lgd = pylab.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=2)
    art.append(lgd)
    # ax.legend()
    plt.tight_layout()
    img_name = "global.png"
    plt.savefig(img_name, dpi=200, additional_artists=art, bbox_inches="tight")
Esempio n. 23
0
def main():
    """
    1 
    2 Make cbk on train set
    3 Train 200
    4 Test 50k
    """
    # utils.print_success("VQMM (approx. 6h)")

    # 1
    # preprocess features
    # YAAFE produce files which contain unusable float format
    # Need to transfroms those into a valid format
    preprocess_features("features/database1/")

    # 2
    # Read filenames & groundtruths
    groundtruths = {}
    with open("groundtruths/database1.csv", "r") as filep:
        for line in filep:
            row = line[:-1].split(",")
            groundtruths[row[0]] = row[1]

    # 3
    # VQMM needs a special file containing path & filename along ground truth.
    dir_feats = utils.abs_path_dir("features/database1_vqmm/")
    files_list = os.listdir(dir_feats)
    dir_tmp = utils.create_dir(utils.create_dir("src/tmp") + "vqmm")
    filenames_gts = dir_tmp + "filelist.txt"
    with open(filenames_gts, "w") as filep:
        for filename in files_list:
            fn = filename.split(".")[0]
            filep.write(dir_feats + filename + "\t" + groundtruths[fn] + "\n")

    # # 4
    # # Need to compile VQMM and check that everything is ok
    utils.print_success("Compiling VQMM")
    vqmm_cmd = "src/vqmm/vqmm"
    os.system("make -C src/vqmm/src")

    # 5
    # Create codebook needed for VQMM
    file_cbk = dir_tmp + "codebook.txt"
    create_cbk(vqmm_cmd, filenames_gts, file_cbk)

    # 5
    # launch expe1
    experiment_1(vqmm_cmd, file_cbk)

    experiments_2_3(vqmm_cmd, file_cbk)
    process_results()
Esempio n. 24
0
def merge_arff(indir, outfilename):
    """Description of merge_arff

    bextract programm from Marsyas generate one output file per audio file
    This function merge them all in one unique file
    Check if analysed file are valid i.e. not empty
    """
    utils.print_success("Preprocessing ARFFs")
    indir = utils.abs_path_dir(indir)
    tmpfilename = "tmp_arff.txt"
    os.system("ls " + indir + " > " + tmpfilename)
    with open(tmpfilename, 'r') as filenames:
        outfn = open(outfilename, 'w')
        cpt_invalid_fn = 0
        # Write first lines of ARFF template file
        for filename in filenames:
            filename = validate_arff(indir + "/" + filename[:-1])
            if filename:
                with open(filename, 'r') as template:
                    nb_line = 77
                    for line in template:
                        if not nb_line:
                            break
                        nb_line -= 1
                        outfn.write(line)
                    break
            else:
                cpt_invalid_fn += 1
        # Append all arff file to the output file
        cur_file_num = 1
        for filename in filenames:
            filename = validate_arff(indir + "/" + filename[:-1])
            if filename:
                cur_file_num = cur_file_num + 1
                sys.stdout.write("\r\tAnalysing file\t" + str(cur_file_num))
                sys.stdout.flush()
                fname = open(filename, 'r')
                outfn.write("".join(fname.readlines()[74:77]))
                fname.close()
            else:
                cpt_invalid_fn += 1
        sys.stdout.write('\n')
        sys.stdout.flush()
        outfn.close()
    os.remove(tmpfilename)
    if cpt_invalid_fn:
        utils.print_warning(str(cpt_invalid_fn) + " ARFF with errors found")
    utils.print_success("Preprocessing done")
    return outfilename
Esempio n. 25
0
def main(args):
    """
    @brief      Main entry point
    """
    path = utils.abs_path_dir(args.path)
    in_dir = utils.abs_path_dir(args.in_dir)
    out_dir = utils.abs_path_dir(args.out_dir)
    id_songs_feat_done = []
    for filen in os.listdir(out_dir):
        if os.path.isfile(out_dir + filen):
            m = re.search(r"\d{3,9}", filen)
            id_songs_feat_done.append(m.group())
    id_songs_feat_done = list(set(id_songs_feat_done))
    index = 0
    with open("../data/filelist.csv", "r") as filep:
        for line in filep:
            row = line[:-1].split(",")
            # Check if features have been extracted by YAAFE, Marsyas & Essentia
            if "1" in row[6] and "1" in row[7] and "1" in row[8]:
                if not row[0] in id_songs_feat_done:
                    folder = in_dir + row[1] + "_" + row[2] + "_" + row[0]
                    index += 1
                    print(str(index) + " " + folder)
                    extract_features(folder, out_dir, path)
Esempio n. 26
0
def plot_isrc_country_repartition(isrc_filename="ISRC_valid.txt",
                                  img_outdir=""):
    """Description of plot_isrc_country_repartition
    """
    img_outdir = utils.abs_path_dir(img_outdir)
    # Gather countries' name along ISO-2 codes
    countries = {}
    with open('src/wikipedia-iso-country-codes.csv', 'r') as csvfile:
        codes = csv.reader(csvfile, delimiter=',', quotechar='"')
        for row in codes:
            countries[row[0]] = row[1]
    # Map nb of ISRCs with a color for each country
    colors = {}
    with open(isrc_filename, "r") as filep:
        for row in filep:
            country_code = row[0:2]
            if country_code in colors:
                colors[country_code] += 1
            else:
                colors[country_code] = 1
    countries_shp = shpreader.natural_earth(resolution='110m',
                                            category='cultural',
                                            name='admin_0_countries')
    fig, axe = plt.subplots(figsize=(12, 6),
                            subplot_kw={'projection': ccrs.PlateCarree()})

    norm = mpl.colors.Normalize(vmin=0, vmax=float(max(list(colors.values()))))
    cmap = plt.cm.YlOrBr  # or YlGnBu

    for country in shpreader.Reader(countries_shp).records():
        country_name = country.attributes['name_long']
        if country_name in countries:
            country_iso2 = countries[country_name]
            if country_iso2 in colors:
                color = colors[country_iso2]
            else:
                color = -1
        else:
            color = -1
        axe.add_geometries(country.geometry,
                           ccrs.PlateCarree(),
                           facecolor=cmap(norm(color)),
                           label=country_name)
    axe.outline_patch.set_edgecolor('white')
    cax = fig.add_axes([0.91, 0.2, 0.02, 0.6])
    mpl.colorbar.ColorbarBase(cax, cmap=cmap, norm=norm)
    plt.savefig(img_outdir + "Figure_2_ISRC_country_repartition.png")
    utils.print_success("ISRC country repartition image saved")
Esempio n. 27
0
def merge_arff(indir, outfilename):
    """Description of merge_arff

    bextract program from Marsyas generate one output file per audio file
    This function merge them all in one unique file
    Check if analysed file are valid i.e. not empty
    """
    utils.print_success("Preprocessing ARFFs")
    indir = utils.abs_path_dir(indir)
    filenames = os.listdir(indir)
    outfn = open(outfilename, 'w')
    cpt_invalid_fn = 0
    # Write first lines of ARFF template file
    for filename in filenames:
        if os.path.isfile(indir + filename):
            new_fn = validate_arff(indir + filename)
            if new_fn:
                with open(new_fn, 'r') as template:
                    nb_line = 74
                    for line in template:
                        if not nb_line:
                            break
                        nb_line -= 1
                        outfn.write(line)
                    break
            else:
                cpt_invalid_fn += 1
    # Append all arff file to the output file
    cur_file_num = 1
    for filename in filenames:
        if os.path.isfile(indir + filename):
            new_fn = validate_arff(indir + filename)
            if new_fn:
                cur_file_num = cur_file_num + 1
                utils.print_progress_start("Analysing file\t" +
                                           str(cur_file_num))
                fname = open(new_fn, 'r')
                outfn.write("".join(fname.readlines()[74:77]))
                fname.close()
            else:
                cpt_invalid_fn += 1
    utils.print_progress_end()
    outfn.close()
    # os.system("rm " + indir + "*.arff")
    if cpt_invalid_fn:
        utils.print_warning(
            str(cpt_invalid_fn) + " ARFF files with errors found")
    return outfilename
Esempio n. 28
0
def classify(train=None,
             test=None,
             data=None,
             res_dir="res/",
             disp=True,
             outfilename=None):
    """Description of compare
    compare multiple classifier and display the best one
    """
    utils.print_success("Comparison of differents classifiers")
    if data is not None:
        train_features = data["train_features"]
        train_groundtruths = data["train_groundtruths"]
        test_features = data["test_features"]
        test_groundtruths = data["test_groundtruths"]
    else:
        train = utils.abs_path_file(train)
        test = utils.abs_path_file(test)
        train_features, train_groundtruths = read_file(train)
        test_features, test_groundtruths = read_file(test)
    if not utils.create_dir(res_dir):
        res_dir = utils.abs_path_dir(res_dir)
    classifiers = {
        "RandomForest": RandomForestClassifier(n_jobs=-1)
        # "RandomForest": RandomForestClassifier(n_estimators=5),
        # "KNeighbors":KNeighborsClassifier(3),
        # "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
        # "DecisionTree":DecisionTreeClassifier(max_depth=5),
        # "MLP":MLPClassifier(),
        # "AdaBoost":AdaBoostClassifier(),
        # "GaussianNB":GaussianNB(),
        # "QDA":QuadraticDiscriminantAnalysis(),
        # "SVM":SVC(kernel="linear", C=0.025),
        # "GradientBoosting":GradientBoostingClassifier(),
        # "ExtraTrees":ExtraTreesClassifier(),
        # "LogisticRegression":LogisticRegression(),
        # "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis()
    }
    for key in classifiers:
        utils.print_success(key)
        clf = classifiers[key]
        utils.print_info("\tFit")
        clf.fit(train_features, train_groundtruths)
        utils.print_info("\tPredict")
        predictions = clf.predict(test_features)
    return predictions
Esempio n. 29
0
def read_data_1(stats_dir, filen):
    stats_dir = utils.abs_path_dir(stats_dir)
    filen = utils.abs_path_file(filen)
    data = []
    names = []
    with open(stats_dir + filen, "r") as filep:
        for line in filep:
            # Read file with lines like this:
            # GA,0.578947368421,0.631578947368,0.710526315789,0.722222222222
            # SVMBFF,0.631578947368,0.684210526316,0.815789473684,0.66666666
            # VQMM,0.736842105263,0.842105263158,0.842105263158,0.75,0.61111
            row = line[:-1].split(",")
            tmp = []
            for index in range(1, len(row)):
                names.append(row[0])
                tmp.append(float(row[index]))
            data.append(tmp)
            print(filen.split(".")[0].split("_")[1].title() + " for " + row[0] + " \t= " + str("{0:.3f}".format(sum(tmp)/len(tmp))) + " ± " + str("{0:.3f}".format(stdev(tmp))))
Esempio n. 30
0
def read_files(dir_features):
    utils.print_success("Preprocessing YAAFE's features  (approx. 20 minutes)")
    tmp_gts = utils.read_groundtruths("groundtruths/database2.csv")
    dir_features = utils.abs_path_dir(dir_features)
    filenames = os.listdir(dir_features)
    dir_tmp = utils.create_dir(utils.create_dir("tmp") + "ghosal")
    features = []
    groundtruths = []
    to_print = "/" + str(len(filenames))
    for index, filename in enumerate(filenames):
        utils.print_progress_start(str(index + 1) + to_print)
        # pandas used here because fastest method to read csv fils
        data = pandas.read_csv(dir_features + filename, sep=" ").values
        filen = filename.split("_")[0]
        if filen in tmp_gts:
            groundtruths.append(tmp_gts[filen])
            features.append([sum(x) / len(data) for x in zip(*data)])
    return filenames, features, groundtruths