Beispiel #1
0
def diarization(wavname,ubmname,out_dir):
    out = out_dir+'/'
    basename = tools.gen_uid(wavname)
    sadname = '%s/%s_sad.txt'%(out,basename)
    featname = '%s/%s_feat.mfc'%(out,basename)
    bicname = '%s/%s_bic.txt'%(out,basename)
    clustname = '%s/%s_cluster.txt'%(out,basename)
    viterbiname = '%s/%s_viterbi.txt'%(out,basename)
    attr = {'audio':wavname,
        'mfcc':featname,
        'sad':sadname,
        'bic':bicname,
        'cluster':clustname,
        'viterbi':viterbiname}

    # SAD
    sad.run_sad(attr)
    
    # MFCC
    feat.run_mfcc(attr)
    
    # BIC
    bic.run_bic(attr,'audioseg')
    
    # CLUSTERING
    cluster.run_clustering(attr)
    
    
    # Pick top clusters
    labels, segment_starts,segment_ends = tools.read_segs(attr['cluster'])
    top_n = tools.top_n_clusters(labels, segment_starts,segment_ends,n=2)
    
    
    # Adapt UBM for each cluster.
    cluster_gmms = {}
    for i in top_n:
        cluster = 'C%s'%(str(i))
        gmmname = gmm.adapt(attr,cluster,ubmname)
        cluster_gmms[cluster] = gmmname
    
    # Resegmentation
    hmmname = '%s/%s_hmm.txt'%(out,basename)
    resegment.viterbi(attr,cluster_gmms,hmmname)
    labs,starts,ends = tools.merge_segs(attr['viterbi'],attr['sad'])
    return labs,starts,ends
def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\
          n_expand, level, caseolap=True, local_embedding=True):

    if level > MAX_LEVEL:
        return
    #print('============================= Running level ', level, ' and node ', parent, '=============================')
    start = time.time()
    df = DataFiles(input_dir, node_dir)

    ## TODO: Everytime we need to read-in the whole corpus, which can be slow.

    full_data = DataSet(df.embedding_file, df.doc_file)
    end = time.time()
    #print('[Main] Done reading the full data using time %s seconds' % (end-start))

    # filter the keywords
    if caseolap is False:
        try:
            children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent, \
                                      df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file)
        except:
            print(children)
            print('Clustering not finished.')
            return
        copyfile(df.seed_keyword_file, df.filtered_keyword_file)
    else:
        ## Adaptive Clustering, maximal n_cluster_iter iterations

        n_cluster = clusterInfo.pop(0)  ## Changed by Mili

        if n_cluster > 0:
            for iter in range(n_cluster_iter):
                if iter > 0:
                    df.seed_keyword_file = df.filtered_keyword_file
                try:
                    children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent,\
                                   df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file)
                except Exception as err:
                    print('[Error]: Clustering not finished.')
                    print('[Error]: ' + str(err))
                    return

                start = time.time()
                main_caseolap(df.link_file, df.doc_membership_file,
                              df.cluster_keyword_file,
                              df.caseolap_keyword_file)
                main_rank_phrase(df.caseolap_keyword_file,
                                 df.filtered_keyword_file, filter_thre)

                end = time.time()
                #print("[Main] Finish running CaseOALP using %s (seconds)" % (end - start))
        else:
            print('[Info]: Leaf node is reached')

    # prepare the embedding for child level

    if n_cluster > 0:  #changed by mili

        if level < MAX_LEVEL:

            if local_embedding is False:

                src_file = node_dir + 'embeddings.txt'

                for child in children:
                    tgt_file = node_dir + child + '/embeddings.txt'
                    # copyfile(src_file, tgt_file)
                    symlink(src_file, tgt_file)
            else:
                start = time.time()
                main_local_embedding(node_dir, df.doc_file, df.index_file,
                                     parent, n_expand, level, MAX_LEVEL, SIZE,
                                     SAMPLE, WINDOW, MIN_COUNT, ITER)
                end = time.time()
                #print("[Main] Finish running local embedding training using %s (seconds)" % (end - start))

        for child in children:
            recur(input_dir, node_dir + child + '/', n_cluster, child, n_cluster_iter, \
                      filter_thre, n_expand, level + 1, caseolap, local_embedding)
Beispiel #3
0
def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\
          n_expand, level, caseolap=True, local_embedding=True):
    if level > MAX_LEVEL:
        return
    print('============================= Running level ', level, ' and node ',
          parent, '=============================')
    start = time.time()
    df = DataFiles(input_dir, node_dir)
    ## TODO: Everytime we need to read-in the whole corpus, which can be slow.
    print('mylog df.doc_file: ' + df.doc_file)
    full_data = DataSet(df.embedding_file, df.doc_file)
    # print('mylog type(full_data): ' + str(type(full_data)))
    end = time.time()
    print('[Main] Done reading the full data using time %s seconds' %
          (end - start))

    # filter the keywords
    if caseolap is False:
        try:
            children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent, \
                                      df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file)
        except:
            print('Clustering not finished.')
            return
        copyfile(df.seed_keyword_file, df.filtered_keyword_file)
    else:
        ## Adaptive Clustering, maximal n_cluster_iter iterations
        for iter in range(n_cluster_iter):
            if iter > 0:
                df.seed_keyword_file = df.filtered_keyword_file
            try:
                children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent,\
                               df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file)
            except:
                print('Clustering not finished.')
                return

            start = time.time()
            main_caseolap(df.link_file, df.doc_membership_file,
                          df.cluster_keyword_file, df.caseolap_keyword_file)
            main_rank_phrase(df.caseolap_keyword_file,
                             df.filtered_keyword_file, filter_thre)
            end = time.time()
            print("[Main] Finish running CaseOALP using %s (seconds)" %
                  (end - start))

    # prepare the embedding for child level
    if level < MAX_LEVEL:
        if local_embedding is False:
            src_file = node_dir + 'embeddings.txt'
            for child in children:
                tgt_file = node_dir + child + '/embeddings.txt'
                # copyfile(src_file, tgt_file)
                symlink(src_file, tgt_file)
        else:
            start = time.time()
            main_local_embedding(node_dir, df.doc_file, df.index_file, parent,
                                 n_expand)
            end = time.time()
            print(
                "[Main] Finish running local embedding training using %s (seconds)"
                % (end - start))

    for child in children:
        recur(input_dir, node_dir + child + '/', n_cluster, child, n_cluster_iter, \
              filter_thre, n_expand, level + 1, caseolap, local_embedding)
Beispiel #4
0
    wavname,ubmname = tools.read_input()
    basename = tools.gen_uid(wavname)
    attr = tools.gen_attr(out,basename,wavname)
    
    
    # SAD
    sad.run_sad(attr)

    # MFCC
    feat.run_mfcc(attr)

    # BIC
    bic.run_bic(attr,'uniform')

    # CLUSTERING
    cluster.run_clustering(attr)

    # Pick top clusters
    labels, segment_starts,segment_ends = tools.read_segs(attr['cluster'])
    n_spkrs = 4
    top_n = tools.top_n_clusters(labels, segment_starts,segment_ends,n_spkrs)


    # Adapt UBM for each cluster.
    cluster_gmms = {}
    for i in top_n:
        cluster = 'C%s'%(str(i))
        gmmname = gmm.adapt(attr,cluster,ubmname)
        cluster_gmms[cluster] = gmmname
    
    # Resegmentation
Beispiel #5
0
def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,
          n_expand, level, caseolap=True, local_embedding=True):
    if level > MAX_LEVEL:
        return
    print('============================= Running level ', level, ' and node ', parent, '=============================')
    start = time.time()
    df = DataFiles(input_dir, node_dir)
    # df.embedding_file: node_dir/embeddings.txt
    # df.doc_file: input_dir/papers.txt
    full_data = DataSet(df.embedding_file, df.doc_file)
    end = time.time()
    print('[Main] Done reading the full data using time %s seconds' % (end - start))

    # filter the keywords
    if caseolap is False:
        try:
            children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent,
                                      df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file)
        except:
            print('Clustering not finished, please check: if caseolap is False.')
            return
        copyfile(df.seed_keyword_file, df.filtered_keyword_file)

    else:
        # Adaptive Clustering, maximal n_cluster_iter iterations
        for iter in range(n_cluster_iter):
            if iter > 0:
                df.seed_keyword_file = df.filtered_keyword_file
            # try:
            #     children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent,
            #                               df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file)
            # except:
            #     print('Clustering not finished, please check: if caseolap is True.')
            #     return

            children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent,
                                      df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file)
            start = time.time()
            main_caseolap(df.link_file, df.doc_membership_file, df.cluster_keyword_file, df.caseolap_keyword_file)
            main_rank_phrase(df.caseolap_keyword_file, df.filtered_keyword_file, filter_thre)
            end = time.time()
            print("[Main] Finish running CaseOALP using %s (seconds)" % (end - start))
            
    del full_data
    gc.collect()

    # prepare the embedding for child level
    if level < MAX_LEVEL:
        if local_embedding is False:
            src_file = node_dir + 'embeddings.txt'
            for child in children:
                tgt_file = node_dir + child + '/embeddings.txt'
                # copyfile(src_file, tgt_file)
                symlink(src_file, tgt_file)
        else:
            start = time.time()
            main_local_embedding(node_dir, df.doc_file, df.index_file, parent, n_expand)
            end = time.time()
            print("[Main] Finish running local embedding training using %s (seconds)" % (end - start))

    for child in children:
        recur(input_dir, node_dir + child + '/', n_cluster, child, n_cluster_iter,
              filter_thre, n_expand, level + 1, caseolap, local_embedding)