def diarization(wavname,ubmname,out_dir): out = out_dir+'/' basename = tools.gen_uid(wavname) sadname = '%s/%s_sad.txt'%(out,basename) featname = '%s/%s_feat.mfc'%(out,basename) bicname = '%s/%s_bic.txt'%(out,basename) clustname = '%s/%s_cluster.txt'%(out,basename) viterbiname = '%s/%s_viterbi.txt'%(out,basename) attr = {'audio':wavname, 'mfcc':featname, 'sad':sadname, 'bic':bicname, 'cluster':clustname, 'viterbi':viterbiname} # SAD sad.run_sad(attr) # MFCC feat.run_mfcc(attr) # BIC bic.run_bic(attr,'audioseg') # CLUSTERING cluster.run_clustering(attr) # Pick top clusters labels, segment_starts,segment_ends = tools.read_segs(attr['cluster']) top_n = tools.top_n_clusters(labels, segment_starts,segment_ends,n=2) # Adapt UBM for each cluster. cluster_gmms = {} for i in top_n: cluster = 'C%s'%(str(i)) gmmname = gmm.adapt(attr,cluster,ubmname) cluster_gmms[cluster] = gmmname # Resegmentation hmmname = '%s/%s_hmm.txt'%(out,basename) resegment.viterbi(attr,cluster_gmms,hmmname) labs,starts,ends = tools.merge_segs(attr['viterbi'],attr['sad']) return labs,starts,ends
def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\ n_expand, level, caseolap=True, local_embedding=True): if level > MAX_LEVEL: return #print('============================= Running level ', level, ' and node ', parent, '=============================') start = time.time() df = DataFiles(input_dir, node_dir) ## TODO: Everytime we need to read-in the whole corpus, which can be slow. full_data = DataSet(df.embedding_file, df.doc_file) end = time.time() #print('[Main] Done reading the full data using time %s seconds' % (end-start)) # filter the keywords if caseolap is False: try: children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent, \ df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file) except: print(children) print('Clustering not finished.') return copyfile(df.seed_keyword_file, df.filtered_keyword_file) else: ## Adaptive Clustering, maximal n_cluster_iter iterations n_cluster = clusterInfo.pop(0) ## Changed by Mili if n_cluster > 0: for iter in range(n_cluster_iter): if iter > 0: df.seed_keyword_file = df.filtered_keyword_file try: children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent,\ df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file) except Exception as err: print('[Error]: Clustering not finished.') print('[Error]: ' + str(err)) return start = time.time() main_caseolap(df.link_file, df.doc_membership_file, df.cluster_keyword_file, df.caseolap_keyword_file) main_rank_phrase(df.caseolap_keyword_file, df.filtered_keyword_file, filter_thre) end = time.time() #print("[Main] Finish running CaseOALP using %s (seconds)" % (end - start)) else: print('[Info]: Leaf node is reached') # prepare the embedding for child level if n_cluster > 0: #changed by mili if level < MAX_LEVEL: if local_embedding is False: src_file = node_dir + 'embeddings.txt' for child in children: tgt_file = node_dir + child + '/embeddings.txt' # copyfile(src_file, tgt_file) symlink(src_file, tgt_file) else: start = time.time() main_local_embedding(node_dir, df.doc_file, df.index_file, parent, n_expand, level, MAX_LEVEL, SIZE, SAMPLE, WINDOW, MIN_COUNT, ITER) end = time.time() #print("[Main] Finish running local embedding training using %s (seconds)" % (end - start)) for child in children: recur(input_dir, node_dir + child + '/', n_cluster, child, n_cluster_iter, \ filter_thre, n_expand, level + 1, caseolap, local_embedding)
def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\ n_expand, level, caseolap=True, local_embedding=True): if level > MAX_LEVEL: return print('============================= Running level ', level, ' and node ', parent, '=============================') start = time.time() df = DataFiles(input_dir, node_dir) ## TODO: Everytime we need to read-in the whole corpus, which can be slow. print('mylog df.doc_file: ' + df.doc_file) full_data = DataSet(df.embedding_file, df.doc_file) # print('mylog type(full_data): ' + str(type(full_data))) end = time.time() print('[Main] Done reading the full data using time %s seconds' % (end - start)) # filter the keywords if caseolap is False: try: children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent, \ df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file) except: print('Clustering not finished.') return copyfile(df.seed_keyword_file, df.filtered_keyword_file) else: ## Adaptive Clustering, maximal n_cluster_iter iterations for iter in range(n_cluster_iter): if iter > 0: df.seed_keyword_file = df.filtered_keyword_file try: children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent,\ df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file) except: print('Clustering not finished.') return start = time.time() main_caseolap(df.link_file, df.doc_membership_file, df.cluster_keyword_file, df.caseolap_keyword_file) main_rank_phrase(df.caseolap_keyword_file, df.filtered_keyword_file, filter_thre) end = time.time() print("[Main] Finish running CaseOALP using %s (seconds)" % (end - start)) # prepare the embedding for child level if level < MAX_LEVEL: if local_embedding is False: src_file = node_dir + 'embeddings.txt' for child in children: tgt_file = node_dir + child + '/embeddings.txt' # copyfile(src_file, tgt_file) symlink(src_file, tgt_file) else: start = time.time() main_local_embedding(node_dir, df.doc_file, df.index_file, parent, n_expand) end = time.time() print( "[Main] Finish running local embedding training using %s (seconds)" % (end - start)) for child in children: recur(input_dir, node_dir + child + '/', n_cluster, child, n_cluster_iter, \ filter_thre, n_expand, level + 1, caseolap, local_embedding)
wavname,ubmname = tools.read_input() basename = tools.gen_uid(wavname) attr = tools.gen_attr(out,basename,wavname) # SAD sad.run_sad(attr) # MFCC feat.run_mfcc(attr) # BIC bic.run_bic(attr,'uniform') # CLUSTERING cluster.run_clustering(attr) # Pick top clusters labels, segment_starts,segment_ends = tools.read_segs(attr['cluster']) n_spkrs = 4 top_n = tools.top_n_clusters(labels, segment_starts,segment_ends,n_spkrs) # Adapt UBM for each cluster. cluster_gmms = {} for i in top_n: cluster = 'C%s'%(str(i)) gmmname = gmm.adapt(attr,cluster,ubmname) cluster_gmms[cluster] = gmmname # Resegmentation
def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre, n_expand, level, caseolap=True, local_embedding=True): if level > MAX_LEVEL: return print('============================= Running level ', level, ' and node ', parent, '=============================') start = time.time() df = DataFiles(input_dir, node_dir) # df.embedding_file: node_dir/embeddings.txt # df.doc_file: input_dir/papers.txt full_data = DataSet(df.embedding_file, df.doc_file) end = time.time() print('[Main] Done reading the full data using time %s seconds' % (end - start)) # filter the keywords if caseolap is False: try: children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent, df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file) except: print('Clustering not finished, please check: if caseolap is False.') return copyfile(df.seed_keyword_file, df.filtered_keyword_file) else: # Adaptive Clustering, maximal n_cluster_iter iterations for iter in range(n_cluster_iter): if iter > 0: df.seed_keyword_file = df.filtered_keyword_file # try: # children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent, # df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file) # except: # print('Clustering not finished, please check: if caseolap is True.') # return children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent, df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file) start = time.time() main_caseolap(df.link_file, df.doc_membership_file, df.cluster_keyword_file, df.caseolap_keyword_file) main_rank_phrase(df.caseolap_keyword_file, df.filtered_keyword_file, filter_thre) end = time.time() print("[Main] Finish running CaseOALP using %s (seconds)" % (end - start)) del full_data gc.collect() # prepare the embedding for child level if level < MAX_LEVEL: if local_embedding is False: src_file = node_dir + 'embeddings.txt' for child in children: tgt_file = node_dir + child + '/embeddings.txt' # copyfile(src_file, tgt_file) symlink(src_file, tgt_file) else: start = time.time() main_local_embedding(node_dir, df.doc_file, df.index_file, parent, n_expand) end = time.time() print("[Main] Finish running local embedding training using %s (seconds)" % (end - start)) for child in children: recur(input_dir, node_dir + child + '/', n_cluster, child, n_cluster_iter, filter_thre, n_expand, level + 1, caseolap, local_embedding)