def run(sentence_id, publication_id, fname_prefix, fname, fval): features = {} for i in range(len(fname)): name = fname_prefix + '_' + fname[i] val = fval[i] features[name] = val try: feature_str = json.dumps(features) yield (sentence_id, publication_id, feature_str) except: plpy.info("WARNING: CANNOT CONVERT JSON:", features) plpy.info(e.message)
def DFS(lattice_id, edges, nowid, res_array, index_cid_sub, skipping=False): # Reaches N, finish generated a N-gram. if len(res_array) == gram_len: feature_tmp = [] for cwid in res_array: sub = index_cid_sub[cwid] # Deal with range problem if sub >= len(arr_feature): plpy.info('OUT OF RANGE:' + str(arr_feature[:10])) else: feature_tmp.append(arr_feature[sub]) feature = ' '.join([f for f in feature_tmp]) for cwid in res_array: # print lattice_id + '\t' + str(cwid) + '\t' + feature yield lattice_id, cwid, feature, skipping return if nowid not in edges: return for jid in edges[nowid]: next_word = arr_feature[index_cid_sub[jid]] if skipping: # already skipgram, don't generate nonskipgram. if next_word in skipset: # for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True): # yield item pass # rule: can only skip once... else: res_array.append(jid) for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True): yield item res_array.pop() else: # not skipping if next_word in skipset: # branch into two! # Skipgram for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True): yield item # Nonskipgram res_array.append(jid) for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=False): yield item res_array.pop() else: # nonskipgram res_array.append(jid) for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=False): yield item res_array.pop()
def run(publication_id, fname_prefix, fname, fval): features = {} # All features are addable: doc stats / Ngram for i in range(len(fname)): name = fname_prefix + '_' + fname[i] val = fval[i] if name not in features: features[name] = 0.0 features[name] += val try: feature_str = json.dumps(features) yield (publication_id, feature_str) except Exception as e: plpy.info("WARNING: CANNOT CONVERT JSON:", features) plpy.info(e.message)
def run(lattice_id, starts, ends, arr_feature, candidate_ids, gram_len): # Store functions if 'AddEdge' in SD: AddEdge = SD['AddEdge'] else: # allow multiple same edges def AddEdge(edges, f, t): if f not in edges: edges[f] = [] edges[f].append(t) SD['AddEdge'] = AddEdge if 'skipset' in SD: skipset = SD['skipset'] else: skipset = set(['~SIL', '<s>', '</s>']) SD['skipset'] = skipset if 'DFS' in SD: DFS = SD['DFS'] else: # Start with DFS(edges, id, 1) def DFS(lattice_id, edges, nowid, res_array, index_cid_sub, skipping=False): # Reaches N, finish generated a N-gram. if len(res_array) == gram_len: feature_tmp = [] for cwid in res_array: sub = index_cid_sub[cwid] # Deal with range problem if sub >= len(arr_feature): plpy.info('OUT OF RANGE:' + str(arr_feature[:10])) else: feature_tmp.append(arr_feature[sub]) feature = ' '.join([f for f in feature_tmp]) for cwid in res_array: # print lattice_id + '\t' + str(cwid) + '\t' + feature yield lattice_id, cwid, feature, skipping return if nowid not in edges: return for jid in edges[nowid]: next_word = arr_feature[index_cid_sub[jid]] if skipping: # already skipgram, don't generate nonskipgram. if next_word in skipset: # for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True): # yield item pass # rule: can only skip once... else: res_array.append(jid) for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True): yield item res_array.pop() else: # not skipping if next_word in skipset: # branch into two! # Skipgram for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True): yield item # Nonskipgram res_array.append(jid) for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=False): yield item res_array.pop() else: # nonskipgram res_array.append(jid) for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=False): yield item res_array.pop() SD['DFS'] = DFS ################## MAIN FUNCTION #################### N = len(candidate_ids) # number of words if N == 0: plpy.info('Empty data:'+str(lattice_id)) return # Build index for candidate_id index_cid_sub = {} for sub in range(N): cid = candidate_ids[sub] index_cid_sub[cid] = sub # Build index index_start_sub = {} for sub in range(N): start = starts[sub] if start not in index_start_sub: index_start_sub[start] = [] index_start_sub[start].append(sub) ######### 1. build directed graph # Note that input is sorted by (start, end) last_start = starts[0] last_end = ends[0] last_candidate_id = -1 edges = {} # cand_word_id1 : [cand_word_id2] for i in range(N): start = starts[i] end = ends[i] if end + 1 in index_start_sub: for j in index_start_sub[end + 1]: AddEdge(edges, candidate_ids[i], candidate_ids[j]) print edges ######## 2. DFS output candidates res_array = [] for startid in sorted(edges.keys()): res_array.append(startid) for item in DFS(lattice_id, edges, startid, res_array, index_cid_sub): yield item res_array.pop()
def run(lattice_id, starts, ends, arr_feature, candidate_ids, gram_len): # Store functions if 'AddEdge' in SD: AddEdge = SD['AddEdge'] else: # allow multiple same edges def AddEdge(edges, f, t): if f not in edges: edges[f] = [] edges[f].append(t) SD['AddEdge'] = AddEdge if 'skipset' in SD: skipset = SD['skipset'] else: skipset = set(['~SIL', '<s>', '</s>']) SD['skipset'] = skipset if 'DFS' in SD: DFS = SD['DFS'] else: # Start with DFS(edges, id, 1) def DFS(lattice_id, edges, nowid, res_array, index_cid_sub, skipping=False): # Reaches N, finish generated a N-gram. if len(res_array) == gram_len: feature_tmp = [] for cwid in res_array: sub = index_cid_sub[cwid] # Deal with range problem if sub >= len(arr_feature): plpy.info('OUT OF RANGE:' + str(arr_feature[:10])) else: feature_tmp.append(arr_feature[sub]) feature = ' '.join([f for f in feature_tmp]) for cwid in res_array: # print lattice_id + '\t' + str(cwid) + '\t' + feature yield lattice_id, cwid, feature, skipping return if nowid not in edges: return for jid in edges[nowid]: next_word = arr_feature[index_cid_sub[jid]] if skipping: # already skipgram, don't generate nonskipgram. if next_word in skipset: # for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True): # yield item pass # rule: can only skip once... else: res_array.append(jid) for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True): yield item res_array.pop() else: # not skipping if next_word in skipset: # branch into two! # Skipgram for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True): yield item # Nonskipgram res_array.append(jid) for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=False): yield item res_array.pop() else: # nonskipgram res_array.append(jid) for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=False): yield item res_array.pop() SD['DFS'] = DFS ################## MAIN FUNCTION #################### N = len(candidate_ids) # number of words if N == 0: plpy.info('Empty data:' + str(lattice_id)) return # Build index for candidate_id index_cid_sub = {} for sub in range(N): cid = candidate_ids[sub] index_cid_sub[cid] = sub # Build index index_start_sub = {} for sub in range(N): start = starts[sub] if start not in index_start_sub: index_start_sub[start] = [] index_start_sub[start].append(sub) ######### 1. build directed graph # Note that input is sorted by (start, end) last_start = starts[0] last_end = ends[0] last_candidate_id = -1 edges = {} # cand_word_id1 : [cand_word_id2] for i in range(N): start = starts[i] end = ends[i] if end + 1 in index_start_sub: for j in index_start_sub[end + 1]: AddEdge(edges, candidate_ids[i], candidate_ids[j]) print edges ######## 2. DFS output candidates res_array = [] for startid in sorted(edges.keys()): res_array.append(startid) for item in DFS(lattice_id, edges, startid, res_array, index_cid_sub): yield item res_array.pop()
def run(lattice_id, starts, ends, candidates, candidate_ids, transcript): def ElemMatch(e1, e2): return e1 == e2 # preDic: preDic[e] = [b,c,d] (b->e; c->e; d->e) def invertEdges(edges, index_cid_sub): preDic = {} for k in edges: for e in edges[k]: sub_e = index_cid_sub[e] sub_k = index_cid_sub[k] preDic[sub_e].append(sub_k) return preDic def orderedLattice(cids, edges): # generate a topological ordered words lattice_ordered = [] visited = set() indegree = {} for i in cids: indegree[i] = 0 for k in edges: for e in edges[k]: if e not in indegree: indegree[e] = 0 indegree[e] += 1 # print indegree for e in indegree: if indegree[e] == 0: visited.add(e) # print visited while len(visited) > 0: n = visited.pop() lattice_ordered.append(n) if n in edges: for e in edges[n]: indegree[e] -= 1 if indegree[e] == 0: visited.add(e) return lattice_ordered # DEBUG function def PrintStatus(f, path, message=''): # if message != '': print message # print 'F score:' # print '\n'.join([str(_) for _ in f]) # print 'Path:' # for p in path: # for pairlist in p: # print '[%20s]' % ', '.join([str('(%d,%d)' % pair) for pair in pairlist]), # print '' # raw_input() pass # check if edge exists # lattice_words: candidates (words in lattice) # trans_words: transcript words def MatchTranscriptWithLattice(lattice_words, trans_words, edges, candidate_ids, index_cid_sub): # F: longest matching up to ith element from lattice_words and and jth from trans_words n1 = len(lattice_words) n2 = len(trans_words) if n1 == 0 or n2 == 0: return 0, [], [] # f = [[0] * (n2)] * (n1) # Python array is weird.... f = [[0] * n2 for _ in range(n1)] # This is correct way, do not give a shallow copy!! # f = { i:{j : 0 for j in range(0, n2)} for i in range(0, n1)} # an "array" for each grid in matrix path = [[[] for _2 in range(n2)] for _ in range(n1)] indegree = {} #indegree stores cids for i in candidate_ids: indegree[i] = 0 for k in edges: for e in edges[k]: if e not in indegree: indegree[e] = 0 indegree[e] += 1 zero_indegree_index = [index_cid_sub[i] for i in indegree if indegree[i] == 0] non_zero_indegree_index = [index_cid_sub[i] for i in indegree if indegree[i] != 0] # print 'Zero indegree indexes:',zero_indegree_index # Simpler init # Initialize zero-indegree with j=0 for i_zero in zero_indegree_index: for j in range(n2): if ElemMatch(lattice_words[i_zero], trans_words[j]): f[i_zero][j] = 1 path[i_zero][j].append((-1, -1)) # match # # Init # # Initialize zero-indegree with j=0 # for i_zero in zero_indegree_index: # if ElemMatch(lattice_words[i_zero], trans_words[0]): # f[i_zero][0] = 1 # path[i_zero][0].append((-1, -1)) # match # # Update successors (redundant?) # ordered_cids = orderedLattice(candidate_ids, edges) # print 'Ordered Lattice:', ordered_cids # # raw_input() # for i_cid in ordered_cids: # i = index_cid_sub[i_cid] # # For each successor of i # for i_succ_cid in edges[i_cid]: # i_succ_index = index_cid_sub[i_succ_cid] # if ElemMatch(lattice_words[i_succ_index], trans_words[0]): # f[i_succ_index][0] = 1 # path[i_succ_index][0].append((i, -1)) # elif f[i][0] == 1: # f[i_succ_index][0] = 1 # path[i_succ_index][0].append((i, 0)) # # Update j with j-1 # for j in range(1, n2): # for i_zero in zero_indegree_index: # if ElemMatch(lattice_words[i_zero], trans_words[j]): # f[i_zero][j] = 1 # path[i_zero][j].append((-1, j-1)) # elif f[i_zero][j-1] == 1: # f[i_zero][j] = 1 # path[i_zero][j].append((i_zero, j-1)) PrintStatus(f, path, 'Initialization results:') # DP; max over predecessors # C[i,j] = max over all i'-> i (predecessor) { f[i', j-1]+ 1{words[i] == transcript[j]} # f[i',j] # f[i,j-1] # } ordered_cids = orderedLattice(candidate_ids, edges) # for i_index in range(0, n1): for i_cid in ordered_cids: # Must have topological order for DP for j in range(0, n2): # i_cid = candidate_ids[i_index] i_index = index_cid_sub[i_cid] # TODO edges stores cids; i_succ_index is cid!!! i_index is index for i_succ_cid in edges[i_cid]: i_succ_index = index_cid_sub[i_succ_cid] word_succ = lattice_words[i_succ_index] if i_succ_index < n1 and j+1 < n2 \ and ElemMatch(word_succ, trans_words[j+1]): if f[i_succ_index][j+1] < f[i_index][j] + 1: f[i_succ_index][j+1] = f[i_index][j] + 1 path[i_succ_index][j+1] = [(i_index,j)] # Path stores index :P elif f[i_succ_index][j+1] == f[i_index][j] + 1: path[i_succ_index][j+1].append((i_index,j)) if i_succ_index < n1: # shift down (to successor) if f[i_succ_index][j] < f[i_index][j]: f[i_succ_index][j] = f[i_index][j] path[i_succ_index][j] = [(i_index, j)] elif f[i_succ_index][j] == f[i_index][j]: # another best path path[i_succ_index][j].append((i_index, j)) if j + 1 < n2: # shift right (to next word in transcript) if f[i_index][j+1] < f[i_index][j]: f[i_index][j+1] = f[i_index][j] path[i_index][j+1] = [(i_index, j)] elif f[i_index][j+1] == f[i_index][j]: # another best path path[i_index][j+1].append((i_index, j)) # Only match "diagonal" edges possible_opt_match_pairs = [] # can have cases like [(1,a) (2,a)] but have to be both optimal! visited = set() # Search from end to head for all viable best paths def LableMatchDFS(i, j): # print 'visiting ',i,j if (i, j) in visited: # prevent multiple adds return visited.add((i, j)) # mark as visited if i >= 0 and j >= 0: # Terminate at "-1"s for (pi, pj) in path[i][j]: # i,j is a valid match: if pi != i, pi must be pred of i if pi != i and pj != j: possible_opt_match_pairs.append((i, j)) LableMatchDFS(pi, pj) # continue searching # F matrix might look like this: # [1, 1, 1] # [1, 1, 1] # [1, 1, 2] # [0, 0, 0] # [0, 1, 1] # [1, 1, 1] # So we should check best end-nodes endnodes = [index_cid_sub[i] for i in edges if len(edges[i]) == 0] maxscore = 0 besti = [] for i in endnodes: if maxscore < f[i][n2 - 1]: maxscore = f[i][n2 - 1] besti = [i] elif maxscore == f[i][n2 - 1]: besti.append(i) for i in besti: LableMatchDFS(i, n2 - 1) # find all best paths return maxscore, [x for x in reversed(possible_opt_match_pairs)], path, f # Build graph def AddEdge(edges, f, t): if f not in edges: edges[f] = [] edges[f].append(t) # Return a set of all valid paths def DFS(lattice_id, edges, nowid, res_array, index_cid_sub, target_ids): # Reaches target, finish generating a path if nowid in target_ids: # yield a deep copy of the full path yield [_ for _ in res_array] return # Continue generating paths if nowid in edges: for j in edges[nowid]: res_array.append(j) # Nested yield for item in DFS(lattice_id, edges, j, res_array, index_cid_sub, target_ids): yield item res_array.pop() ################## MAIN FUNCTION #################### N = len(candidate_ids) # number of words if N == 0: plpy.info('Empty data:'+str(lattice_id)) return # Build index for candidate_id index_cid_sub = {} for sub in range(N): cid = candidate_ids[sub] index_cid_sub[cid] = sub # Build index index_start_sub = {} for sub in range(N): start = starts[sub] if start not in index_start_sub: index_start_sub[start] = [] index_start_sub[start].append(sub) ######### 1. build directed graph # Note that input is sorted by (start, end) last_start = starts[0] last_end = ends[0] last_candidate_id = -1 edges = {} # cand_word_id1 : [cand_word_id2] for cid in index_cid_sub: edges[cid] = [] indegree = [0 for i in range(N)] for i in range(N): start = starts[i] end = ends[i] if end + 1 in index_start_sub: for j in index_start_sub[end + 1]: AddEdge(edges, candidate_ids[i], candidate_ids[j]) indegree[j] += 1 ############# 2. Return DP result # def run(lattice_id, starts, ends, candidates, candidate_ids, transcript): score, match_pairs, path_mat, f = MatchTranscriptWithLattice(candidates, transcript, \ edges, candidate_ids, index_cid_sub) # DEBUG # print '================' # plpy.info('BEST SCORE: %d' % score) # print 'Match pairs:', match_pairs # (latticeIndex, transcriptIndex) PrintStatus(f,path_mat) # Deduplication match_subs = set([p[0] for p in match_pairs]) # a set of CIDs from the match match_cids = set([candidate_ids[i] for i in match_subs]) true_cids = match_cids for cid in true_cids: yield lattice_id, cid, True false_cids = set(candidate_ids).difference(true_cids) for cid in false_cids: yield lattice_id, cid, False plpy.info('[%s] SCORE: %d, matches: %d / %d' % (lattice_id, score, len(true_cids), len(candidate_ids)))
def run(lattice_id, starts, ends, candidates, candidate_ids, transcript): def orderedLattice(cids, edges): # generate a topological ordered words lattice_ordered = [] visited = set() indegree = {} for i in cids: indegree[i] = 0 for k in edges: for e in edges[k]: if e not in indegree: indegree[e] = 0 indegree[e] += 1 # print indegree for e in indegree: if indegree[e] == 0: visited.add(e) # print visited while len(visited) > 0: n = visited.pop() lattice_ordered.append(n) if n in edges: for e in edges[n]: indegree[e] -= 1 if indegree[e] == 0: visited.add(e) return lattice_ordered # DEBUG function def PrintStatus(f, path, message=''): # if message != '': print message # print 'F score:' # print '\n'.join([str(_) for _ in f]) # print 'Path:' # for p in path: # for pairlist in p: # print '[%20s]' % ', '.join([str('(%d,%d)' % pair) for pair in pairlist]), # print '' # raw_input() pass # Only return 1 path that has highest score. # Break ties by ID order. (small wins) def FindBestPath(lattice_words, edges, candidate_ids, index_cid_sub, expectations, start_subs): # F: longest matching up to ith element from lattice_words and and jth from trans_words n1 = len(lattice_words) if n1 == 0: return 0, [] f = [-10000000.0 for _ in range(n1)] # f[sub] ->score path = [-1 for _ in range(n1)] # stores index: f[sub] -> lastsub # init f of startnodes to be 0 for i in start_subs: f[i] = 0.0 ordered_cids = orderedLattice(candidate_ids, edges) for i_cid in ordered_cids: i = index_cid_sub[i_cid] f[i] += expectations[i] # increase itself for j_cid in edges[i_cid]: # i -> j j = index_cid_sub[j_cid] if f[j] < f[i]: f[j] = f[i] path[j] = i # Only match "diagonal" edges possible_opt_match_pairs = [ ] # can have cases like [(1,a) (2,a)] but have to be both optimal! visited = set() # F matrix might look like this: # [1, 1, 1] # [1, 1, 1] # [1, 1, 2] # [0, 0, 0] # [0, 1, 1] # [1, 1, 1] # So we just pick one best end-nodes (simplified for "1" best path) endnodes = [index_cid_sub[i] for i in edges if len(edges[i]) == 0] maxscore = 0 besti = -1 for i in endnodes: if maxscore < f[i]: maxscore = f[i] besti = i # elif maxscore == f[i][n2 - 1]: # besti.append(i) i = besti bestpath = [] while i != -1: bestpath.append(i) i = path[i] # indexes are sorted by "start,end ". return maxscore, [lattice_words[i] for i in sorted(bestpath)] # return maxscore, [x for x in reversed(possible_opt_match_pairs)], path, f # Build graph def AddEdge(edges, f, t): if f not in edges: edges[f] = [] edges[f].append(t) ################## MAIN FUNCTION #################### N = len(candidate_ids) # number of words if N == 0: plpy.info('Empty data:' + str(lattice_id)) return # Build index for candidate_id index_cid_sub = {} for sub in range(N): cid = candidate_ids[sub] index_cid_sub[cid] = sub # Build index index_start_sub = {} for sub in range(N): start = starts[sub] if start not in index_start_sub: index_start_sub[start] = [] index_start_sub[start].append(sub) ######### 1. build directed graph # Note that input is sorted by (start, end) last_start = starts[0] last_end = ends[0] last_candidate_id = -1 edges = {} # cand_word_id1 : [cand_word_id2] for cid in index_cid_sub: edges[cid] = [] indegree = [0 for i in range(N)] for i in range(N): start = starts[i] end = ends[i] if end + 1 in index_start_sub: for j in index_start_sub[end + 1]: AddEdge(edges, candidate_ids[i], candidate_ids[j]) indegree[j] += 1 start_subs = [i for i in range(len(indegree)) if indegree[i] == 0] ############# 2. Return DP result # If expectation array is empty, fill it with 1 (for oracle) exp = [e - 0.5 for e in expectations] if len(expectations) == 0: exp = [0.5 for _ in range(len(candidates))] score, words = FindBestPath(candidates, \ edges, candidate_ids, index_cid_sub, exp, start_subs) # plpy.info('[%s] SCORE: %d, Words: %s...' % (lattice_id, score, (' '.join(words))[:30])) yield lattice_id, words
def run(lattice_id, starts, ends, candidates, candidate_ids, transcript): def ElemMatch(e1, e2): return e1 == e2 # Returns: score, match_elements, path # Deduplication: matches_arr1 = set([p[0] for p in match_elements]) def Match(arr1, arr2): # F: longest matching up to ith element from arr1 and and jth from arr2 n1 = len(arr1) n2 = len(arr2) if n1 == 0 or n2 == 0: return 0, [], [] # f = [[0] * (n2)] * (n1) # Python array is weird.... f = [[0] * n2 for _ in range(n1)] # This is correct way, do not give a shallow copy!! # f = { i:{j : 0 for j in range(0, n2)} for i in range(0, n1)} # an "array" for each grid in matrix path = [[[] for _2 in range(n2)] for _ in range(n1)] # print f # Init if ElemMatch(arr1[0], arr2[0]): f[0][0] = 1 path[0][0].append((-1, -1)) # match for i in range(1, n1): if ElemMatch(arr1[i], arr2[0]): f[i][0] = 1 path[i][0].append((i-1, -1)) elif f[i-1][0] == 1: # todo multiple? f[i][0] = 1 path[i][0].append((i-1, 0)) for j in range(1, n2): if ElemMatch(arr1[0], arr2[j]): f[0][j] = 1 path[0][j].append((-1, j-1)) elif f[0][j-1] == 1: f[0][j] = 1 path[0][j].append((0, j-1)) # DP for i in range(0, n1): for j in range(0, n2): if i + 1 < n1 and j + 1 < n2 \ and ElemMatch(arr1[i+1], arr2[j+1]): if f[i+1][j+1] < f[i][j] + 1: f[i+1][j+1] = f[i][j] + 1 path[i+1][j+1] = [(i, j)] # new list, truncate previous elif f[i+1][j+1] == f[i][j] + 1: path[i+1][j+1].append((i, j)) # append another best path if i + 1 < n1: # left shift if f[i+1][j] < f[i][j]: f[i+1][j] = f[i][j] path[i+1][j] = [(i, j)] elif f[i+1][j] == f[i][j]: # another best path path[i+1][j].append((i, j)) if j + 1 < n2: # right shift if f[i][j+1] < f[i][j]: f[i][j+1] = f[i][j] path[i][j+1] = [(i, j)] elif f[i][j+1] == f[i][j]: # another best path path[i][j+1].append((i, j)) # Only match "diagonal" edges possible_opt_match_pairs = [] # can have cases like [(1,a) (2,a)] but have to be both optimal! visited = set() def LableMatchDFS(i, j): if (i, j) in visited: # prevent multiple adds return visited.add((i, j)) # mark as visited if i >= 0 and j >= 0: for (pi, pj) in path[i][j]: if pi == i - 1 and pj == j - 1: # i,j is a valid match possible_opt_match_pairs.append((i, j)) LableMatchDFS(pi, pj) # continue searching LableMatchDFS(n1-1, n2-1) return f[n1 - 1][n2 - 1], [x for x in reversed(possible_opt_match_pairs)], path # needs deduplication! # Build graph def AddEdge(edges, f, t): if f not in edges: edges[f] = [] edges[f].append(t) # Return a set of all valid paths def DFS(lattice_id, edges, nowid, res_array, index_cid_sub, target_ids): # Reaches target, finish generating a path if nowid in target_ids: # yield a deep copy of the full path yield [_ for _ in res_array] return # Continue generating paths if nowid in edges: for j in edges[nowid]: res_array.append(j) # Nested yield for item in DFS(lattice_id, edges, j, res_array, index_cid_sub, target_ids): yield item res_array.pop() ################## MAIN FUNCTION #################### N = len(candidate_ids) # number of words if N == 0: plpy.info('Empty data:'+str(lattice_id)) return # Build index for candidate_id index_cid_sub = {} for sub in range(N): cid = candidate_ids[sub] index_cid_sub[cid] = sub # Build index index_start_sub = {} for sub in range(N): start = starts[sub] if start not in index_start_sub: index_start_sub[start] = [] index_start_sub[start].append(sub) ######### 1. build directed graph # Note that input is sorted by (start, end) last_start = starts[0] last_end = ends[0] last_candidate_id = -1 edges = {} # cand_word_id1 : [cand_word_id2] indegree = [0 for i in range(N)] for i in range(N): start = starts[i] end = ends[i] if end + 1 in index_start_sub: for j in index_start_sub[end + 1]: AddEdge(edges, candidate_ids[i], candidate_ids[j]) indegree[j] += 1 ######## 2. DFS output candidates res_array = [] end_ids = set([candidate_ids[i] for i in range(N) if candidate_ids[i] not in edges]) start_ids = set([candidate_ids[i] for i in range(N) if indegree[i] == 0]) # # DEBUG # print 'start nodes:',start_ids # print 'end targets:',end_ids # print 'edges:', edges # print 'indexes:', index_cid_sub # for startid in sorted(edges.keys()): pathnum = 0 maxscore = 0 bestpath_cids = set() for startid in start_ids: res_array.append(startid) for path in DFS(lattice_id, edges, startid, res_array, index_cid_sub, end_ids): pathnum += 1 # "path" is candidate_ids in path path_words = [candidates[index_cid_sub[i]] for i in path] # 0, 1, 2...: subs for both "path" and "path_words" # # DEBUG # print path # print path_words score, match_pairs, path_mat = Match(path_words, transcript) # Deduplication match_subs = set([p[0] for p in match_pairs]) # a set of CIDs from the match match_cids = set([path[i] for i in match_subs]) # # DEBUG # arr1 = path_words # arr2 = transcript # print 'A:',arr1 # print 'B:',arr2 # matches_arr1 = set([p[0] for p in match_pairs]) # Dedup # print 'Matched elements:', [arr1[i] for i in matches_arr1] # print '[%d] %s' % (score, str(matches_arr1)) # print 'Matches:', match_pairs # # print '\n'.join([str(x) for x in path]) if score > maxscore: # Update with this solution maxscore = score # print ' Updating score:', score bestpath_cids = match_cids # print ' best subs:', bestpath_cids elif score == maxscore: # print ' Merge subs from:', bestpath_cids bestpath_cids.update(match_cids) # merge all possible cids into # print ' Merge subs to:', bestpath_cids res_array.pop() # search for different starts # Obtain all results! # DEBUG # print 'Best score:', maxscore # print 'Matched cids', bestpath_cids # print 'Matched words', [candidates[index_cid_sub[cid]] for cid in bestpath_cids] true_cids = bestpath_cids for cid in true_cids: yield lattice_id, cid, True false_cids = set(candidate_ids).difference(true_cids) for cid in false_cids: yield lattice_id, cid, False plpy.info('%d paths, true labels: %d / %d' % (pathnum, len(true_cids), len(true_cids) + len(false_cids) ))
def run(lattice_id, starts, ends, candidates, candidate_ids, transcript): def ElemMatch(e1, e2): return e1 == e2 # Returns: score, match_elements, path # Deduplication: matches_arr1 = set([p[0] for p in match_elements]) def Match(arr1, arr2): # F: longest matching up to ith element from arr1 and and jth from arr2 n1 = len(arr1) n2 = len(arr2) if n1 == 0 or n2 == 0: return 0, [], [] # f = [[0] * (n2)] * (n1) # Python array is weird.... f = [[0] * n2 for _ in range(n1) ] # This is correct way, do not give a shallow copy!! # f = { i:{j : 0 for j in range(0, n2)} for i in range(0, n1)} # an "array" for each grid in matrix path = [[[] for _2 in range(n2)] for _ in range(n1)] # print f # Init if ElemMatch(arr1[0], arr2[0]): f[0][0] = 1 path[0][0].append((-1, -1)) # match for i in range(1, n1): if ElemMatch(arr1[i], arr2[0]): f[i][0] = 1 path[i][0].append((i - 1, -1)) elif f[i - 1][0] == 1: # todo multiple? f[i][0] = 1 path[i][0].append((i - 1, 0)) for j in range(1, n2): if ElemMatch(arr1[0], arr2[j]): f[0][j] = 1 path[0][j].append((-1, j - 1)) elif f[0][j - 1] == 1: f[0][j] = 1 path[0][j].append((0, j - 1)) # DP for i in range(0, n1): for j in range(0, n2): if i + 1 < n1 and j + 1 < n2 \ and ElemMatch(arr1[i+1], arr2[j+1]): if f[i + 1][j + 1] < f[i][j] + 1: f[i + 1][j + 1] = f[i][j] + 1 path[i + 1][j + 1] = [(i, j) ] # new list, truncate previous elif f[i + 1][j + 1] == f[i][j] + 1: path[i + 1][j + 1].append( (i, j)) # append another best path if i + 1 < n1: # left shift if f[i + 1][j] < f[i][j]: f[i + 1][j] = f[i][j] path[i + 1][j] = [(i, j)] elif f[i + 1][j] == f[i][j]: # another best path path[i + 1][j].append((i, j)) if j + 1 < n2: # right shift if f[i][j + 1] < f[i][j]: f[i][j + 1] = f[i][j] path[i][j + 1] = [(i, j)] elif f[i][j + 1] == f[i][j]: # another best path path[i][j + 1].append((i, j)) # Only match "diagonal" edges possible_opt_match_pairs = [ ] # can have cases like [(1,a) (2,a)] but have to be both optimal! visited = set() def LableMatchDFS(i, j): if (i, j) in visited: # prevent multiple adds return visited.add((i, j)) # mark as visited if i >= 0 and j >= 0: for (pi, pj) in path[i][j]: if pi == i - 1 and pj == j - 1: # i,j is a valid match possible_opt_match_pairs.append((i, j)) LableMatchDFS(pi, pj) # continue searching LableMatchDFS(n1 - 1, n2 - 1) return f[n1 - 1][n2 - 1], [ x for x in reversed(possible_opt_match_pairs) ], path # needs deduplication! # Build graph def AddEdge(edges, f, t): if f not in edges: edges[f] = [] edges[f].append(t) # Return a set of all valid paths def DFS(lattice_id, edges, nowid, res_array, index_cid_sub, target_ids): # Reaches target, finish generating a path if nowid in target_ids: # yield a deep copy of the full path yield [_ for _ in res_array] return # Continue generating paths if nowid in edges: for j in edges[nowid]: res_array.append(j) # Nested yield for item in DFS(lattice_id, edges, j, res_array, index_cid_sub, target_ids): yield item res_array.pop() ################## MAIN FUNCTION #################### N = len(candidate_ids) # number of words if N == 0: plpy.info('Empty data:' + str(lattice_id)) return # Build index for candidate_id index_cid_sub = {} for sub in range(N): cid = candidate_ids[sub] index_cid_sub[cid] = sub # Build index index_start_sub = {} for sub in range(N): start = starts[sub] if start not in index_start_sub: index_start_sub[start] = [] index_start_sub[start].append(sub) ######### 1. build directed graph # Note that input is sorted by (start, end) last_start = starts[0] last_end = ends[0] last_candidate_id = -1 edges = {} # cand_word_id1 : [cand_word_id2] indegree = [0 for i in range(N)] for i in range(N): start = starts[i] end = ends[i] if end + 1 in index_start_sub: for j in index_start_sub[end + 1]: AddEdge(edges, candidate_ids[i], candidate_ids[j]) indegree[j] += 1 ######## 2. DFS output candidates res_array = [] end_ids = set( [candidate_ids[i] for i in range(N) if candidate_ids[i] not in edges]) start_ids = set([candidate_ids[i] for i in range(N) if indegree[i] == 0]) # # DEBUG # print 'start nodes:',start_ids # print 'end targets:',end_ids # print 'edges:', edges # print 'indexes:', index_cid_sub # for startid in sorted(edges.keys()): pathnum = 0 maxscore = 0 bestpath_cids = set() for startid in start_ids: res_array.append(startid) for path in DFS(lattice_id, edges, startid, res_array, index_cid_sub, end_ids): pathnum += 1 # "path" is candidate_ids in path path_words = [candidates[index_cid_sub[i]] for i in path] # 0, 1, 2...: subs for both "path" and "path_words" # # DEBUG # print path # print path_words score, match_pairs, path_mat = Match(path_words, transcript) # Deduplication match_subs = set([p[0] for p in match_pairs]) # a set of CIDs from the match match_cids = set([path[i] for i in match_subs]) # # DEBUG # arr1 = path_words # arr2 = transcript # print 'A:',arr1 # print 'B:',arr2 # matches_arr1 = set([p[0] for p in match_pairs]) # Dedup # print 'Matched elements:', [arr1[i] for i in matches_arr1] # print '[%d] %s' % (score, str(matches_arr1)) # print 'Matches:', match_pairs # # print '\n'.join([str(x) for x in path]) if score > maxscore: # Update with this solution maxscore = score # print ' Updating score:', score bestpath_cids = match_cids # print ' best subs:', bestpath_cids elif score == maxscore: # print ' Merge subs from:', bestpath_cids bestpath_cids.update( match_cids) # merge all possible cids into # print ' Merge subs to:', bestpath_cids res_array.pop() # search for different starts # Obtain all results! # DEBUG # print 'Best score:', maxscore # print 'Matched cids', bestpath_cids # print 'Matched words', [candidates[index_cid_sub[cid]] for cid in bestpath_cids] true_cids = bestpath_cids for cid in true_cids: yield lattice_id, cid, True false_cids = set(candidate_ids).difference(true_cids) for cid in false_cids: yield lattice_id, cid, False plpy.info('%d paths, true labels: %d / %d' % (pathnum, len(true_cids), len(true_cids) + len(false_cids)))
def run(lattice_id, starts, ends, candidates, candidate_ids, transcript): def orderedLattice(cids, edges): # generate a topological ordered words lattice_ordered = [] visited = set() indegree = {} for i in cids: indegree[i] = 0 for k in edges: for e in edges[k]: if e not in indegree: indegree[e] = 0 indegree[e] += 1 # print indegree for e in indegree: if indegree[e] == 0: visited.add(e) # print visited while len(visited) > 0: n = visited.pop() lattice_ordered.append(n) if n in edges: for e in edges[n]: indegree[e] -= 1 if indegree[e] == 0: visited.add(e) return lattice_ordered # DEBUG function def PrintStatus(f, path, message=''): # if message != '': print message # print 'F score:' # print '\n'.join([str(_) for _ in f]) # print 'Path:' # for p in path: # for pairlist in p: # print '[%20s]' % ', '.join([str('(%d,%d)' % pair) for pair in pairlist]), # print '' # raw_input() pass # Only return 1 path that has highest score. # Break ties by ID order. (small wins) def FindBestPath(lattice_words, edges, candidate_ids, index_cid_sub, expectations, start_subs): # F: longest matching up to ith element from lattice_words and and jth from trans_words n1 = len(lattice_words) if n1 == 0: return 0, [] f = [-10000000.0 for _ in range(n1)] # f[sub] ->score path = [-1 for _ in range(n1)] # stores index: f[sub] -> lastsub # init f of startnodes to be 0 for i in start_subs: f[i] = 0.0 ordered_cids = orderedLattice(candidate_ids, edges) for i_cid in ordered_cids: i = index_cid_sub[i_cid] f[i] += expectations[i] # increase itself for j_cid in edges[i_cid]: # i -> j j = index_cid_sub[j_cid] if f[j] < f[i]: f[j] = f[i] path[j] = i # Only match "diagonal" edges possible_opt_match_pairs = [] # can have cases like [(1,a) (2,a)] but have to be both optimal! visited = set() # F matrix might look like this: # [1, 1, 1] # [1, 1, 1] # [1, 1, 2] # [0, 0, 0] # [0, 1, 1] # [1, 1, 1] # So we just pick one best end-nodes (simplified for "1" best path) endnodes = [index_cid_sub[i] for i in edges if len(edges[i]) == 0] maxscore = 0 besti = -1 for i in endnodes: if maxscore < f[i]: maxscore = f[i] besti = i # elif maxscore == f[i][n2 - 1]: # besti.append(i) i = besti bestpath = [] while i != -1: bestpath.append(i) i = path[i] # indexes are sorted by "start,end ". return maxscore, [lattice_words[i] for i in sorted(bestpath)] # return maxscore, [x for x in reversed(possible_opt_match_pairs)], path, f # Build graph def AddEdge(edges, f, t): if f not in edges: edges[f] = [] edges[f].append(t) ################## MAIN FUNCTION #################### N = len(candidate_ids) # number of words if N == 0: plpy.info('Empty data:'+str(lattice_id)) return # Build index for candidate_id index_cid_sub = {} for sub in range(N): cid = candidate_ids[sub] index_cid_sub[cid] = sub # Build index index_start_sub = {} for sub in range(N): start = starts[sub] if start not in index_start_sub: index_start_sub[start] = [] index_start_sub[start].append(sub) ######### 1. build directed graph # Note that input is sorted by (start, end) last_start = starts[0] last_end = ends[0] last_candidate_id = -1 edges = {} # cand_word_id1 : [cand_word_id2] for cid in index_cid_sub: edges[cid] = [] indegree = [0 for i in range(N)] for i in range(N): start = starts[i] end = ends[i] if end + 1 in index_start_sub: for j in index_start_sub[end + 1]: AddEdge(edges, candidate_ids[i], candidate_ids[j]) indegree[j] += 1 start_subs = [i for i in range(len(indegree)) if indegree[i] == 0] ############# 2. Return DP result # If expectation array is empty, fill it with 1 (for oracle) exp = [ e - 0.5 for e in expectations] if len(expectations) == 0: exp = [0.5 for _ in range(len(candidates))] score, words = FindBestPath(candidates, \ edges, candidate_ids, index_cid_sub, exp, start_subs) # plpy.info('[%s] SCORE: %d, Words: %s...' % (lattice_id, score, (' '.join(words))[:30])) yield lattice_id, words