Beispiel #1
0
def run(sentence_id, publication_id, fname_prefix, fname, fval):
  features = {}
  for i in range(len(fname)):
    name = fname_prefix + '_' + fname[i]
    val = fval[i]
    features[name] = val

  try:
    feature_str = json.dumps(features)
    yield (sentence_id, publication_id, feature_str)
  except:
    plpy.info("WARNING: CANNOT CONVERT JSON:", features)
    plpy.info(e.message)
Beispiel #2
0
def run(sentence_id, publication_id, fname_prefix, fname, fval):
    features = {}
    for i in range(len(fname)):
        name = fname_prefix + '_' + fname[i]
        val = fval[i]
        features[name] = val

    try:
        feature_str = json.dumps(features)
        yield (sentence_id, publication_id, feature_str)
    except:
        plpy.info("WARNING: CANNOT CONVERT JSON:", features)
        plpy.info(e.message)
    def DFS(lattice_id, edges, nowid, res_array, index_cid_sub, skipping=False):

      # Reaches N, finish generated a N-gram. 
      if len(res_array) == gram_len:
        feature_tmp = []
        for cwid in res_array:
          sub = index_cid_sub[cwid]
          # Deal with range problem
          if sub >= len(arr_feature):
            plpy.info('OUT OF RANGE:' + str(arr_feature[:10]))
          else:
            feature_tmp.append(arr_feature[sub])
        feature = ' '.join([f for f in feature_tmp])
        for cwid in res_array:
          # print lattice_id + '\t' + str(cwid) + '\t' + feature
          yield lattice_id, cwid, feature, skipping
        return
      
      if nowid not in edges: return
      
      for jid in edges[nowid]:
        next_word = arr_feature[index_cid_sub[jid]]
        
        if skipping: # already skipgram, don't generate nonskipgram.
          if next_word in skipset:
            # for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True):
            #   yield item
            pass  # rule: can only skip once...
          else:
            res_array.append(jid)
            for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True):
              yield item
            res_array.pop()
        else:  # not skipping
          if next_word in skipset: # branch into two!
            # Skipgram
            for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True):
              yield item
            # Nonskipgram
            res_array.append(jid)
            for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=False):
              yield item
            res_array.pop()
          else:
            # nonskipgram
            res_array.append(jid)
            for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=False):
              yield item
            res_array.pop()
def run(publication_id, fname_prefix, fname, fval):
  features = {}

  # All features are addable: doc stats / Ngram
  for i in range(len(fname)):
    name = fname_prefix + '_' + fname[i]
    val = fval[i]
    if name not in features:
      features[name] = 0.0
    features[name] += val

  try:
    feature_str = json.dumps(features)
    yield (publication_id, feature_str)
  except Exception as e:
    plpy.info("WARNING: CANNOT CONVERT JSON:", features)
    plpy.info(e.message)
def run(publication_id, fname_prefix, fname, fval):
    features = {}

    # All features are addable: doc stats / Ngram
    for i in range(len(fname)):
        name = fname_prefix + '_' + fname[i]
        val = fval[i]
        if name not in features:
            features[name] = 0.0
        features[name] += val

    try:
        feature_str = json.dumps(features)
        yield (publication_id, feature_str)
    except Exception as e:
        plpy.info("WARNING: CANNOT CONVERT JSON:", features)
        plpy.info(e.message)
def run(lattice_id, starts, ends, arr_feature, candidate_ids, gram_len):
  # Store functions
  if 'AddEdge' in SD:
    AddEdge = SD['AddEdge']
  else:
    # allow multiple same edges
    def AddEdge(edges, f, t):
      if f not in edges:
        edges[f] = []
      edges[f].append(t)
    SD['AddEdge'] = AddEdge


  if 'skipset' in SD:
    skipset = SD['skipset']
  else:
    skipset = set(['~SIL', '<s>', '</s>'])
    SD['skipset'] = skipset


  if 'DFS' in SD:
    DFS = SD['DFS']
  else:

    # Start with DFS(edges, id, 1)
    def DFS(lattice_id, edges, nowid, res_array, index_cid_sub, skipping=False):

      # Reaches N, finish generated a N-gram. 
      if len(res_array) == gram_len:
        feature_tmp = []
        for cwid in res_array:
          sub = index_cid_sub[cwid]
          # Deal with range problem
          if sub >= len(arr_feature):
            plpy.info('OUT OF RANGE:' + str(arr_feature[:10]))
          else:
            feature_tmp.append(arr_feature[sub])
        feature = ' '.join([f for f in feature_tmp])
        for cwid in res_array:
          # print lattice_id + '\t' + str(cwid) + '\t' + feature
          yield lattice_id, cwid, feature, skipping
        return
      
      if nowid not in edges: return
      
      for jid in edges[nowid]:
        next_word = arr_feature[index_cid_sub[jid]]
        
        if skipping: # already skipgram, don't generate nonskipgram.
          if next_word in skipset:
            # for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True):
            #   yield item
            pass  # rule: can only skip once...
          else:
            res_array.append(jid)
            for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True):
              yield item
            res_array.pop()
        else:  # not skipping
          if next_word in skipset: # branch into two!
            # Skipgram
            for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True):
              yield item
            # Nonskipgram
            res_array.append(jid)
            for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=False):
              yield item
            res_array.pop()
          else:
            # nonskipgram
            res_array.append(jid)
            for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=False):
              yield item
            res_array.pop()


    SD['DFS'] = DFS

  ################## MAIN FUNCTION ####################

  N = len(candidate_ids)  # number of words
  if N == 0:
    plpy.info('Empty data:'+str(lattice_id))
    return

  # Build index for candidate_id
  index_cid_sub = {}
  for sub in range(N):
    cid = candidate_ids[sub]
    index_cid_sub[cid] = sub

  # Build index
  index_start_sub = {}
  for sub in range(N):
    start = starts[sub]
    if start not in index_start_sub: 
      index_start_sub[start] = []
    index_start_sub[start].append(sub)

  ######### 1. build directed graph

  # Note that input is sorted by (start, end)
  last_start = starts[0]
  last_end   = ends[0]
  last_candidate_id = -1
  
  edges = {}  # cand_word_id1 : [cand_word_id2]

  for i in range(N):
    start = starts[i]
    end = ends[i]
    if end + 1 in index_start_sub:
      for j in index_start_sub[end + 1]:
        AddEdge(edges, candidate_ids[i], candidate_ids[j])

  print edges
  ######## 2. DFS output candidates
  res_array = []
  for startid in sorted(edges.keys()):
    res_array.append(startid)
    for item in DFS(lattice_id, edges, startid, res_array, index_cid_sub):
      yield item
    res_array.pop()
Beispiel #7
0
        def DFS(lattice_id,
                edges,
                nowid,
                res_array,
                index_cid_sub,
                skipping=False):

            # Reaches N, finish generated a N-gram.
            if len(res_array) == gram_len:
                feature_tmp = []
                for cwid in res_array:
                    sub = index_cid_sub[cwid]
                    # Deal with range problem
                    if sub >= len(arr_feature):
                        plpy.info('OUT OF RANGE:' + str(arr_feature[:10]))
                    else:
                        feature_tmp.append(arr_feature[sub])
                feature = ' '.join([f for f in feature_tmp])
                for cwid in res_array:
                    # print lattice_id + '\t' + str(cwid) + '\t' + feature
                    yield lattice_id, cwid, feature, skipping
                return

            if nowid not in edges: return

            for jid in edges[nowid]:
                next_word = arr_feature[index_cid_sub[jid]]

                if skipping:  # already skipgram, don't generate nonskipgram.
                    if next_word in skipset:
                        # for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True):
                        #   yield item
                        pass  # rule: can only skip once...
                    else:
                        res_array.append(jid)
                        for item in DFS(lattice_id,
                                        edges,
                                        jid,
                                        res_array,
                                        index_cid_sub,
                                        skipping=True):
                            yield item
                        res_array.pop()
                else:  # not skipping
                    if next_word in skipset:  # branch into two!
                        # Skipgram
                        for item in DFS(lattice_id,
                                        edges,
                                        jid,
                                        res_array,
                                        index_cid_sub,
                                        skipping=True):
                            yield item
                        # Nonskipgram
                        res_array.append(jid)
                        for item in DFS(lattice_id,
                                        edges,
                                        jid,
                                        res_array,
                                        index_cid_sub,
                                        skipping=False):
                            yield item
                        res_array.pop()
                    else:
                        # nonskipgram
                        res_array.append(jid)
                        for item in DFS(lattice_id,
                                        edges,
                                        jid,
                                        res_array,
                                        index_cid_sub,
                                        skipping=False):
                            yield item
                        res_array.pop()
Beispiel #8
0
def run(lattice_id, starts, ends, arr_feature, candidate_ids, gram_len):
    # Store functions
    if 'AddEdge' in SD:
        AddEdge = SD['AddEdge']
    else:
        # allow multiple same edges
        def AddEdge(edges, f, t):
            if f not in edges:
                edges[f] = []
            edges[f].append(t)

        SD['AddEdge'] = AddEdge

    if 'skipset' in SD:
        skipset = SD['skipset']
    else:
        skipset = set(['~SIL', '<s>', '</s>'])
        SD['skipset'] = skipset

    if 'DFS' in SD:
        DFS = SD['DFS']
    else:

        # Start with DFS(edges, id, 1)
        def DFS(lattice_id,
                edges,
                nowid,
                res_array,
                index_cid_sub,
                skipping=False):

            # Reaches N, finish generated a N-gram.
            if len(res_array) == gram_len:
                feature_tmp = []
                for cwid in res_array:
                    sub = index_cid_sub[cwid]
                    # Deal with range problem
                    if sub >= len(arr_feature):
                        plpy.info('OUT OF RANGE:' + str(arr_feature[:10]))
                    else:
                        feature_tmp.append(arr_feature[sub])
                feature = ' '.join([f for f in feature_tmp])
                for cwid in res_array:
                    # print lattice_id + '\t' + str(cwid) + '\t' + feature
                    yield lattice_id, cwid, feature, skipping
                return

            if nowid not in edges: return

            for jid in edges[nowid]:
                next_word = arr_feature[index_cid_sub[jid]]

                if skipping:  # already skipgram, don't generate nonskipgram.
                    if next_word in skipset:
                        # for item in DFS(lattice_id, edges, jid, res_array, index_cid_sub, skipping=True):
                        #   yield item
                        pass  # rule: can only skip once...
                    else:
                        res_array.append(jid)
                        for item in DFS(lattice_id,
                                        edges,
                                        jid,
                                        res_array,
                                        index_cid_sub,
                                        skipping=True):
                            yield item
                        res_array.pop()
                else:  # not skipping
                    if next_word in skipset:  # branch into two!
                        # Skipgram
                        for item in DFS(lattice_id,
                                        edges,
                                        jid,
                                        res_array,
                                        index_cid_sub,
                                        skipping=True):
                            yield item
                        # Nonskipgram
                        res_array.append(jid)
                        for item in DFS(lattice_id,
                                        edges,
                                        jid,
                                        res_array,
                                        index_cid_sub,
                                        skipping=False):
                            yield item
                        res_array.pop()
                    else:
                        # nonskipgram
                        res_array.append(jid)
                        for item in DFS(lattice_id,
                                        edges,
                                        jid,
                                        res_array,
                                        index_cid_sub,
                                        skipping=False):
                            yield item
                        res_array.pop()

        SD['DFS'] = DFS

    ################## MAIN FUNCTION ####################

    N = len(candidate_ids)  # number of words
    if N == 0:
        plpy.info('Empty data:' + str(lattice_id))
        return

    # Build index for candidate_id
    index_cid_sub = {}
    for sub in range(N):
        cid = candidate_ids[sub]
        index_cid_sub[cid] = sub

    # Build index
    index_start_sub = {}
    for sub in range(N):
        start = starts[sub]
        if start not in index_start_sub:
            index_start_sub[start] = []
        index_start_sub[start].append(sub)

    ######### 1. build directed graph

    # Note that input is sorted by (start, end)
    last_start = starts[0]
    last_end = ends[0]
    last_candidate_id = -1

    edges = {}  # cand_word_id1 : [cand_word_id2]

    for i in range(N):
        start = starts[i]
        end = ends[i]
        if end + 1 in index_start_sub:
            for j in index_start_sub[end + 1]:
                AddEdge(edges, candidate_ids[i], candidate_ids[j])

    print edges
    ######## 2. DFS output candidates
    res_array = []
    for startid in sorted(edges.keys()):
        res_array.append(startid)
        for item in DFS(lattice_id, edges, startid, res_array, index_cid_sub):
            yield item
        res_array.pop()
def run(lattice_id, starts, ends, candidates, candidate_ids, transcript):

  def ElemMatch(e1, e2):
    return e1 == e2

  # preDic: preDic[e] = [b,c,d] (b->e; c->e; d->e)
  def invertEdges(edges, index_cid_sub):
    preDic = {}
    for k in edges:
      for e in edges[k]:
        sub_e = index_cid_sub[e] 
        sub_k = index_cid_sub[k]
        preDic[sub_e].append(sub_k)
    return preDic

  def orderedLattice(cids, edges):
    # generate a topological ordered words
    lattice_ordered = []  
    visited = set()
    indegree = {}
    for i in cids:
      indegree[i] = 0
    for k in edges:
      for e in edges[k]:
        if e not in indegree:
          indegree[e] = 0
        indegree[e] += 1
    # print indegree
    for e in indegree:
      if indegree[e] == 0:
        visited.add(e)
    # print visited
    while len(visited) > 0:
      n = visited.pop()  
      lattice_ordered.append(n)
      if n in edges:
        for e in edges[n]:
          indegree[e] -= 1
          if indegree[e] == 0:
            visited.add(e)
    return lattice_ordered

  # DEBUG function
  def PrintStatus(f, path, message=''): 
    # if message != '': print message
    # print 'F score:'
    # print '\n'.join([str(_) for _ in f])
    # print 'Path:'
    # for p in path:
    #   for pairlist in p:
    #     print '[%20s]' % ', '.join([str('(%d,%d)' % pair) for pair in pairlist]),
    #   print ''
    # raw_input()
    pass


  # check if edge exists
  # lattice_words: candidates (words in lattice)
  # trans_words: transcript words 
  def MatchTranscriptWithLattice(lattice_words, trans_words, edges, candidate_ids, index_cid_sub):
    # F: longest matching up to ith element from lattice_words and and jth from trans_words
    n1 = len(lattice_words)
    n2 = len(trans_words)
    if n1 == 0 or n2 == 0: 
      return 0, [], []
    # f = [[0] * (n2)] * (n1)  # Python array is weird....
    f = [[0] * n2 for _ in range(n1)] # This is correct way, do not give a shallow copy!!  
    # f = { i:{j : 0 for j in range(0, n2)} for i in range(0, n1)}
    # an "array" for each grid in matrix
    path = [[[] for _2 in range(n2)] for _ in range(n1)]

    indegree = {}

    #indegree stores cids
    for i in candidate_ids:
      indegree[i] = 0
    for k in edges:
      for e in edges[k]:
        if e not in indegree:
          indegree[e] = 0
        indegree[e] += 1

    zero_indegree_index = [index_cid_sub[i] for i in indegree if indegree[i] == 0]
    non_zero_indegree_index = [index_cid_sub[i] for i in indegree if indegree[i] != 0]

    # print 'Zero indegree indexes:',zero_indegree_index

    # Simpler init
    # Initialize zero-indegree with j=0
    for i_zero in zero_indegree_index:
      for j in range(n2):
        if ElemMatch(lattice_words[i_zero], trans_words[j]):
          f[i_zero][j] = 1
          path[i_zero][j].append((-1, -1)) # match
    
    # # Init
    # # Initialize zero-indegree with j=0
    # for i_zero in zero_indegree_index:
    #   if ElemMatch(lattice_words[i_zero], trans_words[0]):
    #     f[i_zero][0] = 1
    #     path[i_zero][0].append((-1, -1)) # match

    # # Update successors (redundant?)
    # ordered_cids = orderedLattice(candidate_ids, edges)
    # print 'Ordered Lattice:', ordered_cids
    # # raw_input()
    # for i_cid in ordered_cids:
    #   i = index_cid_sub[i_cid]
    #   # For each successor of i
    #   for i_succ_cid in edges[i_cid]:
    #     i_succ_index = index_cid_sub[i_succ_cid]

    #     if ElemMatch(lattice_words[i_succ_index], trans_words[0]):
    #       f[i_succ_index][0] = 1
    #       path[i_succ_index][0].append((i, -1))
    #     elif f[i][0] == 1:
    #       f[i_succ_index][0] = 1
    #       path[i_succ_index][0].append((i, 0))

    # # Update j with j-1
    # for j in range(1, n2):
    #   for i_zero in zero_indegree_index:
    #     if ElemMatch(lattice_words[i_zero], trans_words[j]):
    #       f[i_zero][j] = 1
    #       path[i_zero][j].append((-1, j-1))
    #     elif f[i_zero][j-1] == 1:
    #       f[i_zero][j] = 1
    #       path[i_zero][j].append((i_zero, j-1))
   
    PrintStatus(f, path, 'Initialization results:')

    # DP; max over predecessors
    # C[i,j] = max over all i'-> i (predecessor) {  f[i', j-1]+ 1{words[i] == transcript[j]}
    #     f[i',j]
    #     f[i,j-1]
    #   }
    ordered_cids = orderedLattice(candidate_ids, edges)

    # for i_index in range(0, n1):
    for i_cid in ordered_cids:     # Must have topological order for DP
      for j in range(0, n2):
        # i_cid = candidate_ids[i_index]
        i_index = index_cid_sub[i_cid]

        # TODO edges stores cids; i_succ_index is cid!!! i_index is index
        for i_succ_cid in edges[i_cid]:
          i_succ_index = index_cid_sub[i_succ_cid]
          word_succ = lattice_words[i_succ_index]
          if i_succ_index < n1 and j+1 < n2 \
            and ElemMatch(word_succ, trans_words[j+1]):
              if f[i_succ_index][j+1] < f[i_index][j] + 1:
                f[i_succ_index][j+1] = f[i_index][j] + 1
                path[i_succ_index][j+1] = [(i_index,j)]    # Path stores index :P
              elif f[i_succ_index][j+1] == f[i_index][j] + 1:
                path[i_succ_index][j+1].append((i_index,j))

          if i_succ_index < n1: # shift down (to successor)
            if f[i_succ_index][j] < f[i_index][j]:
              f[i_succ_index][j] = f[i_index][j]
              path[i_succ_index][j] = [(i_index, j)]
            elif f[i_succ_index][j] == f[i_index][j]: # another best path
              path[i_succ_index][j].append((i_index, j))

        if j + 1 < n2: # shift right (to next word in transcript)
          if f[i_index][j+1] < f[i_index][j]:
            f[i_index][j+1] = f[i_index][j]
            path[i_index][j+1] = [(i_index, j)]
          elif f[i_index][j+1] == f[i_index][j]: # another best path
            path[i_index][j+1].append((i_index, j))

    # Only match "diagonal" edges
    possible_opt_match_pairs = []  # can have cases like [(1,a) (2,a)] but have to be both optimal!
    visited = set()

    # Search from end to head for all viable best paths
    def LableMatchDFS(i, j):
      # print 'visiting ',i,j
      if (i, j) in visited:  # prevent multiple adds
        return
      visited.add((i, j))   # mark as visited
      if i >= 0 and j >= 0:  # Terminate at "-1"s
        for (pi, pj) in path[i][j]:
          # i,j is a valid match: if pi != i, pi must be pred of i
          if pi != i and pj != j:
            possible_opt_match_pairs.append((i, j))
          LableMatchDFS(pi, pj)     # continue searching

    # F matrix might look like this:
    # [1, 1, 1]
    # [1, 1, 1]
    # [1, 1, 2]
    # [0, 0, 0]
    # [0, 1, 1]
    # [1, 1, 1]
    # So we should check best end-nodes
    endnodes = [index_cid_sub[i] for i in edges if len(edges[i]) == 0]
    maxscore = 0
    besti = []
    for i in endnodes:
      if maxscore < f[i][n2 - 1]:
        maxscore = f[i][n2 - 1]
        besti = [i]
      elif maxscore == f[i][n2 - 1]:
        besti.append(i)

    for i in besti:
      LableMatchDFS(i, n2 - 1)  # find all best paths

    return maxscore, [x for x in reversed(possible_opt_match_pairs)], path, f


  # Build graph
  def AddEdge(edges, f, t):
    if f not in edges:
      edges[f] = []
    edges[f].append(t)

  # Return a set of all valid paths
  def DFS(lattice_id, edges, nowid, res_array, index_cid_sub, target_ids):
    # Reaches target, finish generating a path
    if nowid in target_ids:
      # yield a deep copy of the full path
      yield [_ for _ in res_array]
      return
    # Continue generating paths
    if nowid in edges:
      for j in edges[nowid]:
        res_array.append(j)
        # Nested yield
        for item in DFS(lattice_id, edges, j, res_array, index_cid_sub, target_ids):
          yield item
        res_array.pop()

  ################## MAIN FUNCTION ####################

  N = len(candidate_ids)  # number of words
  if N == 0:
    plpy.info('Empty data:'+str(lattice_id))
    return

  # Build index for candidate_id
  index_cid_sub = {}
  for sub in range(N):
    cid = candidate_ids[sub]
    index_cid_sub[cid] = sub

  # Build index
  index_start_sub = {}
  for sub in range(N):
    start = starts[sub]
    if start not in index_start_sub: 
      index_start_sub[start] = []
    index_start_sub[start].append(sub)

  ######### 1. build directed graph

  # Note that input is sorted by (start, end)
  last_start = starts[0]
  last_end   = ends[0]
  last_candidate_id = -1
  
  edges = {}  # cand_word_id1 : [cand_word_id2]
  for cid in index_cid_sub: edges[cid] = []
  
  indegree = [0 for i in range(N)]

  for i in range(N):
    start = starts[i]
    end = ends[i]
    if end + 1 in index_start_sub:
      for j in index_start_sub[end + 1]:
        AddEdge(edges, candidate_ids[i], candidate_ids[j])
        indegree[j] += 1

  ############# 2. Return DP result

  # def run(lattice_id, starts, ends, candidates, candidate_ids, transcript):
  score, match_pairs, path_mat, f = MatchTranscriptWithLattice(candidates, transcript, \
    edges, candidate_ids, index_cid_sub)

  # DEBUG
  # print '================'
  # plpy.info('BEST SCORE: %d' % score)
  # print 'Match pairs:', match_pairs  # (latticeIndex, transcriptIndex)
  PrintStatus(f,path_mat)

  # Deduplication
  match_subs = set([p[0] for p in match_pairs])

  # a set of CIDs from the match
  match_cids = set([candidate_ids[i] for i in match_subs])

  true_cids = match_cids
  for cid in true_cids:
    yield lattice_id, cid, True

  false_cids = set(candidate_ids).difference(true_cids)
  for cid in false_cids:
    yield lattice_id, cid, False

  plpy.info('[%s]  SCORE: %d, matches: %d / %d' % (lattice_id, score, len(true_cids), len(candidate_ids)))
Beispiel #10
0
def run(lattice_id, starts, ends, candidates, candidate_ids, transcript):
    def orderedLattice(cids, edges):
        # generate a topological ordered words
        lattice_ordered = []
        visited = set()
        indegree = {}
        for i in cids:
            indegree[i] = 0
        for k in edges:
            for e in edges[k]:
                if e not in indegree:
                    indegree[e] = 0
                indegree[e] += 1
        # print indegree
        for e in indegree:
            if indegree[e] == 0:
                visited.add(e)
        # print visited
        while len(visited) > 0:
            n = visited.pop()
            lattice_ordered.append(n)
            if n in edges:
                for e in edges[n]:
                    indegree[e] -= 1
                    if indegree[e] == 0:
                        visited.add(e)
        return lattice_ordered

    # DEBUG function
    def PrintStatus(f, path, message=''):
        # if message != '': print message
        # print 'F score:'
        # print '\n'.join([str(_) for _ in f])
        # print 'Path:'
        # for p in path:
        #   for pairlist in p:
        #     print '[%20s]' % ', '.join([str('(%d,%d)' % pair) for pair in pairlist]),
        #   print ''
        # raw_input()
        pass

    # Only return 1 path that has highest score.
    # Break ties by ID order. (small wins)
    def FindBestPath(lattice_words, edges, candidate_ids, index_cid_sub,
                     expectations, start_subs):
        # F: longest matching up to ith element from lattice_words and and jth from trans_words
        n1 = len(lattice_words)
        if n1 == 0:
            return 0, []

        f = [-10000000.0 for _ in range(n1)]  # f[sub] ->score
        path = [-1 for _ in range(n1)]  # stores index: f[sub] -> lastsub

        # init f of startnodes to be 0
        for i in start_subs:
            f[i] = 0.0

        ordered_cids = orderedLattice(candidate_ids, edges)

        for i_cid in ordered_cids:
            i = index_cid_sub[i_cid]
            f[i] += expectations[i]  # increase itself
            for j_cid in edges[i_cid]:  # i -> j
                j = index_cid_sub[j_cid]
                if f[j] < f[i]:
                    f[j] = f[i]
                    path[j] = i

        # Only match "diagonal" edges
        possible_opt_match_pairs = [
        ]  # can have cases like [(1,a) (2,a)] but have to be both optimal!
        visited = set()

        # F matrix might look like this:
        # [1, 1, 1]
        # [1, 1, 1]
        # [1, 1, 2]
        # [0, 0, 0]
        # [0, 1, 1]
        # [1, 1, 1]
        # So we just pick one best end-nodes (simplified for "1" best path)
        endnodes = [index_cid_sub[i] for i in edges if len(edges[i]) == 0]
        maxscore = 0
        besti = -1
        for i in endnodes:
            if maxscore < f[i]:
                maxscore = f[i]
                besti = i
            # elif maxscore == f[i][n2 - 1]:
            #   besti.append(i)

        i = besti
        bestpath = []
        while i != -1:
            bestpath.append(i)
            i = path[i]

        # indexes are sorted by "start,end ".
        return maxscore, [lattice_words[i] for i in sorted(bestpath)]

        # return maxscore, [x for x in reversed(possible_opt_match_pairs)], path, f

    # Build graph
    def AddEdge(edges, f, t):
        if f not in edges:
            edges[f] = []
        edges[f].append(t)

    ################## MAIN FUNCTION ####################

    N = len(candidate_ids)  # number of words
    if N == 0:
        plpy.info('Empty data:' + str(lattice_id))
        return

    # Build index for candidate_id
    index_cid_sub = {}
    for sub in range(N):
        cid = candidate_ids[sub]
        index_cid_sub[cid] = sub

    # Build index
    index_start_sub = {}
    for sub in range(N):
        start = starts[sub]
        if start not in index_start_sub:
            index_start_sub[start] = []
        index_start_sub[start].append(sub)

    ######### 1. build directed graph

    # Note that input is sorted by (start, end)
    last_start = starts[0]
    last_end = ends[0]
    last_candidate_id = -1

    edges = {}  # cand_word_id1 : [cand_word_id2]
    for cid in index_cid_sub:
        edges[cid] = []

    indegree = [0 for i in range(N)]

    for i in range(N):
        start = starts[i]
        end = ends[i]
        if end + 1 in index_start_sub:
            for j in index_start_sub[end + 1]:
                AddEdge(edges, candidate_ids[i], candidate_ids[j])
                indegree[j] += 1

    start_subs = [i for i in range(len(indegree)) if indegree[i] == 0]

    ############# 2. Return DP result

    # If expectation array is empty, fill it with 1 (for oracle)
    exp = [e - 0.5 for e in expectations]
    if len(expectations) == 0: exp = [0.5 for _ in range(len(candidates))]
    score, words = FindBestPath(candidates, \
      edges, candidate_ids, index_cid_sub, exp, start_subs)

    # plpy.info('[%s]  SCORE: %d, Words: %s...' % (lattice_id, score, (' '.join(words))[:30]))
    yield lattice_id, words
def run(lattice_id, starts, ends, candidates, candidate_ids, transcript):

  def ElemMatch(e1, e2):
    return e1 == e2

  # Returns: score, match_elements, path
  # Deduplication: matches_arr1 = set([p[0] for p in match_elements])
  def Match(arr1, arr2):
    # F: longest matching up to ith element from arr1 and and jth from arr2
    n1 = len(arr1)
    n2 = len(arr2)
    if n1 == 0 or n2 == 0: 
      return 0, [], []
    # f = [[0] * (n2)] * (n1)  # Python array is weird....
    f = [[0] * n2 for _ in range(n1)] # This is correct way, do not give a shallow copy!!

    # f = { i:{j : 0 for j in range(0, n2)} for i in range(0, n1)}

    # an "array" for each grid in matrix
    path = [[[] for _2 in range(n2)] for _ in range(n1)]

    # print f
    # Init
    if ElemMatch(arr1[0], arr2[0]):
      f[0][0] = 1
      path[0][0].append((-1, -1)) # match

    for i in range(1, n1):
      if ElemMatch(arr1[i], arr2[0]):
        f[i][0] = 1
        path[i][0].append((i-1, -1))
      elif f[i-1][0] == 1:  # todo multiple?
        f[i][0] = 1
        path[i][0].append((i-1, 0))

    for j in range(1, n2):
      if ElemMatch(arr1[0], arr2[j]):
        f[0][j] = 1
        path[0][j].append((-1, j-1))
      elif f[0][j-1] == 1:
        f[0][j] = 1
        path[0][j].append((0, j-1))

    # DP
    for i in range(0, n1):
      for j in range(0, n2):
        if i + 1 < n1 and j + 1 < n2 \
          and ElemMatch(arr1[i+1], arr2[j+1]):
            if f[i+1][j+1] < f[i][j] + 1:
              f[i+1][j+1] = f[i][j] + 1
              path[i+1][j+1] = [(i, j)]  # new list, truncate previous
            elif f[i+1][j+1] == f[i][j] + 1:
              path[i+1][j+1].append((i, j))  # append another best path

        if i + 1 < n1: # left shift
          if f[i+1][j] < f[i][j]:
            f[i+1][j] = f[i][j]
            path[i+1][j] = [(i, j)]
          elif f[i+1][j] == f[i][j]: # another best path
            path[i+1][j].append((i, j))

        if j + 1 < n2: # right shift
          if f[i][j+1] < f[i][j]:
            f[i][j+1] = f[i][j]
            path[i][j+1] = [(i, j)]
          elif f[i][j+1] == f[i][j]: # another best path
            path[i][j+1].append((i, j))

    # Only match "diagonal" edges
    possible_opt_match_pairs = []  # can have cases like [(1,a) (2,a)] but have to be both optimal!
    visited = set()
    def LableMatchDFS(i, j):
      if (i, j) in visited:  # prevent multiple adds
        return
      visited.add((i, j))   # mark as visited
      if i >= 0 and j >= 0:
        for (pi, pj) in path[i][j]:
          if pi == i - 1 and pj == j - 1: # i,j is a valid match
            possible_opt_match_pairs.append((i, j))
          LableMatchDFS(pi, pj)     # continue searching

    LableMatchDFS(n1-1, n2-1)

    return f[n1 - 1][n2 - 1], [x for x in reversed(possible_opt_match_pairs)], path
    # needs deduplication!

  # Build graph
  def AddEdge(edges, f, t):
    if f not in edges:
      edges[f] = []
    edges[f].append(t)

  # Return a set of all valid paths
  def DFS(lattice_id, edges, nowid, res_array, index_cid_sub, target_ids):
    # Reaches target, finish generating a path
    if nowid in target_ids:
      # yield a deep copy of the full path
      yield [_ for _ in res_array]
      return
    # Continue generating paths
    if nowid in edges:
      for j in edges[nowid]:
        res_array.append(j)
        # Nested yield
        for item in DFS(lattice_id, edges, j, res_array, index_cid_sub, target_ids):
          yield item
        res_array.pop()

  ################## MAIN FUNCTION ####################

  N = len(candidate_ids)  # number of words
  if N == 0:
    plpy.info('Empty data:'+str(lattice_id))
    return

  # Build index for candidate_id
  index_cid_sub = {}
  for sub in range(N):
    cid = candidate_ids[sub]
    index_cid_sub[cid] = sub

  # Build index
  index_start_sub = {}
  for sub in range(N):
    start = starts[sub]
    if start not in index_start_sub: 
      index_start_sub[start] = []
    index_start_sub[start].append(sub)

  ######### 1. build directed graph

  # Note that input is sorted by (start, end)
  last_start = starts[0]
  last_end   = ends[0]
  last_candidate_id = -1
  
  edges = {}  # cand_word_id1 : [cand_word_id2]
  indegree = [0 for i in range(N)]

  for i in range(N):
    start = starts[i]
    end = ends[i]
    if end + 1 in index_start_sub:
      for j in index_start_sub[end + 1]:
        AddEdge(edges, candidate_ids[i], candidate_ids[j])
        indegree[j] += 1


  ######## 2. DFS output candidates
  res_array = []

  end_ids = set([candidate_ids[i] for i in range(N) if candidate_ids[i] not in edges])
  start_ids = set([candidate_ids[i] for i in range(N) if indegree[i] == 0])

  # # DEBUG
  # print 'start nodes:',start_ids
  # print 'end targets:',end_ids
  # print 'edges:', edges
  # print 'indexes:', index_cid_sub

  # for startid in sorted(edges.keys()):

  pathnum = 0
  maxscore = 0
  bestpath_cids = set()
  for startid in start_ids:
    res_array.append(startid)
    for path in DFS(lattice_id, edges, startid, res_array, index_cid_sub, end_ids):
      pathnum += 1
      # "path" is candidate_ids in path
      path_words = [candidates[index_cid_sub[i]] for i in path]
      # 0, 1, 2...: subs for both "path" and "path_words"

      # # DEBUG
      # print path  
      # print path_words
      
      score, match_pairs, path_mat = Match(path_words, transcript)
      # Deduplication
      match_subs = set([p[0] for p in match_pairs])

      # a set of CIDs from the match
      match_cids = set([path[i] for i in match_subs])

      # # DEBUG
      # arr1 = path_words
      # arr2 = transcript
      # print 'A:',arr1
      # print 'B:',arr2
      # matches_arr1 = set([p[0] for p in match_pairs]) # Dedup
      # print 'Matched elements:', [arr1[i] for i in matches_arr1]
      # print '[%d]  %s' % (score, str(matches_arr1))
      # print 'Matches:', match_pairs
      # # print '\n'.join([str(x) for x in path])


      if score > maxscore:  # Update with this solution
        maxscore = score
        # print '  Updating score:', score
        bestpath_cids = match_cids
        # print '  best subs:', bestpath_cids
      elif score == maxscore:
        # print '  Merge subs from:', bestpath_cids
        bestpath_cids.update(match_cids)  # merge all possible cids into
        # print '  Merge subs to:', bestpath_cids

    res_array.pop()  # search for different starts

  # Obtain all results!
  # DEBUG
  # print 'Best score:', maxscore
  # print 'Matched cids', bestpath_cids

  # print 'Matched words', [candidates[index_cid_sub[cid]] for cid in bestpath_cids]

  true_cids = bestpath_cids
  for cid in true_cids:
    yield lattice_id, cid, True

  false_cids = set(candidate_ids).difference(true_cids)
  for cid in false_cids:
    yield lattice_id, cid, False

  plpy.info('%d paths, true labels: %d / %d' % (pathnum, len(true_cids), len(true_cids) + len(false_cids) ))
def run(lattice_id, starts, ends, candidates, candidate_ids, transcript):
    def ElemMatch(e1, e2):
        return e1 == e2

    # Returns: score, match_elements, path
    # Deduplication: matches_arr1 = set([p[0] for p in match_elements])
    def Match(arr1, arr2):
        # F: longest matching up to ith element from arr1 and and jth from arr2
        n1 = len(arr1)
        n2 = len(arr2)
        if n1 == 0 or n2 == 0:
            return 0, [], []
        # f = [[0] * (n2)] * (n1)  # Python array is weird....
        f = [[0] * n2 for _ in range(n1)
             ]  # This is correct way, do not give a shallow copy!!

        # f = { i:{j : 0 for j in range(0, n2)} for i in range(0, n1)}

        # an "array" for each grid in matrix
        path = [[[] for _2 in range(n2)] for _ in range(n1)]

        # print f
        # Init
        if ElemMatch(arr1[0], arr2[0]):
            f[0][0] = 1
            path[0][0].append((-1, -1))  # match

        for i in range(1, n1):
            if ElemMatch(arr1[i], arr2[0]):
                f[i][0] = 1
                path[i][0].append((i - 1, -1))
            elif f[i - 1][0] == 1:  # todo multiple?
                f[i][0] = 1
                path[i][0].append((i - 1, 0))

        for j in range(1, n2):
            if ElemMatch(arr1[0], arr2[j]):
                f[0][j] = 1
                path[0][j].append((-1, j - 1))
            elif f[0][j - 1] == 1:
                f[0][j] = 1
                path[0][j].append((0, j - 1))

        # DP
        for i in range(0, n1):
            for j in range(0, n2):
                if i + 1 < n1 and j + 1 < n2 \
                  and ElemMatch(arr1[i+1], arr2[j+1]):
                    if f[i + 1][j + 1] < f[i][j] + 1:
                        f[i + 1][j + 1] = f[i][j] + 1
                        path[i + 1][j + 1] = [(i, j)
                                              ]  # new list, truncate previous
                    elif f[i + 1][j + 1] == f[i][j] + 1:
                        path[i + 1][j + 1].append(
                            (i, j))  # append another best path

                if i + 1 < n1:  # left shift
                    if f[i + 1][j] < f[i][j]:
                        f[i + 1][j] = f[i][j]
                        path[i + 1][j] = [(i, j)]
                    elif f[i + 1][j] == f[i][j]:  # another best path
                        path[i + 1][j].append((i, j))

                if j + 1 < n2:  # right shift
                    if f[i][j + 1] < f[i][j]:
                        f[i][j + 1] = f[i][j]
                        path[i][j + 1] = [(i, j)]
                    elif f[i][j + 1] == f[i][j]:  # another best path
                        path[i][j + 1].append((i, j))

        # Only match "diagonal" edges
        possible_opt_match_pairs = [
        ]  # can have cases like [(1,a) (2,a)] but have to be both optimal!
        visited = set()

        def LableMatchDFS(i, j):
            if (i, j) in visited:  # prevent multiple adds
                return
            visited.add((i, j))  # mark as visited
            if i >= 0 and j >= 0:
                for (pi, pj) in path[i][j]:
                    if pi == i - 1 and pj == j - 1:  # i,j is a valid match
                        possible_opt_match_pairs.append((i, j))
                    LableMatchDFS(pi, pj)  # continue searching

        LableMatchDFS(n1 - 1, n2 - 1)

        return f[n1 - 1][n2 - 1], [
            x for x in reversed(possible_opt_match_pairs)
        ], path
        # needs deduplication!

    # Build graph
    def AddEdge(edges, f, t):
        if f not in edges:
            edges[f] = []
        edges[f].append(t)

    # Return a set of all valid paths
    def DFS(lattice_id, edges, nowid, res_array, index_cid_sub, target_ids):
        # Reaches target, finish generating a path
        if nowid in target_ids:
            # yield a deep copy of the full path
            yield [_ for _ in res_array]
            return
        # Continue generating paths
        if nowid in edges:
            for j in edges[nowid]:
                res_array.append(j)
                # Nested yield
                for item in DFS(lattice_id, edges, j, res_array, index_cid_sub,
                                target_ids):
                    yield item
                res_array.pop()

    ################## MAIN FUNCTION ####################

    N = len(candidate_ids)  # number of words
    if N == 0:
        plpy.info('Empty data:' + str(lattice_id))
        return

    # Build index for candidate_id
    index_cid_sub = {}
    for sub in range(N):
        cid = candidate_ids[sub]
        index_cid_sub[cid] = sub

    # Build index
    index_start_sub = {}
    for sub in range(N):
        start = starts[sub]
        if start not in index_start_sub:
            index_start_sub[start] = []
        index_start_sub[start].append(sub)

    ######### 1. build directed graph

    # Note that input is sorted by (start, end)
    last_start = starts[0]
    last_end = ends[0]
    last_candidate_id = -1

    edges = {}  # cand_word_id1 : [cand_word_id2]
    indegree = [0 for i in range(N)]

    for i in range(N):
        start = starts[i]
        end = ends[i]
        if end + 1 in index_start_sub:
            for j in index_start_sub[end + 1]:
                AddEdge(edges, candidate_ids[i], candidate_ids[j])
                indegree[j] += 1

    ######## 2. DFS output candidates
    res_array = []

    end_ids = set(
        [candidate_ids[i] for i in range(N) if candidate_ids[i] not in edges])
    start_ids = set([candidate_ids[i] for i in range(N) if indegree[i] == 0])

    # # DEBUG
    # print 'start nodes:',start_ids
    # print 'end targets:',end_ids
    # print 'edges:', edges
    # print 'indexes:', index_cid_sub

    # for startid in sorted(edges.keys()):

    pathnum = 0
    maxscore = 0
    bestpath_cids = set()
    for startid in start_ids:
        res_array.append(startid)
        for path in DFS(lattice_id, edges, startid, res_array, index_cid_sub,
                        end_ids):
            pathnum += 1
            # "path" is candidate_ids in path
            path_words = [candidates[index_cid_sub[i]] for i in path]
            # 0, 1, 2...: subs for both "path" and "path_words"

            # # DEBUG
            # print path
            # print path_words

            score, match_pairs, path_mat = Match(path_words, transcript)
            # Deduplication
            match_subs = set([p[0] for p in match_pairs])

            # a set of CIDs from the match
            match_cids = set([path[i] for i in match_subs])

            # # DEBUG
            # arr1 = path_words
            # arr2 = transcript
            # print 'A:',arr1
            # print 'B:',arr2
            # matches_arr1 = set([p[0] for p in match_pairs]) # Dedup
            # print 'Matched elements:', [arr1[i] for i in matches_arr1]
            # print '[%d]  %s' % (score, str(matches_arr1))
            # print 'Matches:', match_pairs
            # # print '\n'.join([str(x) for x in path])

            if score > maxscore:  # Update with this solution
                maxscore = score
                # print '  Updating score:', score
                bestpath_cids = match_cids
                # print '  best subs:', bestpath_cids
            elif score == maxscore:
                # print '  Merge subs from:', bestpath_cids
                bestpath_cids.update(
                    match_cids)  # merge all possible cids into
                # print '  Merge subs to:', bestpath_cids

        res_array.pop()  # search for different starts

    # Obtain all results!
    # DEBUG
    # print 'Best score:', maxscore
    # print 'Matched cids', bestpath_cids

    # print 'Matched words', [candidates[index_cid_sub[cid]] for cid in bestpath_cids]

    true_cids = bestpath_cids
    for cid in true_cids:
        yield lattice_id, cid, True

    false_cids = set(candidate_ids).difference(true_cids)
    for cid in false_cids:
        yield lattice_id, cid, False

    plpy.info('%d paths, true labels: %d / %d' %
              (pathnum, len(true_cids), len(true_cids) + len(false_cids)))
def run(lattice_id, starts, ends, candidates, candidate_ids, transcript):

  def orderedLattice(cids, edges):
    # generate a topological ordered words
    lattice_ordered = []  
    visited = set()
    indegree = {}
    for i in cids:
      indegree[i] = 0
    for k in edges:
      for e in edges[k]:
        if e not in indegree:
          indegree[e] = 0
        indegree[e] += 1
    # print indegree
    for e in indegree:
      if indegree[e] == 0:
        visited.add(e)
    # print visited
    while len(visited) > 0:
      n = visited.pop()  
      lattice_ordered.append(n)
      if n in edges:
        for e in edges[n]:
          indegree[e] -= 1
          if indegree[e] == 0:
            visited.add(e)
    return lattice_ordered

  # DEBUG function
  def PrintStatus(f, path, message=''): 
    # if message != '': print message
    # print 'F score:'
    # print '\n'.join([str(_) for _ in f])
    # print 'Path:'
    # for p in path:
    #   for pairlist in p:
    #     print '[%20s]' % ', '.join([str('(%d,%d)' % pair) for pair in pairlist]),
    #   print ''
    # raw_input()
    pass


  # Only return 1 path that has highest score. 
  # Break ties by ID order. (small wins)
  def FindBestPath(lattice_words, edges, candidate_ids, index_cid_sub, expectations, start_subs):
    # F: longest matching up to ith element from lattice_words and and jth from trans_words
    n1 = len(lattice_words)
    if n1 == 0: 
      return 0, []

    f = [-10000000.0 for _ in range(n1)]    # f[sub] ->score
    path = [-1 for _ in range(n1)]  # stores index: f[sub] -> lastsub

    # init f of startnodes to be 0
    for i in start_subs: f[i] = 0.0

    ordered_cids = orderedLattice(candidate_ids, edges)

    for i_cid in ordered_cids:
      i = index_cid_sub[i_cid]
      f[i] += expectations[i]     # increase itself
      for j_cid in edges[i_cid]:  # i -> j
        j = index_cid_sub[j_cid]
        if f[j] < f[i]:
          f[j] = f[i]
          path[j] = i


    # Only match "diagonal" edges
    possible_opt_match_pairs = []  # can have cases like [(1,a) (2,a)] but have to be both optimal!
    visited = set()

    # F matrix might look like this:
    # [1, 1, 1]
    # [1, 1, 1]
    # [1, 1, 2]
    # [0, 0, 0]
    # [0, 1, 1]
    # [1, 1, 1]
    # So we just pick one best end-nodes (simplified for "1" best path)
    endnodes = [index_cid_sub[i] for i in edges if len(edges[i]) == 0]
    maxscore = 0
    besti = -1
    for i in endnodes:
      if maxscore < f[i]:
        maxscore = f[i]
        besti = i
      # elif maxscore == f[i][n2 - 1]:
      #   besti.append(i)

    i = besti
    bestpath = []
    while i != -1:
      bestpath.append(i)
      i = path[i]

    # indexes are sorted by "start,end ".
    return maxscore, [lattice_words[i] for i in sorted(bestpath)]

    # return maxscore, [x for x in reversed(possible_opt_match_pairs)], path, f


  # Build graph
  def AddEdge(edges, f, t):
    if f not in edges:
      edges[f] = []
    edges[f].append(t)

  ################## MAIN FUNCTION ####################

  N = len(candidate_ids)  # number of words
  if N == 0:
    plpy.info('Empty data:'+str(lattice_id))
    return

  # Build index for candidate_id
  index_cid_sub = {}
  for sub in range(N):
    cid = candidate_ids[sub]
    index_cid_sub[cid] = sub

  # Build index
  index_start_sub = {}
  for sub in range(N):
    start = starts[sub]
    if start not in index_start_sub: 
      index_start_sub[start] = []
    index_start_sub[start].append(sub)

  ######### 1. build directed graph

  # Note that input is sorted by (start, end)
  last_start = starts[0]
  last_end   = ends[0]
  last_candidate_id = -1
  
  edges = {}  # cand_word_id1 : [cand_word_id2]
  for cid in index_cid_sub: edges[cid] = []
  
  indegree = [0 for i in range(N)]

  for i in range(N):
    start = starts[i]
    end = ends[i]
    if end + 1 in index_start_sub:
      for j in index_start_sub[end + 1]:
        AddEdge(edges, candidate_ids[i], candidate_ids[j])
        indegree[j] += 1

  start_subs = [i for i in range(len(indegree)) if indegree[i] == 0]

  ############# 2. Return DP result

  # If expectation array is empty, fill it with 1 (for oracle)
  exp = [ e - 0.5 for e in expectations]
  if len(expectations) == 0: exp = [0.5 for _ in range(len(candidates))]
  score, words = FindBestPath(candidates, \
    edges, candidate_ids, index_cid_sub, exp, start_subs)

  # plpy.info('[%s]  SCORE: %d, Words: %s...' % (lattice_id, score, (' '.join(words))[:30]))
  yield lattice_id, words