Example #1
0
def process_files(align_file, src_file, tgt_file, ref_file, dict_file, out_file, src_sgm, tgt_sgm, lang, is_reverse_alignment):
  """
  """
  tgt_inf = codecs.open(tgt_file, 'r', 'utf-8')

  is_src = 0
  if src_file != '':
    is_src = 1
    src_inf = codecs.open(src_file, 'r', 'utf-8')

  is_align = 0
  if align_file != '':
    is_align = 1
    align_inf = codecs.open(align_file, 'r', 'utf-8')

  is_ref = 0
  if ref_file != '':
    ref_inf = codecs.open(ref_file, 'r', 'utf-8')
    is_ref = 1

  # load dict
  is_dict = 0
  if dict_file != '' and os.path.exists(dict_file):
    dict_map = load_dict(dict_file)
    is_dict = 1

  # out_file
  if out_file == '':
    out_file = tgt_file + '.post'
  ouf = codecs.open(out_file, 'w', 'utf-8')

  new_tgt_file = tgt_file + '.new'
  new_tgt_ouf = codecs.open(new_tgt_file, 'w', 'utf-8')

  # post process
  unk = '<unk>'
  line_id = 0
  debug = 1
  unk_count = 0
  dictionary_count = 0
  identity_count = 0
  for tgt_line in tgt_inf:
    tgt_line = tgt_line.strip()
    debug_count = 0
    debug_str = ''

    if is_src:
      src_line = src_inf.readline().strip()
    if is_ref:
      ref_line = ref_inf.readline().strip()
    if is_align:
      src_tokens = re.split('\s+', src_line)
      tgt_tokens = re.split('\s+', tgt_line)

      # get alignment
      align_line = align_inf.readline().strip()
      if is_reverse_alignment==True: # reversed alignment tgtId-srcId
        (t2s, s2t) = text.aggregate_alignments(align_line)
      else: # normal alignment srcId-tgtId
        (s2t, t2s) = text.aggregate_alignments(align_line)
       
      new_tgt_tokens = []
      for tgt_pos in xrange(len(tgt_tokens)):
        tgt_token = tgt_tokens[tgt_pos]
        if tgt_tokens[tgt_pos] == unk and is_dict:
          unk_count = unk_count + 1
          if tgt_pos in t2s: # aligned unk
            debug_count = debug_count + 1
            src_token = src_tokens[t2s[tgt_pos][0]]
            if src_token in dict_map: # there's a word-word translation
              tgt_token = dict_map[src_token]
              dictionary_count = dictionary_count + 1
              if debug:
                debug_str = debug_str + "dict: " + src_token + " -> " + tgt_token + '\n'
            else: # identity copy
              tgt_token = src_token
              identity_count = identity_count + 1

              if debug:
                debug_str = debug_str + "iden: " + src_token + " -> " + tgt_token + '\n'

        #if tgt_token != '##AT##-##AT##':
        new_tgt_tokens.append(tgt_token)

      out_line = ' '.join(new_tgt_tokens)
    else:
      out_line = tgt_line

    # post process
    if re.search('##AT##-##AT##', out_line):
      out_line = re.sub(' ##AT##-##AT## ', '-', out_line)
      tgt_line = re.sub(' ##AT##-##AT## ', '-', tgt_line)
      if is_align == 0:
        debug_count = 1
    ouf.write('%s\n' % out_line)
    new_tgt_ouf.write('%s\n' % tgt_line)

    # debug info
    if debug == 1 and debug_count>0:
      sys.stderr.write('# example %d\n' % line_id)
      if is_src:
        sys.stderr.write('src: %s\n' % (src_line))
      sys.stderr.write('tgt: %s\n' % (tgt_line))
      sys.stderr.write('%s' % (debug_str))
      sys.stderr.write('out: %s\n' % (out_line))
      if is_ref:
        sys.stderr.write('ref: %s\n' % ref_line)
      debug = 0

    line_id += 1   # concat results

  if is_src:
    src_inf.close()
  if is_align:
    align_inf.close()
  tgt_inf.close()
  ouf.close()
  new_tgt_ouf.close()
  sys.stderr.write('# num sents = %d, unk count=%d, dictionary_count=%d, identity_count=%d\n' % (line_id, unk_count, dictionary_count, identity_count))

  # evaluating 
  if is_ref:
    script_dir = os.path.dirname(sys.argv[0])
    bleu(script_dir, new_tgt_file, ref_file)
   
    if is_align:
      chr_f(script_dir, out_file, ref_file)
      bleu(script_dir, out_file, ref_file)
      if src_sgm != '' and tgt_sgm != '' and lang != '': # compute NIST BLEU score
        nist_bleu(script_dir, out_file, src_sgm, tgt_sgm, lang)
Example #2
0
def post_process(align_file, src_file, tgt_file, ref_file, dict_file, is_reverse_alignment):
  sys.stderr.write('# ref_file = %s\n' % ref_file)
  is_src = 0
  if src_file != '':
    is_src = 1
    src_inf = codecs.open(src_file, 'r', 'utf-8')
  tgt_inf = codecs.open(tgt_file, 'r', 'utf-8')

  is_align = 0
  if align_file != '':
    is_align = 1
    align_inf = codecs.open(align_file, 'r', 'utf-8')
    # post_file
    post_file = tgt_file + '.post'
    post_ouf = codecs.open(post_file, 'w', 'utf-8')
  else:
    post_file = ''

  is_ref = 0
  if ref_file != '':
    ref_inf = codecs.open(ref_file, 'r', 'utf-8')
    is_ref = 1

  # load dict
  is_dict = 0
  if dict_file != '':
    dict_map = load_dict(dict_file)
    is_dict = 1

  # pre_file
  pre_file = tgt_file + '.pre'
  pre_ouf = codecs.open(pre_file, 'w', 'utf-8')

  # post process
  unk = '<unk>'
  line_id = 0
  debug = 1
  unk_count = 0
  dictionary_count = 0
  identity_count = 0
  for tgt_line in tgt_inf:
    tgt_line = tgt_line.strip()
    debug_count = 0
    debug_str = ''

    if is_src:
      src_line = src_inf.readline().strip()
    if is_ref:
      ref_line = ref_inf.readline().strip()
    if is_align:
      src_tokens = re.split('\s+', src_line)
      tgt_tokens = re.split('\s+', tgt_line)

      # get alignment
      align_line = align_inf.readline().strip()
      if is_reverse_alignment==True: # reversed alignment tgtId-srcId
        (t2s, s2t) = text.aggregate_alignments(align_line)
      else: # normal alignment srcId-tgtId
        (s2t, t2s) = text.aggregate_alignments(align_line)
       
      new_tgt_tokens = []
      for tgt_pos in xrange(len(tgt_tokens)):
        tgt_token = tgt_tokens[tgt_pos]
        if tgt_tokens[tgt_pos] == unk and is_dict:
          unk_count = unk_count + 1
          if tgt_pos in t2s: # aligned unk
            debug_count = debug_count + 1
            src_token = src_tokens[t2s[tgt_pos][0]]
            if src_token in dict_map: # there's a word-word translation
              tgt_token = dict_map[src_token]
              dictionary_count = dictionary_count + 1
              if debug:
                debug_str = debug_str + "dict: " + src_token + " -> " + tgt_token + '\n'
            else: # identity copy
              tgt_token = src_token
              identity_count = identity_count + 1

              if debug:
                debug_str = debug_str + "iden: " + src_token + " -> " + tgt_token + '\n'

        new_tgt_tokens.append(tgt_token)

      post_line = ' '.join(new_tgt_tokens)

    # escape ##AT##-##AT## en-de pair (for historical reason)
    #if re.search('##AT##-##AT##', tgt_line):
    tgt_line = re.sub(' ##AT##-##AT## ', '-', tgt_line)
    if is_align == 0:
      debug_count = 1
    pre_ouf.write('%s\n' % tgt_line)
    
    if is_align:
      post_line = re.sub(' ##AT##-##AT## ', '-', post_line)
      post_ouf.write('%s\n' % post_line)

    # debug info
    if debug == 1 and debug_count>0:
      sys.stderr.write('# example %d\n' % line_id)
      if is_src:
        sys.stderr.write('src: %s\n' % (src_line))
      sys.stderr.write('tgt: %s\n' % (tgt_line))
      sys.stderr.write('%s' % (debug_str))

      if is_align:
        sys.stderr.write('out: %s\n' % (post_line))
      if is_ref:
        sys.stderr.write('ref: %s\n' % ref_line)
      debug = 0

    line_id += 1   # concat results

  if is_src:
    src_inf.close()
  if is_align:
    align_inf.close()
  tgt_inf.close()
  post_ouf.close()
  pre_ouf.close()
  sys.stderr.write('# num sents = %d, unk count=%d, dictionary_count=%d, identity_count=%d\n' % (line_id, unk_count, dictionary_count, identity_count))
  return (pre_file, post_file)
def process_files(in_prefix, src_lang, tgt_lang, out_prefix, freq, opt, src_vocab_size, tgt_vocab_size, unk_symbol='<unk>'):
  """
  """
  
  # input
  sys.stderr.write('# Input from %s.*\n' % (in_prefix))
  src_file = in_prefix + '.' + src_lang
  src_inf = codecs.open(src_file, 'r', 'utf-8')
  tgt_file = in_prefix + '.' + tgt_lang
  tgt_inf = codecs.open(tgt_file, 'r', 'utf-8')
  align_inf = codecs.open(in_prefix + '.align', 'r', 'utf-8')

  if src_vocab_size>0:
    src_vocab_file = in_prefix + '.' + src_lang + '.vocab.' + str(src_vocab_size)
  elif freq>0:
    src_vocab_file = in_prefix + '.' + src_lang + '.vocab.f' + str(freq)
  (src_words, src_vocab_map, src_vocab_size) = text.get_vocab(src_file, src_vocab_file, freq, src_vocab_size, unk_symbol)
  
  if tgt_vocab_size>0:
    tgt_vocab_file = in_prefix + '.' + tgt_lang + '.vocab.' + str(tgt_vocab_size)
  elif freq>0:
    tgt_vocab_file = in_prefix + '.' + tgt_lang + '.vocab.f' + str(freq)  
  (tgt_words, tgt_vocab_map, tgt_vocab_size) = text.get_vocab(tgt_file, tgt_vocab_file, freq, tgt_vocab_size, unk_symbol)
  
  # process corpus
  line_id = 0
  debug = True
  bi_counts = {} # bi_counts[src_id][tgt_id]
  src_counts = {}
  tgt_counts = {}
  total_count = 0 # total alignment links
  for src_line in src_inf:
    src_line = src_line.strip()
    tgt_line = tgt_inf.readline().strip()
    src_tokens = re.split('\s+', src_line)
    tgt_tokens = re.split('\s+', tgt_line)
    if opt==1: # reversed alignment tgtId-srcId
      (t2s, s2t) = text.aggregate_alignments(align_inf.readline())
    else: # normal alignment srcId-tgtId
      (s2t, t2s) = text .aggregate_alignments(align_inf.readline())

    # process alignments
    for tgt_pos in t2s.keys():
      for src_pos in t2s[tgt_pos]:
        # same word
        src_token = src_tokens[src_pos]
        tgt_token = tgt_tokens[tgt_pos]
        if src_token in src_vocab_map and tgt_token in tgt_vocab_map: # both known
          src_id = src_vocab_map[src_token]
          tgt_id = tgt_vocab_map[tgt_token]
          if src_id not in bi_counts:
            bi_counts[src_id] = {}
            src_counts[src_id] = 0
          if tgt_id not in tgt_counts:
            tgt_counts[tgt_id] = 0
          if tgt_id not in bi_counts[src_id]:
            bi_counts[src_id][tgt_id] = 0
          
          # update
          bi_counts[src_id][tgt_id] += 1
          src_counts[src_id] += 1
          tgt_counts[tgt_id] += 1
          total_count += 1

    line_id = line_id + 1
    if (line_id % 100000 == 0):
      sys.stderr.write(' (%d) ' % line_id)
  sys.stderr.write('  num lines=%d, total links=%d\n' % (line_id, total_count))

  # output
  check_dir(out_prefix)
  dict_file = out_prefix + '.' + src_lang + '-' + tgt_lang + '.dict'
  dict_ouf = codecs.open(dict_file, 'w', 'utf-8')
  sys.stderr.write('# Output to %s*\n' % dict_file)

  # compute src_probs
  src_probs = {}
  for src_id in src_counts.keys():
    src_probs[src_id] = float(src_counts[src_id])/float(total_count)

  # compute tgt_probs
  tgt_probs = {}
  for tgt_id in tgt_counts.keys():
    tgt_probs[tgt_id] = float(tgt_counts[tgt_id])/float(total_count)

  # compute joint prob
  for src_id in bi_counts.keys():
    for tgt_id in bi_counts[src_id].keys():
      bi_count = bi_counts[src_id][tgt_id]
      if bi_count<10: continue
      p_src_given_tgt = float(bi_count)/float(tgt_counts[tgt_id])
      p_tgt_given_src = float(bi_count)/float(src_counts[src_id])
      
      # normalized pmi
      p_src_tgt = float(bi_count)/float(total_count) # joint
      p_src = src_probs[src_id]
      p_tgt = tgt_probs[tgt_id]
      pmi = math.log(p_src_tgt/(p_src*p_tgt))
      npmi = - pmi / math.log(p_src_tgt) 
  
      # print
      src_token = src_words[src_id]
      tgt_token = tgt_words[tgt_id]
      dict_ouf.write('%s %s %g %g %g %g %g\n' % (src_token, tgt_token, p_tgt_given_src, p_src_given_tgt, (p_src_given_tgt+p_tgt_given_src)/2, pmi, npmi))
      #dict_ouf.write('%s %s %g\n' % (src_token, tgt_token, (p_src_given_tgt+p_tgt_given_src)/2))

  #text.write_vocab(out_prefix + '.vocab.' + src_lang, src_words)
  #text.write_vocab(out_prefix + '.vocab.' + tgt_lang, tgt_words)

  src_inf.close()
  tgt_inf.close()
  align_inf.close()

  dict_ouf.close()
Example #4
0
def process_files(in_prefix,
                  src_lang,
                  tgt_lang,
                  out_prefix,
                  freq,
                  opt,
                  src_vocab_size,
                  tgt_vocab_size,
                  unk_symbol='<unk>'):
    """
  """

    # input
    sys.stderr.write('# Input from %s.*\n' % (in_prefix))
    src_file = in_prefix + '.' + src_lang
    src_inf = codecs.open(src_file, 'r', 'utf-8')
    tgt_file = in_prefix + '.' + tgt_lang
    tgt_inf = codecs.open(tgt_file, 'r', 'utf-8')
    align_inf = codecs.open(in_prefix + '.align', 'r', 'utf-8')

    if src_vocab_size > 0:
        src_vocab_file = in_prefix + '.' + src_lang + '.vocab.' + str(
            src_vocab_size)
    elif freq > 0:
        src_vocab_file = in_prefix + '.' + src_lang + '.vocab.f' + str(freq)
    (src_words, src_vocab_map,
     src_vocab_size) = text.get_vocab(src_file, src_vocab_file, freq,
                                      src_vocab_size, unk_symbol)

    if tgt_vocab_size > 0:
        tgt_vocab_file = in_prefix + '.' + tgt_lang + '.vocab.' + str(
            tgt_vocab_size)
    elif freq > 0:
        tgt_vocab_file = in_prefix + '.' + tgt_lang + '.vocab.f' + str(freq)
    (tgt_words, tgt_vocab_map,
     tgt_vocab_size) = text.get_vocab(tgt_file, tgt_vocab_file, freq,
                                      tgt_vocab_size, unk_symbol)

    # process corpus
    line_id = 0
    debug = True
    bi_counts = {}  # bi_counts[src_id][tgt_id]
    src_counts = {}
    tgt_counts = {}
    total_count = 0  # total alignment links
    for src_line in src_inf:
        src_line = src_line.strip()
        tgt_line = tgt_inf.readline().strip()
        src_tokens = re.split('\s+', src_line)
        tgt_tokens = re.split('\s+', tgt_line)
        if opt == 1:  # reversed alignment tgtId-srcId
            (t2s, s2t) = text.aggregate_alignments(align_inf.readline())
        else:  # normal alignment srcId-tgtId
            (s2t, t2s) = text.aggregate_alignments(align_inf.readline())

        # process alignments
        for tgt_pos in t2s.keys():
            for src_pos in t2s[tgt_pos]:
                # same word
                src_token = src_tokens[src_pos]
                tgt_token = tgt_tokens[tgt_pos]
                if src_token in src_vocab_map and tgt_token in tgt_vocab_map:  # both known
                    src_id = src_vocab_map[src_token]
                    tgt_id = tgt_vocab_map[tgt_token]
                    if src_id not in bi_counts:
                        bi_counts[src_id] = {}
                        src_counts[src_id] = 0
                    if tgt_id not in tgt_counts:
                        tgt_counts[tgt_id] = 0
                    if tgt_id not in bi_counts[src_id]:
                        bi_counts[src_id][tgt_id] = 0

                    # update
                    bi_counts[src_id][tgt_id] += 1
                    src_counts[src_id] += 1
                    tgt_counts[tgt_id] += 1
                    total_count += 1

        line_id = line_id + 1
        if (line_id % 100000 == 0):
            sys.stderr.write(' (%d) ' % line_id)
    sys.stderr.write('  num lines=%d, total links=%d\n' %
                     (line_id, total_count))

    # output
    check_dir(out_prefix)
    dict_file = out_prefix + '.' + src_lang + '-' + tgt_lang + '.dict'
    dict_ouf = codecs.open(dict_file, 'w', 'utf-8')
    sys.stderr.write('# Output to %s*\n' % dict_file)

    # compute src_probs
    src_probs = {}
    for src_id in src_counts.keys():
        src_probs[src_id] = float(src_counts[src_id]) / float(total_count)

    # compute tgt_probs
    tgt_probs = {}
    for tgt_id in tgt_counts.keys():
        tgt_probs[tgt_id] = float(tgt_counts[tgt_id]) / float(total_count)

    # compute joint prob
    for src_id in bi_counts.keys():
        for tgt_id in bi_counts[src_id].keys():
            bi_count = bi_counts[src_id][tgt_id]
            if bi_count < 10: continue
            p_src_given_tgt = float(bi_count) / float(tgt_counts[tgt_id])
            p_tgt_given_src = float(bi_count) / float(src_counts[src_id])

            # normalized pmi
            p_src_tgt = float(bi_count) / float(total_count)  # joint
            p_src = src_probs[src_id]
            p_tgt = tgt_probs[tgt_id]
            pmi = math.log(p_src_tgt / (p_src * p_tgt))
            npmi = -pmi / math.log(p_src_tgt)

            # print
            src_token = src_words[src_id]
            tgt_token = tgt_words[tgt_id]
            dict_ouf.write(
                '%s %s %g %g %g %g %g\n' %
                (src_token, tgt_token, p_tgt_given_src, p_src_given_tgt,
                 (p_src_given_tgt + p_tgt_given_src) / 2, pmi, npmi))
            #dict_ouf.write('%s %s %g\n' % (src_token, tgt_token, (p_src_given_tgt+p_tgt_given_src)/2))

    #text.write_vocab(out_prefix + '.vocab.' + src_lang, src_words)
    #text.write_vocab(out_prefix + '.vocab.' + tgt_lang, tgt_words)

    src_inf.close()
    tgt_inf.close()
    align_inf.close()

    dict_ouf.close()