def main():
  source_file = codecs.open(sys.argv[1], encoding="utf-8", mode="r")
  target_file = codecs.open(sys.argv[2], encoding="utf-8", mode="r")
  source_out = codecs.open(sys.argv[3], encoding="utf-8", mode="w")
  target_out = codecs.open(sys.argv[4], encoding="utf-8", mode="w")

  #source = [line.lstrip() for line in source_file.readlines()]
  #target = [line.lstrip() for line in target_file.readlines()]
  source = source_file.readlines()
  target = target_file.readlines()

  aligner = PyGaleChurchAligner()
  (cost, aligned_source, aligned_target) = aligner.align(source, target)
  print cost, len(aligned_source), len(aligned_target)
  for i in xrange(0, len(aligned_source)):
    source_out.write(aligned_source[i] + "\n")
    target_out.write(aligned_target[i] + "\n")
  source_file.close()
  target_file.close()
  source_out.close()
  target_out.close()
Example #2
0
def main():
    source_file = codecs.open(sys.argv[1], encoding="utf-8", mode="r")
    target_file = codecs.open(sys.argv[2], encoding="utf-8", mode="r")
    source_out = codecs.open(sys.argv[3], encoding="utf-8", mode="w")
    target_out = codecs.open(sys.argv[4], encoding="utf-8", mode="w")

    #source = [line.lstrip() for line in source_file.readlines()]
    #target = [line.lstrip() for line in target_file.readlines()]
    source = source_file.readlines()
    target = target_file.readlines()

    aligner = PyGaleChurchAligner()
    (cost, aligned_source, aligned_target) = aligner.align(source, target)
    print(cost, len(aligned_source), len(aligned_target))
    for i in range(0, len(aligned_source)):
        source_out.write(aligned_source[i] + "\n")
        target_out.write(aligned_target[i] + "\n")
    source_file.close()
    target_file.close()
    source_out.close()
    target_out.close()
Example #3
0
def main():
  parser = optparse.OptionParser()
  parser.add_option("-i", "--input-file", dest="input_file", default="",
      type="string", help="Location of the uncompressed mined webpages")
  parser.add_option("-n", "--num_entries", dest="num_entries", default=0,
      type="int", 
      help="Maximum number of entries to examine, set to 0 for no limit")
  parser.add_option("--language-pair", dest="language_pair",
      default="English,Spanish",
      help="Prints parallel data for the comma-separated language pair"
      + " when used with the --out-prefix option")

  parser.add_option("--out-prefix", dest="out_prefix", default="",
      help="Parallel data will be output to this location")

  parser.add_option("--annotate", dest="annotate", default="",
      help="Prints random pages to the given directory for annotation")
  parser.add_option("--annotate_amount", dest="annotate_amount", default=100,
      type="int", help="Number of document pairs to annotate")

  parser.add_option("--annotation_file", dest="annotation_file", default="",
      help="A file containing annotations of web page pairs")
  parser.add_option("--annotation_dir", dest="annotation_dir", default="",
      help="The location of the HTML for the annotated web pages")

  (opts, args) = parser.parse_args()

  # Initialize the HTML parsers
  cleanup_parser = etree.HTMLParser(encoding = "utf-8",
      target = parsers.CleanupTarget())
  plaintext_parser = etree.HTMLParser(encoding = "utf-8",
      target = parsers.PlaintextTarget())
  strand_parser = etree.HTMLParser(encoding = "utf-8",
      target = parsers.StrandTarget())
  # Gale Church aligner
  gc_aligner = PyGaleChurchAligner()
  strand_aligner = strand.StrandAligner()
  

  if opts.annotation_file and opts.annotation_dir:
    data = read_annotated_data(strand_aligner, strand_parser,
        opts.annotation_file, opts.annotation_dir)

    folds = 5
    features = []
    true_pos = 0
    false_pos = 0
    total_pos = 0
    correct = 0
    total = len(data)
    stats = {}
    for fold in xrange(0, folds):
      print "Fold %d:" % (fold + 1)
      training_data = []
      test_data = []
      for i in xrange(0, len(data)):
        if i % folds == fold:
          test_data.append(data[i])
        else:
          training_data.append(data[i])
      strand_aligner.me_model.set_training_data(training_data)
      strand_aligner.me_model.lbfgs_train()
      features.append(strand_aligner.me_model.get_features())
      for example in test_data:
        predicted_label = strand_aligner.me_model.get_label(example)
        true_label = example.true_instance
        print true_label, predicted_label
        print strand_aligner.me_model.get_probs(example)
        print example.instances[predicted_label]
        if true_label == 0:
          total_pos += 1
          if predicted_label == 0:
            true_pos += 1
        elif true_label != predicted_label:
          false_pos += 1
    correct = true_pos + total - total_pos - false_pos
    stats["Positives"] = total_pos
    stats["Accuracy"] = (100.0 * correct) / total
    stats["Precision"] = 0.0
    if true_pos + false_pos > 0:
      stats["Precision"] = (100.0 * true_pos) / (true_pos + false_pos)
    stats["Recall"] = (100.0 * true_pos) / total_pos
    stats["F1"] = 0.0
    if stats["Recall"] + stats["Precision"] > 0.0:
      stats["F1"] = 2 * stats["Precision"] * stats["Recall"] / (stats["Recall"]
          + stats["Precision"])
    for i in xrange(0, len(features)):
      print "Fold %d" % (i + 1)
      print features[i]
    print stats

  if opts.input_file == "":
    print "No input file given"
    return

  language_pair = None
  if opts.language_pair:
    languages = opts.language_pair.split(",")
    if len(languages) != 2:
      print "Error in language pair:", opts.language_pair
      return
    # TODO: Language codes
    language_pair = (languages[0], languages[1])

  data_to_annotate = []

  aligned_strand_out = None
  aligned_plaintext_out = None
  plaintext_docs_out = None
  segmenters = None
  if opts.out_prefix and language_pair:
    aligned_strand_out = []
    aligned_plaintext_out = []
    plaintext_docs_out = []
    segmenters = []
    for lang in language_pair:
      aligned_strand_out.append(codecs.open(
          "%s.strand.%s" % (opts.out_prefix, lang),
          encoding="utf-8", mode="w"))
      #aligned_plaintext_out.append(codecs.open(
      #    "%s.text.%s" % (opts.out_prefix, lang),
      #    encoding="utf-8", mode="w"))
      #plaintext_docs_out.append(codecs.open(
      #    "%s.docs.%s" % (opts.out_prefix, lang),
      #    encoding="utf-8", mode="w"))
      segmenters.append(Segmenter(lang))

  in_file = open(opts.input_file, "r")
  linecount = 0
  for line in in_file:
    (key, webpages) = parse_entry(line)
    if len(key) == 0:
      print "Malformed entry at line", linecount
    else:
      # default behavior for now: just print the URL
      #print url_to_filename(key).encode('utf-8')

      data_by_language = {}
      for webpage in webpages:
        if webpage['language'] not in data_by_language:
          data_by_language[ webpage['language'] ] = {}
        if opts.annotate:
          try:
            clean_html = apply_parser(webpage['html'], cleanup_parser)
            data_by_language[ webpage['language'] ]["html"] = clean_html
          except:
            pass

        if opts.out_prefix:
          #plaintext = apply_parser(webpage['html'], plaintext_parser)
          #data_by_language[ webpage['language'] ]["text"] = plaintext

          lang = webpage['language']
          if lang in language_pair:
            try:
              tagchunks = apply_parser(webpage['html'], strand_parser)
              data_by_language[lang]["strand"] = tagchunks
            except:
              pass

      if language_pair[0] in data_by_language and language_pair[1] in data_by_language:
        if opts.annotate:
          data_to_annotate.append((key,
              data_by_language[ language_pair[0] ]["html"],
              data_by_language[ language_pair[1] ]["html"]))
        if opts.out_prefix and ("strand" in
            data_by_language[ language_pair[0]]) and ("strand" in
            data_by_language[ language_pair[1]]):
          en_output = data_by_language[ language_pair[0] ]["strand"].split("\n")
          es_output = data_by_language[ language_pair[1] ]["strand"].split("\n")
          en_tagchunks = strand_aligner.create_tag_chunk_stream(en_output)
          es_tagchunks = strand_aligner.create_tag_chunk_stream(es_output)
          alignment = strand_aligner.align(en_tagchunks, es_tagchunks)
          for (s, t) in alignment:
            if (s and s.tc_type == strand.TCType.CHUNK
                and t and t.tc_type == strand.TCType.CHUNK):
              source_sents = segmenters[0].process(unicode(s.chunk_data))
              target_sents = segmenters[1].process(unicode(t.chunk_data))
              (cost, aligned_source, aligned_target) = gc_aligner.align(
                  source_sents, target_sents)
              for i in xrange(0, len(aligned_source)):
                s_sent = aligned_source[i]
                t_sent = aligned_target[i]
                if alpha_min_length(s_sent, t_sent) >= 5 and end_punc(s_sent, t_sent) == 1:
                  aligned_strand_out[0].write(s_sent + "\n")
                  aligned_strand_out[1].write(t_sent + "\n")
          # Plain text output and alignment (TODO)
          # Document output and alignment (TODO)

    linecount += 1
    if linecount == opts.num_entries:
      break
  in_file.close()

  if opts.out_prefix:
    for out_file in aligned_strand_out:
      out_file.close()
    for out_file in aligned_plaintext_out:
      out_file.close()
    for out_file in plaintext_docs_out:
      out_file.close()

  if opts.annotate:
    mkdir_p(opts.annotate)
    mkdir_p(opts.annotate + "/source")
    mkdir_p(opts.annotate + "/target")
    annotation_file = codecs.open(opts.annotate + "/annotation",
        encoding="utf-8", mode="w")
    shuffle(data_to_annotate)
    for i in xrange(0, opts.annotate_amount):
      (key, source, target) = data_to_annotate[i]
      count_str = "%04d_" % i
      out_source = codecs.open(opts.annotate + "/source/" + count_str +
          url_to_filename(key), encoding="utf-8", mode="w")
      out_target = codecs.open(opts.annotate + "/target/" + count_str +
          url_to_filename(key), encoding="utf-8", mode="w")
      out_source.write(source)
      out_target.write(target)
      annotation_file.write(key)
      annotation_file.write("\n\n")
      out_source.close()
      out_target.close()
    annotation_file.close()