def main(): source_file = codecs.open(sys.argv[1], encoding="utf-8", mode="r") target_file = codecs.open(sys.argv[2], encoding="utf-8", mode="r") source_out = codecs.open(sys.argv[3], encoding="utf-8", mode="w") target_out = codecs.open(sys.argv[4], encoding="utf-8", mode="w") #source = [line.lstrip() for line in source_file.readlines()] #target = [line.lstrip() for line in target_file.readlines()] source = source_file.readlines() target = target_file.readlines() aligner = PyGaleChurchAligner() (cost, aligned_source, aligned_target) = aligner.align(source, target) print cost, len(aligned_source), len(aligned_target) for i in xrange(0, len(aligned_source)): source_out.write(aligned_source[i] + "\n") target_out.write(aligned_target[i] + "\n") source_file.close() target_file.close() source_out.close() target_out.close()
def main(): source_file = codecs.open(sys.argv[1], encoding="utf-8", mode="r") target_file = codecs.open(sys.argv[2], encoding="utf-8", mode="r") source_out = codecs.open(sys.argv[3], encoding="utf-8", mode="w") target_out = codecs.open(sys.argv[4], encoding="utf-8", mode="w") #source = [line.lstrip() for line in source_file.readlines()] #target = [line.lstrip() for line in target_file.readlines()] source = source_file.readlines() target = target_file.readlines() aligner = PyGaleChurchAligner() (cost, aligned_source, aligned_target) = aligner.align(source, target) print(cost, len(aligned_source), len(aligned_target)) for i in range(0, len(aligned_source)): source_out.write(aligned_source[i] + "\n") target_out.write(aligned_target[i] + "\n") source_file.close() target_file.close() source_out.close() target_out.close()
def main(): parser = optparse.OptionParser() parser.add_option("-i", "--input-file", dest="input_file", default="", type="string", help="Location of the uncompressed mined webpages") parser.add_option("-n", "--num_entries", dest="num_entries", default=0, type="int", help="Maximum number of entries to examine, set to 0 for no limit") parser.add_option("--language-pair", dest="language_pair", default="English,Spanish", help="Prints parallel data for the comma-separated language pair" + " when used with the --out-prefix option") parser.add_option("--out-prefix", dest="out_prefix", default="", help="Parallel data will be output to this location") parser.add_option("--annotate", dest="annotate", default="", help="Prints random pages to the given directory for annotation") parser.add_option("--annotate_amount", dest="annotate_amount", default=100, type="int", help="Number of document pairs to annotate") parser.add_option("--annotation_file", dest="annotation_file", default="", help="A file containing annotations of web page pairs") parser.add_option("--annotation_dir", dest="annotation_dir", default="", help="The location of the HTML for the annotated web pages") (opts, args) = parser.parse_args() # Initialize the HTML parsers cleanup_parser = etree.HTMLParser(encoding = "utf-8", target = parsers.CleanupTarget()) plaintext_parser = etree.HTMLParser(encoding = "utf-8", target = parsers.PlaintextTarget()) strand_parser = etree.HTMLParser(encoding = "utf-8", target = parsers.StrandTarget()) # Gale Church aligner gc_aligner = PyGaleChurchAligner() strand_aligner = strand.StrandAligner() if opts.annotation_file and opts.annotation_dir: data = read_annotated_data(strand_aligner, strand_parser, opts.annotation_file, opts.annotation_dir) folds = 5 features = [] true_pos = 0 false_pos = 0 total_pos = 0 correct = 0 total = len(data) stats = {} for fold in xrange(0, folds): print "Fold %d:" % (fold + 1) training_data = [] test_data = [] for i in xrange(0, len(data)): if i % folds == fold: test_data.append(data[i]) else: training_data.append(data[i]) strand_aligner.me_model.set_training_data(training_data) strand_aligner.me_model.lbfgs_train() features.append(strand_aligner.me_model.get_features()) for example in test_data: predicted_label = strand_aligner.me_model.get_label(example) true_label = example.true_instance print true_label, predicted_label print strand_aligner.me_model.get_probs(example) print example.instances[predicted_label] if true_label == 0: total_pos += 1 if predicted_label == 0: true_pos += 1 elif true_label != predicted_label: false_pos += 1 correct = true_pos + total - total_pos - false_pos stats["Positives"] = total_pos stats["Accuracy"] = (100.0 * correct) / total stats["Precision"] = 0.0 if true_pos + false_pos > 0: stats["Precision"] = (100.0 * true_pos) / (true_pos + false_pos) stats["Recall"] = (100.0 * true_pos) / total_pos stats["F1"] = 0.0 if stats["Recall"] + stats["Precision"] > 0.0: stats["F1"] = 2 * stats["Precision"] * stats["Recall"] / (stats["Recall"] + stats["Precision"]) for i in xrange(0, len(features)): print "Fold %d" % (i + 1) print features[i] print stats if opts.input_file == "": print "No input file given" return language_pair = None if opts.language_pair: languages = opts.language_pair.split(",") if len(languages) != 2: print "Error in language pair:", opts.language_pair return # TODO: Language codes language_pair = (languages[0], languages[1]) data_to_annotate = [] aligned_strand_out = None aligned_plaintext_out = None plaintext_docs_out = None segmenters = None if opts.out_prefix and language_pair: aligned_strand_out = [] aligned_plaintext_out = [] plaintext_docs_out = [] segmenters = [] for lang in language_pair: aligned_strand_out.append(codecs.open( "%s.strand.%s" % (opts.out_prefix, lang), encoding="utf-8", mode="w")) #aligned_plaintext_out.append(codecs.open( # "%s.text.%s" % (opts.out_prefix, lang), # encoding="utf-8", mode="w")) #plaintext_docs_out.append(codecs.open( # "%s.docs.%s" % (opts.out_prefix, lang), # encoding="utf-8", mode="w")) segmenters.append(Segmenter(lang)) in_file = open(opts.input_file, "r") linecount = 0 for line in in_file: (key, webpages) = parse_entry(line) if len(key) == 0: print "Malformed entry at line", linecount else: # default behavior for now: just print the URL #print url_to_filename(key).encode('utf-8') data_by_language = {} for webpage in webpages: if webpage['language'] not in data_by_language: data_by_language[ webpage['language'] ] = {} if opts.annotate: try: clean_html = apply_parser(webpage['html'], cleanup_parser) data_by_language[ webpage['language'] ]["html"] = clean_html except: pass if opts.out_prefix: #plaintext = apply_parser(webpage['html'], plaintext_parser) #data_by_language[ webpage['language'] ]["text"] = plaintext lang = webpage['language'] if lang in language_pair: try: tagchunks = apply_parser(webpage['html'], strand_parser) data_by_language[lang]["strand"] = tagchunks except: pass if language_pair[0] in data_by_language and language_pair[1] in data_by_language: if opts.annotate: data_to_annotate.append((key, data_by_language[ language_pair[0] ]["html"], data_by_language[ language_pair[1] ]["html"])) if opts.out_prefix and ("strand" in data_by_language[ language_pair[0]]) and ("strand" in data_by_language[ language_pair[1]]): en_output = data_by_language[ language_pair[0] ]["strand"].split("\n") es_output = data_by_language[ language_pair[1] ]["strand"].split("\n") en_tagchunks = strand_aligner.create_tag_chunk_stream(en_output) es_tagchunks = strand_aligner.create_tag_chunk_stream(es_output) alignment = strand_aligner.align(en_tagchunks, es_tagchunks) for (s, t) in alignment: if (s and s.tc_type == strand.TCType.CHUNK and t and t.tc_type == strand.TCType.CHUNK): source_sents = segmenters[0].process(unicode(s.chunk_data)) target_sents = segmenters[1].process(unicode(t.chunk_data)) (cost, aligned_source, aligned_target) = gc_aligner.align( source_sents, target_sents) for i in xrange(0, len(aligned_source)): s_sent = aligned_source[i] t_sent = aligned_target[i] if alpha_min_length(s_sent, t_sent) >= 5 and end_punc(s_sent, t_sent) == 1: aligned_strand_out[0].write(s_sent + "\n") aligned_strand_out[1].write(t_sent + "\n") # Plain text output and alignment (TODO) # Document output and alignment (TODO) linecount += 1 if linecount == opts.num_entries: break in_file.close() if opts.out_prefix: for out_file in aligned_strand_out: out_file.close() for out_file in aligned_plaintext_out: out_file.close() for out_file in plaintext_docs_out: out_file.close() if opts.annotate: mkdir_p(opts.annotate) mkdir_p(opts.annotate + "/source") mkdir_p(opts.annotate + "/target") annotation_file = codecs.open(opts.annotate + "/annotation", encoding="utf-8", mode="w") shuffle(data_to_annotate) for i in xrange(0, opts.annotate_amount): (key, source, target) = data_to_annotate[i] count_str = "%04d_" % i out_source = codecs.open(opts.annotate + "/source/" + count_str + url_to_filename(key), encoding="utf-8", mode="w") out_target = codecs.open(opts.annotate + "/target/" + count_str + url_to_filename(key), encoding="utf-8", mode="w") out_source.write(source) out_target.write(target) annotation_file.write(key) annotation_file.write("\n\n") out_source.close() out_target.close() annotation_file.close()