# # [sgm] # the .sgm file # # [apf] # the .apf file # # Output: Written into the standard output. # The line number of the selected sentence and the sentence separated by a space. import sys import re import bisect from xml.dom.minidom import parse import serif import datetime if __name__ == "__main__": if len(sys.argv) != 4: print "Usage: pick_recent_date.py [date] [text] [apf]" sys.exit(1) date = serif.convert_to_date(sys.argv[1]) text = serif.read_sgm(sys.argv[2]) data = serif.read_apf(text, sys.argv[3]) btimex = serif.find_best_timex(date, text, data) if btimex: start, end = text.expand(btimex.start, btimex.end) line = serif.resolveCoref(text, data, start, end) print text.find(start) - 1, line.encode("utf-8")
return clusters if __name__=='__main__': if len(sys.argv) != 4: print "Usage: pick_recent_date.py [YYYY-MM-DD] [/path/to/cluster/file] [/path/to/apf/directory/]" sys.exit(1) date = serif.convert_to_date(sys.argv[1]) clusters = read_clusters(open(sys.argv[2], 'r')) path = sys.argv[3] for cluster in clusters: btimex = None for article in cluster: article = article.decode('utf-8') try: text = serif.read_sgm(os.path.join(path, article + '.sentences.sgm')) data = serif.read_apf(os.path.join(path, article + '.sentences.sgm.apf')) old_btimex = btimex btimex = serif.find_best_timex(date, text, data, btimex) if btimex != old_btimex: start, end = text.expand(btimex.start, btimex.end) line = serif.resolveCoref(text, data, start, end) print article.encode('utf-8') except IOError: print article.encode('utf-8') + ' # exempted due to error. check if the apf file exists.' if btimex and line: print line.encode('utf-8') print