def __init__(self, tag_file, phr_feats_file, year, lang, filter_p=True, chunker_rules='en', compress=True): self.input = tag_file self.output = phr_feats_file self.year = year self.chunk_schema = sentence.chunk_schema(chunker_rules) self.lang = lang self.compress = compress # field_name to list of sent instances # field name is header string without FH_ or : affixes self.d_field = {} # sent id to sent instance and chunk id to chunk instance self.d_sent = {} self.d_chunk = {} self.next_sent_id = 0 self.next_chunk_id = 0 # lc noun tokens appearing in title self.l_lc_title_noun = [] # create the chunks self.process_doc(filter_p, chunker_rules)
def test_t2c_de_tag_sig(): input = "/home/j/anick/fuse/data/patents/de_test/tag_sig_test.xml" output_phr_occ = "/home/j/anick/fuse/data/patents/de_test/tag_sig_test.phr_occ" output_phr_feats = "/home/j/anick/fuse/data/patents/de_test/tag_sig_test.phr_feats" cs = sentence.chunk_schema("de") year = "1982" lang = "de" doc = Doc(input, output_phr_occ, output_phr_feats, year, lang) return (doc)
def test_t2c_de(filter_p): input = "/home/j/anick/fuse/data/patents/de/tag/1982/DE3102424A1.xml" output_phr_occ = "/home/j/anick/fuse/data/patents/de_test/DE3102424A1.phr_occ" output_phr_feats = "/home/j/anick/fuse/data/patents/de_test/DE3102424A1.phr_feats" cs = sentence.chunk_schema("de") year = "1980" lang = "de" #filter_p = True #filter_p = False doc = Doc(input, output_phr_occ, output_phr_feats, year, lang, filter_p) return (doc)
def patent_tag2chunk_dir(patent_path, language, filter_p=True): lang_path = patent_path + "/" + language phr_occ_path = lang_path + "/phr_occ" phr_feats_path = lang_path + "/phr_feats" tag_path = lang_path + "/tag" c_schema = sentence.chunk_schema(language) for year in os.listdir(tag_path): phr_occ_year_dir = phr_occ_path + "/" + year phr_feats_year_dir = phr_feats_path + "/" + year tag_year_dir = tag_path + "/" + year print "[patent_tag2chunk_dir]calling tag2chunk, filter_p: %s, output dirs: %s, %s" \ % (filter_p, phr_feats_year_dir, phr_occ_year_dir) tag2chunk_dir(tag_year_dir, phr_occ_year_dir, phr_feats_year_dir, year, language, filter_p) print "[patent_tag2chunk_dir]finished writing chunked data to %s and %s" % ( phr_occ_path, phr_feats_path)