Exemple #1
0
 def __init__(self,
              tag_file,
              phr_feats_file,
              year,
              lang,
              filter_p=True,
              chunker_rules='en',
              compress=True):
     self.input = tag_file
     self.output = phr_feats_file
     self.year = year
     self.chunk_schema = sentence.chunk_schema(chunker_rules)
     self.lang = lang
     self.compress = compress
     # field_name to list of sent instances
     # field name is header string without FH_ or : affixes
     self.d_field = {}
     # sent id to sent instance and chunk id to chunk instance
     self.d_sent = {}
     self.d_chunk = {}
     self.next_sent_id = 0
     self.next_chunk_id = 0
     # lc noun tokens appearing in title
     self.l_lc_title_noun = []
     # create the chunks
     self.process_doc(filter_p, chunker_rules)
Exemple #2
0
def test_t2c_de_tag_sig():
    input = "/home/j/anick/fuse/data/patents/de_test/tag_sig_test.xml"
    output_phr_occ = "/home/j/anick/fuse/data/patents/de_test/tag_sig_test.phr_occ"
    output_phr_feats = "/home/j/anick/fuse/data/patents/de_test/tag_sig_test.phr_feats"
    cs = sentence.chunk_schema("de")
    year = "1982"
    lang = "de"
    doc = Doc(input, output_phr_occ, output_phr_feats, year, lang)
    return (doc)
Exemple #3
0
def test_t2c_de(filter_p):
    input = "/home/j/anick/fuse/data/patents/de/tag/1982/DE3102424A1.xml"
    output_phr_occ = "/home/j/anick/fuse/data/patents/de_test/DE3102424A1.phr_occ"
    output_phr_feats = "/home/j/anick/fuse/data/patents/de_test/DE3102424A1.phr_feats"
    cs = sentence.chunk_schema("de")
    year = "1980"
    lang = "de"
    #filter_p = True
    #filter_p = False
    doc = Doc(input, output_phr_occ, output_phr_feats, year, lang, filter_p)
    return (doc)
Exemple #4
0
def patent_tag2chunk_dir(patent_path, language, filter_p=True):
    lang_path = patent_path + "/" + language
    phr_occ_path = lang_path + "/phr_occ"
    phr_feats_path = lang_path + "/phr_feats"
    tag_path = lang_path + "/tag"
    c_schema = sentence.chunk_schema(language)
    for year in os.listdir(tag_path):
        phr_occ_year_dir = phr_occ_path + "/" + year
        phr_feats_year_dir = phr_feats_path + "/" + year
        tag_year_dir = tag_path + "/" + year
        print "[patent_tag2chunk_dir]calling tag2chunk, filter_p: %s, output dirs: %s, %s" \
              % (filter_p, phr_feats_year_dir, phr_occ_year_dir)
        tag2chunk_dir(tag_year_dir, phr_occ_year_dir, phr_feats_year_dir, year,
                      language, filter_p)
    print "[patent_tag2chunk_dir]finished writing chunked data to %s and %s" % (
        phr_occ_path, phr_feats_path)