def get_tagger(language): """Used by batch.py.""" if language == "en": return sdp.STagger("english-caseless-left3words-distsim.tagger") elif language == "de": return sdp.STagger("german-fast.tagger") elif language == "cn": return sdp.STagger("chinese.tagger") else: exit("There is no tagger for language=%s" % language)
def test_tag_en(input=None, output=None): if input is None: input = "/home/j/anick/fuse/data/patents/en_test/txt/US20110052365A1.xml" if output is None: output = "/home/j/anick/fuse/data/patents/en_test/tag/US20110052365A1.xml" tagger = sdp.STagger("english-caseless-left3words-distsim.tagger") tag(input, output, tagger)
def process_patent_dir(patent_sent_dir, output_dir): # create a chunker schema instance cs = sdp.chunker_tech() # create tagger instance st = sdp.STagger("english-caseless-left3words-distsim.tagger") # process each file in the dir for file_name in os.listdir(patent_sent_dir): output_file = output_dir + "/" + file_name process_patent_sent_file(patent_sent_dir, file_name, st, cs, output_file)
def patent_txt2tag_dir(lang_path, language): # choose tagger for language if language == "en": tagger = sdp.STagger("english-caseless-left3words-distsim.tagger") elif language == "de": # note: german-fast is much faster than german-dewac although 4% poorer in dealing # with unknown words. tagger = sdp.STagger("german-fast.tagger") elif language == "cn": tagger = sdp.STagger("chinese.tagger") txt_path = lang_path + "/" + language + "/txt" tag_path = lang_path + "/" + language + "/tag" for year in os.listdir(txt_path): txt_year_dir = txt_path + "/" + year tag_year_dir = tag_path + "/" + year print "[patent_txt2tag_dir]calling txt2tag for dir: %s" % txt_year_dir txt2tag_dir(txt_year_dir, tag_year_dir, tagger) print "[patent_txt2tag_dir]finished writing tagged data to %s" % tag_path
def test_pm(): dir = "/home/j/anick/fuse/data/pubmed" file = "pubmed_lines.txt" output_file = "/home/j/anick/fuse/data/pubmed/chunks.txt" #file = "pubmed_lines_test_1.txt" # create a chunker schema instance cs = sdp.chunker_tech() # create tagger instance tagger = sdp.STagger("english-caseless-left3words-distsim.tagger") process_patent_sent_file(dir, file, tagger, cs, output_file)
def test_tag_de(): tagger = sdp.STagger("german-fast.tagger") dir = "/home/j/anick/fuse/data/tmp" file_list = [ "DE3102424A1_all_caps", "DE3102424A1_all_lower", "DE3102424A1_first_cap" ] for file in file_list: full_inpath = dir + "/" + file + ".xml" full_outpath = dir + "/" + file + ".tag" tag(full_inpath, full_outpath, tagger) print "Created %s" % full_outpath
def pipeline_txt2tag_dir(root, language): source_path = os.path.join(root, "txt") target_path = os.path.join(root, "tag") # choose tagger for language if language == "en": tagger = sdp.STagger("english-caseless-left3words-distsim.tagger") elif language == "de": # note: german-fast is much faster than german-dewac although 4% poorer in dealing # with unknown words. tagger = sdp.STagger("german-fast.tagger") elif language == "cn": tagger = sdp.STagger("chinese.tagger") for file in os.listdir(source_path): source_file = source_path + "/" + file target_file = target_path + "/" + file print "[txt2tag_dir]from %s to %s" % (source_file, target_file) #txt2tag_file(source_file, target_file, tagger) tag(source_file, target_file, tagger) print "[txt2tag_dir]done"
def test_tag_cn(): input = "/home/j/anick/fuse/data/patents/tmp/cn/CN1394959A-tf.seg" output = "/home/j/anick/fuse/data/patents/tmp/cn/CN1394959A-tf.tag" tagger = sdp.STagger("chinese.tagger") tag(input, output, tagger)
def test_tag_en(): input = "/home/j/anick/fuse/data/patents/en_test/txt/US20110052365A1.xml" output = "/home/j/anick/fuse/data/patents/en_test/tag/US20110052365A1.xml" tagger = sdp.STagger("chinese.tagger") tag(input, output, tagger)