Ejemplo n.º 1
0
def get_tagger(language):
    """Used by batch.py."""
    if language == "en":
        return sdp.STagger("english-caseless-left3words-distsim.tagger")
    elif language == "de":
        return sdp.STagger("german-fast.tagger")
    elif language == "cn":
        return sdp.STagger("chinese.tagger")
    else:
        exit("There is no tagger for language=%s" % language)
Ejemplo n.º 2
0
def test_tag_en(input=None, output=None):
    if input is None:
        input = "/home/j/anick/fuse/data/patents/en_test/txt/US20110052365A1.xml"
    if output is None:
        output = "/home/j/anick/fuse/data/patents/en_test/tag/US20110052365A1.xml"
    tagger = sdp.STagger("english-caseless-left3words-distsim.tagger")
    tag(input, output, tagger)
Ejemplo n.º 3
0
def process_patent_dir(patent_sent_dir, output_dir):
    # create a chunker schema instance
    cs = sdp.chunker_tech()
    # create tagger instance
    st = sdp.STagger("english-caseless-left3words-distsim.tagger") 

    # process each file in the dir
    for file_name in os.listdir(patent_sent_dir):
        output_file = output_dir + "/" + file_name
        process_patent_sent_file(patent_sent_dir, file_name, st, cs, output_file)
Ejemplo n.º 4
0
def patent_txt2tag_dir(lang_path, language):
    # choose tagger for language
    if language == "en":
        tagger = sdp.STagger("english-caseless-left3words-distsim.tagger")
    elif language == "de":
        # note: german-fast is much faster than german-dewac although 4% poorer in dealing
        # with unknown words.
        tagger = sdp.STagger("german-fast.tagger")
    elif language == "cn":
        tagger = sdp.STagger("chinese.tagger")

    txt_path = lang_path + "/" + language + "/txt"
    tag_path = lang_path + "/" + language + "/tag"
    for year in os.listdir(txt_path):
        txt_year_dir = txt_path + "/" + year
        tag_year_dir = tag_path + "/" + year
        print "[patent_txt2tag_dir]calling txt2tag for dir: %s" % txt_year_dir
        txt2tag_dir(txt_year_dir, tag_year_dir, tagger)
    print "[patent_txt2tag_dir]finished writing tagged data to %s" % tag_path
Ejemplo n.º 5
0
def test_pm():
    dir = "/home/j/anick/fuse/data/pubmed"
    file = "pubmed_lines.txt"
    output_file = "/home/j/anick/fuse/data/pubmed/chunks.txt"
    #file = "pubmed_lines_test_1.txt"
    # create a chunker schema instance
    cs = sdp.chunker_tech()
    # create tagger instance
    tagger = sdp.STagger("english-caseless-left3words-distsim.tagger") 

    process_patent_sent_file(dir, file, tagger, cs, output_file)
Ejemplo n.º 6
0
def test_tag_de():
    tagger = sdp.STagger("german-fast.tagger")
    dir = "/home/j/anick/fuse/data/tmp"
    file_list = [
        "DE3102424A1_all_caps", "DE3102424A1_all_lower",
        "DE3102424A1_first_cap"
    ]
    for file in file_list:
        full_inpath = dir + "/" + file + ".xml"
        full_outpath = dir + "/" + file + ".tag"
        tag(full_inpath, full_outpath, tagger)
        print "Created %s" % full_outpath
Ejemplo n.º 7
0
def pipeline_txt2tag_dir(root, language):
    source_path = os.path.join(root, "txt")
    target_path = os.path.join(root, "tag")

    # choose tagger for language
    if language == "en":
        tagger = sdp.STagger("english-caseless-left3words-distsim.tagger")
    elif language == "de":
        # note: german-fast is much faster than german-dewac although 4% poorer in dealing
        # with unknown words.
        tagger = sdp.STagger("german-fast.tagger")
    elif language == "cn":
        tagger = sdp.STagger("chinese.tagger")

    for file in os.listdir(source_path):
        source_file = source_path + "/" + file
        target_file = target_path + "/" + file
        print "[txt2tag_dir]from %s to %s" % (source_file, target_file)
        #txt2tag_file(source_file, target_file, tagger)
        tag(source_file, target_file, tagger)
    print "[txt2tag_dir]done"
Ejemplo n.º 8
0
def test_tag_cn():
    input = "/home/j/anick/fuse/data/patents/tmp/cn/CN1394959A-tf.seg"
    output = "/home/j/anick/fuse/data/patents/tmp/cn/CN1394959A-tf.tag"
    tagger = sdp.STagger("chinese.tagger")
    tag(input, output, tagger)
Ejemplo n.º 9
0
def test_tag_en():
    input = "/home/j/anick/fuse/data/patents/en_test/txt/US20110052365A1.xml"
    output = "/home/j/anick/fuse/data/patents/en_test/tag/US20110052365A1.xml"
    tagger = sdp.STagger("chinese.tagger")
    tag(input, output, tagger)