Ejemplo n.º 1
0
def run_txt2tag(target_path, language, limit):
    """Takes txt files and runs the tagger (and segmenter for Chinese) on them. Adds files to
    the language/tag and language/seg directories. Works on pasiphae but not on chalciope."""
    print "[--txt2tag] on %s/%s/txt/" % (target_path, language)
    stages = read_stages(target_path, language)
    tagger = txt2tag.get_tagger(language)
    segmenter = sdp.Segmenter()
    fnames = files_to_process(target_path, language, stages, '--txt2tag', limit)
    count = 0
    for year, fname in fnames:
        count += 1
        txt_file = os.path.join(target_path, language, 'txt', year, fname)
        seg_file = os.path.join(target_path, language, 'seg', year, fname)
        tag_file = os.path.join(target_path, language, 'tag', year, fname)
        if language == 'cn':
            if verbose:
                print "[--txt2tag] %04d creating %s" % (count, seg_file)
            cn_txt2seg.seg(txt_file, seg_file, segmenter)
            if verbose:
                print "[--txt2tag] %04d creating %s" % (count, tag_file)
            cn_seg2tag.tag(seg_file, tag_file, tagger)
        else:
            if verbose:
                print "[--txt2tag] %04d creating %s" % (count, tag_file)
            txt2tag.tag(txt_file, tag_file, tagger)
    update_stages(target_path, language, '--txt2tag', limit)
Ejemplo n.º 2
0
def run_txt2seg(rconfig, limit, options, verbose):
    """Takes txt files and runs the Chinese segmenter on them."""

    input_dataset = find_input_dataset(TXT2SEG, rconfig)
    output_dataset = find_output_dataset(TXT2SEG, rconfig)
    print_datasets(TXT2SEG, input_dataset, output_dataset)
    check_file_counts(input_dataset, output_dataset, limit)

    count = 0
    segmenter = sdp.Segmenter()
    swrapper = cn_txt2seg.SegmenterWrapper(segmenter)

    fspecs = get_lines(rconfig.filenames, output_dataset.files_processed,
                       limit)
    for fspec in fspecs:
        count += 1
        filename = fspec.target
        print_file_progress(TXT2SEG, rconfig.corpus, count, filename, verbose)
        file_in, file_out = prepare_io(filename, input_dataset, output_dataset)
        uncompress(file_in)
        #cn_txt2seg.seg(file_in, file_out, segmenter)
        swrapper.process(file_in, file_out)
        compress(file_in, file_out)
        if count % STEP == 0:
            output_dataset.update_processed_count(STEP)

    return (count % STEP, [output_dataset])
Ejemplo n.º 3
0
def test_seg_cn():
    input = "/home/j/yzhou/patentWork/data/cn/txt/1999/CN1214051A.xml"
    #input = "/home/j/yzhou/patentWork/data/cn/txt/2009/CN101573383A.xml"
    output = "/home/j/yzhou/patentWork/data/cn/seg/1999/CN1214051A.xml"
    #output = "/home/j/yzhou/patentWork/data/cn/seg/2009/CN101573383A.xml"
    # segment using Stanford segmenter with chinese tree bank model
    segmenter = sdp.Segmenter()
    seg(input, output, segmenter)
Ejemplo n.º 4
0
def patent_txt2seg_dir(lang_path, language):
    segmenter = sdp.Segmenter()
    print "Allowing 10 seconds for segmenter to load stuff..."
    sleep(10)
    txt_path = lang_path + "/" + language + "/txt"
    seg_path = lang_path + "/" + language + "/seg"
    for year in os.listdir(txt_path):
        txt_year_dir = txt_path + "/" + year
        seg_year_dir = seg_path + "/" + year
        print "[patent_txt2seg_dir]calling txt2seg for dir: %s" % txt_year_dir
        txt2seg_dir(txt_year_dir, seg_year_dir, segmenter)
    print "[patent_txt2seg_dir]finished writing segmented data to %s" % seg_path
Ejemplo n.º 5
0
 def __init__(self):
     self.segmenter = sdp.Segmenter()
     self.s_input = None
     self.s_output = None
     self.lines = []
Ejemplo n.º 6
0
            else:
                # this is a hack needed because the segmenter has a normalization error
                # for non-breaking spaces, replace them here with regular spaces.
                line = line.replace(unichr(160), ' ')
                l_seg_string = segmenter.seg(line)
                if l_seg_string != '':
                    s_output.write("%s" % l_seg_string)
    s_input.close()
    s_output.close()


def is_omitable(s):
    """Do not segment anything over 500 characters or with ascii-8 only."""
    if len(s) > 500:
        return True
    return all(ord(c) < 256 for c in s)


if __name__ == '__main__':

    files_in = sys.argv[1:]
    sdp_segmenter = sdp.Segmenter()
    swrapper = Segmenter()
    use_old = False
    for file_in in files_in:
        file_out = file_in + '.seg'
        if use_old:
            seg(file_in, file_out, sdp_segmenter)
        else:
            swrapper.process(file_in, file_out, verbose=True)
Ejemplo n.º 7
0
 def __init__(self, segmenter=None):
     self.segmenter = segmenter
     if segmenter is None:
         self.segmenter = sdp.Segmenter()
     self.model_loaded = False