def run_txt2tag(target_path, language, limit): """Takes txt files and runs the tagger (and segmenter for Chinese) on them. Adds files to the language/tag and language/seg directories. Works on pasiphae but not on chalciope.""" print "[--txt2tag] on %s/%s/txt/" % (target_path, language) stages = read_stages(target_path, language) tagger = txt2tag.get_tagger(language) segmenter = sdp.Segmenter() fnames = files_to_process(target_path, language, stages, '--txt2tag', limit) count = 0 for year, fname in fnames: count += 1 txt_file = os.path.join(target_path, language, 'txt', year, fname) seg_file = os.path.join(target_path, language, 'seg', year, fname) tag_file = os.path.join(target_path, language, 'tag', year, fname) if language == 'cn': if verbose: print "[--txt2tag] %04d creating %s" % (count, seg_file) cn_txt2seg.seg(txt_file, seg_file, segmenter) if verbose: print "[--txt2tag] %04d creating %s" % (count, tag_file) cn_seg2tag.tag(seg_file, tag_file, tagger) else: if verbose: print "[--txt2tag] %04d creating %s" % (count, tag_file) txt2tag.tag(txt_file, tag_file, tagger) update_stages(target_path, language, '--txt2tag', limit)
def run_txt2seg(rconfig, limit, options, verbose): """Takes txt files and runs the Chinese segmenter on them.""" input_dataset = find_input_dataset(TXT2SEG, rconfig) output_dataset = find_output_dataset(TXT2SEG, rconfig) print_datasets(TXT2SEG, input_dataset, output_dataset) check_file_counts(input_dataset, output_dataset, limit) count = 0 segmenter = sdp.Segmenter() swrapper = cn_txt2seg.SegmenterWrapper(segmenter) fspecs = get_lines(rconfig.filenames, output_dataset.files_processed, limit) for fspec in fspecs: count += 1 filename = fspec.target print_file_progress(TXT2SEG, rconfig.corpus, count, filename, verbose) file_in, file_out = prepare_io(filename, input_dataset, output_dataset) uncompress(file_in) #cn_txt2seg.seg(file_in, file_out, segmenter) swrapper.process(file_in, file_out) compress(file_in, file_out) if count % STEP == 0: output_dataset.update_processed_count(STEP) return (count % STEP, [output_dataset])
def test_seg_cn(): input = "/home/j/yzhou/patentWork/data/cn/txt/1999/CN1214051A.xml" #input = "/home/j/yzhou/patentWork/data/cn/txt/2009/CN101573383A.xml" output = "/home/j/yzhou/patentWork/data/cn/seg/1999/CN1214051A.xml" #output = "/home/j/yzhou/patentWork/data/cn/seg/2009/CN101573383A.xml" # segment using Stanford segmenter with chinese tree bank model segmenter = sdp.Segmenter() seg(input, output, segmenter)
def patent_txt2seg_dir(lang_path, language): segmenter = sdp.Segmenter() print "Allowing 10 seconds for segmenter to load stuff..." sleep(10) txt_path = lang_path + "/" + language + "/txt" seg_path = lang_path + "/" + language + "/seg" for year in os.listdir(txt_path): txt_year_dir = txt_path + "/" + year seg_year_dir = seg_path + "/" + year print "[patent_txt2seg_dir]calling txt2seg for dir: %s" % txt_year_dir txt2seg_dir(txt_year_dir, seg_year_dir, segmenter) print "[patent_txt2seg_dir]finished writing segmented data to %s" % seg_path
def __init__(self): self.segmenter = sdp.Segmenter() self.s_input = None self.s_output = None self.lines = []
else: # this is a hack needed because the segmenter has a normalization error # for non-breaking spaces, replace them here with regular spaces. line = line.replace(unichr(160), ' ') l_seg_string = segmenter.seg(line) if l_seg_string != '': s_output.write("%s" % l_seg_string) s_input.close() s_output.close() def is_omitable(s): """Do not segment anything over 500 characters or with ascii-8 only.""" if len(s) > 500: return True return all(ord(c) < 256 for c in s) if __name__ == '__main__': files_in = sys.argv[1:] sdp_segmenter = sdp.Segmenter() swrapper = Segmenter() use_old = False for file_in in files_in: file_out = file_in + '.seg' if use_old: seg(file_in, file_out, sdp_segmenter) else: swrapper.process(file_in, file_out, verbose=True)
def __init__(self, segmenter=None): self.segmenter = segmenter if segmenter is None: self.segmenter = sdp.Segmenter() self.model_loaded = False