def uctoTokenizer(input_text: str, output_text: str): from ucto import Tokenizer """ A function to create a folia file from a text file :param input_text: path to text file :param output_text: name for the folia file """ configurationfile = "../ucto_config/tokconfig_eng_ch" tokenizer = Tokenizer(configurationfile, foliaoutput=True) folia_file_P = Path(input_text) isFolder = folia_file_P.is_dir() if isFolder: # files = [f for f in glob.glob(folia_file + "**/*.xml", recursive=True)] files = [f for f in glob.glob(input_text + "**/*.*", recursive=True)] pbar = ProgressBar() path = Path(output_text) if not path.exists(): path.mkdir() for f in pbar(files): path_out = f.split("/") name_out = path / path_out[len(path_out) - 1] out = str(name_out).replace(".txt", ".folia.xml") tokenizer.tokenize(f, str(out)) else: tokenizer.tokenize(input_text, output_text)
import csv import re import sys from lxml import etree from ucto import Tokenizer tokenizer = Tokenizer('-L nl -n -Q') data = etree.parse(sys.argv[1]) with open(sys.argv[1] + ".csv", 'w') as output: writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL) for row in data.getroot(): _, idnumber, text, summary = list(row.iterchildren()) motifs = set() if summary.text is None: continue for motiflist in re.findall('\[.*?\]', summary.text): for motif in re.findall('[^\s\[\],]+', motiflist): if not re.search('[0-9]', motif): continue motifs.add(motif) if motifs: print idnumber.text text = tokenizer.tokenize(text.text, verbose=False) writer.writerow([idnumber.text, r' '.join(motifs), ' '.join(text)])
import csv import re import sys from lxml import etree from ucto import Tokenizer tokenizer = Tokenizer("-L nl -n -Q") data = etree.parse(sys.argv[1]) with open(sys.argv[1] + ".csv", "w") as output: writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL) for row in data.getroot(): _, idnumber, text, summary = list(row.iterchildren()) motifs = set() if summary.text is None: continue for motiflist in re.findall("\[.*?\]", summary.text): for motif in re.findall("[^\s\[\],]+", motiflist): if not re.search("[0-9]", motif): continue motifs.add(motif) if motifs: print idnumber.text text = tokenizer.tokenize(text.text, verbose=False) writer.writerow([idnumber.text, r" ".join(motifs), " ".join(text)])
# check if trainingfile or instancebase is an existing file and # add this to the configuration. If no file is given we stick # to the default file with that comes with a particular classifier if args.trainingfile: if not os.path.isfile(args.trainingfile): raise IOError('Trainingfile not found') settings['f'] = args.trainingfile del settings['i'] elif args.instancebase: if not os.path.isfile(args.instancebase): raise IOError('Instancebase not found') settings['i'] = args.instancebase tokenizer = Tokenizer('-L nl -n -Q') with classifier(config.HOST, config.PORT, settings) as program: args.output.write(codecs.BOM_UTF8) for i, line in enumerate(codecs.open(args.testfile, encoding=config.ENCODING)): words = tokenizer.tokenize(line.strip(), tokens = lambda s: s.split()) output = [] for word in word: results = classifier.classify(word) output.append(classifier.pprint_results(results)) for word, result in zip(words, output): args.output.write( u'{0}\t{1}\n'.format(word, result).encode(config.ENCODING)) if (i+1) % 25 == 0: sys.stderr.write( 'Processed: {0} words @ {1}\n'.format(i, time.ctime()))