def extract_features(txt_io, feat_io, cword_io, train=False, factor_files={}): csets = CSetPair(config['source-cset'], config['target-cset']) extractor = FeatureExtractor(csets, config['features'], config['costs']) check_factor_requirements(extractor.required_factors(), factor_files) finder = CWordFinder(csets, train) if config['nulls-ngrams']: null_finder = NullFinder(csets.src, config['nulls-ngrams']) finder.add_extra_finder(null_finder) reader = CWordReader(cword_io) log.info("Extract features from {}".format(txt_io.name)) count = 0 for sid, line, fact_sent in each_factorized_input(txt_io, factor_files): for cword in finder.find_confusion_words(line, fact_sent): feat_str = extractor.extract_features(cword, fact_sent) feat_io.write(feat_str) reader.format(sid, cword) count += 1 log.info("Found {} confusion words".format(count))