def main(args): if os.path.isdir(args.input): input_path = os.path.join(args.input, 'PragerFeats') else: input_path = args.input if args.output: output_path = args.output else: output_path = input_path + '.scanner' # display paths logger.info("input path: {0}".format(input_path)) logger.info("output path: {0}".format(output_path)) nb_features = read_features(input_path) tk_nextmove, tk_output = build_scanner(nb_features) scanner = tk_nextmove, tk_output, nb_features with open(output_path, 'w') as f: cPickle.dump(scanner, f) logger.info("wrote scanner to {0}".format(output_path))
args = parser.parse_args() if not (args.domain or args.lang) or (args.domain and args.lang): parser.error("exactly one of domain(-d) or language (-l) must be specified") if args.features: feature_path = args.features else: feature_path = os.path.join(args.model, "DFfeats") bucketlist_path = os.path.join(args.model, "bucketlist") if not os.path.exists(feature_path): parser.error("{0} does not exist".format(feature_path)) bucketlist = map(str.strip, open(bucketlist_path)) features = read_features(feature_path) if args.domain: index_path = os.path.join(args.model, "domain_index") suffix = ".domain" elif args.lang: index_path = os.path.join(args.model, "lang_index") suffix = ".lang" else: raise ValueError("no event specified") if args.weights: weights_path = args.weights else: weights_path = os.path.join(args.model, "IGweights" + suffix + (".bin" if args.binarize else ""))
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("input", metavar="INPUT", help="build a scanner for INPUT. If input is a directory, read INPUT/LDfeats") parser.add_argument("-o","--output", help="output scanner to OUTFILE", metavar="OUTFILE") args = parser.parse_args() if os.path.isdir(args.input): input_path = os.path.join(args.input, 'LDfeats') else: input_path = args.input if args.output: output_path = args.output else: output_path = input_path + '.scanner' # display paths if not SILENT: print "input path:", input_path print "output path:", output_path nb_features = read_features(input_path) tk_nextmove, tk_output = build_scanner(nb_features) scanner = tk_nextmove, tk_output, nb_features with open(output_path, 'w') as f: cPickle.dump(scanner, f) if not SILENT: print "wrote scanner to {0}".format(output_path)
if args.output: makedir(args.output) out_dir = args.output else: out_dir = model_dir langs = sorted(all_langs) # display paths print "languages({1}): {0}".format(langs, len(langs)) print "model path:", model_dir print "feature path:", feat_path print "output path:", out_dir print "temp (buckets) path:", buckets_dir feats = read_features(feat_path) indexer = CorpusIndexer(args.corpus, langs = langs) items = [ (d,l,p) for (d,l,n,p) in indexer.items ] if len(items) == 0: raise ValueError("found no files!") print "will process {0} features across {1} paths".format(len(feats), len(items)) # produce a scanner over all the features tk_nextmove, tk_output = build_scanner(feats) # Generate a class map over all the languages we are dealing with cm = generate_cm([ (l,p) for d,l,p in items], len(langs)) # Compute P(t|C)
"exactly one of domain(-d) or language (-l) must be specified") if args.features: feature_path = args.features else: feature_path = os.path.join(args.model, 'DFfeats') if args.buckets: bucketlist_paths = args.buckets else: bucketlist_paths = [os.path.join(args.model, 'bucketlist')] if not os.path.exists(feature_path): parser.error('{0} does not exist'.format(feature_path)) features = read_features(feature_path) if args.domain: index_path = os.path.join(args.model, 'domain_index') suffix = '.domain' elif args.lang: index_path = os.path.join(args.model, 'lang_index') suffix = '.lang' else: raise ValueError("no event specified") if args.weights: weights_path = args.weights else: weights_path = os.path.join( args.model,
with open(index_path, "w") as f: writer = csv.writer(f) writer.writerows(items) if args.temp: buckets_dir = args.temp else: buckets_dir = os.path.join(model_dir, "buckets") makedir(buckets_dir) bucketlist_path = os.path.join(model_dir, "bucketlist") index_path = os.path.join(model_dir, "paths") if args.ld_feats: # LD features are pre-specified. We are basically just building the NB model. LDfeats = read_features(args.ld_feats) else: # LD features not pre-specified, so we compute them. # Tokenize DFfeats = None print "will tokenize %d files" % len(items) # TODO: Custom tokenizer if doing custom first-pass features if args.df_feats: print "reading custom features from:", args.df_feats DFfeats = read_features(args.df_feats) print "building tokenizer for custom list of {0} features".format(len(DFfeats)) tk = Scanner(DFfeats) elif args.word: print "using word tokenizer"
help= "build a scanner for INPUT. If input is a directory, read INPUT/LDfeats" ) parser.add_argument("-o", "--output", help="output scanner to OUTFILE", metavar="OUTFILE") args = parser.parse_args() if os.path.isdir(args.input): input_path = os.path.join(args.input, 'LDfeats') else: input_path = args.input if args.output: output_path = args.output else: output_path = input_path + '.scanner' # display paths print "input path:", input_path print "output path:", output_path nb_features = read_features(input_path) tk_nextmove, tk_output = build_scanner(nb_features) scanner = tk_nextmove, tk_output, nb_features with open(output_path, 'w') as f: cPickle.dump(scanner, f) print "wrote scanner to {0}".format(output_path)
with open(index_path, 'w') as f: writer = csv.writer(f) writer.writerows(items) if args.temp: buckets_dir = args.temp else: buckets_dir = os.path.join(model_dir, 'buckets') makedir(buckets_dir) bucketlist_path = os.path.join(model_dir, 'bucketlist') index_path = os.path.join(model_dir, 'paths') if args.ld_feats: # LD features are pre-specified. We are basically just building the NB model. LDfeats = read_features(args.ld_feats) else: # LD features not pre-specified, so we compute them. # Tokenize DFfeats = None print "will tokenize %d files" % len(items) # TODO: Custom tokenizer if doing custom first-pass features if args.df_feats: print "reading custom features from:", args.df_feats DFfeats = read_features(args.df_feats) print "building tokenizer for custom list of {0} features".format( len(DFfeats)) tk = Scanner(DFfeats) elif args.word: