# print("running here 0") #给bucket构造路径 buckets_dir = os.path.join(model_dir, 'buckets') makedir(buckets_dir) print "建成完毕." #计算得到特征 # Tokenize DFfeats = None print "will tokenize %d documents" % len(items) print "using byte NGram tokenizer, max_order: {0}".format(4) tk = NGramTokenizer(1, 4) # 首次通过标记化,用于确定特征的DF tk_dir = os.path.join(buckets_dir, 'tokenize-pass1') makedir(tk_dir) b_dirs = build_index(items, tk, tk_dir, 64, None, 50, None, 140, False) print("running here 0") doc_count = tally(b_dirs, None) DFfeats = ngram_select(doc_count, 4, 15000) shutil.rmtree(tk_dir) # 再次仅为所选的特征计数 DF_scanner = Scanner(DFfeats) df_dir = os.path.join(buckets_dir, 'tokenize-pass2') makedir(df_dir) b_dirs = build_index(items, DF_scanner, df_dir, 64, None, 50) b_dirs = [[d] for d in b_dirs] # 计算向量值 domain_dist_vec = numpy.array([ domain_dist[domain_index[d]]
if args.df_feats: print "reading custom features from:", args.df_feats DFfeats = read_features(args.df_feats) print "building tokenizer for custom list of {0} features".format( len(DFfeats)) tk = Scanner(DFfeats) elif args.word: print "using word tokenizer" tk = str.split else: print "using byte NGram tokenizer, max_order: {0}".format( args.max_order) tk = NGramTokenizer(1, args.max_order) # First-pass tokenization, used to determine DF of features b_dirs = build_index(items, tk, buckets_dir, args.buckets, args.jobs, args.chunksize) if args.debug: # output the paths to the buckets with open(bucketlist_path, 'w') as f: for d in b_dirs: f.write(d + '\n') # We need to compute a tally if we are selecting features by DF, but also if # we want full debug output. if DFfeats is None or args.debug: # Compute DF per-feature doc_count = tally(b_dirs, args.jobs) if args.debug: doc_count_path = os.path.join(model_dir, 'DF_all') write_weights(doc_count, doc_count_path)
# TODO: Custom tokenizer if doing custom first-pass features if args.df_feats: print "reading custom features from:", args.df_feats DFfeats = read_features(args.df_feats) print "building tokenizer for custom list of {0} features".format(len(DFfeats)) tk = Scanner(DFfeats) elif args.word: print "using word tokenizer" tk = str.split else: print "using byte NGram tokenizer, max_order: {0}".format(args.max_order) tk = NGramTokenizer(1, args.max_order) # First-pass tokenization, used to determine DF of features b_dirs = build_index( items, tk, buckets_dir, args.buckets, args.jobs, args.chunksize, args.sample_count, args.sample_size ) if args.debug: # output the paths to the buckets with open(bucketlist_path, "w") as f: for d in b_dirs: f.write(d + "\n") # We need to compute a tally if we are selecting features by DF, but also if # we want full debug output. if DFfeats is None or args.debug: # Compute DF per-feature doc_count = tally(b_dirs, args.jobs) if args.debug: doc_count_path = os.path.join(model_dir, "DF_all")
if args.df_feats: print "reading custom features from:", args.df_feats DFfeats = read_features(args.df_feats) print "building tokenizer for custom list of {0} features".format(len(DFfeats)) tk = Scanner(DFfeats) elif args.word: print "using word tokenizer" tk = str.split else: print "using byte NGram tokenizer, max_order: {0}".format(args.max_order) tk = NGramTokenizer(1, args.max_order) # First-pass tokenization, used to determine DF of features tk_dir = os.path.join(buckets_dir, 'tokenize-pass1') makedir(tk_dir) b_dirs = build_index(items, tk, tk_dir, args.buckets, args.jobs, args.chunksize, args.sample_count, args.sample_size, args.line) if args.debug: # output the paths to the buckets bucketlist_path = os.path.join(model_dir, 'bucketlist') with open(bucketlist_path,'w') as f: for d in b_dirs: f.write(d+'\n') # We need to compute a tally if we are selecting features by DF, but also if # we want full debug output. if DFfeats is None or args.debug: # Compute DF per-feature doc_count = tally(b_dirs, args.jobs) if args.debug: doc_count_path = os.path.join(model_dir, 'DF_all')