# 计算IG权重 ig_params = [ ('lang', lang_dist_vec, '.lang', True), ] print("come here 111") ig_params.append(('domain', domain_dist_vec, '.domain', False)) ig_vals = {} for label, dist, suffix, binarize in ig_params: print "Computing information gain for {0}".format(label) ig = compute_IG(b_dirs, DFfeats, dist, binarize, suffix, None) ig_vals[label] = dict((row[0], numpy.array(row[1].flat)) for row in ig) # 根据LD选择特征 features_per_lang = select_LD_features(ig_vals['lang'], ig_vals.get('domain'), 300, ignore_domain=canshu.no_domain_ig) LDfeats = reduce(set.union, map(set, features_per_lang.values())) print 'selected %d features' % len(LDfeats) # 编译LD特征的扫描值 tk_nextmove, tk_output = build_scanner(LDfeats) # 组合成NB模型 langs = sorted(lang_index, key=lang_index.get) nb_classes = langs nb_dir = os.path.join(buckets_dir, 'NBtrain') makedir(nb_dir) nb_pc, nb_ptc = learn_nb_params([(int(l), p) for _, l, p in items], len(langs), tk_nextmove, tk_output, nb_dir,
('lang', lang_dist_vec, '.lang', True), ] if not args.no_domain_ig: ig_params.append( ('domain', domain_dist_vec, '.domain', False) ) ig_vals = {} for label, dist, suffix, binarize in ig_params: print "Computing information gain for {0}".format(label) ig = compute_IG(b_dirs, DFfeats, dist, binarize, suffix, args.jobs) if args.debug: weights_path = os.path.join(model_dir, 'IGweights' + suffix + ('.bin' if binarize else '')) write_weights(ig, weights_path) ig_vals[label] = dict((row[0], numpy.array(row[1].flat)) for row in ig) # Select features according to the LD criteria features_per_lang = select_LD_features(ig_vals['lang'], ig_vals['domain'], args.feats_per_lang, ignore_domain = args.no_domain_ig) LDfeats = reduce(set.union, map(set, features_per_lang.values())) print 'selected %d features' % len(LDfeats) if args.debug: feature_path = os.path.join(model_dir, 'LDfeats') write_features(sorted(LDfeats), feature_path) print 'wrote LD features to "%s"' % feature_path with open(feature_path + '.perlang', 'w') as f: writer = csv.writer(f) for i in range(len(features_per_lang)): writer.writerow(map(repr,features_per_lang[i])) print 'wrote LD.perlang features to "%s"' % feature_path + '.perlang' # Compile a scanner for the LDfeats
ig_params = [("lang", lang_dist_vec, ".lang", True)] if not args.no_domain_ig: ig_params.append(("domain", domain_dist_vec, ".domain", False)) ig_vals = {} for label, dist, suffix, binarize in ig_params: print "Computing information gain for {0}".format(label) ig = compute_IG(b_dirs, DFfeats, dist, binarize, suffix, args.jobs) if args.debug: weights_path = os.path.join(model_dir, "IGweights" + suffix + (".bin" if binarize else "")) write_weights(ig, weights_path) ig_vals[label] = dict((row[0], numpy.array(row[1].flat)) for row in ig) # Select features according to the LD criteria features_per_lang = select_LD_features( ig_vals["lang"], ig_vals.get("domain"), args.feats_per_lang, ignore_domain=args.no_domain_ig ) LDfeats = reduce(set.union, map(set, features_per_lang.values())) print "selected %d features" % len(LDfeats) if args.debug: feature_path = os.path.join(model_dir, "LDfeats") write_features(sorted(LDfeats), feature_path) print 'wrote LD features to "%s"' % feature_path with open(feature_path + ".perlang", "w") as f: writer = csv.writer(f) for i in range(len(features_per_lang)): writer.writerow(map(repr, features_per_lang[i])) print 'wrote LD.perlang features to "%s"' % feature_path + ".perlang"
ig_vals = {} for label, dist, suffix, binarize in ig_params: print "Computing information gain for {0}".format(label) ig = compute_IG(b_dirs, DFfeats, dist, binarize, suffix, args.jobs) if args.debug: weights_path = os.path.join( model_dir, 'IGweights' + suffix + ('.bin' if binarize else '')) write_weights(ig, weights_path) ig_vals[label] = dict( (row[0], numpy.array(row[1].flat)) for row in ig) # Select features according to the LD criteria features_per_lang = select_LD_features(ig_vals['lang'], ig_vals['domain'], args.feats_per_lang, ignore_domain=args.no_domain_ig) LDfeats = reduce(set.union, map(set, features_per_lang.values())) print 'selected %d features' % len(LDfeats) if args.debug: feature_path = os.path.join(model_dir, 'LDfeats') write_features(sorted(LDfeats), feature_path) print 'wrote LD features to "%s"' % feature_path with open(feature_path + '.perlang', 'w') as f: writer = csv.writer(f) for i in range(len(features_per_lang)): writer.writerow(map(repr, features_per_lang[i])) print 'wrote LD.perlang features to "%s"' % feature_path + '.perlang'