bucketlist = map(str.strip, open(bucketlist_path)) features = read_features(feature_path) if args.domain: index_path = os.path.join(args.model, "domain_index") suffix = ".domain" elif args.lang: index_path = os.path.join(args.model, "lang_index") suffix = ".lang" else: raise ValueError("no event specified") if args.weights: weights_path = args.weights else: weights_path = os.path.join(args.model, "IGweights" + suffix + (".bin" if args.binarize else "")) # display paths print "model path:", args.model print "buckets path:", bucketlist_path print "features path:", feature_path print "weights path:", weights_path print "index path:", index_path print "suffix:", suffix print "computing information gain" dist = read_dist(index_path) ig = compute_IG(bucketlist, features, dist, args.binarize, suffix, args.jobs) write_weights(ig, weights_path)
print "features output path:", feature_path if args.tokens_per_order: print "max ngram order:", args.max_order print "tokens per order:", args.tokens_per_order else: print "tokens:", args.tokens with open(bucketlist_path) as f: bucketlist = map(str.strip, f) doc_count = tally(bucketlist, args.jobs) print "unique features:", len(doc_count) if args.doc_count: # The constant true is used to indicate output to default location doc_count_path = os.path.join( args.model, 'DF_all') if args.doc_count == True else args.doc_count write_weights(doc_count, doc_count_path) print "wrote DF counts for all features to:", doc_count_path if args.tokens_per_order: # Choose a number of features for each length of token feats = ngram_select(doc_count, args.max_order, args.tokens_per_order) else: # Choose a number of features overall feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens]) print "selected features: ", len(feats) write_features(feats, feature_path) print 'wrote features to "%s"' % feature_path
# Compute P(t|C) print "learning P(t|C)" paths = zip(*items)[2] nb_ptc = learn_ptc(paths, tk_nextmove, tk_output, cm, buckets_dir, args) nb_ptc = np.array(nb_ptc).reshape(len(feats), len(langs)) # Normalize to 1 on the term axis print "renormalizing P(t|C)" for i in range(nb_ptc.shape[1]): # had to de-vectorize this due to memory consumption newval = np.empty_like(nb_ptc[:,i]) for j in range(newval.shape[0]): newval[j] = (1/np.exp(nb_ptc[:,i] - nb_ptc[j,i]).sum()) nb_ptc[:,i] = newval assert (1.0 - newval.sum()) < 0.0001 print "doing per-pair output" for lang1, lang2 in pairs: # Where to do output if args.no_norm: weights_path = os.path.join(out_dir, ('BLfeats.no_norm.{0}.{1}'.format(lang1, lang2))) else: weights_path = os.path.join(out_dir, ('BLfeats.{0}.{1}'.format(lang1, lang2))) i1 = indexer.lang_index[lang1] i2 = indexer.lang_index[lang2] w = dict(zip(feats, np.abs((nb_ptc[:,i1] - nb_ptc[:,i2]) / (nb_ptc.sum(1) if not args.no_norm else 1)))) write_weights(w, weights_path) print "wrote weights to {0}".format(weights_path)
print "features output path:", feature_path if args.tokens_per_order: print "max ngram order:", args.max_order print "tokens per order:", args.tokens_per_order else: print "tokens:", args.tokens with open(bucketlist_path) as f: bucketlist = map(str.strip, f) doc_count = tally(bucketlist, args.jobs) print "unique features:", len(doc_count) if args.doc_count: # The constant true is used to indicate output to default location doc_count_path = os.path.join(args.model, 'DF_all') if args.doc_count == True else args.doc_count write_weights(doc_count, doc_count_path) print "wrote DF counts for all features to:", doc_count_path if args.tokens_per_order: # Choose a number of features for each length of token feats = ngram_select(doc_count, args.max_order, args.tokens_per_order) else: # Choose a number of features overall feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens] ) print "selected features: ", len(feats) write_features(feats, feature_path) print 'wrote features to "%s"' % feature_path
else: raise ValueError("no event specified") if args.weights: weights_path = args.weights else: weights_path = os.path.join( args.model, 'IGweights' + suffix + ('.bin' if args.binarize else '')) # display paths print "model path:", args.model print "buckets path:", bucketlist_paths print "features path:", feature_path print "weights path:", weights_path print "index path:", index_path print "suffix:", suffix print "computing information gain" # Compile buckets together bucketlist = zip(*(map(str.strip, open(p)) for p in bucketlist_paths)) # Check that each bucketlist has the same number of buckets assert len(set(map(len, bucketlist))) == 1, "incompatible bucketlists!" dist = read_dist(index_path) ig = compute_IG(bucketlist, features, dist, args.binarize, suffix, args.jobs) write_weights(ig, weights_path)