parser.set_defaults(maxn=sys.maxint) parser.add_argument("--maxN", dest="maxn", type=int, help="") parser.set_defaults(knn=1) parser.add_argument("-k", "--knn", dest="knn", type=int, help="use k of the nearest neighbors (default 1)") args = parser.parse_args() test_file = args.test topK = args.knn maxN = args.maxn if args.modelfile: logging.info("load trained model file") modelfile = args.modelfile model = Category2Vec.load(modelfile) else: input_file = args.train[0] p_dir = re.compile("^.*/") basename = p_dir.sub("", input_file) if args.outdir: outdir = args.outdir else: m = p_dir.search(input_file) outdir = m.group(0) if m else "" if args.split: input_file = args.train logging.info("train from input file") model = Category2Vec(CatSentence(input_file, cont_col=3, split=args.split),
parser.add_argument("-k", "--knn", dest="knn", type=int, help="use k of the nearest neighbors (default 1)") args = parser.parse_args() test_file = args.test topK = args.knn maxN = args.maxn if not args.modelfile1 or not args.modelfile2: print "Specify modelfile1 and modelfile2" quit(-1) logging.info("load trained model file") modelfile1 = args.modelfile1 model1 = Category2Vec.load(modelfile1) modelfile2 = args.modelfile2 model2 = Category2Vec.load(modelfile2) logging.info("initializing pairnorm") model1.init_pairnorm() model2.init_pairnorm() #pairtable = np.empty((model1.pair_len, model1.layer1_size * 2), dtype=REAL) #init_joint_pairtable(model1, model2, pairtable) test_sentences = CatSentence(test_file) confusion_mtx = {} def prepare_sentences(): count = 0 for sent_tuple in test_sentences: yield sent_tuple
if not os.path.isfile(wikip_data): logger.info("downloading Wikipedia data") urllib.urlretrieve(wiki_url, wikip_data) logger.info("downloaded in %s" % wikip_data) sentences = WikiSentence(wikip_data) if not os.path.isfile(c2v_model_name): model = Category2Vec(sentences, iteration=20, model="cb", hs=1, negative=0, size=300) model.save(c2v_model_name) else: model = Category2Vec.load(c2v_model_name) print "Input a category name or an article title (type EXIT to exit)" sys.stdout.write("Name: ") line = sys.stdin.readline() while line: line = utils.to_unicode(line.rstrip()) if line == "EXIT": break try: if model.cat_no_hash.has_key(line): cat_no = model.cat_no_hash[line] cat_vec = model.cats[cat_no] ncats = model.most_similar_category(cat_vec, 11) print "Similar categories similarity" print "-" * 45