p_dir = re.compile("^.*/") basename = p_dir.sub("", input_file) if args.outdir: outdir = args.outdir else: m = p_dir.search(input_file) outdir = m.group(0) if m else "" if args.split: input_file = args.train logging.info("train from input file") model = Category2Vec(CatSentence(input_file, cont_col=3, split=args.split), iteration=args.iteration, model=args.model, hs=args.hs, negative=args.neg, workers=args.thread, alpha=args.alpha, size=args.dim, update_mode=args.update, normalize_each_epoch=args.norm) modelfile = "%s%s_%s.model" % (outdir, basename, model.identifier()) model.save(modelfile) logging.info("initializing pairnorm") model.init_pairnorm() test_sentences = CatSentence(test_file) confusion_mtx = {} def prepare_sentences(): count = 0
level=logging.INFO) current_dir = os.path.dirname(os.path.realpath(__file__)) wikip_data = current_dir + "/" + wiki_name c2v_model_name = current_dir + "/" + model_dir + "/" + wiki_name + "_cat.model" if not os.path.exists(current_dir + "/" + model_dir): os.mkdir(current_dir + "/" + model_dir) if not os.path.isfile(wikip_data): logger.info("downloading Wikipedia data") urllib.urlretrieve(wiki_url, wikip_data) logger.info("downloaded in %s" % wikip_data) sentences = WikiSentence(wikip_data) if not os.path.isfile(c2v_model_name): model = Category2Vec(sentences, iteration=20, model="cb", hs=1, negative=0, size=300) model.save(c2v_model_name) else: model = Category2Vec.load(c2v_model_name) print "Input a category name or an article title (type EXIT to exit)" sys.stdout.write("Name: ") line = sys.stdin.readline() while line: line = utils.to_unicode(line.rstrip()) if line == "EXIT": break try: if model.cat_no_hash.has_key(line):