parser.add_argument("--maxN", dest="maxn", type=int, help="") parser.set_defaults(thread=cpu_count()) parser.add_argument("-t", "--thread", dest="thread", type=int, help="the number of threads") parser.set_defaults(knn=1) parser.add_argument("-k", "--knn", dest="knn", type=int, help="use k of the nearest neighbors (default 1)") args = parser.parse_args() test_file = args.test topK = args.knn maxN = args.maxn if not args.modelfile1 or not args.modelfile2: print "Specify modelfile1 and modelfile2" quit(-1) logging.info("load trained model file") modelfile1 = args.modelfile1 model1 = Sentence2Vec.load(modelfile1) modelfile2 = args.modelfile2 model2 = Sentence2Vec.load(modelfile2) sent_cat = readSentence(CatSentence(args.train, cont_col=3, split=args.split)) test_sentences = CatSentence(test_file) confusion_mtx = {} def prepare_sentences(): count = 0 for sent_tuple in test_sentences: yield sent_tuple count += 1 if count > maxN: break
if __name__ == "__main__": logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) current_dir = os.path.dirname(os.path.realpath(__file__)) wikip_data = current_dir+"/"+wiki_name s2v_model_name = current_dir+"/"+model_dir+"/"+ wiki_name + "_sent.model" if not os.path.exists(current_dir+"/"+model_dir): os.mkdir(current_dir+"/"+model_dir) if not os.path.isfile(wikip_data): logger.info("downloading Wikipedia data") urllib.urlretrieve(wiki_url, wikip_data) logger.info("downloaded in %s" % wikip_data) sentences = WikiSentence(wikip_data) if not os.path.isfile(s2v_model_name): model = Sentence2Vec(sentences,iteration=10, model="cb", hs = 1, negative = 0, size=300, update_mode = 0) model.save(s2v_model_name) else: model = Sentence2Vec.load(s2v_model_name) print "Input an article title (type EXIT to exit)" sys.stdout.write("Name: ") line = sys.stdin.readline() while line: line = utils.to_unicode(line.rstrip()) if line == "EXIT": break try: if model.sent_no_hash.has_key(line): sent_no = model.sent_no_hash[line] sent_vec = model.sents[sent_no]
sent_cat[sent_id] = cat_id return sent_cat if __name__ == "__main__": logging.basicConfig( format= '%(asctime)s %(relativeCreated)d : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) program = os.path.basename(sys.argv[0]) if len(sys.argv) < 2: print(globals()['__doc__'] % locals()) sys.exit(1) parser = Sentence2Vec.arg_parser() parser.add_argument("--split", dest="split", action="store_true", help="use this option for split training data", default=False) parser.add_argument("--modelfile", dest="modelfile", type=str, help="trained model file") parser.add_argument("--test", dest="test", type=str, help="test file") parser.set_defaults(maxn=sys.maxint) parser.add_argument("--maxN", dest="maxn", type=int, help="") parser.set_defaults(knn=1) parser.add_argument("-k", "--knn",
sent_cat = {} for tpl in sent: sent_id = tpl[1] cat_id = tpl[2] sent_cat[sent_id] = cat_id return sent_cat if __name__ == "__main__": logging.basicConfig(format='%(asctime)s %(relativeCreated)d : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) program = os.path.basename(sys.argv[0]) if len(sys.argv) < 2: print(globals()['__doc__'] % locals()) sys.exit(1) parser = Sentence2Vec.arg_parser() parser.add_argument("--split", dest="split", action="store_true", help="use this option for split training data", default=False) parser.add_argument("--modelfile", dest="modelfile", type=str, help="trained model file") parser.add_argument("--test", dest="test", type=str, help="test file") parser.set_defaults(maxn=sys.maxint) parser.add_argument("--maxN", dest="maxn", type=int, help="") parser.set_defaults(knn=1) parser.add_argument("-k","--knn", dest="knn", type=int, help="use k of the nearest neighbors (default 1)") args = parser.parse_args() test_file = args.test topK = args.knn maxN = args.maxn if not args.train: print "ERROR: specify training set" quit()
parser.add_argument("-k", "--knn", dest="knn", type=int, help="use k of the nearest neighbors (default 1)") args = parser.parse_args() test_file = args.test topK = args.knn maxN = args.maxn if not args.modelfile1 or not args.modelfile2: print "Specify modelfile1 and modelfile2" quit(-1) logging.info("load trained model file") modelfile1 = args.modelfile1 model1 = Sentence2Vec.load(modelfile1) modelfile2 = args.modelfile2 model2 = Sentence2Vec.load(modelfile2) sent_cat = readSentence( CatSentence(args.train, cont_col=3, split=args.split)) test_sentences = CatSentence(test_file) confusion_mtx = {} def prepare_sentences(): count = 0 for sent_tuple in test_sentences: yield sent_tuple count += 1 if count > maxN: break