parser.add_argument("--maxN", dest="maxn", type=int, help="") parser.set_defaults(thread=cpu_count()) parser.add_argument("-t", "--thread", dest="thread", type=int, help="the number of threads") parser.set_defaults(knn=1) parser.add_argument("-k", "--knn", dest="knn", type=int, help="use k of the nearest neighbors (default 1)") args = parser.parse_args() test_file = args.test topK = args.knn maxN = args.maxn if not args.modelfile1 or not args.modelfile2: print "Specify modelfile1 and modelfile2" quit(-1) logging.info("load trained model file") modelfile1 = args.modelfile1 model1 = Sentence2Vec.load(modelfile1) modelfile2 = args.modelfile2 model2 = Sentence2Vec.load(modelfile2) sent_cat = readSentence(CatSentence(args.train, cont_col=3, split=args.split)) test_sentences = CatSentence(test_file) confusion_mtx = {} def prepare_sentences(): count = 0 for sent_tuple in test_sentences: yield sent_tuple count += 1 if count > maxN: break
current_dir = os.path.dirname(os.path.realpath(__file__)) wikip_data = current_dir+"/"+wiki_name s2v_model_name = current_dir+"/"+model_dir+"/"+ wiki_name + "_sent.model" if not os.path.exists(current_dir+"/"+model_dir): os.mkdir(current_dir+"/"+model_dir) if not os.path.isfile(wikip_data): logger.info("downloading Wikipedia data") urllib.urlretrieve(wiki_url, wikip_data) logger.info("downloaded in %s" % wikip_data) sentences = WikiSentence(wikip_data) if not os.path.isfile(s2v_model_name): model = Sentence2Vec(sentences,iteration=10, model="cb", hs = 1, negative = 0, size=300, update_mode = 0) model.save(s2v_model_name) else: model = Sentence2Vec.load(s2v_model_name) print "Input an article title (type EXIT to exit)" sys.stdout.write("Name: ") line = sys.stdin.readline() while line: line = utils.to_unicode(line.rstrip()) if line == "EXIT": break try: if model.sent_no_hash.has_key(line): sent_no = model.sent_no_hash[line] sent_vec = model.sents[sent_no] nsents = model.most_similar_sentence(sent_vec, 11) print "Similar articles similarity" print "-"*45
dest="knn", type=int, help="use k of the nearest neighbors (default 1)") args = parser.parse_args() test_file = args.test topK = args.knn maxN = args.maxn if not args.train: print "ERROR: specify training set" quit() input_file = args.train[0] if args.modelfile: logging.info("load trained model file") modelfile = args.modelfile model = Sentence2Vec.load(modelfile) else: p_dir = re.compile("^.*/") basename = p_dir.sub("", input_file) if args.outdir: outdir = args.outdir else: m = p_dir.search(input_file) outdir = m.group(0) if m else "" if args.split: input_file = args.train logging.info("train from input file") model = Sentence2Vec(CatSentence(input_file, cont_col=3, split=args.split), iteration=args.iteration,
parser.add_argument("--maxN", dest="maxn", type=int, help="") parser.set_defaults(knn=1) parser.add_argument("-k","--knn", dest="knn", type=int, help="use k of the nearest neighbors (default 1)") args = parser.parse_args() test_file = args.test topK = args.knn maxN = args.maxn if not args.train: print "ERROR: specify training set" quit() input_file = args.train[0] if args.modelfile: logging.info("load trained model file") modelfile = args.modelfile model = Sentence2Vec.load(modelfile) else: p_dir = re.compile("^.*/") basename = p_dir.sub("",input_file) if args.outdir: outdir = args.outdir else: m = p_dir.search(input_file) outdir = m.group(0) if m else "" if args.split: input_file = args.train logging.info("train from input file") model = Sentence2Vec(CatSentence(input_file, cont_col=3, split=args.split), iteration=args.iteration, model=args.model, hs = args.hs, negative = args.neg, workers = args.thread, alpha=args.alpha, size=args.dim, update_mode = args.update) modelfile = "%s%s_%s.model" % (outdir, basename, model.identifier()) model.save(modelfile)
parser.add_argument("-k", "--knn", dest="knn", type=int, help="use k of the nearest neighbors (default 1)") args = parser.parse_args() test_file = args.test topK = args.knn maxN = args.maxn if not args.modelfile1 or not args.modelfile2: print "Specify modelfile1 and modelfile2" quit(-1) logging.info("load trained model file") modelfile1 = args.modelfile1 model1 = Sentence2Vec.load(modelfile1) modelfile2 = args.modelfile2 model2 = Sentence2Vec.load(modelfile2) sent_cat = readSentence( CatSentence(args.train, cont_col=3, split=args.split)) test_sentences = CatSentence(test_file) confusion_mtx = {} def prepare_sentences(): count = 0 for sent_tuple in test_sentences: yield sent_tuple count += 1 if count > maxN: break