コード例 #1
0
    parser.add_argument("--maxN", dest="maxn", type=int, help="")
    parser.set_defaults(thread=cpu_count())
    parser.add_argument("-t", "--thread", dest="thread", type=int, help="the number of threads")
    parser.set_defaults(knn=1)
    parser.add_argument("-k", "--knn", dest="knn", type=int, help="use k of the nearest neighbors (default 1)")
    args = parser.parse_args()
    test_file = args.test
    topK = args.knn
    maxN = args.maxn
    if not args.modelfile1 or not args.modelfile2:
        print "Specify modelfile1 and modelfile2"
        quit(-1)

    logging.info("load trained model file")
    modelfile1 = args.modelfile1
    model1 = Sentence2Vec.load(modelfile1)
    modelfile2 = args.modelfile2
    model2 = Sentence2Vec.load(modelfile2)

    sent_cat = readSentence(CatSentence(args.train, cont_col=3, split=args.split))
    test_sentences = CatSentence(test_file)
    confusion_mtx = {}

    def prepare_sentences():
        count = 0
        for sent_tuple in test_sentences:
            yield sent_tuple
            count += 1
            if count > maxN:
                break
コード例 #2
0
 current_dir = os.path.dirname(os.path.realpath(__file__))
 wikip_data = current_dir+"/"+wiki_name
 s2v_model_name = current_dir+"/"+model_dir+"/"+ wiki_name + "_sent.model"
 if not os.path.exists(current_dir+"/"+model_dir):
     os.mkdir(current_dir+"/"+model_dir)
 if not os.path.isfile(wikip_data):
     logger.info("downloading Wikipedia data")
     urllib.urlretrieve(wiki_url, wikip_data)
     logger.info("downloaded in %s" % wikip_data)
 
 sentences = WikiSentence(wikip_data)
 if not os.path.isfile(s2v_model_name):
     model = Sentence2Vec(sentences,iteration=10, model="cb", hs = 1, negative = 0, size=300, update_mode = 0)
     model.save(s2v_model_name)
 else:
     model = Sentence2Vec.load(s2v_model_name)
 
 print "Input an article title (type EXIT to exit)"
 sys.stdout.write("Name: ")
 line = sys.stdin.readline()
 while line:
     line = utils.to_unicode(line.rstrip())
     if line == "EXIT":
         break
     try:
         if model.sent_no_hash.has_key(line):
             sent_no = model.sent_no_hash[line]
             sent_vec = model.sents[sent_no]
             nsents = model.most_similar_sentence(sent_vec, 11)
             print "Similar articles              similarity"
             print "-"*45
コード例 #3
0
                        dest="knn",
                        type=int,
                        help="use k of the nearest neighbors (default 1)")
    args = parser.parse_args()
    test_file = args.test
    topK = args.knn
    maxN = args.maxn
    if not args.train:
        print "ERROR: specify training set"
        quit()

    input_file = args.train[0]
    if args.modelfile:
        logging.info("load trained model file")
        modelfile = args.modelfile
        model = Sentence2Vec.load(modelfile)
    else:
        p_dir = re.compile("^.*/")
        basename = p_dir.sub("", input_file)
        if args.outdir:
            outdir = args.outdir
        else:
            m = p_dir.search(input_file)
            outdir = m.group(0) if m else ""
        if args.split:
            input_file = args.train
        logging.info("train from input file")
        model = Sentence2Vec(CatSentence(input_file,
                                         cont_col=3,
                                         split=args.split),
                             iteration=args.iteration,
コード例 #4
0
    parser.add_argument("--maxN", dest="maxn", type=int, help="")
    parser.set_defaults(knn=1)
    parser.add_argument("-k","--knn", dest="knn", type=int, help="use k of the nearest neighbors (default 1)")
    args = parser.parse_args()
    test_file = args.test
    topK = args.knn
    maxN = args.maxn
    if not args.train:
        print "ERROR: specify training set"
        quit()

    input_file = args.train[0]
    if args.modelfile:
        logging.info("load trained model file")
        modelfile = args.modelfile
        model = Sentence2Vec.load(modelfile)
    else:
        p_dir = re.compile("^.*/")
        basename = p_dir.sub("",input_file)
        if args.outdir:
            outdir = args.outdir
        else:
            m = p_dir.search(input_file)
            outdir = m.group(0) if m else ""
        if args.split:
            input_file = args.train
        logging.info("train from input file")
        model = Sentence2Vec(CatSentence(input_file, cont_col=3, split=args.split), iteration=args.iteration, model=args.model, hs = args.hs, negative = args.neg, workers = args.thread, alpha=args.alpha, size=args.dim, update_mode = args.update)
        modelfile = "%s%s_%s.model" % (outdir, basename, model.identifier())
        model.save(modelfile)
コード例 #5
0
    parser.add_argument("-k",
                        "--knn",
                        dest="knn",
                        type=int,
                        help="use k of the nearest neighbors (default 1)")
    args = parser.parse_args()
    test_file = args.test
    topK = args.knn
    maxN = args.maxn
    if not args.modelfile1 or not args.modelfile2:
        print "Specify modelfile1 and modelfile2"
        quit(-1)

    logging.info("load trained model file")
    modelfile1 = args.modelfile1
    model1 = Sentence2Vec.load(modelfile1)
    modelfile2 = args.modelfile2
    model2 = Sentence2Vec.load(modelfile2)

    sent_cat = readSentence(
        CatSentence(args.train, cont_col=3, split=args.split))
    test_sentences = CatSentence(test_file)
    confusion_mtx = {}

    def prepare_sentences():
        count = 0
        for sent_tuple in test_sentences:
            yield sent_tuple
            count += 1
            if count > maxN: break