예제 #1
0
    parser.add_argument("-k",
                        "--knn",
                        dest="knn",
                        type=int,
                        help="use k of the nearest neighbors (default 1)")
    args = parser.parse_args()
    test_file = args.test
    topK = args.knn
    maxN = args.maxn
    if not args.modelfile1 or not args.modelfile2:
        print "Specify modelfile1 and modelfile2"
        quit(-1)

    logging.info("load trained model file")
    modelfile1 = args.modelfile1
    model1 = Category2Vec.load(modelfile1)
    modelfile2 = args.modelfile2
    model2 = Category2Vec.load(modelfile2)

    logging.info("initializing pairnorm")
    model1.init_pairnorm()
    model2.init_pairnorm()
    #pairtable = np.empty((model1.pair_len, model1.layer1_size * 2), dtype=REAL)
    #init_joint_pairtable(model1, model2, pairtable)
    test_sentences = CatSentence(test_file)
    confusion_mtx = {}

    def prepare_sentences():
        count = 0
        for sent_tuple in test_sentences:
            yield sent_tuple
예제 #2
0
import cPickle as pickle

logger = logging.getLogger("cat_predict_eval")

if __name__ == "__main__":
    logging.basicConfig(
        format=
        '%(asctime)s %(relativeCreated)d : %(threadName)s : %(levelname)s : %(message)s',
        level=logging.INFO)
    logging.info("running %s" % " ".join(sys.argv))
    program = os.path.basename(sys.argv[0])
    if len(sys.argv) < 2:
        print(globals()['__doc__'] % locals())
        sys.exit(1)

    parser = Category2Vec.arg_parser()
    parser.add_argument("--split",
                        dest="split",
                        action="store_true",
                        help="use this option for split training data",
                        default=False)
    parser.add_argument("--modelfile",
                        dest="modelfile",
                        type=str,
                        help="trained model file")
    parser.add_argument("--test", dest="test", type=str, help="test file")
    parser.set_defaults(maxn=sys.maxint)
    parser.add_argument("--maxN", dest="maxn", type=int, help="")
    parser.set_defaults(knn=1)
    parser.add_argument("-k",
                        "--knn",
예제 #3
0
 current_dir = os.path.dirname(os.path.realpath(__file__))
 wikip_data = current_dir+"/"+wiki_name
 c2v_model_name = current_dir+"/"+model_dir+"/"+ wiki_name + "_cat.model"
 if not os.path.exists(current_dir+"/"+model_dir):
     os.mkdir(current_dir+"/"+model_dir)
 if not os.path.isfile(wikip_data):
     logger.info("downloading Wikipedia data")
     urllib.urlretrieve(wiki_url, wikip_data)
     logger.info("downloaded in %s" % wikip_data)
 
 sentences = WikiSentence(wikip_data)
 if not os.path.isfile(c2v_model_name):
     model = Category2Vec(sentences,iteration=20, model="cb", hs = 1, negative = 0, size=300)
     model.save(c2v_model_name)
 else:
     model = Category2Vec.load(c2v_model_name)
 
 print "Input a category name or an article title (type EXIT to exit)"
 sys.stdout.write("Name: ")
 line = sys.stdin.readline()
 while line:
     line = utils.to_unicode(line.rstrip())
     if line == "EXIT":
         break
     try:
         if model.cat_no_hash.has_key(line):
             cat_no = model.cat_no_hash[line]
             cat_vec = model.cats[cat_no]
             ncats = model.most_similar_category(cat_vec, 11)
             print "Similar categories            similarity"
             print "-"*45
예제 #4
0
        level=logging.INFO)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    wikip_data = current_dir + "/" + wiki_name
    c2v_model_name = current_dir + "/" + model_dir + "/" + wiki_name + "_cat.model"
    if not os.path.exists(current_dir + "/" + model_dir):
        os.mkdir(current_dir + "/" + model_dir)
    if not os.path.isfile(wikip_data):
        logger.info("downloading Wikipedia data")
        urllib.urlretrieve(wiki_url, wikip_data)
        logger.info("downloaded in %s" % wikip_data)

    sentences = WikiSentence(wikip_data)
    if not os.path.isfile(c2v_model_name):
        model = Category2Vec(sentences,
                             iteration=20,
                             model="cb",
                             hs=1,
                             negative=0,
                             size=300)
        model.save(c2v_model_name)
    else:
        model = Category2Vec.load(c2v_model_name)

    print "Input a category name or an article title (type EXIT to exit)"
    sys.stdout.write("Name: ")
    line = sys.stdin.readline()
    while line:
        line = utils.to_unicode(line.rstrip())
        if line == "EXIT":
            break
        try:
            if model.cat_no_hash.has_key(line):
예제 #5
0
from Queue import Queue
import numpy as np
import time,re
import cPickle as pickle

logger = logging.getLogger("cat_predict_eval")

if __name__ == "__main__":
    logging.basicConfig(format='%(asctime)s %(relativeCreated)d : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
    logging.info("running %s" % " ".join(sys.argv))
    program = os.path.basename(sys.argv[0])
    if len(sys.argv) < 2:
        print(globals()['__doc__'] % locals())
        sys.exit(1)

    parser = Category2Vec.arg_parser()
    parser.add_argument("--split", dest="split", action="store_true", help="use this option for split training data", default=False)
    parser.add_argument("--modelfile", dest="modelfile", type=str, help="trained model file")
    parser.add_argument("--test", dest="test", type=str, help="test file")
    parser.set_defaults(maxn=sys.maxint)
    parser.add_argument("--maxN", dest="maxn", type=int, help="")
    parser.set_defaults(knn=1)
    parser.add_argument("-k","--knn", dest="knn", type=int, help="use k of the nearest neighbors (default 1)")
    args = parser.parse_args()
    test_file = args.test
    topK = args.knn
    maxN = args.maxn
    if args.modelfile:
        logging.info("load trained model file")
        modelfile = args.modelfile
        model = Category2Vec.load(modelfile)