Ejemplo n.º 1
0
 parser.set_defaults(maxn=sys.maxint)
 parser.add_argument("--maxN", dest="maxn", type=int, help="")
 parser.set_defaults(knn=1)
 parser.add_argument("-k",
                     "--knn",
                     dest="knn",
                     type=int,
                     help="use k of the nearest neighbors (default 1)")
 args = parser.parse_args()
 test_file = args.test
 topK = args.knn
 maxN = args.maxn
 if args.modelfile:
     logging.info("load trained model file")
     modelfile = args.modelfile
     model = Category2Vec.load(modelfile)
 else:
     input_file = args.train[0]
     p_dir = re.compile("^.*/")
     basename = p_dir.sub("", input_file)
     if args.outdir:
         outdir = args.outdir
     else:
         m = p_dir.search(input_file)
         outdir = m.group(0) if m else ""
     if args.split:
         input_file = args.train
     logging.info("train from input file")
     model = Category2Vec(CatSentence(input_file,
                                      cont_col=3,
                                      split=args.split),
Ejemplo n.º 2
0
    parser.add_argument("-k",
                        "--knn",
                        dest="knn",
                        type=int,
                        help="use k of the nearest neighbors (default 1)")
    args = parser.parse_args()
    test_file = args.test
    topK = args.knn
    maxN = args.maxn
    if not args.modelfile1 or not args.modelfile2:
        print "Specify modelfile1 and modelfile2"
        quit(-1)

    logging.info("load trained model file")
    modelfile1 = args.modelfile1
    model1 = Category2Vec.load(modelfile1)
    modelfile2 = args.modelfile2
    model2 = Category2Vec.load(modelfile2)

    logging.info("initializing pairnorm")
    model1.init_pairnorm()
    model2.init_pairnorm()
    #pairtable = np.empty((model1.pair_len, model1.layer1_size * 2), dtype=REAL)
    #init_joint_pairtable(model1, model2, pairtable)
    test_sentences = CatSentence(test_file)
    confusion_mtx = {}

    def prepare_sentences():
        count = 0
        for sent_tuple in test_sentences:
            yield sent_tuple
Ejemplo n.º 3
0
    if not os.path.isfile(wikip_data):
        logger.info("downloading Wikipedia data")
        urllib.urlretrieve(wiki_url, wikip_data)
        logger.info("downloaded in %s" % wikip_data)

    sentences = WikiSentence(wikip_data)
    if not os.path.isfile(c2v_model_name):
        model = Category2Vec(sentences,
                             iteration=20,
                             model="cb",
                             hs=1,
                             negative=0,
                             size=300)
        model.save(c2v_model_name)
    else:
        model = Category2Vec.load(c2v_model_name)

    print "Input a category name or an article title (type EXIT to exit)"
    sys.stdout.write("Name: ")
    line = sys.stdin.readline()
    while line:
        line = utils.to_unicode(line.rstrip())
        if line == "EXIT":
            break
        try:
            if model.cat_no_hash.has_key(line):
                cat_no = model.cat_no_hash[line]
                cat_vec = model.cats[cat_no]
                ncats = model.most_similar_category(cat_vec, 11)
                print "Similar categories            similarity"
                print "-" * 45