p_dir = re.compile("^.*/")
        basename = p_dir.sub("", input_file)
        if args.outdir:
            outdir = args.outdir
        else:
            m = p_dir.search(input_file)
            outdir = m.group(0) if m else ""
        if args.split:
            input_file = args.train
        logging.info("train from input file")
        model = Category2Vec(CatSentence(input_file,
                                         cont_col=3,
                                         split=args.split),
                             iteration=args.iteration,
                             model=args.model,
                             hs=args.hs,
                             negative=args.neg,
                             workers=args.thread,
                             alpha=args.alpha,
                             size=args.dim,
                             update_mode=args.update,
                             normalize_each_epoch=args.norm)
        modelfile = "%s%s_%s.model" % (outdir, basename, model.identifier())
        model.save(modelfile)

    logging.info("initializing pairnorm")
    model.init_pairnorm()
    test_sentences = CatSentence(test_file)
    confusion_mtx = {}

    def prepare_sentences():
        count = 0
Esempio n. 2
0
        level=logging.INFO)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    wikip_data = current_dir + "/" + wiki_name
    c2v_model_name = current_dir + "/" + model_dir + "/" + wiki_name + "_cat.model"
    if not os.path.exists(current_dir + "/" + model_dir):
        os.mkdir(current_dir + "/" + model_dir)
    if not os.path.isfile(wikip_data):
        logger.info("downloading Wikipedia data")
        urllib.urlretrieve(wiki_url, wikip_data)
        logger.info("downloaded in %s" % wikip_data)

    sentences = WikiSentence(wikip_data)
    if not os.path.isfile(c2v_model_name):
        model = Category2Vec(sentences,
                             iteration=20,
                             model="cb",
                             hs=1,
                             negative=0,
                             size=300)
        model.save(c2v_model_name)
    else:
        model = Category2Vec.load(c2v_model_name)

    print "Input a category name or an article title (type EXIT to exit)"
    sys.stdout.write("Name: ")
    line = sys.stdin.readline()
    while line:
        line = utils.to_unicode(line.rstrip())
        if line == "EXIT":
            break
        try:
            if model.cat_no_hash.has_key(line):