Esempio n. 1
0
    # load tours data
    path = args.path
    if not path:
        path = os.path.join(os.path.dirname(__file__), "../data/spots.json")
    cityspots = CitySpots.load(path)

    # create city information data
    cityspotsj = []
    for cs in cityspots:
        j = {
            "id": cs.city.code,
            "spots": [s.code for s in cs.spots]
        }
        cityspotsj.append(j)

    # save as document
    cityspots_doc = CitySpots.to_doc(cityspots)

    print("show city spots corpus")
    cityspots_doc.show_vocab(limit=20)

    doc_path = os.path.join(os.path.dirname(path), "./cityspots_doc.pickle")
    p = PickleResource(doc_path)
    p.save(cityspots_doc)

    # save as json file
    j = json.dumps(cityspotsj, indent=2, ensure_ascii=False)
    data_path = os.path.join(os.path.dirname(path), "./cityspots.json")
    with open(data_path, "wb") as f:
        f.write(j.encode("utf-8"))
    parser.add_argument("--iter", type=int, default=100, help="number of training iteration.")
    parser.add_argument("--burn", type=int, default=1000, help="number of burn.")
    parser.add_argument("--epoch", type=int, default=10, help="number of epoch.")

    args = parser.parse_args()
    path = args.path
    if not path:
        path = os.path.join(os.path.dirname(__file__), "../data/cityspots_doc_edited.pickle")

    # make resource files
    fname = os.path.basename(path)
    make_path = lambda p: os.path.join(os.path.dirname(path), "./" + (os.path.splitext(fname)[0]) + "_" + p)
    r = GensimResource(make_path("model.gensim"))

    # document (corpus)
    p = PickleResource(path)
    doc = p.load()
    training, test = doc.split(right_rate_or_size=0.3, compact=False)

    model = None
    perplexity = 1e5
    topics = [args.topics]
    if args.till:
        topics = range(args.topics, args.till + 1)

    for t in topics:
        print("topic count = {0}".format(t))
        for e in range(args.epoch):
            # make model
            m = GTopicModel(t, training, resource=r)
            m.train(iter=args.iter, burn=args.burn)
Esempio n. 3
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Edit corpus of spot document.")

    parser.add_argument("--path", type=str, help="path to spots document file (pickle).")
    parser.add_argument("--save", nargs="?", type=bool, const=True, default=False, help="save result.")
    parser.add_argument("--under", type=int, default=-1, help="cut under the n count word.")
    parser.add_argument("--above", type=float, default=-1, help="cut above the n or n% count word.")
    parser.add_argument("--freq", type=float, default=-1, help="cut above the n% frequency word in documents.")
    parser.add_argument("--ignore", type=str, default="", help="ignore words list file (only file name, locate in path folder).")

    args = parser.parse_args()
    path = args.path
    if not path:
        path = os.path.join(os.path.dirname(__file__), "../data/cityspots_doc.pickle")

    p = PickleResource(path)
    doc = p.load()

    if args.freq > 0:
        doc.cut_frequent(args.freq)

    doc.cut_pos({"pos": ["動詞", "副詞"], "class1": ["接尾", "副詞可能"], "class2": ["人名", "地域", "副詞可能"]})

    if args.under > 0:
        doc.cut_under(args.under)

    if args.above > 0:
        doc.cut_above(args.above)

    if args.ignore:
        ig_path = os.path.join(os.path.dirname(path), args.ignore)