# load tours data path = args.path if not path: path = os.path.join(os.path.dirname(__file__), "../data/spots.json") cityspots = CitySpots.load(path) # create city information data cityspotsj = [] for cs in cityspots: j = { "id": cs.city.code, "spots": [s.code for s in cs.spots] } cityspotsj.append(j) # save as document cityspots_doc = CitySpots.to_doc(cityspots) print("show city spots corpus") cityspots_doc.show_vocab(limit=20) doc_path = os.path.join(os.path.dirname(path), "./cityspots_doc.pickle") p = PickleResource(doc_path) p.save(cityspots_doc) # save as json file j = json.dumps(cityspotsj, indent=2, ensure_ascii=False) data_path = os.path.join(os.path.dirname(path), "./cityspots.json") with open(data_path, "wb") as f: f.write(j.encode("utf-8"))
parser.add_argument("--iter", type=int, default=100, help="number of training iteration.") parser.add_argument("--burn", type=int, default=1000, help="number of burn.") parser.add_argument("--epoch", type=int, default=10, help="number of epoch.") args = parser.parse_args() path = args.path if not path: path = os.path.join(os.path.dirname(__file__), "../data/cityspots_doc_edited.pickle") # make resource files fname = os.path.basename(path) make_path = lambda p: os.path.join(os.path.dirname(path), "./" + (os.path.splitext(fname)[0]) + "_" + p) r = GensimResource(make_path("model.gensim")) # document (corpus) p = PickleResource(path) doc = p.load() training, test = doc.split(right_rate_or_size=0.3, compact=False) model = None perplexity = 1e5 topics = [args.topics] if args.till: topics = range(args.topics, args.till + 1) for t in topics: print("topic count = {0}".format(t)) for e in range(args.epoch): # make model m = GTopicModel(t, training, resource=r) m.train(iter=args.iter, burn=args.burn)
if __name__ == "__main__": parser = argparse.ArgumentParser(description="Edit corpus of spot document.") parser.add_argument("--path", type=str, help="path to spots document file (pickle).") parser.add_argument("--save", nargs="?", type=bool, const=True, default=False, help="save result.") parser.add_argument("--under", type=int, default=-1, help="cut under the n count word.") parser.add_argument("--above", type=float, default=-1, help="cut above the n or n% count word.") parser.add_argument("--freq", type=float, default=-1, help="cut above the n% frequency word in documents.") parser.add_argument("--ignore", type=str, default="", help="ignore words list file (only file name, locate in path folder).") args = parser.parse_args() path = args.path if not path: path = os.path.join(os.path.dirname(__file__), "../data/cityspots_doc.pickle") p = PickleResource(path) doc = p.load() if args.freq > 0: doc.cut_frequent(args.freq) doc.cut_pos({"pos": ["動詞", "副詞"], "class1": ["接尾", "副詞可能"], "class2": ["人名", "地域", "副詞可能"]}) if args.under > 0: doc.cut_under(args.under) if args.above > 0: doc.cut_above(args.above) if args.ignore: ig_path = os.path.join(os.path.dirname(path), args.ignore)