# テキストファイルからタイトルを読み込み、カテゴリを付加し、MessagePack形式で書き出す。 # "title\n..." -> [[category, title], ...] import sys import msgpackutil if len(sys.argv) <= 3: print("Usage: " + sys.argv[0] + " category infile outfile", file=sys.stderr) exit(1) category = sys.argv[1] infile = sys.argv[2] outfile = sys.argv[3] print("category = " + category) print("infile = " + infile) print("outfile = " + outfile) print("loading...") with open(infile, "r") as file: in_records = [line.rstrip() for line in file] print("processing...") out_records = [[category, title] for title in in_records] out_records.sort() print("dumping...") msgpackutil.dump(outfile, out_records) print("ok")
print("rail_records.len = " + str(len(rail_records))) print("other_records.len = " + str(len(other_records))) # 乱数シードを固定する。 random.seed(0) random.shuffle(rail_records) random.shuffle(other_records) # テスト用データをを取り出す。 num_of_test_records = 1000 test_records = [] test_records.extend(rail_records[0:num_of_test_records]) del rail_records[0:num_of_test_records] test_records.extend(other_records[0:num_of_test_records]) del other_records[0:num_of_test_records] random.shuffle(test_records) # 学習用データを取り出す。 train_records = [] train_records.extend(rail_records) train_records.extend(other_records) random.shuffle(train_records) print("test_records.len = " + str(len(test_records))) print("train_records.len = " + str(len(train_records))) print("dumping...") msgpackutil.dump(testfile, test_records) msgpackutil.dump(trainfile, train_records)
def save(self, filename): msgpackutil.dump(filename, self.terms)