コード例 #1
0
# テキストファイルからタイトルを読み込み、カテゴリを付加し、MessagePack形式で書き出す。
# "title\n..." -> [[category, title], ...]

import sys
import msgpackutil

if len(sys.argv) <= 3:
    print("Usage: " + sys.argv[0] + " category infile outfile", file=sys.stderr)
    exit(1)

category = sys.argv[1]
infile   = sys.argv[2]
outfile  = sys.argv[3]
print("category = " + category)
print("infile   = " + infile)
print("outfile  = " + outfile)

print("loading...")
with open(infile, "r") as file:
    in_records = [line.rstrip() for line in file]

print("processing...")
out_records = [[category, title] for title in in_records]
out_records.sort()

print("dumping...")
msgpackutil.dump(outfile, out_records)

print("ok")
コード例 #2
0
print("rail_records.len  = " + str(len(rail_records)))
print("other_records.len = " + str(len(other_records)))

# 乱数シードを固定する。
random.seed(0)

random.shuffle(rail_records)
random.shuffle(other_records)

# テスト用データをを取り出す。
num_of_test_records = 1000
test_records = []
test_records.extend(rail_records[0:num_of_test_records])
del rail_records[0:num_of_test_records]
test_records.extend(other_records[0:num_of_test_records])
del other_records[0:num_of_test_records]
random.shuffle(test_records)

# 学習用データを取り出す。
train_records = []
train_records.extend(rail_records)
train_records.extend(other_records)
random.shuffle(train_records)

print("test_records.len  = " + str(len(test_records)))
print("train_records.len = " + str(len(train_records)))

print("dumping...")
msgpackutil.dump(testfile, test_records)
msgpackutil.dump(trainfile, train_records)
コード例 #3
0
print("rail_records.len  = " + str(len(rail_records)))
print("other_records.len = " + str(len(other_records)))

# 乱数シードを固定する。
random.seed(0)

random.shuffle(rail_records)
random.shuffle(other_records)

# テスト用データをを取り出す。
num_of_test_records = 1000
test_records = []
test_records.extend(rail_records[0:num_of_test_records])
del rail_records[0:num_of_test_records]
test_records.extend(other_records[0:num_of_test_records])
del other_records[0:num_of_test_records]
random.shuffle(test_records)

# 学習用データを取り出す。
train_records = []
train_records.extend(rail_records)
train_records.extend(other_records)
random.shuffle(train_records)

print("test_records.len  = " + str(len(test_records)))
print("train_records.len = " + str(len(train_records)))

print("dumping...")
msgpackutil.dump(testfile, test_records)
msgpackutil.dump(trainfile, train_records)
コード例 #4
0
 def save(self, filename):
     msgpackutil.dump(filename, self.terms)