forked from torresal/gcis-isbn-validation
/
normalize_isbn.py
123 lines (99 loc) · 5.32 KB
/
normalize_isbn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import requests
from isbnlib import EAN13, clean, canonical
import os
import json
import datetime
import sys
#top_dir = "output_%s"%datetime.date.today()+"/"
#book_dump_dir = "normalization_output/"
if len(sys.argv) > 1:
if sys.argv[1] is not None:
book_top_path = sys.argv[1]
if not os.path.isdir(book_top_path):
print "Path parameter for GCIS book dump is not valid"
else:
#correct top_dir if it doesn't end with /
if not book_top_path.endswith("/"):
book_top_path = "%s/"%(book_top_path)
#establish book dump directory
book_dir = "%sbook/"%(book_top_path)
#gcis-TIMESTAMP/book-normalized
#book_top_path = book_dir + norm_dir
norm_dir = "book-normalized/"
norm_top_path = "%s%s"%(book_top_path, norm_dir)
if not os.path.isdir(norm_top_path):
os.makedirs(norm_top_path)
#gcis-TIMESTAMP/book-normalized/isbn-13
isbn_13_path = "%sisbn-13/"%(norm_top_path)
if not os.path.isdir(isbn_13_path):
os.makedirs(isbn_13_path)
#gcis-TIMESTAMP/book-normalized/isbn-other
isbn_other_path = "%sisbn-other/"%(norm_top_path)
if not os.path.isdir(isbn_other_path):
os.makedirs(isbn_other_path)
#gcis-TIMESTAMP/book-normalized/isbn-none
isbn_none_path = "%sisbn-none/"%(norm_top_path)
if not os.path.isdir(isbn_none_path):
os.makedirs(isbn_none_path)
problem_log = open(norm_top_path+"normalize_log.txt", "w")
problem_log.write("Normalization Log \n\n")
#problem_json = []
# book_dir = top_dir+"book/"
# directory = "gcis-isbn-validation/%s"%book_dir
no_isbn = []
other_isbn = []
problem_count = 0
normal_count = 0
other_count = 0
total_count = 0
for (root, dirs, files) in os.walk(book_dir):
for f in files:
with open(book_dir+f) as item:
json_item = json.load(item)
book_isbn = json_item['isbn']
if book_isbn is not None:
#No isbn
if book_isbn == "None":
with open("%s%s"%(isbn_none_path,str(f)),'w') as jsonFile:
jsonFile.write(json.dumps(json_item, sort_keys=True, indent=4, separators=(',',': ')))
#problem_log.write(json_item['identifier']+"\n")
no_isbn.append(json_item['identifier'])
problem_count = problem_count + 1
else:
book_isbn = clean(book_isbn)
#ISBN-13
if EAN13(book_isbn) != None:
book_isbn = EAN13(book_isbn)
json_item['isbn'] = book_isbn
with open("%s%s"%(isbn_13_path,str(f)), 'w') as jsonFile:
jsonFile.write(json.dumps(json_item, sort_keys=True, indent=4, separators=(',', ': ')))
normal_count = normal_count + 1
#ISBN-OTHER
else:
book_isbn = book_isbn.replace("-", "")
json_item['isbn'] = book_isbn
with open("%s%s"%(isbn_other_path,str(f)), 'w') as jsonFile:
jsonFile.write(json.dumps(json_item, sort_keys=True, indent=4, separators=(',', ': ')))
other_isbn.append(json_item['identifier'])
#problem_log.write(json_item['identifier']+"\n")
other_count = other_count + 1
#No isbn
else:
with open("%s%s"%(isbn_none_path,str(f)),'w') as jsonFile:
jsonFile.write(json.dumps(json_item, sort_keys=True, indent=4, separators=(',',': ')))
no_isbn.append(json_item['identifier'])
#problem_log.write(json_item['identifier']+"\n")
problem_count = problem_count + 1
total_count = normal_count+problem_count+other_count
problem_log.write("ISBN13 files: %s"%normal_count+"\n")
problem_log.write("\nFiles that have an ISBN, but are not ISBN-13 format: %s"%other_count+"\n")
for non13_id in other_isbn:
problem_log.write("\t%s\n"%(non13_id))
problem_log.write("\nFiles without any ISBN: %s"%problem_count+"\n")
for problem_id in no_isbn:
problem_log.write("\t%s\n"%(problem_id))
problem_log.write("\nTotal files: %s"%(total_count))
else:
print "Requires parameter for GCIS book dump"
else:
print "Requires parameter for GCIS book dump"