/
updating.py
232 lines (197 loc) · 8.93 KB
/
updating.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
"""Updating module to update and upload CDS records."""
import argparse
from json import load
from os.path import join
from invenio.bibtask import task_low_level_submission
from invenio.search_engine import get_record
from invenio.bibrecord import (
field_add_subfield, field_xml_output,
record_get_field_instances, field_get_subfield_values)
def extend_author_field(author_field, cds_id):
"""Extend author datafield by CDS authority id and Beard tag.
Extends the author datafield by the MARC subfields
$$0:AUTHOR|(CDS)<cds_id>
$$9:#BEARD#
if $$0:AUTHOR|(CDS)<cds_id> does not exist in `author_field`.
:param author_field:
Example:
# from invenio.search_engine import get_record
# from invenio.bibrecord import record_get_field_instances
# record = get_record(2150939)
# author_field = record_get_field_instances(record, "100")[0]
author_field = ([('a', 'Ellis, John'),
('u', "King's Coll. London"),
('u', 'CERN')], ' ', ' ', '', 32)
:param str cds_id: sequence of numbers representing the CDS id
Example:
cds_id = '2108556'
:result:
Example:
author_field = ([('a', 'Ellis, John'),
('u', "King's Coll. London"),
('u', 'CERN'),
('0', 'AUTHOR|(CDS)2108556'),
('9', '#BEARD#')], ' ', ' ', '', 32)
:return: True, if `author_field` has been updated, False otherwise
"""
cds_authority_id = "AUTHOR|(CDS){0}".format(cds_id)
if cds_authority_id not in field_get_subfield_values(author_field, '0'):
field_add_subfield(author_field, "0", cds_authority_id)
field_add_subfield(author_field, "9", "#BEARD#")
return True
return False
def update_record(record_id, authors):
"""Update authors in CDS record.
:param int record_id: record to update author datafields
Example:
record_id = 2150939
:param dict authors: dictionary where keys are author full names and
values the CDS profile ids to be updated in the given record
Example:
authors = {'Ellis, John': '2108556'}
:return: string representing the record XML element containing
author (`100`) and/or co-author (`700`) datafields. Empty string if
nothing to update
Example:
'<record>
<controlfield tag="001">2150939</controlfield>
<datafield tag="100" ind1=" " ind2=" ">
<subfield code="a">Ellis, John</subfield>
<subfield code="u">King's Coll. London</subfield>
<subfield code="u">CERN</subfield>
<subfield code="0">AUTHOR|(CDS)2108556</subfield>
<subfield code="9">#BEARD#</subfield>
</datafield>
</record>'
"""
record = get_record(record_id)
record_author = record_get_field_instances(record, "100")
record_coauthors = record_get_field_instances(record, "700")
if len(record_author) > 1:
print ("Oops: several '100' (main author) fields have been found in "
"record '{0}'".format(record_id))
return ""
datafields = ""
author = False
for author_field in record_author:
try:
author_name = field_get_subfield_values(author_field, 'a')[0]
try:
cds_id = authors[author_name]
if extend_author_field(author_field, cds_id):
datafields += field_xml_output(author_field, "100")
author = True
except KeyError:
pass
except IndexError:
# Author field (`100`) does not have subfield `a`
pass
if len(authors) > 1 or not author:
for coauthor_field in record_coauthors:
try:
coauthor_name = field_get_subfield_values(
coauthor_field, 'a')[0]
try:
cds_id = authors[coauthor_name]
if extend_author_field(coauthor_field, cds_id):
author = True
except KeyError:
pass
except IndexError:
# Co-author field (`700`) does not have subfield `a`
pass
datafields += field_xml_output(coauthor_field, "700")
# Nothing to update
if not author:
# print "No authors to update in record '{0}'".format(record_id)
return ""
record = ('<record><controlfield tag="001">{0}</controlfield>{1}'
'</record>'.format(record_id, datafields))
return record
def swap_clusters(linked_clusters):
"""Swap linked clusters.
:param dict linked_clusters: contains the linked clusters
created by the linking process. Keys are CDS profile ids to which the
cluster belongs and values is a list containing `signature_id`s
Example:
{'2108556': ['1000048_Ellis, Jonathan Richard_3262364',
'100545_Ellis, Jonathan Richard_13778',
'1042975_John Ellis_3414782', ...],
'2094406': ['2127658_Betti, Federico_8687791',
'5701_Betti, F_4574', ...],
...}
:return: dictionary representing linked clusters in another format.
Keys are CDS record ids which have to be updated and values is a list
containing pairs. A pair contains the author's full name and its
CDS profile id
Example:
{'1000048': {'Ellis, Jonathan Richard': '2108556'},
'100545': {'Ellis, Jonathan Richard': '2108556'},
'1042975': {'John Ellis': '2108556'},
'2127658': {'Betti, Federico': '2094406'},
'5701': {'Betti, F': '2094406'},
...}
"""
clusters_swapped = {}
for cds_id, signature_ids in linked_clusters.iteritems():
for signature_id in signature_ids:
signature_id_splitted = signature_id.split("_")
publication_id = signature_id_splitted[0]
author_name = signature_id_splitted[1]
# TODO check for same author names in one record
try:
clusters_swapped[publication_id][author_name] = cds_id
except KeyError:
clusters_swapped[publication_id] = {author_name: cds_id}
return clusters_swapped
def update(input_clusters, output_updates_dir, chunk_size=1000, upload=False):
"""Update authors which are linked to CDS profiles.
:param str input_clusters: file path to JSON file containing the
linked clusters for updating the signatures
:param str output_updates_dir: existing directory path to write XML files
containing record updates used for bibupload
:param int chunk_size: number of records to write to one file
:param bool upload: send updates (`output_updates`) to bibupload if enabled
"""
# Load linked clusters
with open(input_clusters) as f:
linked_clusters = load(f)
print "{0} clusters have been loaded".format(len(linked_clusters))
# Swap linked clusters
clusters_swapped = swap_clusters(linked_clusters)
print "{0} records to update".format(len(clusters_swapped))
# Update records and write to (multiple) file(s)
file_handle = None
chunk = 0
number_of_records_in_chunk = 0
for cluster_id in clusters_swapped:
if not number_of_records_in_chunk % chunk_size:
if file_handle:
file_handle.close()
file_path = join(output_updates_dir,
"record_updates_{0}.xml".format(chunk))
file_handle = open(file_path, "w")
chunk += 1
number_of_records_in_chunk = 1 # Reset counter
# cluster_id is representing the record id
record_id = int(cluster_id)
record = update_record(record_id, clusters_swapped[cluster_id])
if record:
file_handle.write(record)
number_of_records_in_chunk += 1
file_handle.close()
# TODO upload: call bibupload (--correct), if file contains data and
# `upload` is enabled
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_clusters", required=True, type=str,
help="Linked clusters by authority ids of signatures")
parser.add_argument("--output_updates_dir", required=True, type=str,
help="Updates as MARC XML records used for bibupload")
parser.add_argument("--chunk_size", default=1000, type=int,
help="Number of records to write to one file")
parser.add_argument("--upload", default=0, type=int,
help="Whether it should be sent to bibupload or not")
args = parser.parse_args()
update(args.input_clusters, args.output_updates_dir,
chunk_size = args.chunk_size, upload=args.upload == 1)