def process(self, data): registration = Registration.from_json(data) output = self.disposition(registration) if registration.uuid: self.output_for_uuid[registration.uuid] = output if registration.parent: # In the previous step, children were processed # immediately after their parents. That means they're # processed after their parents here. parent_output = self.output_for_uuid[registration.parent['uuid']] # In general, children are totally independent # registrations. However, if the 'parent' registration # (the one for which the most data is available) was # deemed to be out of range, there's a good chance the # 'children' are also out of range. if parent_output and output == self.in_range and parent_output != self.in_range: registration.disposition = "Classified with parent." registration.warnings.append( "This registration seems to be in range, but it was associated with a registration which was a foreign publication or not in range. To be safe, this registration will be put in the same category as its 'parent'; it should be checked manually." ) output = parent_output json.dump(registration.jsonable(require_disposition=True), output) output.write("\n")
if penalty > 0: # Beyond "a couple typoes", the Levenshtein distance # basically means there's no match, so we cap the penalty # at a pretty low level. penalty = min(penalty, 0.20) return penalty comparator = Comparator("output/ia-0-texts.ndjson") output = open("output/ia-1-matched.ndjson", "w") for filename in ["FINAL-not-renewed.ndjson" ]: #"FINAL-possibly-renewed.ndjson"]: for i in open("output/%s" % filename): cce = Registration.from_json(json.loads(i)) title = cce.title if not title or not comparator.normalize(title): continue matches = list(comparator.matches(cce)) # If there are a huge number of IA matches for a CCE title, # penalize them -- it's probably a big mess that must be dealt # with separately. Give a slight boost if there's only a single # match. if len(matches) == 1: num_matches_coefficient = 1.1 elif len(matches) <= MATCH_CUTOFF: num_matches_coefficient = 1 else: num_matches_coefficient = 1 - (len(matches) -
def convert(self, input_file): self.out.writerow(Registration.csv_row_labels + Renewal.csv_row_labels) for line in open(input_file): registration = Registration.from_json(json.loads(line)) self.out.writerow(registration.csv_row)