def run_update(): ap_cables = _aftenposten_cable_ids() rr_cables = _russianreporter_cable_ids() known_cables = ap_cables | rr_cables missing_subjects = _file_as_set(_FILE_MISSING_SUBJECTS) cable_ids = missing_subjects & known_cables for cable_id in cable_ids: if cable_id in ap_cables: source_iri = _AP_IRI + cable_id + '.html' page = urllib2.urlopen(source_iri).read().decode('utf-8', 'ignore') ap_iri = _AP_SOURCE_PATTERN.search(page).group(1) subject = parse_subject(page) if not subject: if cable_id == u'09OSLO399': # This cable is malformed subject = u'NORWAYS RUSSIA POLICY: WISHFUL THINKING' else: print '-------- ERROR, cannot parse subject of ' + source_iri continue _additional_subjects[cable_id] = (subject, ap_iri) elif cable_id in rr_cables: source_iri = _RR_IRI + cable_id + '.html' page = urllib2.urlopen(source_iri).read().decode('utf-8', 'ignore') subject = parse_subject(page) if not subject: print '-------- ERROR, cannot parse subject of ' + source_iri continue _additional_subjects[cable_id] = (subject, source_iri) _write_fixed_subjects(_additional_subjects)
def subject(self): return reader.parse_subject(self.content, self.reference_id)
def check(expected, input): eq_(expected, parse_subject(input))
def check(content, clean, expected): eq_(expected, parse_subject(content, clean=clean))