def process(self, *args, **kwargs): logger.info("Processing file {}".format(self.items_file_path)) counter = 0 for item in file_wrapper(self.items_file_path): counter += 1 self._process_item_wrapper(item) logger.info("{} items processed, {} created, {} updated, {} errors".format(counter, self._count_created, self._count_updated, self._count_error))
def process(self, *args, **kwargs): logger.info("Processing file {}".format(self.items_file_path)) counter = 0 for item in file_wrapper(self.items_file_path): counter += 1 self._process_member(item) logger.info("{} items processed, {} created, {} updated".format( counter, self._count_created, self._count_updated))
def process(self, *args, **kwargs): logger.info("Processing file {}".format(self.items_file_path)) counter = 0 for item in file_wrapper(self.items_file_path): counter += 1 if item['type'] == 'LibraryResultPage': # Ignore these entries pass if item['type'] == 'LibraryAgenda': # Filter out ombudsman agendas if 'Ombudsman' not in item['title_en']: self._process_agenda_item(item) logger.info("{} items processed, {} created, {} updated".format(counter, self._count_created, self._count_updated))
def process(self, *args, **kwargs): logger.info("Processing file {}".format(self.items_file_path)) counter = 0 for item in file_wrapper(self.items_file_path): counter += 1 if item['type'] == 'LibraryResultPage': # Ignore these entries continue if item['type'] == 'LibraryHansard': self._process_hansard_item(item) # After all downloaded hansards are created/updated, merge the ones that are parts of a hansard. self._merge_parts() logger.info("{} (raw) items processed, {} created, {} updated, {} warnings".format(counter, self._count_created, self._count_updated, self._count_warning)) logger.info("{} merged items created/updated.".format(self._count_merged))
def process(self, *args, **kwargs): logger.info("Processing file {}".format(self.items_file_path)) counter = 0 for item in file_wrapper(self.items_file_path): counter += 1 if item['type'] == 'LibraryResultPage': # Ignore these entries pass if item['type'] == 'LibraryAgenda': # Filter out ombudsman agendas if 'Ombudsman' not in item['title_en']: self._process_agenda_item(item) logger.info("{} items processed, {} created, {} updated".format( counter, self._count_created, self._count_updated))
def process(self): logger.info("Processing file {}".format(self.items_file_path)) counter = 0 # keys are fields in the jsonlines item, values are the fields in the model object field_map = { 'asker': 'raw_asker', 'reply_link': 'reply_link', 'number_and_type': 'number_and_type', 'date': 'raw_date', 'source_url': 'crawled_from', 'subject': 'subject', } matcher_en = RawMember.get_matcher() matcher_cn = RawMember.get_matcher(False) for item in file_wrapper(self.items_file_path): try: counter += 1 # For each question, fill in the raw values, then try to match against a RawMember instance # Generate a uid and get the object uid = self._generate_uid(item) obj, created = RawCouncilQuestion.objects.get_or_create( uid=uid) if created: self._count_created += 1 else: self._count_updated += 1 # Fill in the last parsed and last crawled values if self.job is not None: obj.last_crawled = self.job.completed obj.last_parsed = now() # Fill in the items that can be copied directly for k, v in field_map.items(): val = item.get(k, None) setattr(obj, v, val) if obj.reply_link is None: obj.reply_link = u'' # the subject_link is sometimes a relative path, so convert it to an absolute url subject_link = item.get('subject_link', u'') if subject_link != u'': abs_url = urljoin(item['source_url'], subject_link) obj.subject_link = abs_url # Convert the language from the string to the constants lang = LANG_CN if item['language'] == u'C' else LANG_EN obj.language = lang if lang == LANG_CN: matcher = matcher_cn else: matcher = matcher_en # Try to find the RawMember object that matches the asker raw_name = item['asker'] name = MemberName(raw_name) match = matcher.match(name) if match is not None: member = match[1] obj.asker = member # Finally save obj.save() except (KeyError, RuntimeError) as e: self._count_error += 1 logger.warn( u'Could not process question {} from date {}'.format( item['number_and_type'], item['date'])) logger.warn(unicode(e)) continue logger.info( "{} items processed, {} created, {} updated, {} errors".format( counter, self._count_created, self._count_updated, self._count_error))
def process(self): logger.info("Processing file {}".format(self.items_file_path)) counter = 0 for item in file_wrapper(self.items_file_path): if item['type'] and item['file_urls']: # if we downloaded something counter += 1 uid, main_type = self._generate_uid(item) #get/create the object if main_type == 'agenda': obj, created = RawHansardAgenda.objects.get_or_create(uid=uid) elif main_type == 'minutes': obj, created = RawHansardMinutes.objects.get_or_create(uid=uid) elif main_type == 'floor': obj, created = RawHansardFloorRecord.objects.get_or_create(uid=uid) elif main_type == 'hansard': obj, created = RawHansardFormalRecord.objects.get_or_create(uid=uid) else: logger.warning('Unknown Harsard type:{}'.format(main_type)) #bookkeeping if created: self._count_created += 1 else: self._count_updated += 1 # Fill in model fields try: # Fill in the last parsed and last crawled values if self.job is not None: obj.last_crawled = self.job.completed obj.last_parsed = now() # Fill in the items that can be copied directly for k, v in field_map.items(): val = item.get(k, None) setattr(obj, v, val) # Fill in language field lang = uid.split('-')[-1] if lang=='e': obj.language = LANG_EN elif lang=='c': obj.language = LANG_CN elif lang=='ec': obj.language = LANG_BOTH # Fill in URL link to file obj.url = item['file_urls'][0] # Fill in the local path try: obj.local_filename = item['files'][0]['path'] except IndexError: logger.warn(u'Could not get local path for Hansard object {} from date {}'.format(item['type'], item['date'])) # Finally save obj.save() except (KeyError, RuntimeError) as e: self._count_error += 1 logger.warn(u'Could not process Hansard object {} from date {}'.format(item['type'], item['date'])) logger.warn(unicode(e)) continue else: logger.warning('The Harsard type is not specified:{}'.format(item)) logger.info("{} items processed, {} created, {} updated".format(counter, self._count_created, self._count_updated))
def process(self): logger.info("Processing file {}".format(self.items_file_path)) counter = 0 # keys are fields in the jsonlines item, values are the fields in the model object field_map = { 'asker': 'raw_asker', 'reply_link': 'reply_link', 'number_and_type': 'number_and_type', 'date': 'raw_date', 'source_url': 'crawled_from', 'subject': 'subject', } matcher_en = RawMember.get_matcher() matcher_cn = RawMember.get_matcher(False) for item in file_wrapper(self.items_file_path): try: counter += 1 # For each question, fill in the raw values, then try to match against a RawMember instance # Generate a uid and get the object uid = self._generate_uid(item) obj, created = RawCouncilQuestion.objects.get_or_create(uid=uid) if created: self._count_created += 1 else: self._count_updated += 1 # Fill in the last parsed and last crawled values if self.job is not None: obj.last_crawled = self.job.completed obj.last_parsed = now() # Fill in the items that can be copied directly for k, v in field_map.items(): val = item.get(k, None) setattr(obj, v, val) if obj.reply_link is None: obj.reply_link = u'' # the subject_link is sometimes a relative path, so convert it to an absolute url subject_link = item.get('subject_link', u'') if subject_link != u'': abs_url = urljoin(item['source_url'], subject_link) obj.subject_link = abs_url # Convert the language from the string to the constants lang = LANG_CN if item['language'] == u'C' else LANG_EN obj.language = lang if lang == LANG_CN: matcher = matcher_cn else: matcher = matcher_en # Try to find the RawMember object that matches the asker raw_name = item['asker'] name = MemberName(raw_name) match = matcher.match(name) if match is not None: member = match[1] obj.asker = member # Finally save obj.save() except (KeyError, RuntimeError) as e: self._count_error += 1 logger.warn(u'Could not process question {} from date {}'.format(item['number_and_type'], item['date'])) logger.warn(unicode(e)) continue logger.info("{} items processed, {} created, {} updated, {} errors".format(counter, self._count_created, self._count_updated, self._count_error))
def process(self): logger.info("Processing file {}".format(self.items_file_path)) counter = 0 # keys are fields in the jsonlines item, values are the fields in the model object field_map = { "asker": "raw_asker", "reply_link": "reply_link", "number_and_type": "number_and_type", "date": "raw_date", "source_url": "crawled_from", "subject": "subject", } matcher_en = RawMember.get_matcher() matcher_cn = RawMember.get_matcher(False) for item in file_wrapper(self.items_file_path): try: counter += 1 # For each question, fill in the raw values, then try to match against a RawMember instance # Generate a uid and get the object uid = self._generate_uid(item) obj, created = RawCouncilQuestion.objects.get_or_create(uid=uid) if created: self._count_created += 1 else: self._count_updated += 1 # Fill in the last parsed and last crawled values if self.job is not None: obj.last_crawled = self.job.completed obj.last_parsed = now() # Fill in the items that can be copied directly for k, v in field_map.items(): val = item.get(k, None) setattr(obj, v, val) if obj.reply_link is None: obj.reply_link = u"" # the subject_link is sometimes a relative path, so convert it to an absolute url subject_link = item.get("subject_link", u"") if subject_link != u"": abs_url = urljoin(item["source_url"], subject_link) obj.subject_link = abs_url # Convert the language from the string to the constants lang = LANG_CN if item["language"] == u"C" else LANG_EN obj.language = lang if lang == LANG_CN: matcher = matcher_cn else: matcher = matcher_en # Try to find the RawMember object that matches the asker # There will still be some askers not matched - we will use parser to fix them soon raw_name = item["asker"] # Some postprocessing # Get rid of 'Hon', '議員' and '' raw_name = raw_name.replace(u"Hon", u"") raw_name = raw_name.replace(u"議員", u"") # Get rid of heading and tailing spaces if raw_name[0] == u" ": raw_name = raw_name[1:] if raw_name[-1] == u" ": raw_name = raw_name[:-1] # Try to match the name with RawMember name = MemberName(raw_name) match = matcher.match(name) if match is not None: member = match[1] obj.asker = member else: pass # logger.warn(u'Cannot match asker "{}" with members in database'.format(raw_name)) # Get the local path of reply content try: obj.local_filename = item["files"][0]["path"] except IndexError: obj.local_filename = None logger.warn( u"Could not get local path for question {} from date {}".format( item["number_and_type"], item["date"] ) ) # Sometimes the reply link is not available yet, # and sometimes the meeting was cancelled or deferred # In these cases, forget about them. if obj.local_filename is not None: obj.save() except (KeyError, RuntimeError) as e: self._count_error += 1 logger.warn(u"Could not process question {} from date {}".format(item["number_and_type"], item["date"])) logger.warn(unicode(e)) continue # After saving all items, use parser to fix missing askers no_asker_list = RawCouncilQuestion.fix_asker_by_parser() logger.info( u"{} items processed, {} created, {} updated, {} errors, {} questions without asker".format( counter, self._count_created, self._count_updated, self._count_error, len(no_asker_list) ) ) # for debugging print(no_asker_list)