def get_context_data(self, **kwargs): context = super(RawCouncilQuestionDetailView, self).get_context_data(**kwargs) parser = self.object.get_parser() if parser: context['parser'] = parser if parser.asker: name = MemberName(parser.asker) matcher = RawMember.get_matcher() match = matcher.match(name) if match is None: matcher = RawMember.get_matcher(english=False) match = matcher.match(name) context['name']=match return context
def get_context_data(self, **kwargs): context = super(RawCouncilHansardDetailView, self).get_context_data(**kwargs) parser = self.object.get_parser() context['parser'] = parser if parser is not None: if parser.language == LANG_EN: matcher = RawMember.get_matcher() elif parser.language == LANG_CN: matcher = RawMember.get_matcher(english=False) if parser.president is not None: name = MemberName(parser.president[0]) match = matcher.match(name) if match is not None: obj = (parser.president, match) context['president']=obj return context
def get_context_data(self, **kwargs): context = super(RawCouncilAgendaDetailView, self).get_context_data(**kwargs) parser = self.object.get_parser() context['parser'] = parser matcher = RawMember.get_matcher() questions = [] if parser.questions is not None: for q in parser.questions: name = MemberName(q.asker) match = matcher.match(name) if match==None: #try Chinese. #This will be better handled when we have different language display matcher = RawMember.get_matcher(english=False) match = matcher.match(name) obj = (q, match) questions.append(obj) context['questions'] = questions return context
def get_context_data(self, **kwargs): context = super(RawCouncilAgendaDetailView, self).get_context_data(**kwargs) parser = self.object.get_parser() context['parser'] = parser matcher = RawMember.get_matcher() questions = [] if parser.questions is not None: for q in parser.questions: name = MemberName(q.asker) match = matcher.match(name) obj = (q, match) questions.append(obj) context['questions'] = questions return context
def _get_member_object(self, uid): try: obj = RawMember.objects.get(uid=uid) self._count_updated += 1 except RawMember.DoesNotExist: obj = RawMember(uid=uid) self._count_created += 1 except RawMember.MultipleObjectsReturned: warnings.warn( "Found more than one item with raw id {}".format(uid), RuntimeWarning) obj = None return obj
def process(self): logger.info("Processing file {}".format(self.items_file_path)) counter = 0 # keys are fields in the jsonlines item, values are the fields in the model object field_map = { 'asker': 'raw_asker', 'reply_link': 'reply_link', 'number_and_type': 'number_and_type', 'date': 'raw_date', 'source_url': 'crawled_from', 'subject': 'subject', } matcher_en = RawMember.get_matcher() matcher_cn = RawMember.get_matcher(False) for item in file_wrapper(self.items_file_path): try: counter += 1 # For each question, fill in the raw values, then try to match against a RawMember instance # Generate a uid and get the object uid = self._generate_uid(item) obj, created = RawCouncilQuestion.objects.get_or_create( uid=uid) if created: self._count_created += 1 else: self._count_updated += 1 # Fill in the last parsed and last crawled values if self.job is not None: obj.last_crawled = self.job.completed obj.last_parsed = now() # Fill in the items that can be copied directly for k, v in field_map.items(): val = item.get(k, None) setattr(obj, v, val) if obj.reply_link is None: obj.reply_link = u'' # the subject_link is sometimes a relative path, so convert it to an absolute url subject_link = item.get('subject_link', u'') if subject_link != u'': abs_url = urljoin(item['source_url'], subject_link) obj.subject_link = abs_url # Convert the language from the string to the constants lang = LANG_CN if item['language'] == u'C' else LANG_EN obj.language = lang if lang == LANG_CN: matcher = matcher_cn else: matcher = matcher_en # Try to find the RawMember object that matches the asker raw_name = item['asker'] name = MemberName(raw_name) match = matcher.match(name) if match is not None: member = match[1] obj.asker = member # Finally save obj.save() except (KeyError, RuntimeError) as e: self._count_error += 1 logger.warn( u'Could not process question {} from date {}'.format( item['number_and_type'], item['date'])) logger.warn(unicode(e)) continue logger.info( "{} items processed, {} created, {} updated, {} errors".format( counter, self._count_created, self._count_updated, self._count_error))
def process(self): logger.info("Processing file {}".format(self.items_file_path)) counter = 0 # keys are fields in the jsonlines item, values are the fields in the model object field_map = { 'asker': 'raw_asker', 'reply_link': 'reply_link', 'number_and_type': 'number_and_type', 'date': 'raw_date', 'source_url': 'crawled_from', 'subject': 'subject', } matcher_en = RawMember.get_matcher() matcher_cn = RawMember.get_matcher(False) for item in file_wrapper(self.items_file_path): try: counter += 1 # For each question, fill in the raw values, then try to match against a RawMember instance # Generate a uid and get the object uid = self._generate_uid(item) obj, created = RawCouncilQuestion.objects.get_or_create(uid=uid) if created: self._count_created += 1 else: self._count_updated += 1 # Fill in the last parsed and last crawled values if self.job is not None: obj.last_crawled = self.job.completed obj.last_parsed = now() # Fill in the items that can be copied directly for k, v in field_map.items(): val = item.get(k, None) setattr(obj, v, val) if obj.reply_link is None: obj.reply_link = u'' # the subject_link is sometimes a relative path, so convert it to an absolute url subject_link = item.get('subject_link', u'') if subject_link != u'': abs_url = urljoin(item['source_url'], subject_link) obj.subject_link = abs_url # Convert the language from the string to the constants lang = LANG_CN if item['language'] == u'C' else LANG_EN obj.language = lang if lang == LANG_CN: matcher = matcher_cn else: matcher = matcher_en # Try to find the RawMember object that matches the asker raw_name = item['asker'] name = MemberName(raw_name) match = matcher.match(name) if match is not None: member = match[1] obj.asker = member # Finally save obj.save() except (KeyError, RuntimeError) as e: self._count_error += 1 logger.warn(u'Could not process question {} from date {}'.format(item['number_and_type'], item['date'])) logger.warn(unicode(e)) continue logger.info("{} items processed, {} created, {} updated, {} errors".format(counter, self._count_created, self._count_updated, self._count_error))
def process(self): logger.info("Processing file {}".format(self.items_file_path)) counter = 0 # keys are fields in the jsonlines item, values are the fields in the model object field_map = { "asker": "raw_asker", "reply_link": "reply_link", "number_and_type": "number_and_type", "date": "raw_date", "source_url": "crawled_from", "subject": "subject", } matcher_en = RawMember.get_matcher() matcher_cn = RawMember.get_matcher(False) for item in file_wrapper(self.items_file_path): try: counter += 1 # For each question, fill in the raw values, then try to match against a RawMember instance # Generate a uid and get the object uid = self._generate_uid(item) obj, created = RawCouncilQuestion.objects.get_or_create(uid=uid) if created: self._count_created += 1 else: self._count_updated += 1 # Fill in the last parsed and last crawled values if self.job is not None: obj.last_crawled = self.job.completed obj.last_parsed = now() # Fill in the items that can be copied directly for k, v in field_map.items(): val = item.get(k, None) setattr(obj, v, val) if obj.reply_link is None: obj.reply_link = u"" # the subject_link is sometimes a relative path, so convert it to an absolute url subject_link = item.get("subject_link", u"") if subject_link != u"": abs_url = urljoin(item["source_url"], subject_link) obj.subject_link = abs_url # Convert the language from the string to the constants lang = LANG_CN if item["language"] == u"C" else LANG_EN obj.language = lang if lang == LANG_CN: matcher = matcher_cn else: matcher = matcher_en # Try to find the RawMember object that matches the asker # There will still be some askers not matched - we will use parser to fix them soon raw_name = item["asker"] # Some postprocessing # Get rid of 'Hon', '議員' and '' raw_name = raw_name.replace(u"Hon", u"") raw_name = raw_name.replace(u"議員", u"") # Get rid of heading and tailing spaces if raw_name[0] == u" ": raw_name = raw_name[1:] if raw_name[-1] == u" ": raw_name = raw_name[:-1] # Try to match the name with RawMember name = MemberName(raw_name) match = matcher.match(name) if match is not None: member = match[1] obj.asker = member else: pass # logger.warn(u'Cannot match asker "{}" with members in database'.format(raw_name)) # Get the local path of reply content try: obj.local_filename = item["files"][0]["path"] except IndexError: obj.local_filename = None logger.warn( u"Could not get local path for question {} from date {}".format( item["number_and_type"], item["date"] ) ) # Sometimes the reply link is not available yet, # and sometimes the meeting was cancelled or deferred # In these cases, forget about them. if obj.local_filename is not None: obj.save() except (KeyError, RuntimeError) as e: self._count_error += 1 logger.warn(u"Could not process question {} from date {}".format(item["number_and_type"], item["date"])) logger.warn(unicode(e)) continue # After saving all items, use parser to fix missing askers no_asker_list = RawCouncilQuestion.fix_asker_by_parser() logger.info( u"{} items processed, {} created, {} updated, {} errors, {} questions without asker".format( counter, self._count_created, self._count_updated, self._count_error, len(no_asker_list) ) ) # for debugging print(no_asker_list)