Exemple #1
0
 def process(self, *args, **kwargs):
     logger.info("Processing file {}".format(self.items_file_path))
     counter = 0
     for item in file_wrapper(self.items_file_path):
         counter += 1
         self._process_item_wrapper(item)
     logger.info("{} items processed, {} created, {} updated, {} errors".format(counter, self._count_created, self._count_updated, self._count_error))
 def process(self, *args, **kwargs):
     logger.info("Processing file {}".format(self.items_file_path))
     counter = 0
     for item in file_wrapper(self.items_file_path):
         counter += 1
         self._process_member(item)
     logger.info("{} items processed, {} created, {} updated".format(
         counter, self._count_created, self._count_updated))
 def process(self, *args, **kwargs):
     logger.info("Processing file {}".format(self.items_file_path))
     counter = 0
     for item in file_wrapper(self.items_file_path):
         counter += 1
         if item['type'] == 'LibraryResultPage':
             # Ignore these entries
             pass
         if item['type'] == 'LibraryAgenda':
             # Filter out ombudsman agendas
             if 'Ombudsman' not in item['title_en']:
                 self._process_agenda_item(item)
     logger.info("{} items processed, {} created, {} updated".format(counter, self._count_created, self._count_updated))
 def process(self, *args, **kwargs):
     logger.info("Processing file {}".format(self.items_file_path))
     counter = 0
     for item in file_wrapper(self.items_file_path):
         counter += 1
         if item['type'] == 'LibraryResultPage':
             # Ignore these entries
             continue
         if item['type'] == 'LibraryHansard':
             self._process_hansard_item(item)
     # After all downloaded hansards are created/updated, merge the ones that are parts of a hansard.
     self._merge_parts()
     logger.info("{} (raw) items processed, {} created, {} updated, {} warnings".format(counter, self._count_created, self._count_updated, self._count_warning))
     logger.info("{} merged items created/updated.".format(self._count_merged))
Exemple #5
0
 def process(self, *args, **kwargs):
     logger.info("Processing file {}".format(self.items_file_path))
     counter = 0
     for item in file_wrapper(self.items_file_path):
         counter += 1
         if item['type'] == 'LibraryResultPage':
             # Ignore these entries
             pass
         if item['type'] == 'LibraryAgenda':
             # Filter out ombudsman agendas
             if 'Ombudsman' not in item['title_en']:
                 self._process_agenda_item(item)
     logger.info("{} items processed, {} created, {} updated".format(
         counter, self._count_created, self._count_updated))
Exemple #6
0
    def process(self):
        logger.info("Processing file {}".format(self.items_file_path))
        counter = 0
        # keys are fields in the jsonlines item, values are the fields in the model object
        field_map = {
            'asker': 'raw_asker',
            'reply_link': 'reply_link',
            'number_and_type': 'number_and_type',
            'date': 'raw_date',
            'source_url': 'crawled_from',
            'subject': 'subject',
        }
        matcher_en = RawMember.get_matcher()
        matcher_cn = RawMember.get_matcher(False)
        for item in file_wrapper(self.items_file_path):
            try:
                counter += 1
                # For each question, fill in the raw values, then try to match against a RawMember instance

                # Generate a uid and get the object
                uid = self._generate_uid(item)
                obj, created = RawCouncilQuestion.objects.get_or_create(
                    uid=uid)
                if created:
                    self._count_created += 1
                else:
                    self._count_updated += 1

                # Fill in the last parsed and last crawled values
                if self.job is not None:
                    obj.last_crawled = self.job.completed
                obj.last_parsed = now()

                # Fill in the items that can be copied directly
                for k, v in field_map.items():
                    val = item.get(k, None)
                    setattr(obj, v, val)

                if obj.reply_link is None:
                    obj.reply_link = u''

                # the subject_link is sometimes a relative path, so convert it to an absolute url
                subject_link = item.get('subject_link', u'')
                if subject_link != u'':
                    abs_url = urljoin(item['source_url'], subject_link)
                    obj.subject_link = abs_url

                # Convert the language from the string to the constants
                lang = LANG_CN if item['language'] == u'C' else LANG_EN
                obj.language = lang
                if lang == LANG_CN:
                    matcher = matcher_cn
                else:
                    matcher = matcher_en

                # Try to find the RawMember object that matches the asker
                raw_name = item['asker']
                name = MemberName(raw_name)
                match = matcher.match(name)
                if match is not None:
                    member = match[1]
                    obj.asker = member

                # Finally save
                obj.save()
            except (KeyError, RuntimeError) as e:
                self._count_error += 1
                logger.warn(
                    u'Could not process question {} from date {}'.format(
                        item['number_and_type'], item['date']))
                logger.warn(unicode(e))
                continue
        logger.info(
            "{} items processed, {} created, {} updated, {} errors".format(
                counter, self._count_created, self._count_updated,
                self._count_error))
Exemple #7
0
 def process(self):
     logger.info("Processing file {}".format(self.items_file_path))
     counter = 0
     for item in file_wrapper(self.items_file_path):
         if item['type'] and item['file_urls']: # if we downloaded something
             counter += 1
             uid, main_type = self._generate_uid(item)
             #get/create the object
             if main_type == 'agenda':
                 obj, created = RawHansardAgenda.objects.get_or_create(uid=uid)
             elif main_type == 'minutes':
                 obj, created = RawHansardMinutes.objects.get_or_create(uid=uid)
             elif main_type == 'floor':
                 obj, created = RawHansardFloorRecord.objects.get_or_create(uid=uid)
             elif main_type == 'hansard':
                 obj, created = RawHansardFormalRecord.objects.get_or_create(uid=uid)
             else:
                 logger.warning('Unknown Harsard type:{}'.format(main_type))
             
             #bookkeeping
             if created:
                 self._count_created += 1
             else:
                 self._count_updated += 1
             
             # Fill in model fields
             try:
                 # Fill in the last parsed and last crawled values
                 if self.job is not None:
                     obj.last_crawled = self.job.completed
                 obj.last_parsed = now()
                 
                 # Fill in the items that can be copied directly
                 for k, v in field_map.items():
                     val = item.get(k, None)
                     setattr(obj, v, val)
                 
                 # Fill in language field
                 lang = uid.split('-')[-1]
                 if lang=='e':
                     obj.language = LANG_EN
                 elif lang=='c':
                     obj.language = LANG_CN
                 elif lang=='ec':
                     obj.language = LANG_BOTH
                 
                 # Fill in URL link to file
                 obj.url = item['file_urls'][0]
                 
                 # Fill in the local path
                 try:
                     obj.local_filename = item['files'][0]['path']
                 except IndexError:
                     logger.warn(u'Could not get local path for Hansard object {} from date {}'.format(item['type'], item['date']))
             
                 # Finally save
                 obj.save()
                 
             except (KeyError, RuntimeError) as e:
                 self._count_error += 1
                 logger.warn(u'Could not process Hansard object {} from date {}'.format(item['type'], item['date']))
                 logger.warn(unicode(e))
                 continue
         else:
             logger.warning('The Harsard type is not specified:{}'.format(item))
             
     logger.info("{} items processed, {} created, {} updated".format(counter, self._count_created, self._count_updated))
Exemple #8
0
    def process(self):
        logger.info("Processing file {}".format(self.items_file_path))
        counter = 0
        # keys are fields in the jsonlines item, values are the fields in the model object
        field_map = {
            'asker': 'raw_asker',
            'reply_link': 'reply_link',
            'number_and_type': 'number_and_type',
            'date': 'raw_date',
            'source_url': 'crawled_from',
            'subject': 'subject',
        }
        matcher_en = RawMember.get_matcher()
        matcher_cn = RawMember.get_matcher(False)
        for item in file_wrapper(self.items_file_path):
            try:
                counter += 1
                # For each question, fill in the raw values, then try to match against a RawMember instance

                # Generate a uid and get the object
                uid = self._generate_uid(item)
                obj, created = RawCouncilQuestion.objects.get_or_create(uid=uid)
                if created:
                    self._count_created += 1
                else:
                    self._count_updated += 1

                # Fill in the last parsed and last crawled values
                if self.job is not None:
                    obj.last_crawled = self.job.completed
                obj.last_parsed = now()

                # Fill in the items that can be copied directly
                for k, v in field_map.items():
                    val = item.get(k, None)
                    setattr(obj, v, val)

                if obj.reply_link is None:
                    obj.reply_link = u''

                # the subject_link is sometimes a relative path, so convert it to an absolute url
                subject_link = item.get('subject_link', u'')
                if subject_link != u'':
                    abs_url = urljoin(item['source_url'], subject_link)
                    obj.subject_link = abs_url

                # Convert the language from the string to the constants
                lang = LANG_CN if item['language'] == u'C' else LANG_EN
                obj.language = lang
                if lang == LANG_CN:
                    matcher = matcher_cn
                else:
                    matcher = matcher_en

                # Try to find the RawMember object that matches the asker
                raw_name = item['asker']
                name = MemberName(raw_name)
                match = matcher.match(name)
                if match is not None:
                    member = match[1]
                    obj.asker = member

                # Finally save
                obj.save()
            except (KeyError, RuntimeError) as e:
                self._count_error += 1
                logger.warn(u'Could not process question {} from date {}'.format(item['number_and_type'], item['date']))
                logger.warn(unicode(e))
                continue
        logger.info("{} items processed, {} created, {} updated, {} errors".format(counter, self._count_created, self._count_updated, self._count_error))
Exemple #9
0
    def process(self):
        logger.info("Processing file {}".format(self.items_file_path))
        counter = 0
        # keys are fields in the jsonlines item, values are the fields in the model object
        field_map = {
            "asker": "raw_asker",
            "reply_link": "reply_link",
            "number_and_type": "number_and_type",
            "date": "raw_date",
            "source_url": "crawled_from",
            "subject": "subject",
        }
        matcher_en = RawMember.get_matcher()
        matcher_cn = RawMember.get_matcher(False)
        for item in file_wrapper(self.items_file_path):
            try:
                counter += 1
                # For each question, fill in the raw values, then try to match against a RawMember instance

                # Generate a uid and get the object
                uid = self._generate_uid(item)
                obj, created = RawCouncilQuestion.objects.get_or_create(uid=uid)
                if created:
                    self._count_created += 1
                else:
                    self._count_updated += 1

                # Fill in the last parsed and last crawled values
                if self.job is not None:
                    obj.last_crawled = self.job.completed
                obj.last_parsed = now()

                # Fill in the items that can be copied directly
                for k, v in field_map.items():
                    val = item.get(k, None)
                    setattr(obj, v, val)

                if obj.reply_link is None:
                    obj.reply_link = u""

                # the subject_link is sometimes a relative path, so convert it to an absolute url
                subject_link = item.get("subject_link", u"")
                if subject_link != u"":
                    abs_url = urljoin(item["source_url"], subject_link)
                    obj.subject_link = abs_url

                # Convert the language from the string to the constants
                lang = LANG_CN if item["language"] == u"C" else LANG_EN
                obj.language = lang
                if lang == LANG_CN:
                    matcher = matcher_cn
                else:
                    matcher = matcher_en

                # Try to find the RawMember object that matches the asker
                # There will still be some askers not matched - we will use parser to fix them soon
                raw_name = item["asker"]
                # Some postprocessing
                # Get rid of 'Hon', '議員' and ''
                raw_name = raw_name.replace(u"Hon", u"")
                raw_name = raw_name.replace(u"議員", u"")

                # Get rid of heading and tailing spaces
                if raw_name[0] == u" ":
                    raw_name = raw_name[1:]
                if raw_name[-1] == u" ":
                    raw_name = raw_name[:-1]

                # Try to match the name with RawMember
                name = MemberName(raw_name)
                match = matcher.match(name)
                if match is not None:
                    member = match[1]
                    obj.asker = member
                else:
                    pass
                    # logger.warn(u'Cannot match asker "{}" with members in database'.format(raw_name))

                # Get the local path of reply content
                try:
                    obj.local_filename = item["files"][0]["path"]
                except IndexError:
                    obj.local_filename = None
                    logger.warn(
                        u"Could not get local path for question {} from date {}".format(
                            item["number_and_type"], item["date"]
                        )
                    )

                # Sometimes the reply link is not available yet,
                # and sometimes the meeting was cancelled or deferred
                # In these cases, forget about them.
                if obj.local_filename is not None:
                    obj.save()

            except (KeyError, RuntimeError) as e:
                self._count_error += 1
                logger.warn(u"Could not process question {} from date {}".format(item["number_and_type"], item["date"]))
                logger.warn(unicode(e))
                continue
        # After saving all items, use parser to fix missing askers
        no_asker_list = RawCouncilQuestion.fix_asker_by_parser()

        logger.info(
            u"{} items processed, {} created, {} updated, {} errors, {} questions without asker".format(
                counter, self._count_created, self._count_updated, self._count_error, len(no_asker_list)
            )
        )
        # for debugging
        print(no_asker_list)