Esempio n. 1
0
 def get_context_data(self, **kwargs):
     context = super(RawCouncilQuestionDetailView, self).get_context_data(**kwargs)
     parser = self.object.get_parser()
     if parser:
         context['parser'] = parser
         if parser.asker: 
             name = MemberName(parser.asker)
             matcher = RawMember.get_matcher()
             match = matcher.match(name)
             if match is None:
                 matcher = RawMember.get_matcher(english=False)
                 match = matcher.match(name)
             context['name']=match
     return context
Esempio n. 2
0
 def get_context_data(self, **kwargs):
     context = super(RawCouncilHansardDetailView, self).get_context_data(**kwargs)
     parser = self.object.get_parser()
     context['parser'] = parser
     if parser is not None:
         if parser.language == LANG_EN:
             matcher = RawMember.get_matcher()
         elif parser.language == LANG_CN:
             matcher = RawMember.get_matcher(english=False)
         if parser.president is not None:
             name = MemberName(parser.president[0])
             match = matcher.match(name)
             if match is not None:
                 obj = (parser.president, match)
                 context['president']=obj
     return context
Esempio n. 3
0
 def get_context_data(self, **kwargs):
     context = super(RawCouncilAgendaDetailView, self).get_context_data(**kwargs)
     parser = self.object.get_parser()
     context['parser'] = parser
     
     matcher = RawMember.get_matcher()
     questions = []
     if parser.questions is not None:
         for q in parser.questions:
             name = MemberName(q.asker)
             match = matcher.match(name)
             if match==None:
             #try Chinese. 
             #This will be better handled when we have different language display
                 matcher = RawMember.get_matcher(english=False)
                 match = matcher.match(name)
             obj = (q, match)
             questions.append(obj)
     context['questions'] = questions
     return context
Esempio n. 4
0
 def get_context_data(self, **kwargs):
     context = super(RawCouncilAgendaDetailView, self).get_context_data(**kwargs)
     parser = self.object.get_parser()
     context['parser'] = parser
     matcher = RawMember.get_matcher()
     questions = []
     if parser.questions is not None:
         for q in parser.questions:
             name = MemberName(q.asker)
             match = matcher.match(name)
             obj = (q, match)
             questions.append(obj)
     context['questions'] = questions
     return context
Esempio n. 5
0
    def _get_member_object(self, uid):
        try:
            obj = RawMember.objects.get(uid=uid)
            self._count_updated += 1
        except RawMember.DoesNotExist:
            obj = RawMember(uid=uid)
            self._count_created += 1
        except RawMember.MultipleObjectsReturned:
            warnings.warn(
                "Found more than one item with raw id {}".format(uid),
                RuntimeWarning)
            obj = None

        return obj
Esempio n. 6
0
 def get_context_data(self, **kwargs):
     context = super(RawCouncilAgendaDetailView,
                     self).get_context_data(**kwargs)
     parser = self.object.get_parser()
     context['parser'] = parser
     matcher = RawMember.get_matcher()
     questions = []
     if parser.questions is not None:
         for q in parser.questions:
             name = MemberName(q.asker)
             match = matcher.match(name)
             obj = (q, match)
             questions.append(obj)
     context['questions'] = questions
     return context
Esempio n. 7
0
    def process(self):
        logger.info("Processing file {}".format(self.items_file_path))
        counter = 0
        # keys are fields in the jsonlines item, values are the fields in the model object
        field_map = {
            'asker': 'raw_asker',
            'reply_link': 'reply_link',
            'number_and_type': 'number_and_type',
            'date': 'raw_date',
            'source_url': 'crawled_from',
            'subject': 'subject',
        }
        matcher_en = RawMember.get_matcher()
        matcher_cn = RawMember.get_matcher(False)
        for item in file_wrapper(self.items_file_path):
            try:
                counter += 1
                # For each question, fill in the raw values, then try to match against a RawMember instance

                # Generate a uid and get the object
                uid = self._generate_uid(item)
                obj, created = RawCouncilQuestion.objects.get_or_create(
                    uid=uid)
                if created:
                    self._count_created += 1
                else:
                    self._count_updated += 1

                # Fill in the last parsed and last crawled values
                if self.job is not None:
                    obj.last_crawled = self.job.completed
                obj.last_parsed = now()

                # Fill in the items that can be copied directly
                for k, v in field_map.items():
                    val = item.get(k, None)
                    setattr(obj, v, val)

                if obj.reply_link is None:
                    obj.reply_link = u''

                # the subject_link is sometimes a relative path, so convert it to an absolute url
                subject_link = item.get('subject_link', u'')
                if subject_link != u'':
                    abs_url = urljoin(item['source_url'], subject_link)
                    obj.subject_link = abs_url

                # Convert the language from the string to the constants
                lang = LANG_CN if item['language'] == u'C' else LANG_EN
                obj.language = lang
                if lang == LANG_CN:
                    matcher = matcher_cn
                else:
                    matcher = matcher_en

                # Try to find the RawMember object that matches the asker
                raw_name = item['asker']
                name = MemberName(raw_name)
                match = matcher.match(name)
                if match is not None:
                    member = match[1]
                    obj.asker = member

                # Finally save
                obj.save()
            except (KeyError, RuntimeError) as e:
                self._count_error += 1
                logger.warn(
                    u'Could not process question {} from date {}'.format(
                        item['number_and_type'], item['date']))
                logger.warn(unicode(e))
                continue
        logger.info(
            "{} items processed, {} created, {} updated, {} errors".format(
                counter, self._count_created, self._count_updated,
                self._count_error))
Esempio n. 8
0
    def process(self):
        logger.info("Processing file {}".format(self.items_file_path))
        counter = 0
        # keys are fields in the jsonlines item, values are the fields in the model object
        field_map = {
            'asker': 'raw_asker',
            'reply_link': 'reply_link',
            'number_and_type': 'number_and_type',
            'date': 'raw_date',
            'source_url': 'crawled_from',
            'subject': 'subject',
        }
        matcher_en = RawMember.get_matcher()
        matcher_cn = RawMember.get_matcher(False)
        for item in file_wrapper(self.items_file_path):
            try:
                counter += 1
                # For each question, fill in the raw values, then try to match against a RawMember instance

                # Generate a uid and get the object
                uid = self._generate_uid(item)
                obj, created = RawCouncilQuestion.objects.get_or_create(uid=uid)
                if created:
                    self._count_created += 1
                else:
                    self._count_updated += 1

                # Fill in the last parsed and last crawled values
                if self.job is not None:
                    obj.last_crawled = self.job.completed
                obj.last_parsed = now()

                # Fill in the items that can be copied directly
                for k, v in field_map.items():
                    val = item.get(k, None)
                    setattr(obj, v, val)

                if obj.reply_link is None:
                    obj.reply_link = u''

                # the subject_link is sometimes a relative path, so convert it to an absolute url
                subject_link = item.get('subject_link', u'')
                if subject_link != u'':
                    abs_url = urljoin(item['source_url'], subject_link)
                    obj.subject_link = abs_url

                # Convert the language from the string to the constants
                lang = LANG_CN if item['language'] == u'C' else LANG_EN
                obj.language = lang
                if lang == LANG_CN:
                    matcher = matcher_cn
                else:
                    matcher = matcher_en

                # Try to find the RawMember object that matches the asker
                raw_name = item['asker']
                name = MemberName(raw_name)
                match = matcher.match(name)
                if match is not None:
                    member = match[1]
                    obj.asker = member

                # Finally save
                obj.save()
            except (KeyError, RuntimeError) as e:
                self._count_error += 1
                logger.warn(u'Could not process question {} from date {}'.format(item['number_and_type'], item['date']))
                logger.warn(unicode(e))
                continue
        logger.info("{} items processed, {} created, {} updated, {} errors".format(counter, self._count_created, self._count_updated, self._count_error))
Esempio n. 9
0
    def process(self):
        logger.info("Processing file {}".format(self.items_file_path))
        counter = 0
        # keys are fields in the jsonlines item, values are the fields in the model object
        field_map = {
            "asker": "raw_asker",
            "reply_link": "reply_link",
            "number_and_type": "number_and_type",
            "date": "raw_date",
            "source_url": "crawled_from",
            "subject": "subject",
        }
        matcher_en = RawMember.get_matcher()
        matcher_cn = RawMember.get_matcher(False)
        for item in file_wrapper(self.items_file_path):
            try:
                counter += 1
                # For each question, fill in the raw values, then try to match against a RawMember instance

                # Generate a uid and get the object
                uid = self._generate_uid(item)
                obj, created = RawCouncilQuestion.objects.get_or_create(uid=uid)
                if created:
                    self._count_created += 1
                else:
                    self._count_updated += 1

                # Fill in the last parsed and last crawled values
                if self.job is not None:
                    obj.last_crawled = self.job.completed
                obj.last_parsed = now()

                # Fill in the items that can be copied directly
                for k, v in field_map.items():
                    val = item.get(k, None)
                    setattr(obj, v, val)

                if obj.reply_link is None:
                    obj.reply_link = u""

                # the subject_link is sometimes a relative path, so convert it to an absolute url
                subject_link = item.get("subject_link", u"")
                if subject_link != u"":
                    abs_url = urljoin(item["source_url"], subject_link)
                    obj.subject_link = abs_url

                # Convert the language from the string to the constants
                lang = LANG_CN if item["language"] == u"C" else LANG_EN
                obj.language = lang
                if lang == LANG_CN:
                    matcher = matcher_cn
                else:
                    matcher = matcher_en

                # Try to find the RawMember object that matches the asker
                # There will still be some askers not matched - we will use parser to fix them soon
                raw_name = item["asker"]
                # Some postprocessing
                # Get rid of 'Hon', '議員' and ''
                raw_name = raw_name.replace(u"Hon", u"")
                raw_name = raw_name.replace(u"議員", u"")

                # Get rid of heading and tailing spaces
                if raw_name[0] == u" ":
                    raw_name = raw_name[1:]
                if raw_name[-1] == u" ":
                    raw_name = raw_name[:-1]

                # Try to match the name with RawMember
                name = MemberName(raw_name)
                match = matcher.match(name)
                if match is not None:
                    member = match[1]
                    obj.asker = member
                else:
                    pass
                    # logger.warn(u'Cannot match asker "{}" with members in database'.format(raw_name))

                # Get the local path of reply content
                try:
                    obj.local_filename = item["files"][0]["path"]
                except IndexError:
                    obj.local_filename = None
                    logger.warn(
                        u"Could not get local path for question {} from date {}".format(
                            item["number_and_type"], item["date"]
                        )
                    )

                # Sometimes the reply link is not available yet,
                # and sometimes the meeting was cancelled or deferred
                # In these cases, forget about them.
                if obj.local_filename is not None:
                    obj.save()

            except (KeyError, RuntimeError) as e:
                self._count_error += 1
                logger.warn(u"Could not process question {} from date {}".format(item["number_and_type"], item["date"]))
                logger.warn(unicode(e))
                continue
        # After saving all items, use parser to fix missing askers
        no_asker_list = RawCouncilQuestion.fix_asker_by_parser()

        logger.info(
            u"{} items processed, {} created, {} updated, {} errors, {} questions without asker".format(
                counter, self._count_created, self._count_updated, self._count_error, len(no_asker_list)
            )
        )
        # for debugging
        print(no_asker_list)