Beispiel #1
0
    def import_doc(self, info):
        url = DOC_DL_URL % (info['type'], info['id'])
        info['info_link'] = url
        self.fix_id_quirks(info)
        if not should_download_doc(info):
            self.logger.warning("skipping %s %s" % (info['type'], info['id']))
            return None

        origin_id = "%s %s" % (info['type'], info['id'])
        try:
            doc = Document.objects.get(origin_id=origin_id)
        except Document.DoesNotExist:
            doc = Document(origin_id=origin_id)

        if 'update_time' in info:
            doc.mark_checked()
            if doc.last_modified_time and doc.last_modified_time >= info[
                    'update_time'] and not self.replace:
                self.logger.debug("%s %s not updated" %
                                  (info['type'], info['id']))
                doc.save(update_fields=['last_checked_time'])
                return None
            else:
                self.logger.debug(
                    "%s %s updated %s (checked %s)" %
                    (info['type'], info['id'], info['update_time'],
                     doc.last_modified_time))
        else:
            if doc.pk and not self.replace:
                return doc

        doc.type = DOC_TYPES[info['type']]
        doc.name = origin_id

        info = self.fetch_processing_info(info)

        if info['type'] == 'HE':
            self.import_he(info)
        else:
            ret = self.import_sgml_doc(info, current_version=doc.version)
            if not ret:
                return None

        doc.version = info.get('doc_version', None)
        doc.subject = info['subject']
        for attr_name in ('summary', 'question', 'answer', 'answerer_name',
                          'answerer_title'):
            if attr_name in info:
                setattr(doc, attr_name, info[attr_name])
        if 'error' in info:
            doc.error = info['error']
        else:
            doc.error = None
        # Figure out the document date through the intro stage.
        for st in info['phases']:
            if st['phase'] == 'intro':
                doc.date = st['date']
                break
        if doc.date is None:
            raise ParseError("Document date could not be determined")
        doc.info_link = info['info_link']
        if 'sgml_link' in info:
            doc.sgml_link = info['sgml_link']
        if 'author' in info:
            doc.author = Member.objects.get(origin_id=info['author']['id'])

        doc.mark_modified()
        doc.save()

        self.save_stages(doc, info)
        self.save_keywords(doc, info)
        if 'signatures' in info:
            self.save_signatures(doc, info)

        # The keywords are saved only at this point. We'll save it again in order
        # to create the proper KeywordActivity objects.
        doc._updated = True
        doc.save()

        return doc
Beispiel #2
0
    def import_doc(self, info):
        url = DOC_DL_URL % (info['type'], info['id'])
        info['info_link'] = url
        self.fix_id_quirks(info)
        if not should_download_doc(info):
            self.logger.warning("skipping %s %s" % (info['type'], info['id']))
            return None

        origin_id = "%s %s" % (info['type'], info['id'])
        try:
            doc = Document.objects.get(origin_id=origin_id)
        except Document.DoesNotExist:
            doc = Document(origin_id=origin_id)

        if 'update_time' in info:
            doc.mark_checked()
            if doc.last_modified_time and doc.last_modified_time >= info['update_time'] and not self.replace:
                self.logger.debug("%s %s not updated" % (info['type'], info['id']))
                doc.save(update_fields=['last_checked_time'])
                return None
            else:
                self.logger.debug("%s %s updated %s (checked %s)" % (
                    info['type'], info['id'], info['update_time'], doc.last_modified_time
                ))
        else:
            if doc.pk and not self.replace:
                return doc

        doc.type = DOC_TYPES[info['type']]
        doc.name = origin_id

        info = self.fetch_processing_info(info)

        if info['type'] == 'HE':
            self.import_he(info)
        else:
            ret = self.import_sgml_doc(info, current_version=doc.version)
            if not ret:
                return None

        doc.version = info.get('doc_version', None)
        doc.subject = info['subject']
        for attr_name in ('summary', 'question', 'answer', 'answerer_name', 'answerer_title'):
            if attr_name in info:
                setattr(doc, attr_name, info[attr_name])
        if 'error' in info:
            doc.error = info['error']
        else:
            doc.error = None
        # Figure out the document date through the intro stage.
        for st in info['phases']:
            if st['phase'] == 'intro':
                doc.date = st['date']
                break
        if doc.date is None:
            raise ParseError("Document date could not be determined")
        doc.info_link = info['info_link']
        if 'sgml_link' in info:
            doc.sgml_link = info['sgml_link']
        if 'author' in info:
            doc.author = Member.objects.get(origin_id=info['author']['id'])

        doc.mark_modified()
        doc.save()

        self.save_stages(doc, info)
        self.save_keywords(doc, info)
        if 'signatures' in info:
            self.save_signatures(doc, info)

        # The keywords are saved only at this point. We'll save it again in order
        # to create the proper KeywordActivity objects.
        doc._updated = True
        doc.save()

        return doc
Beispiel #3
0
Datei: doc.py Projekt: ideak/kamu
    def import_doc(self, info):
        url = DOC_DL_URL % (info['type'], info['id'])
        info['info_link'] = url
        self.fix_quirks(info)
        if not should_download_doc(info):
            self.logger.warning("skipping %s %s" % (info['type'], info['id']))
            return None
        self.logger.info("downloading %s %s" % (info['type'], info['id']))

        origin_id = "%s %s" % (info['type'], info['id'])
        try:
            doc = Document.objects.get(origin_id=origin_id)
            if not self.replace:
                return doc
        except Document.DoesNotExist:
            doc = Document(origin_id=origin_id)

        doc.type = DOC_TYPES[info['type']]
        doc.name = origin_id

        info = self.fetch_processing_info(info)

        if info['type'] == 'HE':
            self.import_he(info)
        else:
            ret = self.import_sgml_doc(info)
            if not ret:
                return None
        s = "%s %s" % (info['type'], info['id'])
        doc.subject = info['subject']
        if 'summary' in info:
            doc.summary = info['summary']
        if 'error' in info:
            doc.error = info['error']
        else:
            doc.error = None
        # Figure out the document date through the intro stage.
        for st in info['phases']:
            (idx, stage, date) = st
            if stage == 'intro':
                doc.date = date
                break
        if doc.date is None:
            raise ParseError("Document date could not be determined")
        doc.info_link = info['info_link']
        if 'sgml_link' in info:
            doc.sgml_link = info['sgml_link']
        if 'author' in info:
            doc.author = Member.objects.get(origin_id=info['author']['id'])

        doc.save()

        self.save_stages(doc, info)
        self.save_keywords(doc, info)
        if 'signatures' in info:
            self.save_signatures(doc, info)

        # The keywords are saved only at this point. We'll save it again in order
        # to create the proper KeywordActivity objects.
        doc._updated = True
        doc.save()

        return doc