Example #1
0
    def update_activities_count(self):
        # This module to give us imformation the count activity in database
        # and in the XML

        try:
            # Activity count in the XML
            file_grabber = FileGrabber()
            response = file_grabber.get_the_file(self.source_url)

            # Parse to XML tree
            tree = etree.fromstring(response.content)

            # Get version from the XML
            if not self.iati_version:
                parser = etree.XMLParser(huge_tree=True, encoding='utf-8')
                parser_tree = etree.parse(BytesIO(response.content), parser)
                root = parser_tree.getroot()

                # Continue parsing if version is 2.01 or above
                iati_version = root.xpath('@version')
                if len(iati_version) > 0:
                    iati_version = iati_version[0]

                self.iati_version = iati_version

            count = len(tree.findall('iati-activity'))
            self.activities_count_in_xml = count

            # Activity count in the Database
            self.activities_count_in_database = self.activity_set.all().count()

            self.save(process=False)
        except Exception as e:
            logger.error(e)
Example #2
0
    def get_xml_activity_amount(self, url):
        try:
            file_grabber = FileGrabber()
            xml_file = file_grabber.get_the_file(url)
            occurences = 0

            for line in xml_file:

                if "</iati-identifier>" in line:
                    amount = line.count("</iati-identifier>")
                    occurences += amount

            del xml_file
            gc.collect()
            return occurences

        except Exception as e:
            if e.args:
                print(e.args[0])
            print("ERROR IN GET_XML_ACTIVITY_AMOUNT, FILE URL " + url)
Example #3
0
def handle_response(tar, source):
        file_grabber = FileGrabber()
        response = file_grabber.get_the_file(source.source_url)
        if not response or response.code != 200:
            print("source url {} down or doesn't exist".format(source.source_url))
            return
        modified_time = response.info().get('Last-Modified')

        iati_file = StringIO.StringIO(response.read())

        info = tarfile.TarInfo(
                name="{}.xml".format(source.ref)
                )

        info.size = len(iati_file.buf)

        if modified_time:
            info.mtime = _parse_http_datetime(modified_time)

        tar.addfile(tarinfo=info, fileobj=iati_file)
Example #4
0
    def get_xml_activity_amount(self, url):
        try:
            file_grabber = FileGrabber()
            xml_file = file_grabber.get_the_file(url)
            occurences = 0

            for line in xml_file:

                if "</iati-identifier>" in line:
                    amount = line.count("</iati-identifier>")
                    occurences += amount

            del xml_file
            gc.collect()
            return occurences

        except Exception as e:
            if e.args:
                print(e.args[0])
            print("ERROR IN GET_XML_ACTIVITY_AMOUNT, FILE URL " + url)
Example #5
0
    def update_activities_count(self):
        # This module to give us imformation the count activity in database
        # and in the XML

        try:
            # Activity count in the XML
            file_grabber = FileGrabber()
            response = file_grabber.get_the_file(self.source_url)

            # Parse to XML tree
            tree = etree.fromstring(response.content)
            count = len(tree.getchildren())
            self.activities_count_in_xml = count - 1 if count > 0 else count

            # Activity count in the Database
            self.activities_count_in_database = self.activity_set.all().count()

            self.save(process=False)
        except Exception as e:
            logger.error(e)
Example #6
0
    def parse_url(self, source):
        """
        Parses the source with url
        """
        url = source.source_url
        xml_source_ref = source.ref
        # last_hash = source.last_hash
        
        try:
            file_grabber = FileGrabber()
            iati_file = file_grabber.get_the_file(url)

            if iati_file:

                # delete old activities
                # TODO: determine this in the parser based on last-updated-datetime
                # TODO: also, throw away all narratives
                # try:
                #     deleter = Deleter()
                #     deleter.delete_by_source(xml_source_ref)
                # except Exception as e:
                #     exception_handler(e, "parse url", "delete by source")

                data = iati_file.read()
                root = etree.fromstring(str(data))

                parser = self.prepare_parser(root, source)
                parser.load_and_parse(root)

                # Throw away query logs when in debug mode to prevent memory from overflowing
                if settings.DEBUG:
                    from django import db
                    db.reset_queries()

        except Exception as e:
            exception_handler(e, "parse url", "parse_url")
    def __init__(self, dataset, root=None, force_reparse=False):
        """
        Given a IATI dataset, prepare an IATI parser
        """

        if settings.IATI_PARSER_DISABLED:
            raise ParserDisabledError(
                "The parser is disabled on this instance of OIPA")

        self.dataset = dataset
        self.url = dataset.source_url
        self.force_reparse = force_reparse
        self.hash_changed = True
        self.valid_dataset = True

        if root is not None:
            self.root = root
            self.parser = self._prepare_parser(self.root, dataset)
            return

        file_grabber = FileGrabber()
        response = file_grabber.get_the_file(self.url)

        from iati_synchroniser.models import DatasetNote
        if not response or response.status_code != 200:
            self.valid_dataset = False
            note = DatasetNote(
                dataset=self.dataset,
                iati_identifier="n/a",
                model="n/a",
                field="n/a",
                message="Cannot access the URL",
                exception_type='UrlError',
                line_number=None
            )
            note.save()
            self.dataset.note_count = 1

            # If not a XML file them sha1 should blank
            self.dataset.sha1 = ''

            self.dataset.save()
            return

        # 1. Turn bytestring into string (treat it using specified encoding):
        try:
            iati_file = smart_text(response.content, 'utf-8')
        # XXX: some files contain non utf-8 characters:
        # FIXME: this is hardcoded:
        except UnicodeDecodeError:
            iati_file = smart_text(response.content, 'latin-1')

        # 2. Encode the string to use for hashing:
        hasher = hashlib.sha1()
        hasher.update(iati_file.encode('utf-8'))
        sha1 = hasher.hexdigest()

        if dataset.sha1 == sha1:
            # dataset did not change, no need to reparse normally
            self.hash_changed = False
        else:
            dataset.sha1 = sha1

            # Save a sha1 in the first time of the process parse
            dataset.save()

        try:
            self.root = etree.fromstring(response.content)
            self.parser = self._prepare_parser(self.root, dataset)

            if settings.ERROR_LOGS_ENABLED:
                self.xsd_validate()

        # TODO: when moving error messages to frontend, create a separate error
        # for wrong file type:
        except etree.XMLSyntaxError as e:
            self.valid_dataset = False
            DatasetNote.objects.filter(dataset=self.dataset).delete()
            note = DatasetNote(
                dataset=self.dataset,
                iati_identifier="n/a",
                model="n/a",
                field="n/a",
                message="This file contains XML syntax errors or it's not an "
                        "XML file",
                exception_type='XMLSyntaxError',
                line_number=None
            )
            note.save()
            self.dataset.note_count = 1

            # If not the XML should not have a sha1
            self.dataset.sha1 = ''

            self.dataset.save()
            return
Example #8
0
    def __init__(self, dataset, root=None, force_reparse=False):
        """
        Given a IATI dataset, prepare an IATI parser
        """

        if settings.IATI_PARSER_DISABLED:
            raise ParserDisabledError(
                "The parser is disabled on this instance of OIPA")

        self.dataset = dataset
        self.url = dataset.source_url
        self.force_reparse = force_reparse
        self.hash_changed = True
        self.valid_dataset = True

        if root is not None:
            self.root = root
            self.parser = self._prepare_parser(self.root, dataset)
            return

        file_grabber = FileGrabber()
        response = file_grabber.get_the_file(self.url)
        from iati_synchroniser.models import DatasetNote
        if not response or response.code != 200:
            self.valid_dataset = False
            DatasetNote.objects.filter(dataset=self.dataset).delete()
            note = DatasetNote(dataset=self.dataset,
                               iati_identifier="n/a",
                               model="n/a",
                               field="n/a",
                               message="URL down or does not exist",
                               exception_type='UrlError',
                               line_number=None)
            note.save()
            self.dataset.note_count = 1
            self.dataset.save()
            return

        iati_file = response.read()
        iati_file_str = str(iati_file)

        hasher = hashlib.sha1()
        hasher.update(iati_file_str)
        sha1 = hasher.hexdigest()

        if dataset.sha1 == sha1:
            # dataset did not change, no need to reparse normally
            self.hash_changed = False
        else:
            dataset.sha1 = sha1

        try:
            self.root = etree.fromstring(iati_file_str)
            self.parser = self._prepare_parser(self.root, dataset)
        except etree.XMLSyntaxError as e:
            self.valid_dataset = False
            DatasetNote.objects.filter(dataset=self.dataset).delete()
            note = DatasetNote(dataset=self.dataset,
                               iati_identifier="n/a",
                               model="n/a",
                               field="n/a",
                               message="This file contains XML syntax errors",
                               exception_type='XMLSyntaxError',
                               line_number=None)
            note.save()
            self.dataset.note_count = 1
            self.dataset.save()
            return
Example #9
0
    def __init__(self, dataset, root=None, force_reparse=False):
        """
        Given a IATI dataset, prepare an IATI parser
        """

        if settings.IATI_PARSER_DISABLED:
            raise ParserDisabledError(
                "The parser is disabled on this instance of OIPA")

        self.dataset = dataset
        self.url = dataset.source_url
        self.force_reparse = force_reparse
        self.hash_changed = True
        self.valid_dataset = True

        if root is not None:
            self.root = root
            self.parser = self._prepare_parser(self.root, dataset)
            return

        file_grabber = FileGrabber()
        response = file_grabber.get_the_file(self.url)

        from iati_synchroniser.models import DatasetNote
        if not response or response.status_code != 200:
            self.valid_dataset = False
            note = DatasetNote(dataset=self.dataset,
                               iati_identifier="n/a",
                               model="n/a",
                               field="n/a",
                               message="Cannot access the URL",
                               exception_type='UrlError',
                               line_number=None)
            note.save()
            self.dataset.note_count = 1

            # If not a XML file them sha1 should blank
            self.dataset.sha1 = ''

            self.dataset.save()
            return

        # 1. Turn bytestring into string (treat it using specified encoding):
        try:
            iati_file = smart_text(response.content, 'utf-8')
        # XXX: some files contain non utf-8 characters:
        # FIXME: this is hardcoded:
        except UnicodeDecodeError:
            iati_file = smart_text(response.content, 'latin-1')

        # 2. Encode the string to use for hashing:
        hasher = hashlib.sha1()
        hasher.update(iati_file.encode('utf-8'))
        sha1 = hasher.hexdigest()

        if dataset.sha1 == sha1:
            # dataset did not change, no need to reparse normally
            self.hash_changed = False
        else:
            dataset.sha1 = sha1

            # Save a sha1 in the first time of the process parse
            dataset.save()

        try:
            parser = etree.XMLParser(huge_tree=True)
            tree = etree.parse(BytesIO(response.content), parser)
            self.root = tree.getroot()
            self.parser = self._prepare_parser(self.root, dataset)

            if settings.ERROR_LOGS_ENABLED:
                self.xsd_validate()

        # TODO: when moving error messages to frontend, create a separate error
        # for wrong file type:
        except etree.XMLSyntaxError as e:
            self.valid_dataset = False
            DatasetNote.objects.filter(dataset=self.dataset).delete()
            note = DatasetNote(
                dataset=self.dataset,
                iati_identifier="n/a",
                model="n/a",
                field="n/a",
                message="This file contains XML syntax errors or it's not an "
                "XML file",
                exception_type='XMLSyntaxError',
                line_number=None)
            note.save()
            self.dataset.note_count = 1

            # If not the XML should not have a sha1
            self.dataset.sha1 = ''

            self.dataset.save()
            return