def test_formatter_wrapper(formatter, values, expected_value):
    actual_value = XmlDumpConverter.formatter_wrapper(
        formatter,
        values
    )

    assert actual_value == expected_value
 def __init__(self, is_quiet=False):
     """
     Creates new GndDumpConverter instance.
     :param is_quiet: If set to True, console output will be suppressed.
     """
     self.is_quiet = is_quiet
     self.xml_dump_converter = XmlDumpConverter(self.XML_ENTITIES_PATH,
                                                self.XML_ENTITY_ID_XPATH,
                                                propertymappings.mapping,
                                                self.XML_NAMESPACES,
                                                is_quiet)
def test_write_external_data():
    expected_dump_id = "foobar"
    expected_value_triple = ("foobar", "P42", ["foobar"])

    gnd_converter = GndDumpConverter(False)
    xml_dump_converter_mock = XmlDumpConverter(None, None, None)
    def process_dump_mock(dump_file):
        yield expected_value_triple
    xml_dump_converter_mock.process_dump = process_dump_mock
    gnd_converter.xml_dump_converter = xml_dump_converter_mock
    result_mock = ResultWriter(StringIO(), StringIO())
    def write_external_value_mock(dump_id, external_id,
                                  property_id, external_value):
        assert expected_dump_id == dump_id
        assert expected_value_triple[0] == external_id
        assert expected_value_triple[1] == property_id
        assert external_value in expected_value_triple[2]
    result_mock.write_external_value = write_external_value_mock

    gnd_converter.write_external_data(expected_dump_id, None, result_mock)
def test_run_formatter(formatter, values, expected_values):
    actual_values = XmlDumpConverter.run_formatter(formatter, values)

    assert expected_values == actual_values
class GndDumpConverter:
    """
    Dump converter for dumps of the Integrated Authority File (GND) of the
    German National Library. Downloads latest dump files, processes entities and
    writes values to tar file.
    """
    DATA_SOURCE_ITEM_ID = "Q36578"
    IDENTIFIER_PROPERTY_ID = "P227"
    LANGUAGE = "de"
    LICENSE_ITEM_ID = "Q6938433"

    FILE_PREFIXES = {
        "GND-Tpgesamt": "Tpgesamt",
        "GND-Tggesamt": "Tggesamt",
        "GND-Tugesamt": "Tugesamt"
    }
    URL_FORMAT = "http://datendienst.dnb.de/cgi-bin/mabit.pl?cmd=fetch&userID=GNDxml&pass=gndmarcxml{0}{1}&mabheft={2}{0}{3}gndmrc.xml.gz"

    XML_ENTITIES_PATH = "ns:collection/ns:record"
    XML_ENTITY_ID_XPATH = "substring-after(ns:datafield[@tag='035']/ns:subfield[@code='a' and starts-with(./text(), '(DE-588)')], '(DE-588)')"
    XML_NAMESPACES = {
        "ns": "http://www.loc.gov/MARC21/slim"
    }

    def __init__(self, is_quiet=False):
        """
        Creates new GndDumpConverter instance.
        :param is_quiet: If set to True, console output will be suppressed.
        """
        self.is_quiet = is_quiet
        self.xml_dump_converter = XmlDumpConverter(self.XML_ENTITIES_PATH,
                                                   self.XML_ENTITY_ID_XPATH,
                                                   propertymappings.mapping,
                                                   self.XML_NAMESPACES,
                                                   is_quiet)

    def execute(self, result_writer):
        """
        Starts whole convert process.
        :param result_writer: Writer for output of result.
        """
        for dump_id, file_prefix in self.FILE_PREFIXES.iteritems():
            if not self.is_quiet:
                print "Start to convert '{0}'".format(file_prefix)

            dump_file, dump_url, dump_size = self.download_dump(file_prefix)

            uncompressed_dump_file = GzipFile(mode="rb", fileobj=dump_file)

            self.write_external_data(
                dump_id,
                uncompressed_dump_file,
                result_writer)
            result_writer.write_dump_information(
                dump_id,
                self.DATA_SOURCE_ITEM_ID,
                [self.IDENTIFIER_PROPERTY_ID],
                self.LANGUAGE,
                dump_url,
                dump_size,
                self.LICENSE_ITEM_ID)

            uncompressed_dump_file.close()
            dump_file.close()

            if not self.is_quiet:
                print

    def get_dump_url(self, file_prefix, fallback=False, date=datetime.date.today()):
        """
        Returns url of the latest dump with specified prefix.
        If dump file does not exist, fallback option can be set True to
        build url of previous dump.
        This will be applicable, if new dump should be already available,
        but was not published yet.
        :param file_prefix: Prefix of the dump file.
        :param fallback: If set to True, url of previous dump will be returned.
        :param date: Datetime, on which the url generation is based on.
        :return: Url of the dump
        """
        if fallback:
            previous_month = date.month - 4
            previous_year = date.year
            if previous_month <= 0:
                previous_month = (date.month - 4) % 12
                if previous_month == 0:
                    previous_month = 12
                previous_year -= 1
            date = datetime.date(previous_year, previous_month, 1)

        year = date.year
        if date.month == 1:
            index = 3
            month = "10"
            year -= 1
        elif date.month < 6:
            index = 1
            month = "02"
        elif date.month < 10:
            index = 2
            month = "06"
        else:
            index = 3
            month = "10"
        year = str(year)[-2:]

        return self.URL_FORMAT.format(year, index, file_prefix, month)

    def download_dump(self, file_prefix):
        """
        Downloads a dump identified by file prefix to destination file.
        :param file_prefix: Prefix of the dump file.
        :return: List of file object, url and size of downloaded file.
        """
        dump_file = TemporaryFile()
        dump_url = self.get_dump_url(file_prefix)
        try:
            dump_size = downloadutils.download_file(dump_url, dump_file,
                                                    is_quiet=self.is_quiet,
                                                    progress_message="Downloading database dump...{0}")
        except DownloadError as e:
            if e.status_code == 400 or e.status_code == 500:
                dump_url = self.get_dump_url(file_prefix, fallback=True)
                dump_size = downloadutils.download_file(dump_url, dump_file,
                                                        is_quiet=self.is_quiet,
                                                        progress_message="Downloading database dump...{0}")
            else:
                raise

        return dump_file, dump_url, dump_size

    def write_external_data(self, dump_id, dump_file, result_writer):
        """
        Processes dump and writes external values to file.
        :param dump_id: Id of the processing dump.
        :param dump_file: File object of the dump.
        :param result_writer: Current result writer.
        """
        external_data = self.xml_dump_converter.process_dump(dump_file)
        for external_id, property_id, external_values in external_data:
            for external_value in external_values:
                result_writer.write_external_value(dump_id, external_id,
                                                   property_id, external_value)