Example #1
0
    def run(self):
        target = os.path.join(os.path.dirname(self.output().path), str(self.date))
        if not os.path.exists(target):
            os.makedirs(target)

        _, errorlog = tempfile.mkstemp(prefix="siskin-")
        stylesheet = self.input().get("stylesheet").path
        size = wc(self.input().get("filelist").path)

        with self.input().get("filelist").open() as handle:
            for i, row in enumerate(handle.iter_tsv(cols=("path",)), start=1):
                basename = os.path.basename(row.path)
                name = basename.replace(".xml", ".marcxml")
                destination = os.path.join(target, name)
                if not os.path.exists(destination):
                    try:
                        output = shellout("xsltproc {xsl} {input} > {output}", input=row.path, xsl=stylesheet)
                        luigi.File(output).move(destination)
                    except RuntimeError as err:
                        self.logger.error("{0}: {1}".format(row.path, err))
                        with open(errorlog, "a") as log:
                            log.write("%s\t%s\n" % (row.path, err))
                self.logger.debug("{0}/{1} {2}".format(i, size, row.path))

        # write receipt
        with self.output().open("w") as output:
            for path in iterfiles(target):
                output.write_tsv(path)

        # this is just a temporary artefact for now
        self.logger.debug("Conversion errors logged at: {0}".format(errorlog))
Example #2
0
 def run(self):
     source = os.path.join(config.get('core', 'swb-mirror'), 'nationallizenzen')
     target = os.path.dirname(self.output().path)
     shellout("rsync -avz {source} {target}", source=source, target=target)
     with self.output().open('w') as output:
         for path in iterfiles(target):
             output.write_tsv(path)
Example #3
0
 def run(self):
     target = os.path.dirname(self.output().path)
     pattern = os.path.join(config.get('lfer', 'glob'))
     shellout("rsync -avz {src} {target}", src=pattern, target=target)
     with self.output().open('w') as output:
         for path in sorted(iterfiles(target)):
             output.write_tsv(path)
Example #4
0
    def run(self):
        """ The indicator is always recreated, while the subdir
        for a given (host, username, base, pattern) is just synced. """
        base = os.path.dirname(self.output().path)
        subdir = hashlib.sha1('{host}:{username}:{base}:{pattern}'.format(
            host=self.host, username=self.username, base=self.base,
            pattern=self.pattern)).hexdigest()
        # target is the root of the mirror
        target = os.path.join(base, subdir)
        if not os.path.exists(target):
            os.makedirs(target)

        command = """lftp -u {username},{password}
        -e "set net:max-retries {max_retries}; set net:timeout {timeout}; mirror --verbose=0
        --only-newer -I {pattern} {base} {target}; exit" {host}"""

        shellout(command, host=self.host, username=pipes.quote(self.username),
                 password=pipes.quote(self.password),
                 pattern=pipes.quote(self.pattern),
                 target=pipes.quote(target),
                 base=pipes.quote(self.base),
                 max_retries=self.max_retries,
                 timeout=self.timeout)

        with self.output().open('w') as output:
            for path in iterfiles(target):
                self.logger.debug("Mirrored: %s" % path)
                output.write_tsv(path)
Example #5
0
    def run(self):
        # create target subdirectory
        target = os.path.join(os.path.dirname(self.output().path), str(self.closest()))
        if not os.path.exists(target):
            os.makedirs(target)
        size = wc(self.input().path)

        with self.input().open() as handle:
            for i, row in enumerate(handle.iter_tsv(cols=("url",)), start=1):
                name = os.path.join(target, row.url.split("/")[-2])
                destination = "{name}.xml".format(name=name)
                if not os.path.exists(destination):
                    output = shellout(
                        """wget -q --retry-connrefused
                                      {url} -O {output}""",
                        url=row.url,
                    )
                    luigi.File(output).move(destination)
                self.logger.debug("{0}/{1} {2}".format(i, size, row.url))

        # write "receipt"
        with self.output().open("w") as output:
            for path in iterfiles(target):
                if path.endswith(".xml"):
                    output.write_tsv(path)
Example #6
0
 def run(self):
     stopover = tempfile.mkdtemp(prefix='siskin-')
     shellout("scp {origin} {stopover}".format(origin=config.get('pao', 'scp-src'), stopover=stopover))
     _, combined = tempfile.mkstemp(prefix='siskin-')
     for path in iterfiles(directory=stopover,
                           fun=lambda path: re.search(r'pao[\d].mrc', path)):
         shellout("cat {path} >> {output}", path=path, output=combined)
     luigi.File(combined).move(self.output().path)
Example #7
0
    def run(self):
        stopover = tempfile.mkdtemp(prefix='siskin-')
        origin = config.get('gbv', 'scp-src').format(tag=self.tag)
        shellout("scp {origin} {output}", origin=origin, output=stopover)

        _, combined = tempfile.mkstemp(prefix='siskin-')
        for path in iterfiles(stopover):
            shellout("cat {input} >> {output}", input=path, output=combined)
        luigi.File(combined).move(self.output().fn)
Example #8
0
 def run(self):
     target = os.path.join(self.taskdir(), str(self.date))
     if not os.path.exists(target):
         os.makedirs(target)
     _, stopover = tempfile.mkstemp(prefix='siskin-')
     shellout("wget --retry-connrefused -O {stopover} '{url}' && unzip -o -d {dir} {stopover}", dir=target, stopover=stopover, url=self.url)
     files = list(iterfiles(target))
     if not len(files) == 1:
         raise RuntimeError('more than one file')
     luigi.File(files[0]).move(self.output().path)
Example #9
0
 def run(self):
     directory = self.config.get('elsevierjournals', 'backlog-dir')
     _, output = tempfile.mkstemp(prefix='siskin-')
     for path in sorted(
             iterfiles(directory, fun=lambda p: p.endswith('.tar'))):
         shellout(
             "span-import -i elsevier-tar {input} | pigz -c >> {output}",
             input=path,
             output=output)
     luigi.LocalTarget(output).move(self.output().path)
Example #10
0
 def run(self):
     # gather files
     stopover = tempfile.mkdtemp(prefix='siskin-')
     shellout("scp {origin} {output}", origin=config.get('ksd', 'scp-src'), output=stopover)
     # combine files
     _, combined = tempfile.mkstemp(prefix='siskin-')
     for path in sorted(iterfiles(stopover), reverse=True):
         shellout("cat {input} >> {output}", input=path, output=combined)
     # clean dups
     output = shellout("marcuniq {input} > {output}", input=combined)
     luigi.File(output).move(self.output().path)
Example #11
0
    def run(self):
        target = os.path.join(self.taskdir(), 'mirror')
        shellout("mkdir -p {target} && rsync {rsync_options} {src} {target}",
                 rsync_options=self.config.get('gbi', 'rsync-options', '-avzP'),
                 src=self.config.get('gbi', 'scp-src'), target=target)

        if not os.path.exists(self.taskdir()):
            os.makedirs(self.taskdir())

        with self.output().open('w') as output:
            for path in iterfiles(target):
                output.write_tsv(path)
Example #12
0
    def run(self):
        filemap = {'all': 'http://www.textgrid.de/fileadmin/digitale-bibliothek/literatur.zip',
                   'v1': 'http://www.textgrid.de/fileadmin/digitale-bibliothek/literatur-nur-texte-1.zip',
                   'v2': 'http://www.textgrid.de/fileadmin/digitale-bibliothek/literatur-nur-texte-2.zip'}

        if self.corpus not in filemap:
            raise RuntimeError('available corpus ids: all, v1, v2')

        output = shellout("wget --retry-connrefused '{url}' -O {output}", url=filemap[self.corpus])
        shellout("unzip -d '{dir}' {input}", dir=self.input().get('dir').path, input=output)
        with self.output().open('w') as output:
            for path in iterfiles(self.input().get('dir').path):
                output.write_tsv(path.encode('utf-8'))
Example #13
0
    def run(self):
        target = os.path.join(self.taskdir(), 'mirror')
        shellout("mkdir -p {target} && rsync {rsync_options} {src} {target}",
                 rsync_options=self.config.get('cambridge', 'rsync-options', fallback='-avzP'),
                 src=self.config.get('cambridge', 'scp-src'),
                 target=target)

        if not os.path.exists(self.taskdir()):
            os.makedirs(self.taskdir())

        with self.output().open('w') as output:
            for path in iterfiles(target):
                output.write_tsv(path)
Example #14
0
 def run(self):
     prefix = '{0}-'.format(random_string())
     output = shellout("cd {tmp} && split -l {lines} -a 8 {input} {prefix} && cd -",
                       lines=self.lines, tmp=tempfile.gettempdir(),
                       input=self.input().path, prefix=prefix)
     target = os.path.join(self.taskdir())
     if not os.path.exists(target):
         os.makedirs(target)
     with self.output().open('w') as output:
         for path in iterfiles(tempfile.gettempdir()):
             filename = os.path.basename(path)
             if filename.startswith(prefix):
                 dst = os.path.join(target, filename)
                 shutil.move(path, dst)
                 output.write_tsv(dst)
Example #15
0
    def run(self):
        with self.input().open() as handle:
            doc = json.load(handle)
        tempdir = tempfile.mkdtemp(prefix='tmp-siskin-')
        for attachment in doc['issue']['attachments']:
            target = os.path.join(tempdir, os.path.basename(attachment["content_url"]))
            shellout("""curl -vL --fail -H "X-Redmine-API-Key:{apikey}" -o {target} "{url}" """,
                     url=attachment["content_url"],
                     apikey=self.config.get("redmine", "apikey"),
                     target=target)

        with self.output().open('w') as output:
            for path in iterfiles(tempdir):
                self.logger.debug("Downloaded: %s", path)
                output.write_tsv(path)
Example #16
0
 def run(self):
     target = os.path.join(self.taskdir(), self.version, self.format)
     if not os.path.exists(target):
         os.makedirs(target)
     url = os.path.join(self.base, self.version, "datasets", self.format)
     stopover = tempfile.mkdtemp(prefix='siskin-')
     shellout("""wget -q -nd -P {directory} -rc -np -A.{format}.gz '{url}'""",
                 url=url, directory=stopover, format=self.format)
     for path in glob.glob(unicode(os.path.join(stopover, '*'))):
         dst = os.path.join(target, os.path.basename(path))
         if not os.path.exists(dst):
             # this is atomic given path and target are on the same device
             shutil.move(path, target)
     with self.output().open('w') as output:
         for path in iterfiles(target, fun=lambda p: p.endswith('nt.gz')):
             output.write_tsv(self.version, self.format, path)
Example #17
0
    def run(self):
        base = "http://www.universitypressscholarship.com/"

        with self.input().open() as handle:
            for row in handle.iter_tsv(cols=('path',)):
                dirname, basename = row.path.split('/')[-2:]
                slugged = dirname.replace('%20', '-').lower()
                url = urlparse.urljoin(base, row.path)
                dst = os.path.join(self.taskdir(), '{0}-{1}'.format(slugged, basename))
                if os.path.exists(dst):
                    continue
                output = shellout("""wget --retry-connrefused "{url}" -O {output} """, url=url)
                luigi.File(output).move(dst)

        with self.output().open('w') as output:
            for path in iterfiles(self.taskdir()):
                if not path.endswith('mrc'):
                    continue
                output.write_tsv(path)
Example #18
0
    def run(self):
        """
        The indicator is always recreated, while the subdir
        for a given (host, username, base, pattern) is just synced.
        """
        base = os.path.dirname(self.output().path)
        subdir = hashlib.sha1('{host}:{username}:{base}:{pattern}'.format(
            host=self.host, username=self.username, base=self.base, pattern=self.pattern).encode('utf-8')).hexdigest()

        target = os.path.join(base, subdir)  # target is the root of the mirror
        if not os.path.exists(target):
            os.makedirs(target)

        exclude_glob = ""
        if not self.exclude_glob == "":
            exclude_glob = "--exclude-glob %s" % self.exclude_glob

        command = """lftp -u {username},{password}
        -e "
            set sftp:auto-confirm yes;
            set net:max-retries {max_retries};
            set net:timeout {timeout};
            set mirror:parallel-directories 1;
            set ssl:verify-certificate no;
            set ftp:ssl-protect-data true;

        mirror --verbose=0 --only-newer {exclude_glob} -I {pattern} {base} {target}; exit" {host}"""

        shellout(command,
                 host=self.host,
                 username=pipes.quote(self.username),
                 password=pipes.quote(self.password),
                 pattern=pipes.quote(self.pattern),
                 target=pipes.quote(target),
                 base=pipes.quote(self.base),
                 max_retries=self.max_retries,
                 timeout=self.timeout,
                 exclude_glob=exclude_glob)

        with self.output().open('w') as output:
            for path in iterfiles(target):
                self.logger.debug("Mirrored: %s", path)
                output.write_tsv(path)
Example #19
0
 def run(self):
     directory = self.config.get('elsevierjournals', 'backlog-dir')
     _, output = tempfile.mkstemp(prefix='siskin-')
     for path in sorted(iterfiles(directory, fun=lambda p: p.endswith('.tar'))):
         shellout("span-import -i elsevier-tar {input} | pigz -c >> {output}", input=path, output=output)
     luigi.LocalTarget(output).move(self.output().path)
Example #20
0
 def run(self):
     shellout("rsync -avz {src} {dst}", src=config.get('liberec', 'src'), dst=self.taskdir())
     with self.output().open('w') as output:
         for path in iterfiles(self.taskdir(), fun=lambda p: '-luigi-tmp-' not in p):
             output.write_tsv(path)
Example #21
0
 def run(self):
     directory = self.config.get("elsevierjournals", "backlog-dir")
     _, output = tempfile.mkstemp(prefix="siskin-")
     for path in sorted(iterfiles(directory, fun=lambda p: p.endswith(".tar"))):
         shellout("span-import -i elsevier-tar {input} | pigz -c >> {output}", input=path, output=output)
     luigi.File(output).move(self.output().path)
Example #22
0
    def run(self):
        """
        Iterate over all zipfiles in reverse, convert and concat binary marc
        into tempfile, then deduplicate.
        """

        # Load all deletions into set.
        deleted = set()

        deldir = os.path.dirname(self.input().get('deletions').path)
        for path in sorted(iterfiles(deldir), reverse=True):
            with open(path) as handle:
                for i, line in enumerate(handle, start=1):
                    line = line.strip()
                    if len(line) > 20:
                        self.logger.warn("suspicious id: %s", line)
                    deleted.add(line)

        # Load updates.
        pattern = re.compile(r'^date-[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}.zip$')
        datadir = os.path.dirname(self.input().get('data').path)

        # Combine all binary MARC records in this file.
        _, combined = tempfile.mkstemp(prefix='siskin-')

        for path in sorted(iterfiles(datadir), reverse=True):
            filename = os.path.basename(path)

            if not pattern.match(filename):
                self.logger.warn("ignoring invalid filename: %s", path)
                continue
            if os.stat(path).st_size < 22:
                self.logger.warn("ignoring possibly empty zip file: %s", path)
                continue

            with zipfile.ZipFile(path) as zf:
                for name in zf.namelist():
                    with zf.open(name) as handle:
                        with tempfile.NamedTemporaryFile(delete=False) as dst:
                            shutil.copyfileobj(handle, dst)
                        shellout(
                            "yaz-marcdump -i marcxml -o marc {input} >> {output}",
                            input=dst.name,
                            output=combined,
                            ignoremap={5: 'expected error from yaz'})
                        os.remove(dst.name)

        # Finally, concatenate initial dump.
        shellout("cat {input} >> {output}",
                 input=self.input().get('dump').path,
                 output=combined)

        # Already seen identifier.
        seen = set()

        with self.output().open('wb') as output:
            writer = pymarc.MARCWriter(output)

            # Iterate over MARC records (which are newest to oldest, keep track of seen identifiers).
            with open(combined) as handle:
                reader = pymarc.MARCReader(handle,
                                           force_utf8=True,
                                           to_unicode=True)
                for record in reader:
                    field = record["001"]
                    if not field:
                        self.logger.debug("missing identifier")
                        continue

                    id = field.value()

                    if id in seen:
                        self.logger.debug("skipping duplicate: %s", id)
                        continue
                    if id in deleted:
                        self.logger.debug("skipping deleted: %s", id)
                        continue

                    self.logger.debug("adding %s", id)
                    writer.write(record)
                    seen.add(id)

        self.logger.debug(
            "found %s unique records (deletion list contained %s ids)",
            len(seen), len(deleted))
        os.remove(combined)
Example #23
0
    def run(self):
        """
        Iterate over all zipfiles in reverse, convert and concat binary marc
        into tempfile, then deduplicate.
        """

        # Load all deletions into set.
        deleted = set()

        deldir = os.path.dirname(self.input().get('deletions').path)
        for path in sorted(iterfiles(deldir), reverse=True):
            with open(path) as handle:
                for i, line in enumerate(handle, start=1):
                    line = line.strip()
                    if len(line) > 20:
                        self.logger.warn("suspicious id: %s", line)
                    deleted.add(line)

        # Load updates.
        pattern = re.compile(r'^date-[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}.zip$')
        datadir = os.path.dirname(self.input().get('data').path)

        # Combine all binary MARC records in this file.
        _, combined = tempfile.mkstemp(prefix='siskin-')

        for path in sorted(iterfiles(datadir), reverse=True):
            filename = os.path.basename(path)

            if not pattern.match(filename):
                self.logger.warn("ignoring invalid filename: %s", path)
                continue
            if os.stat(path).st_size < 22:
                self.logger.warn("ignoring possibly empty zip file: %s", path)
                continue

            with zipfile.ZipFile(path) as zf:
                for name in zf.namelist():
                    with zf.open(name) as handle:
                        with tempfile.NamedTemporaryFile(delete=False) as dst:
                            shutil.copyfileobj(handle, dst)
                        shellout("yaz-marcdump -i marcxml -o marc {input} >> {output}",
                                 input=dst.name,
                                 output=combined,
                                 ignoremap={5: 'expected error from yaz'})
                        os.remove(dst.name)

        # Finally, concatenate initial dump.
        shellout("cat {input} >> {output}", input=self.input().get('dump').path, output=combined)

        # Already seen identifier.
        seen = set()

        with self.output().open('wb') as output:
            writer = pymarc.MARCWriter(output)

            # Iterate over MARC records (which are newest to oldest, keep track of seen identifiers).
            with open(combined) as handle:
                reader = pymarc.MARCReader(handle, force_utf8=True, to_unicode=True)
                for record in reader:
                    field = record["001"]
                    if not field:
                        self.logger.debug("missing identifier")
                        continue

                    id = field.value()

                    if id in seen:
                        self.logger.debug("skipping duplicate: %s", id)
                        continue
                    if id in deleted:
                        self.logger.debug("skipping deleted: %s", id)
                        continue

                    self.logger.debug("adding %s", id)
                    writer.write(record)
                    seen.add(id)

        self.logger.debug("found %s unique records (deletion list contained %s ids)", len(seen), len(deleted))
        os.remove(combined)