def run(self): with self.output().open('w') as output: for k1, k2 in itertools.combinations(list(self.input().keys()), 2): s1 = load_set_from_target(self.input().get(k1)) s2 = load_set_from_target(self.input().get(k2)) for issn in sorted(s1.intersection(s2)): output.write_tsv(k1, k2, issn)
def run(self): amsl = load_set_from_target(self.input().get('amsl')) crossref = load_set_from_target(self.input().get('crossref')) with self.output().open('w') as output: stats = { 'amsl': amsl, 'crossref': crossref, 'amsl_only': amsl - crossref, 'crossref_only': crossref - amsl, 'both': amsl & crossref, } output.write(json.dumps(stats, cls=SetEncoder))
def run(self): seen = load_set_from_target(self.input().get('seen')) written = set() with self.input().get('mapping').open() as handle: with self.output().open('w') as output: for row in handle.iter_tsv(cols=('prefix', 'name', 'current')): if row.prefix not in seen: self.logger.debug("not seen: %s", row.prefix) continue cut = row[:2] if tuple(cut) not in written: output.write_tsv(*cut) written.add(tuple(cut))
def run(self): identifier_blacklist = load_set_from_target(self.input().get('blacklist')) excludes = load_set_from_file(self.assets('028_doaj_filter.tsv'), func=lambda line: line.replace("-", "")) with self.output().open('w') as output: with self.input().get('dump').open() as handle: for line in handle: record, skip = json.loads(line), False if record['id'] in identifier_blacklist: continue for issn in record["bibjson"]["journal"]["issns"]: issn = issn.replace("-", "").strip() if issn in excludes: skip = True break if skip: continue output.write(line)