def compare_outputs(duplicate_report): original = "/home/richard/tmp/doaj/article_duplicates_2019-02-27/actions-2019-04-02.csv" compare = "/home/richard/tmp/doaj/article_duplicates_2019-02-27/actions-2019-04-04.csv" missing_out = "/home/richard/tmp/doaj/article_duplicates_2019-02-27/missing.csv" extra_out = "/home/richard/tmp/doaj/article_duplicates_2019-02-27/extra.csv" reference = "/home/richard/tmp/doaj/article_duplicates_2019-02-27/reference.csv" with codecs.open(original, "rb", "utf-8") as f1: r1 = clcsv.UnicodeReader(f1) r1.next() id1 = [x[0] for x in r1] with codecs.open(compare, "rb", "utf-8") as f2: r2 = clcsv.UnicodeReader(f2) r2.next() id2 = [x[0] for x in r2] missing = [x for x in id1 if x not in id2] print("missing {x}".format(x=len(missing))) with codecs.open(missing_out, "wb", "utf-8") as f3: f3.write("\n".join(missing)) extra = [x for x in id2 if x not in id1] print("extra {x}".format(x=len(extra))) with codecs.open(extra_out, "wb", "utf-8") as f4: f4.write("\n".join(extra)) with codecs.open(duplicate_report, "rb", "utf-8") as f5, \ codecs.open(reference, "wb", "utf-8") as f6: r5 = clcsv.UnicodeReader(f5) w6 = clcsv.UnicodeWriter(f6) headers = r5.next() w6.writerow(headers) w6.writerow([]) seen_roots = [] next_row = None while True: match_set, next_row = _read_match_set(r5, next_row) for m in missing: if match_set.contains_id(m): root_id = match_set.root["id"] if root_id in seen_roots: continue seen_roots.append(root_id) print("Reference set for root id {x}".format(x=root_id)) rows = match_set.to_rows() for row in rows: w6.writerow(row) w6.writerow([]) if next_row is None: break
def history_records_assemble(id, source, out_dir): with codecs.open(source, "rb", "utf-8") as f: reader = clcsv.UnicodeReader(f) for row in reader: if row[0] == id: fn = row[1] + "_" + row[3] out = os.path.join(out_dir, fn) shutil.copy(row[2], out)
def create_users(source): with codecs.open(source, "rb", "utf-8") as f: reader = clcsv.UnicodeReader(f) for row in reader: username = row[0] email = row[1] password = row[2] if row[2] != "" else None roles = [r.strip() for r in row[3].split(",")] create_user(username, email, password, roles)
def test_01_publishers_with_consent(self): # output file to save csv output_file = os.path.join(self.tmp_dir, 'accounts.csv') # Create accounts with marketing consent not set for i in range(20): pubsource = AccountFixtureFactory.make_publisher_source() pubaccount = models.Account(**pubsource) pubaccount.set_id() pubaccount.save() # Create accounts with marketing consent set to False for i in range(20): pubsource = AccountFixtureFactory.make_publisher_source() pubaccount = models.Account(**pubsource) pubaccount.set_id() pubaccount.set_marketing_consent(False) pubaccount.save() # Create accounts with marketing consent set to True expected_data = [[ u'ID', u'Name', u'Email', u'Created', u'Last Updated', u'Updated Since Create?' ]] for i in range(20): pubsource = AccountFixtureFactory.make_publisher_source() pubaccount = models.Account(**pubsource) pubaccount.set_id() pubaccount.set_marketing_consent(True) if i == 19: pubaccount.save(blocking=True) else: pubaccount.save() expected_data.append([ unicode(pubaccount.id), unicode(pubaccount.name), unicode(pubaccount.email), unicode(pubaccount.created_date), unicode(pubaccount.last_updated), unicode('False') ]) publishers_with_consent(output_file) assert os.path.exists(output_file) table = [] with codecs.open(output_file, "rb", "utf-8") as f: reader = clcsv.UnicodeReader(f) for row in reader: table.append(row) assert len(table) == 21 self.assertItemsEqual(table, expected_data)
def test_03_apps_by_country(self): apps = ApplicationFixtureFactory.make_application_spread(APPLICATION_YEAR_OUTPUT, "year") for a in apps: a.save() time.sleep(2) outfiles = reporting.content_reports("1970-01-01T00:00:00Z", dates.now(), TMP_DIR) assert len(outfiles) == 1 assert os.path.exists(outfiles[0]) table = [] with codecs.open(outfiles[0], "rb", "utf-8") as f: reader = clcsv.UnicodeReader(f) for row in reader: table.append(row) expected = self._as_output(APPLICATION_YEAR_OUTPUT) assert table == expected
unique_deduplicated = [] genuine_unique_ids = [] genuine_unique_deduplicated = [] bad_data_unique_ids = [] bad_data_unique_deduplicated = [] genuine_count = 0 bad_data_count = 0 with codecs.open(GENUINE, "wb", "utf-8") as a: awriter = clcsv.UnicodeWriter(a) with codecs.open(BAD_DATA, "wb", "utf-8") as b: bwriter = clcsv.UnicodeWriter(b) with codecs.open(IN, "rb", "utf-8") as f: reader = clcsv.UnicodeReader(f) headers = reader.next() awriter.writerow(headers) bwriter.writerow(headers) i = 0 for row in reader: print(i) i += 1 data = _to_dict(headers, row) aid = data["article_id"] mid = data["match_id"] if aid not in unique_ids:
def test_journal_csv(self, name, kwargs): prune_arg = kwargs.get("prune") tmp_write_arg = kwargs.get("tmp_write") main_write_arg = kwargs.get("main_write") journals_arg = kwargs.get("journals") journals_no_issn_arg = kwargs.get("journals_no_issn") not_in_doaj_arg = kwargs.get("not_in_doaj") journals_with_articles_arg = kwargs.get("journals_with_articles") raises_arg = kwargs.get("raises") ############################################### ## set up raises = EXCEPTIONS.get(raises_arg) prune = True if prune_arg == "True" else False if prune_arg == "False" else None journal_count = int(journals_arg) journals_no_issn_count = int(journals_no_issn_arg) not_in_doaj_count = int(not_in_doaj_arg) journals_with_articles_count = int(journals_with_articles_arg) if tmp_write_arg == "fail": app.config[ "STORE_TMP_IMPL"] = StoreMockFactory.no_writes_classpath() if main_write_arg == "fail": app.config["STORE_IMPL"] = StoreMockFactory.no_writes_classpath() journals = [] if journal_count > 0: journals += [ models.Journal(**s) for s in JournalFixtureFactory.make_many_journal_sources( count=journal_count, in_doaj=True) ] comparisons = {} articles = [] for i in range(len(journals)): journal = journals[i] bj = journal.bibjson() bj.alternative_title = u"Заглавие на журнала" # checking mixed unicode issns = journal.bibjson().issns() source1 = ArticleFixtureFactory.make_article_source(eissn=issns[0], pissn=issns[1], with_id=False, in_doaj=False) articles.append(models.Article(**source1)) comparisons[issns[0]] = { "issns": issns, "article_count": 0, "article_latest": "" } if i < journals_with_articles_count: source2 = ArticleFixtureFactory.make_article_source( eissn=issns[0], pissn=issns[1], with_id=False, in_doaj=True) article2 = models.Article(**source2) article2.set_created("2019-0{i}-01T00:00:00Z".format(i=i + 1)) articles.append(article2) source3 = ArticleFixtureFactory.make_article_source( eissn=issns[0], pissn=issns[1], with_id=False, in_doaj=True) article3 = models.Article(**source3) article3.set_created("2019-0{i}-02T00:00:00Z".format(i=i + 1)) articles.append(article3) comparisons[issns[0]]["article_count"] = 2 comparisons[issns[0]][ "article_latest"] = "2019-0{i}-02T00:00:00Z".format(i=i + 1) if journals_no_issn_count > 0: noissns = [ models.Journal(**s) for s in JournalFixtureFactory.make_many_journal_sources( count=journals_no_issn_count, in_doaj=True) ] for i in range(len(noissns)): noissn = noissns[i] bj = noissn.bibjson() bj.remove_identifiers(idtype=bj.P_ISSN) bj.remove_identifiers(idtype=bj.E_ISSN) noissn.set_id("no_issn_{i}".format(i=i)) journals += noissns if not_in_doaj_count > 0: nots = [ models.Journal(**s) for s in JournalFixtureFactory.make_many_journal_sources( count=not_in_doaj_count, in_doaj=False) ] for i in range(len(nots)): n = nots[i] n.set_id("not_in_doaj_{i}".format(i=i)) journals += nots jids = [] for i in range(len(journals)): journals[i].save() jids.append((journals[i].id, journals[i].last_updated)) aids = [] for i in range(len(articles)): articles[i].save() aids.append((articles[i].id, articles[i].last_updated)) if prune: self.localStore.store(self.container_id, "journalcsv__doaj_20180101_0000_utf8.csv", source_stream=StringIO("test1")) self.localStore.store(self.container_id, "journalcsv__doaj_20180601_0000_utf8.csv", source_stream=StringIO("test2")) models.Journal.blockall(jids) models.Article.blockall(aids) ########################################################### # Execution if raises is not None: with self.assertRaises(raises): self.svc.csv(prune) tempFiles = self.tmpStore.list(self.container_id) assert len(tempFiles) == 0 else: url = self.svc.csv(prune) assert url is not None csv_info = models.cache.Cache.get_latest_csv() assert csv_info.get("url") == url filenames = self.localStore.list(self.container_id) if prune: assert len(filenames) == 2 assert "journalcsv__doaj_20180101_0000_utf8.csv" not in filenames latest = None for fn in filenames: if fn != "journalcsv__doaj_20180601_0000_utf8.csv": latest = fn break handle = self.localStore.get(self.container_id, latest, encoding="utf-8") reader = clcsv.UnicodeReader(handle) rows = [r for r in reader] if len(comparisons) > 0: expected_headers = JournalFixtureFactory.csv_headers() for i in range(len(expected_headers)): h = expected_headers[i] if h != rows[0][i]: print("{x} - {y}".format(x=h, y=rows[0][i])) assert rows[0] == expected_headers assert len(rows) == journal_count + 1 for i in range(1, len(rows)): row = rows[i] alt_title = row[2] issn = row[3] eissn = row[4] article_count = int(row[57]) article_latest = row[58] assert alt_title == u"Заглавие на журнала" assert issn in comparisons[issn]["issns"] assert eissn in comparisons[issn]["issns"] assert article_count == comparisons[issn][ "article_count"], (article_count, comparisons[issn]["article_count"]) assert article_latest == comparisons[issn][ "article_latest"] else: assert len(rows) == 0
def finalise(source, report_out, articles_dir, final_actions): if not os.path.exists(articles_dir): os.makedirs(articles_dir) actions = ActionRegister() with codecs.open(source, "rb", "utf-8") as s: reader = clcsv.UnicodeReader(s) headers = reader.next() accounts = {} for row in reader: article_id = row[0] article_doi = row[2] article_ft = row[3] article_owner = row[4] match_type = row[8] match_id = row[9] match_doi = row[11] match_ft = row[12] match_owner = row[13] actions.set_action(article_id, "delete", "could not be automatically cleaned up") actions.set_action(match_id, "delete", "could not be automatically cleaned up") if article_owner not in accounts: accounts[article_owner] = [] reason = "" if match_type == "doi": reason = "DOI appears in multiple articles" elif match_type == "fulltext": reason = "Fulltext URL appears in multiple articles" else: reason = "Fulltext URL and DOI both appear in multiple articles" accounts[article_owner].append([article_doi, article_ft, reason]) if match_owner not in accounts: accounts[match_owner] = [] reason = "" if match_type == "doi": reason = "DOI appears in multiple articles" elif match_type == "fulltext": reason = "Fulltext URL appears in multiple articles" else: reason = "Fulltext URL and DOI both appear in multiple articles" accounts[article_owner].append([match_doi, match_ft, reason]) final_instructions = {} actions.export_to(final_instructions) with codecs.open(final_actions, "wb", "utf-8") as fa: fawriter = clcsv.UnicodeWriter(fa) fawriter.writerow(["id", "action", "reason"]) for k, v in final_instructions.iteritems(): fawriter.writerow([k, v["action"], v["reason"]]) with codecs.open(report_out, "wb", "utf-8") as ro: writer = clcsv.UnicodeWriter(ro) writer.writerow(["account", "articles to delete", "article_details"]) for k, v in accounts.iteritems(): fn = k + "_articles.csv" with codecs.open(os.path.join(articles_dir, fn), "wb", "utf-8") as a: awriter = clcsv.UnicodeWriter(a) awriter.writerow([ "DOI", "Fulltext", "Reason for removal", "Number of duplicated articles" ]) dedupe = [] for article in v: found = -1 for i in range(len(dedupe)): d = dedupe[i] if d[0] == article[0] and d[1] == article[1]: found = i break if found > -1: dedupe[found][3] += 1 else: dedupe.append(article + [1]) for d in dedupe: awriter.writerow(d) writer.writerow([k, len(v), fn])
def analyse(duplicate_report, noids_report, out, noaction, nocleanup, log): with codecs.open(out, "wb", "utf-8") as o, \ codecs.open(log, "wb", "utf-8") as l, \ codecs.open(duplicate_report, "rb", "utf-8") as f, \ codecs.open(noaction, "wb", "utf-8") as g, \ codecs.open(nocleanup, "wb", "utf-8") as h: reader = clcsv.UnicodeReader(f) noaction_writer = clcsv.UnicodeWriter(g) nocleanup_writer = clcsv.UnicodeWriter(h) headers = reader.next() noaction_writer.writerow(headers) noaction_writer.writerow([]) nocleanup_writer.writerow(headers) final_instructions = {} next_row = None while True: match_set, next_row = _read_match_set(reader, next_row) ids = [m["id"] for m in match_set.matches] l.write("--" + str(len(ids)) + "-- " + ",".join(ids) + "\n\n") actions = ActionRegister() # get rid of any articles from the match set that are not in doaj _eliminate_not_in_doaj(match_set, actions) set_size = len(match_set.matches) while True: cont = True if len(match_set.matches) == 1: _sanitise(match_set, actions) cont = False if cont: _clean_matching_dois(match_set, actions) _clean_matching_fulltexts(match_set, actions) _sanitise(match_set, actions) if len(match_set.matches) == 1: cont = False if cont: _remove_old(match_set, actions) if len(match_set.matches) == set_size or len( match_set.matches) == 0: break set_size = len(match_set.matches) # report on the actions on this match set if actions.has_actions(): l.write(actions.report()) l.write("\n\n") actions.export_to(final_instructions) # write the noaction report file and the almost identical nocleanup file (which can be actioned by another part of this script) if len(match_set.matches) > 1: rows = match_set.to_rows() for row in rows: noaction_writer.writerow(row) nocleanup_writer.writerow(row) for note in match_set.notes: noaction_writer.writerow([note]) noaction_writer.writerow([]) if next_row is None: break with codecs.open(noids_report, "rb", "utf-8") as n: nreader = clcsv.UnicodeReader(n) headers = nreader.next() for row in nreader: final_instructions[row[0]] = { "action": "delete", "reason": "no doi or fulltext" } writer = clcsv.UnicodeWriter(o) writer.writerow(["id", "action", "reason"]) for k, v in final_instructions.iteritems(): writer.writerow([k, v["action"], v["reason"]])
def history_records_assemble(id, csv_dir, tar_dir, out_dir, assemble, do_diff): if assemble: if os.path.exists(out_dir): shutil.rmtree(out_dir) os.makedirs(out_dir) csvs = [c for c in os.listdir(csv_dir) if c.endswith(".csv")] paths = [] # find all the files from the index csvs for c in csvs: tarname = c.rsplit(".", 1)[0] + ".tar.gz" with codecs.open(os.path.join(csv_dir, c), "rb", "utf-8") as f: reader = clcsv.UnicodeReader(f) for row in reader: if row[0] == id: paths.append({ "csv" : c, "tarname" : tarname, "tarpath" : row[2], "date" : row[1], "fileid" : row[3] }) # gather all the files in the target directory with codecs.open(os.path.join(out_dir, "_index." + id + ".csv"), "wb", "utf-8") as g: writer = clcsv.UnicodeWriter(g) writer.writerow(["CSV", "Tar Name", "Tar Path", "Date", "File ID"]) for p in paths: tarball = tarfile.open(os.path.join(tar_dir, p["tarname"]), "r:gz") member = tarball.getmember(p["tarpath"]) handle = tarball.extractfile(member) out = os.path.join(out_dir, p["date"] + "_" + p["fileid"] + ".json") with codecs.open(out, "wb", "utf-8") as f: shutil.copyfileobj(handle, f) writer.writerow([p["csv"], p["tarname"], p["tarpath"], p["date"], p["fileid"]]) if do_diff: difffile = os.path.join(out_dir, "_diff." + id + ".json") if os.path.exists(difffile): os.remove(difffile) # order the files and diff them into a single summary file # FIXME: note that this is not the standardised form of jsondiff, for some reason, but it # will do for now. changes = [] files = [f for f in os.listdir(out_dir) if f.endswith(".json")] files.sort() for i in range(len(files) - 1): f1 = files[i] f2 = files[i + 1] with codecs.open(os.path.join(out_dir, f1), "rb", "utf-8") as r1, \ codecs.open(os.path.join(out_dir, f2), "rb", "utf-8") as r2: j1 = json.loads(r1.read()) j2 = json.loads(r2.read()) d = diff(j1, j2) d["_from"] = f1 d["_to"] = f2 d = _fix_symbols(d) changes.append(d) with codecs.open(difffile, "wb", "utf-8") as o: o.write(json.dumps(changes, indent=2, sort_keys=True))
def history_records_analyse(source, out_dir, reverted_only=False, date=None): ids = set() if date is not None: with codecs.open(source, "rb", "utf-8") as f: reader = clcsv.UnicodeReader(f) for row in reader: if row[1] == date: ids.add(row[0]) records = {} with codecs.open(source, "rb", "utf-8") as f: reader = clcsv.UnicodeReader(f) reader.next() for row in reader: if date is None or row[0] in ids: if row[0] not in records: records[row[0]] = [] records[row[0]].append(row[:3]) count = 1 out = os.path.join(out_dir, "owners.csv") with codecs.open(out, "wb", "utf-8") as o: writer = clcsv.UnicodeWriter(o) writer.writerow(["count", "id", "reverted", "change history"]) writer.writerow([]) for id, rows in records.iteritems(): rows = sorted(rows, key=lambda x: x[1]) owners = [] lastOwner = False ownerTransitions = [] flagged = False for row in rows: with codecs.open(row[2], "rb", "utf-8") as f: data = json.load(f) owner = data.get("admin", {}).get("owner") if len(ownerTransitions) == 0 or owner != ownerTransitions[-1]: ownerTransitions.append(owner) if owner != lastOwner and row[1] == date: flagged = True owners.append((row[1], owner)) lastOwner = owner out_row_1 = [o[0] for o in owners] out_row_2 = [o[1] for o in owners] owner_set = set(out_row_2) if date is None: flagged = True if len(owner_set) > 1 and flagged: reverted = False for i in range(len(ownerTransitions)): o = ownerTransitions[i] if i + 2 < len(ownerTransitions): for j in range(i + 2, len(ownerTransitions)): comp = ownerTransitions[j] if o == comp: reverted = True break if reverted: break if not reverted_only or (reverted_only and reverted): writer.writerow([count, id, "X" if reverted else ""] + out_row_1) writer.writerow(["", "", "X" if reverted else ""] + out_row_2) writer.writerow([]) count += 1