Exemple #1
0
def compare_outputs(duplicate_report):
    original = "/home/richard/tmp/doaj/article_duplicates_2019-02-27/actions-2019-04-02.csv"
    compare = "/home/richard/tmp/doaj/article_duplicates_2019-02-27/actions-2019-04-04.csv"
    missing_out = "/home/richard/tmp/doaj/article_duplicates_2019-02-27/missing.csv"
    extra_out = "/home/richard/tmp/doaj/article_duplicates_2019-02-27/extra.csv"
    reference = "/home/richard/tmp/doaj/article_duplicates_2019-02-27/reference.csv"

    with codecs.open(original, "rb", "utf-8") as f1:
        r1 = clcsv.UnicodeReader(f1)
        r1.next()
        id1 = [x[0] for x in r1]

    with codecs.open(compare, "rb", "utf-8") as f2:
        r2 = clcsv.UnicodeReader(f2)
        r2.next()
        id2 = [x[0] for x in r2]

    missing = [x for x in id1 if x not in id2]
    print("missing {x}".format(x=len(missing)))
    with codecs.open(missing_out, "wb", "utf-8") as f3:
        f3.write("\n".join(missing))

    extra = [x for x in id2 if x not in id1]
    print("extra {x}".format(x=len(extra)))
    with codecs.open(extra_out, "wb", "utf-8") as f4:
        f4.write("\n".join(extra))

    with codecs.open(duplicate_report, "rb", "utf-8") as f5, \
            codecs.open(reference, "wb", "utf-8") as f6:
        r5 = clcsv.UnicodeReader(f5)
        w6 = clcsv.UnicodeWriter(f6)
        headers = r5.next()
        w6.writerow(headers)
        w6.writerow([])

        seen_roots = []
        next_row = None
        while True:
            match_set, next_row = _read_match_set(r5, next_row)
            for m in missing:
                if match_set.contains_id(m):
                    root_id = match_set.root["id"]
                    if root_id in seen_roots:
                        continue
                    seen_roots.append(root_id)

                    print("Reference set for root id {x}".format(x=root_id))
                    rows = match_set.to_rows()
                    for row in rows:
                        w6.writerow(row)
                    w6.writerow([])

            if next_row is None:
                break
Exemple #2
0
def history_records_assemble(id, source, out_dir):
    with codecs.open(source, "rb", "utf-8") as f:
        reader = clcsv.UnicodeReader(f)
        for row in reader:
            if row[0] == id:
                fn = row[1] + "_" + row[3]
                out = os.path.join(out_dir, fn)
                shutil.copy(row[2], out)
Exemple #3
0
def create_users(source):
    with codecs.open(source, "rb", "utf-8") as f:
        reader = clcsv.UnicodeReader(f)
        for row in reader:
            username = row[0]
            email = row[1]
            password = row[2] if row[2] != "" else None
            roles = [r.strip() for r in row[3].split(",")]
            create_user(username, email, password, roles)
    def test_01_publishers_with_consent(self):
        # output file to save csv
        output_file = os.path.join(self.tmp_dir, 'accounts.csv')
        # Create accounts with marketing consent not set
        for i in range(20):
            pubsource = AccountFixtureFactory.make_publisher_source()
            pubaccount = models.Account(**pubsource)
            pubaccount.set_id()
            pubaccount.save()
        # Create accounts with marketing consent set to False
        for i in range(20):
            pubsource = AccountFixtureFactory.make_publisher_source()
            pubaccount = models.Account(**pubsource)
            pubaccount.set_id()
            pubaccount.set_marketing_consent(False)
            pubaccount.save()
        # Create accounts with marketing consent set to True
        expected_data = [[
          u'ID',
          u'Name',
          u'Email',
          u'Created',
          u'Last Updated',
          u'Updated Since Create?'
        ]]
        for i in range(20):
            pubsource = AccountFixtureFactory.make_publisher_source()
            pubaccount = models.Account(**pubsource)
            pubaccount.set_id()
            pubaccount.set_marketing_consent(True)
            if i == 19:
                pubaccount.save(blocking=True)
            else:
                pubaccount.save()
            expected_data.append([
              unicode(pubaccount.id),
              unicode(pubaccount.name),
              unicode(pubaccount.email),
              unicode(pubaccount.created_date),
              unicode(pubaccount.last_updated),
              unicode('False')
            ])

        publishers_with_consent(output_file)

        assert os.path.exists(output_file)

        table = []
        with codecs.open(output_file, "rb", "utf-8") as f:
            reader = clcsv.UnicodeReader(f)
            for row in reader:
                table.append(row)
        assert len(table) == 21
        self.assertItemsEqual(table, expected_data)
    def test_03_apps_by_country(self):
        apps = ApplicationFixtureFactory.make_application_spread(APPLICATION_YEAR_OUTPUT, "year")
        for a in apps:
            a.save()
        time.sleep(2)

        outfiles = reporting.content_reports("1970-01-01T00:00:00Z", dates.now(), TMP_DIR)

        assert len(outfiles) == 1
        assert os.path.exists(outfiles[0])

        table = []
        with codecs.open(outfiles[0], "rb", "utf-8") as f:
            reader = clcsv.UnicodeReader(f)
            for row in reader:
                table.append(row)

        expected = self._as_output(APPLICATION_YEAR_OUTPUT)
        assert table == expected
unique_deduplicated = []
genuine_unique_ids = []
genuine_unique_deduplicated = []
bad_data_unique_ids = []
bad_data_unique_deduplicated = []
genuine_count = 0
bad_data_count = 0

with codecs.open(GENUINE, "wb", "utf-8") as a:
    awriter = clcsv.UnicodeWriter(a)

    with codecs.open(BAD_DATA, "wb", "utf-8") as b:
        bwriter = clcsv.UnicodeWriter(b)

        with codecs.open(IN, "rb", "utf-8") as f:
            reader = clcsv.UnicodeReader(f)

            headers = reader.next()
            awriter.writerow(headers)
            bwriter.writerow(headers)

            i = 0
            for row in reader:
                print(i)
                i += 1

                data = _to_dict(headers, row)
                aid = data["article_id"]
                mid = data["match_id"]

                if aid not in unique_ids:
Exemple #7
0
    def test_journal_csv(self, name, kwargs):

        prune_arg = kwargs.get("prune")
        tmp_write_arg = kwargs.get("tmp_write")
        main_write_arg = kwargs.get("main_write")
        journals_arg = kwargs.get("journals")
        journals_no_issn_arg = kwargs.get("journals_no_issn")
        not_in_doaj_arg = kwargs.get("not_in_doaj")
        journals_with_articles_arg = kwargs.get("journals_with_articles")

        raises_arg = kwargs.get("raises")

        ###############################################
        ## set up

        raises = EXCEPTIONS.get(raises_arg)
        prune = True if prune_arg == "True" else False if prune_arg == "False" else None
        journal_count = int(journals_arg)
        journals_no_issn_count = int(journals_no_issn_arg)
        not_in_doaj_count = int(not_in_doaj_arg)
        journals_with_articles_count = int(journals_with_articles_arg)

        if tmp_write_arg == "fail":
            app.config[
                "STORE_TMP_IMPL"] = StoreMockFactory.no_writes_classpath()

        if main_write_arg == "fail":
            app.config["STORE_IMPL"] = StoreMockFactory.no_writes_classpath()

        journals = []
        if journal_count > 0:
            journals += [
                models.Journal(**s)
                for s in JournalFixtureFactory.make_many_journal_sources(
                    count=journal_count, in_doaj=True)
            ]

        comparisons = {}
        articles = []
        for i in range(len(journals)):
            journal = journals[i]
            bj = journal.bibjson()
            bj.alternative_title = u"Заглавие на журнала"  # checking mixed unicode
            issns = journal.bibjson().issns()
            source1 = ArticleFixtureFactory.make_article_source(eissn=issns[0],
                                                                pissn=issns[1],
                                                                with_id=False,
                                                                in_doaj=False)
            articles.append(models.Article(**source1))
            comparisons[issns[0]] = {
                "issns": issns,
                "article_count": 0,
                "article_latest": ""
            }
            if i < journals_with_articles_count:
                source2 = ArticleFixtureFactory.make_article_source(
                    eissn=issns[0],
                    pissn=issns[1],
                    with_id=False,
                    in_doaj=True)
                article2 = models.Article(**source2)
                article2.set_created("2019-0{i}-01T00:00:00Z".format(i=i + 1))
                articles.append(article2)

                source3 = ArticleFixtureFactory.make_article_source(
                    eissn=issns[0],
                    pissn=issns[1],
                    with_id=False,
                    in_doaj=True)
                article3 = models.Article(**source3)
                article3.set_created("2019-0{i}-02T00:00:00Z".format(i=i + 1))
                articles.append(article3)

                comparisons[issns[0]]["article_count"] = 2
                comparisons[issns[0]][
                    "article_latest"] = "2019-0{i}-02T00:00:00Z".format(i=i +
                                                                        1)

        if journals_no_issn_count > 0:
            noissns = [
                models.Journal(**s)
                for s in JournalFixtureFactory.make_many_journal_sources(
                    count=journals_no_issn_count, in_doaj=True)
            ]
            for i in range(len(noissns)):
                noissn = noissns[i]
                bj = noissn.bibjson()
                bj.remove_identifiers(idtype=bj.P_ISSN)
                bj.remove_identifiers(idtype=bj.E_ISSN)
                noissn.set_id("no_issn_{i}".format(i=i))
            journals += noissns

        if not_in_doaj_count > 0:
            nots = [
                models.Journal(**s)
                for s in JournalFixtureFactory.make_many_journal_sources(
                    count=not_in_doaj_count, in_doaj=False)
            ]
            for i in range(len(nots)):
                n = nots[i]
                n.set_id("not_in_doaj_{i}".format(i=i))
            journals += nots

        jids = []
        for i in range(len(journals)):
            journals[i].save()
            jids.append((journals[i].id, journals[i].last_updated))

        aids = []
        for i in range(len(articles)):
            articles[i].save()
            aids.append((articles[i].id, articles[i].last_updated))

        if prune:
            self.localStore.store(self.container_id,
                                  "journalcsv__doaj_20180101_0000_utf8.csv",
                                  source_stream=StringIO("test1"))
            self.localStore.store(self.container_id,
                                  "journalcsv__doaj_20180601_0000_utf8.csv",
                                  source_stream=StringIO("test2"))

        models.Journal.blockall(jids)
        models.Article.blockall(aids)

        ###########################################################
        # Execution

        if raises is not None:
            with self.assertRaises(raises):
                self.svc.csv(prune)

                tempFiles = self.tmpStore.list(self.container_id)
                assert len(tempFiles) == 0
        else:
            url = self.svc.csv(prune)
            assert url is not None

            csv_info = models.cache.Cache.get_latest_csv()
            assert csv_info.get("url") == url

            filenames = self.localStore.list(self.container_id)
            if prune:
                assert len(filenames) == 2
                assert "journalcsv__doaj_20180101_0000_utf8.csv" not in filenames

            latest = None
            for fn in filenames:
                if fn != "journalcsv__doaj_20180601_0000_utf8.csv":
                    latest = fn
                    break

            handle = self.localStore.get(self.container_id,
                                         latest,
                                         encoding="utf-8")
            reader = clcsv.UnicodeReader(handle)
            rows = [r for r in reader]

            if len(comparisons) > 0:
                expected_headers = JournalFixtureFactory.csv_headers()
                for i in range(len(expected_headers)):
                    h = expected_headers[i]
                    if h != rows[0][i]:
                        print("{x} - {y}".format(x=h, y=rows[0][i]))
                assert rows[0] == expected_headers

                assert len(rows) == journal_count + 1

                for i in range(1, len(rows)):
                    row = rows[i]
                    alt_title = row[2]
                    issn = row[3]
                    eissn = row[4]
                    article_count = int(row[57])
                    article_latest = row[58]

                    assert alt_title == u"Заглавие на журнала"
                    assert issn in comparisons[issn]["issns"]
                    assert eissn in comparisons[issn]["issns"]
                    assert article_count == comparisons[issn][
                        "article_count"], (article_count,
                                           comparisons[issn]["article_count"])
                    assert article_latest == comparisons[issn][
                        "article_latest"]

            else:
                assert len(rows) == 0
Exemple #8
0
def finalise(source, report_out, articles_dir, final_actions):
    if not os.path.exists(articles_dir):
        os.makedirs(articles_dir)

    actions = ActionRegister()
    with codecs.open(source, "rb", "utf-8") as s:
        reader = clcsv.UnicodeReader(s)
        headers = reader.next()

        accounts = {}
        for row in reader:
            article_id = row[0]
            article_doi = row[2]
            article_ft = row[3]
            article_owner = row[4]
            match_type = row[8]
            match_id = row[9]
            match_doi = row[11]
            match_ft = row[12]
            match_owner = row[13]

            actions.set_action(article_id, "delete",
                               "could not be automatically cleaned up")
            actions.set_action(match_id, "delete",
                               "could not be automatically cleaned up")

            if article_owner not in accounts:
                accounts[article_owner] = []
            reason = ""
            if match_type == "doi":
                reason = "DOI appears in multiple articles"
            elif match_type == "fulltext":
                reason = "Fulltext URL appears in multiple articles"
            else:
                reason = "Fulltext URL and DOI both appear in multiple articles"
            accounts[article_owner].append([article_doi, article_ft, reason])

            if match_owner not in accounts:
                accounts[match_owner] = []
            reason = ""
            if match_type == "doi":
                reason = "DOI appears in multiple articles"
            elif match_type == "fulltext":
                reason = "Fulltext URL appears in multiple articles"
            else:
                reason = "Fulltext URL and DOI both appear in multiple articles"
            accounts[article_owner].append([match_doi, match_ft, reason])

    final_instructions = {}
    actions.export_to(final_instructions)

    with codecs.open(final_actions, "wb", "utf-8") as fa:
        fawriter = clcsv.UnicodeWriter(fa)
        fawriter.writerow(["id", "action", "reason"])
        for k, v in final_instructions.iteritems():
            fawriter.writerow([k, v["action"], v["reason"]])

    with codecs.open(report_out, "wb", "utf-8") as ro:
        writer = clcsv.UnicodeWriter(ro)
        writer.writerow(["account", "articles to delete", "article_details"])
        for k, v in accounts.iteritems():
            fn = k + "_articles.csv"
            with codecs.open(os.path.join(articles_dir, fn), "wb",
                             "utf-8") as a:
                awriter = clcsv.UnicodeWriter(a)
                awriter.writerow([
                    "DOI", "Fulltext", "Reason for removal",
                    "Number of duplicated articles"
                ])
                dedupe = []
                for article in v:
                    found = -1
                    for i in range(len(dedupe)):
                        d = dedupe[i]
                        if d[0] == article[0] and d[1] == article[1]:
                            found = i
                            break
                    if found > -1:
                        dedupe[found][3] += 1
                    else:
                        dedupe.append(article + [1])
                for d in dedupe:
                    awriter.writerow(d)
            writer.writerow([k, len(v), fn])
Exemple #9
0
def analyse(duplicate_report, noids_report, out, noaction, nocleanup, log):

    with codecs.open(out, "wb", "utf-8") as o, \
            codecs.open(log, "wb", "utf-8") as l, \
            codecs.open(duplicate_report, "rb", "utf-8") as f, \
            codecs.open(noaction, "wb", "utf-8") as g, \
            codecs.open(nocleanup, "wb", "utf-8") as h:

        reader = clcsv.UnicodeReader(f)
        noaction_writer = clcsv.UnicodeWriter(g)
        nocleanup_writer = clcsv.UnicodeWriter(h)
        headers = reader.next()
        noaction_writer.writerow(headers)
        noaction_writer.writerow([])
        nocleanup_writer.writerow(headers)

        final_instructions = {}
        next_row = None
        while True:
            match_set, next_row = _read_match_set(reader, next_row)
            ids = [m["id"] for m in match_set.matches]
            l.write("--" + str(len(ids)) + "-- " + ",".join(ids) + "\n\n")
            actions = ActionRegister()

            # get rid of any articles from the match set that are not in doaj
            _eliminate_not_in_doaj(match_set, actions)

            set_size = len(match_set.matches)
            while True:
                cont = True
                if len(match_set.matches) == 1:
                    _sanitise(match_set, actions)
                    cont = False

                if cont:
                    _clean_matching_dois(match_set, actions)
                    _clean_matching_fulltexts(match_set, actions)
                    _sanitise(match_set, actions)

                if len(match_set.matches) == 1:
                    cont = False

                if cont:
                    _remove_old(match_set, actions)

                if len(match_set.matches) == set_size or len(
                        match_set.matches) == 0:
                    break
                set_size = len(match_set.matches)

            # report on the actions on this match set
            if actions.has_actions():
                l.write(actions.report())
                l.write("\n\n")

            actions.export_to(final_instructions)

            # write the noaction report file and the almost identical nocleanup file (which can be actioned by another part of this script)
            if len(match_set.matches) > 1:
                rows = match_set.to_rows()
                for row in rows:
                    noaction_writer.writerow(row)
                    nocleanup_writer.writerow(row)
                for note in match_set.notes:
                    noaction_writer.writerow([note])
                noaction_writer.writerow([])

            if next_row is None:
                break

        with codecs.open(noids_report, "rb", "utf-8") as n:
            nreader = clcsv.UnicodeReader(n)
            headers = nreader.next()
            for row in nreader:
                final_instructions[row[0]] = {
                    "action": "delete",
                    "reason": "no doi or fulltext"
                }

        writer = clcsv.UnicodeWriter(o)
        writer.writerow(["id", "action", "reason"])
        for k, v in final_instructions.iteritems():
            writer.writerow([k, v["action"], v["reason"]])
Exemple #10
0
def history_records_assemble(id, csv_dir, tar_dir, out_dir, assemble, do_diff):

    if assemble:
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        os.makedirs(out_dir)

        csvs = [c for c in os.listdir(csv_dir) if c.endswith(".csv")]
        paths = []

        # find all the files from the index csvs
        for c in csvs:
            tarname = c.rsplit(".", 1)[0] + ".tar.gz"
            with codecs.open(os.path.join(csv_dir, c), "rb", "utf-8") as f:
                reader = clcsv.UnicodeReader(f)
                for row in reader:
                    if row[0] == id:
                        paths.append({
                            "csv" : c,
                            "tarname" : tarname,
                            "tarpath" : row[2],
                            "date" : row[1],
                            "fileid" : row[3]
                        })

        # gather all the files in the target directory
        with codecs.open(os.path.join(out_dir, "_index." + id + ".csv"), "wb", "utf-8") as g:
            writer = clcsv.UnicodeWriter(g)
            writer.writerow(["CSV", "Tar Name", "Tar Path", "Date", "File ID"])
            for p in paths:
                tarball = tarfile.open(os.path.join(tar_dir, p["tarname"]), "r:gz")
                member = tarball.getmember(p["tarpath"])
                handle = tarball.extractfile(member)
                out = os.path.join(out_dir, p["date"] + "_" + p["fileid"] + ".json")
                with codecs.open(out, "wb", "utf-8") as f:
                    shutil.copyfileobj(handle, f)
                writer.writerow([p["csv"], p["tarname"], p["tarpath"], p["date"], p["fileid"]])

    if do_diff:
        difffile = os.path.join(out_dir, "_diff." + id + ".json")
        if os.path.exists(difffile):
            os.remove(difffile)

        # order the files and diff them into a single summary file
        # FIXME: note that this is not the standardised form of jsondiff, for some reason, but it
        # will do for now.
        changes = []
        files = [f for f in os.listdir(out_dir) if f.endswith(".json")]
        files.sort()
        for i in range(len(files) - 1):
            f1 = files[i]
            f2 = files[i + 1]
            with codecs.open(os.path.join(out_dir, f1), "rb", "utf-8") as r1, \
                    codecs.open(os.path.join(out_dir, f2), "rb", "utf-8") as r2:
                j1 = json.loads(r1.read())
                j2 = json.loads(r2.read())
                d = diff(j1, j2)
                d["_from"] = f1
                d["_to"] = f2
                d = _fix_symbols(d)
                changes.append(d)

        with codecs.open(difffile, "wb", "utf-8") as o:
            o.write(json.dumps(changes, indent=2, sort_keys=True))
Exemple #11
0
def history_records_analyse(source, out_dir, reverted_only=False, date=None):
    ids = set()
    if date is not None:
        with codecs.open(source, "rb", "utf-8") as f:
            reader = clcsv.UnicodeReader(f)
            for row in reader:
                if row[1] == date:
                    ids.add(row[0])

    records = {}
    with codecs.open(source, "rb", "utf-8") as f:
        reader = clcsv.UnicodeReader(f)
        reader.next()
        for row in reader:
            if date is None or row[0] in ids:
                if row[0] not in records:
                    records[row[0]] = []
                records[row[0]].append(row[:3])

    count = 1
    out = os.path.join(out_dir, "owners.csv")
    with codecs.open(out, "wb", "utf-8") as o:
        writer = clcsv.UnicodeWriter(o)
        writer.writerow(["count", "id", "reverted", "change history"])
        writer.writerow([])

        for id, rows in records.iteritems():
            rows = sorted(rows, key=lambda x: x[1])
            owners = []
            lastOwner = False
            ownerTransitions = []
            flagged = False
            for row in rows:
                with codecs.open(row[2], "rb", "utf-8") as f:
                    data = json.load(f)
                owner = data.get("admin", {}).get("owner")
                if len(ownerTransitions) == 0 or owner != ownerTransitions[-1]:
                    ownerTransitions.append(owner)
                if owner != lastOwner and row[1] == date:
                    flagged = True
                owners.append((row[1], owner))
                lastOwner = owner

            out_row_1 = [o[0] for o in owners]
            out_row_2 = [o[1] for o in owners]
            owner_set = set(out_row_2)

            if date is None: flagged = True

            if len(owner_set) > 1 and flagged:
                reverted = False
                for i in range(len(ownerTransitions)):
                    o = ownerTransitions[i]
                    if i + 2 < len(ownerTransitions):
                        for j in range(i + 2, len(ownerTransitions)):
                            comp = ownerTransitions[j]
                            if o == comp:
                                reverted = True
                                break
                    if reverted:
                        break

                if not reverted_only or (reverted_only and reverted):
                    writer.writerow([count, id, "X" if reverted else ""] + out_row_1)
                    writer.writerow(["", "", "X" if reverted else ""] + out_row_2)
                    writer.writerow([])
                    count += 1