Exemple #1
0
    def make_status_spread(cls, desired_output, period, role_map):
        desired_output = deepcopy(desired_output)
        header = desired_output[0]
        del desired_output[0]
        del header[0]
        ranges = []
        for h in header:
            start = None
            end = None
            if period == "month":
                startts = dates.parse(h, "%Y-%m")
                year, month = divmod(startts.month+1, 12)
                if month == 0:
                    month = 12
                    year = year - 1
                endts = datetime(startts.year + year, month, 1)
                start = dates.format(startts)
                end = dates.format(endts)
            elif period == "year":
                startts = dates.parse(h, "%Y")
                endts = datetime(startts.year + 1, 1, 1)
                start = dates.format(startts)
                end = dates.format(endts)

            ranges.append((start, end))

        provs = []
        for row in desired_output:
            user = row[0]
            del row[0]
            role = role_map[user]
            for i in range(len(row)):
                count = row[i]
                start, end = ranges[i]
                status = None
                if role == "associate_editor":
                    status = constants.APPLICATION_STATUS_COMPLETED
                elif role == "editor":
                    status = constants.APPLICATION_STATUS_READY
                elif role == "admin":
                    status = ADMIN_STATUSES[randint(0, len(ADMIN_STATUSES) - 1)]
                for j in range(count):
                    p = Provenance()
                    p.set_created(dates.random_date(start, end))
                    p.user = user
                    p.roles = [role]
                    p.type = "suggestion"
                    p.action = "status:" + status
                    p.resource_id = uuid.uuid4().hex
                    provs.append(p)

        return provs
Exemple #2
0
def type_map(t):
    type = DO_TYPE_TO_JSON_TYPE.get(t, "string")
    if type == "timestamp":
        return dates.now()
    elif type == "datestamp":
        return dates.format(datetime.utcnow(), "%Y-%m-%d")
    return type
Exemple #3
0
 def datify(val):
     if val is None or val == "":
         return None
     if isinstance(val, date) or isinstance(val, datetime):
         return dates.format(val, format=out_format)
     else:
         return dates.reformat(val, in_format=in_format, out_format=out_format)
Exemple #4
0
    def make_action_spread(cls, desired_output, action, period):
        desired_output = deepcopy(desired_output)
        header = desired_output[0]
        del desired_output[0]
        del header[0]
        ranges = []
        for h in header:
            start = None
            end = None
            if period == "month":
                startts = dates.parse(h, "%Y-%m")
                year, month = divmod(startts.month+1, 12)
                if month == 0:
                    month = 12
                    year = year - 1
                endts = datetime(startts.year + year, month, 1)
                start = dates.format(startts)
                end = dates.format(endts)
            elif period == "year":
                startts = dates.parse(h, "%Y")
                endts = datetime(startts.year + 1, 1, 1)
                start = dates.format(startts)
                end = dates.format(endts)

            ranges.append((start, end))

        provs = []
        for row in desired_output:
            user = row[0]
            del row[0]
            for i in range(len(row)):
                count = row[i]
                start, end = ranges[i]
                for j in range(count):
                    p = Provenance()
                    p.set_created(dates.random_date(start, end))
                    p.user = user
                    p.type = "suggestion"
                    p.action = action
                    p.resource_id = uuid.uuid4().hex
                    provs.append(p)

        return provs
Exemple #5
0
    def make_action_spread(cls, desired_output, action, period):
        desired_output = deepcopy(desired_output)
        header = desired_output[0]
        del desired_output[0]
        del header[0]
        ranges = []
        for h in header:
            start = None
            end = None
            if period == "month":
                startts = dates.parse(h, "%Y-%m")
                year, month = divmod(startts.month + 1, 12)
                if month == 0:
                    month = 12
                    year = year - 1
                endts = datetime(startts.year + year, month, 1)
                start = dates.format(startts)
                end = dates.format(endts)
            elif period == "year":
                startts = dates.parse(h, "%Y")
                endts = datetime(startts.year + 1, 1, 1)
                start = dates.format(startts)
                end = dates.format(endts)

            ranges.append((start, end))

        provs = []
        for row in desired_output:
            user = row[0]
            del row[0]
            for i in range(len(row)):
                count = row[i]
                start, end = ranges[i]
                for j in range(count):
                    p = Provenance()
                    p.set_created(dates.random_date(start, end))
                    p.user = user
                    p.type = "suggestion"
                    p.action = action
                    p.resource_id = uuid.uuid4().hex
                    provs.append(p)

        return provs
Exemple #6
0
    def make_application_spread(cls, desired_output, period):
        desired_output = deepcopy(desired_output)
        header = desired_output[0]
        del desired_output[0]
        del header[0]
        ranges = []
        for h in header:
            start = None
            end = None
            if period == "month":
                startts = dates.parse(h, "%Y-%m")
                year, month = divmod(startts.month+1, 12)
                if month == 0:
                    month = 12
                    year = year - 1
                endts = datetime(startts.year + year, month, 1)
                start = dates.format(startts)
                end = dates.format(endts)
            elif period == "year":
                startts = dates.parse(h, "%Y")
                endts = datetime(startts.year + 1, 1, 1)
                start = dates.format(startts)
                end = dates.format(endts)

            ranges.append((start, end))

        apps = []
        for row in desired_output:
            country = row[0]
            del row[0]
            for i in range(len(row)):
                count = row[i]
                start, end = ranges[i]
                for j in range(count):
                    s = Suggestion()
                    s.set_created(dates.random_date(start, end))
                    s.bibjson().country = country
                    apps.append(s)

        return apps
Exemple #7
0
    def run(self):
        """
        Execute the task as specified by the background_job
        :return:
        """
        job = self.background_job

        # Connection to the ES index
        conn = Connection(app.config.get("ELASTIC_SEARCH_HOST"), index='_snapshot')

        snap_ttl = app.config.get('ELASTIC_SEARCH_SNAPSHOT_TTL', 366)
        snap_thresh = datetime.utcnow() - timedelta(days=snap_ttl)
        job.add_audit_message('Deleting backups older than {}'.format(dates.format(snap_thresh)))

        client = ESSnapshotsClient(conn, app.config['ELASTIC_SEARCH_SNAPSHOT_REPOSITORY'])
        client.prune_snapshots(snap_ttl, self.report_deleted_closure(job))
Exemple #8
0
    def run(self):
        """
        Execute the task as specified by the background_job
        :return:
        """
        job = self.background_job

        # Connection to the ES index
        conn = Connection(app.config.get("ELASTIC_SEARCH_HOST"),
                          index='_snapshot')

        snap_ttl = app.config.get('ELASTIC_SEARCH_SNAPSHOT_TTL', 366)
        snap_thresh = datetime.utcnow() - timedelta(days=snap_ttl)
        job.add_audit_message('Deleting backups older than {}'.format(
            dates.format(snap_thresh)))

        client = ESSnapshotsClient(
            conn, app.config['ELASTIC_SEARCH_SNAPSHOT_REPOSITORY'])
        client.prune_snapshots(snap_ttl, self.report_deleted_closure(job))
Exemple #9
0
def status():
    res = {
        'stable': True,
        'ping': {
            'apps': {},
            'indices': {}
        },
        'background': {
            'status': 'Background jobs are stable',
            'info': []
        },
        'notes': []
    }

    # to get monitoring on this, use uptime robot or similar to check that the status page
    # contains the 'stable': True string and the following note strings

    app_note = 'apps reachable'
    app_unreachable = 0
    inodes_note = 'inode use on app machines below 95%'
    inodes_high = 0
    writable_note = 'app machines can write to disk'
    not_writable = 0
    #disk_note = 'disk use on app machines below 95%'
    #disk_high = 0
    #memory_note = 'memory use on app machines below 95%'
    #memory_high = 0
    es_note = 'indexes stable'
    es_unreachable = 0
    indexable_note = 'index accepts index/delete operations'
    cluster_note = 'cluster stable'

    for addr in app.config.get('APP_MACHINES_INTERNAL_IPS', []):
        if not addr.startswith('http'): addr = 'http://' + addr
        addr += url_for('.stats')
        r = requests.get(addr)
        res['ping']['apps'][
            addr] = r.status_code if r.status_code != 200 else r.json()
        try:
            if res['ping']['apps'][addr].get('inode_used_pc', 0) >= 95:
                inodes_high += 1
                inodes_note = 'INODE GREATER THAN 95% ON ' + str(
                    inodes_high) + ' APP MACHINES'
            if res['ping']['apps'][addr].get('writable', False) != True:
                not_writable += 1
                writable_note = 'WRITE FAILURE ON ' + str(
                    not_writable) + ' APP MACHINES'
            #if res['ping']['apps'][addr].get('disk_used_pc',0) >= 95:
            #    disk_high += 1
            #    disk_note = 'DISK USE GREATER THAN 95% ON ' + disk_high + ' APP MACHINES'
            #if res['ping']['apps'][addr].get('memory_used_pc',0) >= 95:
            #    memory_high += 1
            #    memory_note = 'MEMORY USE GREATER THAN 95% ON ' + memory_high + ' APP MACHINES'
        except:
            pass
        if r.status_code != 200:
            res['stable'] = False
            app_unreachable += 1
            app_note = str(app_unreachable) + ' APPS UNREACHABLE'
    res['notes'].append(app_note)
    res['notes'].append(inodes_note)
    res['notes'].append(writable_note)
    #res['notes'].append(disk_note)
    #res['notes'].append(memory_note)

    # check that all necessary ES nodes can actually be pinged from this machine
    for eddr in [app.config['ELASTIC_SEARCH_HOST']] if isinstance(
            app.config['ELASTIC_SEARCH_HOST'],
            str) else app.config['ELASTIC_SEARCH_HOST']:
        if not eddr.startswith('http'): eddr = 'http://' + eddr
        if not eddr.endswith(':9200'): eddr += ':9200'
        r = requests.get(eddr)
        res['ping']['indices'][eddr] = r.status_code
        res['stable'] = r.status_code == 200
        if r.status_code != 200:
            res['stable'] = False
            es_unreachable += 1
            es_note = str(es_unreachable) + ' INDEXES UNREACHABLE'
    res['notes'].append(es_note)

    # query ES for cluster health and nodes up
    es_addr = str(app.config['ELASTIC_SEARCH_HOST'][0]
                  if not isinstance(app.config['ELASTIC_SEARCH_HOST'], str)
                  else app.config['ELASTIC_SEARCH_HOST']).rstrip('/')
    if not es_addr.startswith('http'): es_addr = 'http://' + es_addr
    if not es_addr.endswith(':9200'): es_addr += ':9200'
    try:
        es = requests.get(es_addr + '/_status').json()
        res['index'] = {
            'cluster': {},
            'shards': {
                'total': es['_shards']['total'],
                'successful': es['_shards']['successful']
            },
            'indices': {}
        }
        for k, v in es['indices'].items():
            res['index']['indices'][k] = {
                'docs':
                v['docs']['num_docs'],
                'size':
                int(
                    math.ceil(v['index']['primary_size_in_bytes']) / 1024 /
                    1024)
            }
        try:
            ces = requests.get(es_addr + '/_cluster/health')
            res['index']['cluster'] = ces.json()
            res['stable'] = res['index']['cluster']['status'] == 'green'
            if res['index']['cluster']['status'] != 'green':
                cluster_note = 'CLUSTER UNSTABLE'
        except:
            res['stable'] = False
            cluster_note = 'CLUSTER UNSTABLE'
    except:
        res['stable'] = False
        cluster_note = 'CLUSTER UNSTABLE'
    res['notes'].append(cluster_note)

    if False:  # remove this False if happy to test write to the index (could be a setting)
        if res['stable'] and False:
            try:
                ts = str(int(time.time()))
                test_index = 'status_test_writable_' + ts
                test_type = 'test_' + ts
                test_id = ts
                rp = requests.put(es_addr + '/' + test_index + '/' +
                                  test_type + '/' + test_id,
                                  json={'hello': 'world'})
                if rp.status_code != 201:
                    indexable_note = 'NEW INDEX WRITE OPERATION FAILED TO WRITE, RETURNED ' + str(
                        rp.status_code)
                else:
                    try:
                        rr = requests.get(es_addr + '/' + test_index + '/' +
                                          test_type + '/' + test_id).json()
                        if rr['hello'] != 'world':
                            indexable_note = 'INDEX READ DID NOT FIND EXPECTED VALUE IN NEW WRITTEN RECORD'
                        try:
                            rd = requests.delete(es_addr + '/' + test_index)
                            if rd.status_code != 200:
                                indexable_note = 'INDEX DELETE OF TEST INDEX DID NOT RETURNED UNEXPECTED STATUS CODE OF ' + str(
                                    rd.status_code)
                            try:
                                rg = requests.get(es_addr + '/' + test_index)
                                if rg.status_code != 404:
                                    indexable_note = 'INDEX READ AFTER DELETE TEST RETURNED UNEXPECTED STATUS CODE OF ' + str(
                                        rg.status_code)
                            except:
                                pass
                        except:
                            indexable_note = 'INDEX DELETE OF TEST INDEX FAILED'
                    except:
                        indexable_note = 'INDEX READ OF NEW WRITTEN RECORD DID NOT SUCCEED'
            except:
                indexable_note = 'INDEX/DELETE OPERATIONS CAUSED EXCEPTION'
        else:
            indexable_note = 'INDEX/DELETE OPERATIONS NOT TESTED DUE TO SYSTEM ALREADY UNSTABLE'
        res['notes'].append(indexable_note)

    # check background jobs
    try:
        # check if journal_csv, which should run at half past every hour on the main queue, has completed in the last 2 hours (which confirms main queue)
        qcsv = {
            "query": {
                "bool": {
                    "must": [{
                        "term": {
                            "status": "complete"
                        }
                    }, {
                        "term": {
                            "action": "journal_csv"
                        }
                    }, {
                        "range": {
                            "created_date": {
                                "gte":
                                dates.format(
                                    dates.before(datetime.utcnow(), 7200))
                            }
                        }
                    }]
                }
            },
            "size": 1,
            "sort": {
                "created_date": {
                    "order": "desc"
                }
            }
        }
        rcsv = models.BackgroundJob.send_query(
            qcsv)['hits']['hits'][0]['_source']
        res['background']['info'].append(
            'journal_csv has run in the last 2 hours, confirming main queue is running'
        )
    except:
        res['background']['status'] = 'Unstable'
        res['background']['info'].append(
            'Error when trying to check background job journal_csv in the last 2 hours - could be a problem with this job or with main queue'
        )
        res['stable'] = False
    try:
        # check if prune_es_backups, which should run at 9.30am every day, has completed in the last 24 hours (which confirms long running queue)
        qprune = {
            "query": {
                "bool": {
                    "must": [{
                        "term": {
                            "status": "complete"
                        }
                    }, {
                        "term": {
                            "action": "prune_es_backups"
                        }
                    }, {
                        "range": {
                            "created_date": {
                                "gte":
                                dates.format(
                                    dates.before(datetime.utcnow(), 86400))
                            }
                        }
                    }]
                }
            },
            "size": 1,
            "sort": {
                "created_date": {
                    "order": "desc"
                }
            }
        }
        rprune = models.BackgroundJob.send_query(
            qprune)['hits']['hits'][0]['_source']
        res['background']['info'].append(
            'prune_es_backups has run in the last 24 hours, confirming long running queue is running'
        )
    except:
        res['background']['status'] = 'Unstable'
        res['background']['info'].append(
            'Error when trying to check background job prune_es_backups in the last 24 hours - could be a problem with this job or with long running queue'
        )
        res['stable'] = False
    # try:         #fixme: commented out by SE - this isn't working well, it should probably be a background task itself
    #     # remove old jobs if there are too many - remove anything over six months and complete
    #     old_seconds = app.config.get("STATUS_OLD_REMOVE_SECONDS", 15552000)
    #     qbg = {"query": {"bool": {"must": [
    #         {"term": {"status": "complete"}},
    #         {"range": {"created_date": {"lte": dates.format(dates.before(datetime.utcnow(), old_seconds))}}}
    #     ]}}, "size": 10000, "sort": {"created_date": {"order": "desc"}}, "fields": "id"}
    #     rbg = models.BackgroundJob.send_query(qbg)
    #     for job in rbg.get('hits', {}).get('hits', []):
    #         models.BackgroundJob.remove_by_id(job['fields']['id'][0])
    #     res['background']['info'].append('Removed {0} old complete background jobs'.format(rbg.get('hits', {}).get('total', 0)))
    # except:
    #     res['background']['status'] = 'Unstable'
    #     res['background']['info'].append('Error when trying to remove old background jobs')
    #     res['stable'] = False
    try:
        # alert about errors in the last ten minutes - assuming we are going to use uptimerobot to check this every ten minutes
        error_seconds = app.config.get("STATUS_ERROR_CHECK_SECONDS", 600)
        error_ignore = app.config.get(
            "STATUS_ERROR_IGNORE",
            [])  # configure a list of strings that denote something to ignore
        error_ignore = [error_ignore] if isinstance(error_ignore,
                                                    str) else error_ignore
        error_ignore_fields = app.config.get(
            "STATUS_ERROR_IGNORE_FIELDS_TO_CHECK", False
        )  # which fields to get in the query, to check for the strings provided above
        error_ignore_fields = [error_ignore_fields] if isinstance(
            error_ignore_fields, str) else error_ignore_fields
        error_means_unstable = app.config.get("STATUS_ERROR_MEANS_UNSTABLE",
                                              True)
        qer = {
            "query": {
                "bool": {
                    "must": [{
                        "term": {
                            "status": "error"
                        }
                    }, {
                        "range": {
                            "created_date": {
                                "gte":
                                dates.format(
                                    dates.before(datetime.utcnow(),
                                                 error_seconds))
                            }
                        }
                    }]
                }
            },
            "size": 10000,
            "sort": {
                "created_date": {
                    "order": "desc"
                }
            }
        }  # this could be customised with a fields list if we only want to check certain fields for ignore types
        if error_ignore_fields != False:
            qer["fields"] = error_ignore_fields
        rer = models.BackgroundJob.send_query(qer)
        error_count = 0
        for job in rer.get('hits', {}).get('hits', []):
            countable = True
            jsj = json.dumps(job)
            for ig in error_ignore:
                if ig in jsj:
                    countable = False
                    break
            if countable:
                error_count += 1
        if error_count != 0:
            res['background']['status'] = 'Unstable'
            res['background']['info'].append(
                'Background jobs are causing errors')
            res['stable'] = error_means_unstable
        emsg = 'Found {0} background jobs in error status in the last {1} seconds'.format(
            error_count, error_seconds)
        if len(error_ignore) != 0:
            emsg += '. Ignoring ' + ', '.join(
                error_ignore) + ' which reduced the error count from ' + str(
                    rer.get('hits', {}).get('total', 0))
        res['background']['info'].append(emsg)
    except:
        res['background']['status'] = 'Unstable'
        res['background']['info'].append(
            'Error when trying to check background jobs for errors')
        res['stable'] = False

    resp = make_response(json.dumps(res))
    resp.mimetype = "application/json"
    return resp
    def run(self):
        job = self.background_job
        params = job.params

        # Set up the files we need to run this task - a dir to place the report, and a place to write the article csv
        outdir = self.get_param(params, "outdir",
                                "article_duplicates_" + dates.today())
        job.add_audit_message("Saving reports to " + outdir)
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        # Location for our interim CSV file of articles
        tmpdir = self.get_param(params, "tmpdir",
                                'tmp_article_duplicate_report')
        if not os.path.exists(tmpdir):
            os.makedirs(tmpdir)

        tmp_csvname = self.get_param(params, "article_csv", False)
        tmp_csvpath, total = self._make_csv_dump(tmpdir, tmp_csvname)

        # Initialise our reports
        global_reportfile = 'duplicate_articles_global_' + dates.today(
        ) + '.csv'
        global_reportpath = os.path.join(outdir, global_reportfile)
        f = open(global_reportpath, "w", encoding="utf-8")
        global_report = csv.writer(f)
        header = [
            "article_id", "article_created", "article_doi", "article_fulltext",
            "article_owner", "article_issns", "article_in_doaj", "n_matches",
            "match_type", "match_id", "match_created", "match_doi",
            "match_fulltext", "match_owner", "match_issns", "match_in_doaj",
            "owners_match", "titles_match", "article_title", "match_title"
        ]
        global_report.writerow(header)

        noids_reportfile = 'noids_' + dates.today() + '.csv'
        noids_reportpath = os.path.join(outdir, noids_reportfile)
        g = open(noids_reportpath, "w", encoding="utf-8")
        noids_report = csv.writer(g)
        header = [
            "article_id", "article_created", "article_owner", "article_issns",
            "article_in_doaj"
        ]
        noids_report.writerow(header)

        # Record the sets of duplicated articles
        global_matches = []

        a_count = 0

        articleService = DOAJ.articleService()

        # Read back in the article csv file we created earlier
        with open(tmp_csvpath, 'r', encoding='utf-8') as t:
            article_reader = csv.reader(t)

            start = datetime.now()
            estimated_finish = ""
            for a in article_reader:
                if a_count > 1 and a_count % 100 == 0:
                    n = datetime.now()
                    diff = (n - start).total_seconds()
                    expected_total = ((diff / a_count) * total)
                    estimated_finish = dates.format(
                        dates.after(start, expected_total))
                a_count += 1

                article = models.Article(
                    _source={
                        'id': a[0],
                        'created_date': a[1],
                        'bibjson': {
                            'identifier': json.loads(a[2]),
                            'link': json.loads(a[3]),
                            'title': a[4]
                        },
                        'admin': {
                            'in_doaj': json.loads(a[5])
                        }
                    })

                # Get the global duplicates
                try:
                    global_duplicates = articleService.discover_duplicates(
                        article,
                        results_per_match_type=10000,
                        include_article=False)
                except exceptions.DuplicateArticleException:
                    # this means the article did not have any ids that could be used for deduplication
                    owner = self._lookup_owner(article)
                    noids_report.writerow([
                        article.id, article.created_date, owner,
                        ','.join(article.bibjson().issns()),
                        article.is_in_doaj()
                    ])
                    continue

                dupcount = 0
                if global_duplicates:

                    # Look up an article's owner
                    owner = self._lookup_owner(article)

                    # Deduplicate the DOI and fulltext duplicate lists
                    s = set([article.id] + [
                        d.id for d in global_duplicates.get('doi', []) +
                        global_duplicates.get('fulltext', [])
                    ])
                    # remove article's own id from global_duplicates
                    dupcount = len(s) - 1
                    if s not in global_matches:
                        self._write_rows_from_duplicates(
                            article, owner, global_duplicates, global_report)
                        global_matches.append(s)

                app.logger.debug('{0}/{1} {2} {3} {4} {5}'.format(
                    a_count, total, article.id, dupcount, len(global_matches),
                    estimated_finish))

        job.add_audit_message(
            '{0} articles processed for duplicates. {1} global duplicate sets found.'
            .format(a_count, len(global_matches)))
        f.close()
        g.close()

        # Delete the transient temporary files.
        shutil.rmtree(tmpdir)

        # Email the reports if that parameter has been set.
        send_email = self.get_param(params, "email", False)
        if send_email:
            archive_name = "article_duplicates_" + dates.today()
            email_archive(outdir, archive_name)
            job.add_audit_message("email alert sent")
        else:
            job.add_audit_message("no email alert sent")
Exemple #11
0
    def run(self):
        job = self.background_job
        params = job.params

        # Set up the files we need to run this task - a dir to place the report, and a place to write the article csv
        outdir = self.get_param(params, "outdir", "article_duplicates_" + dates.today())
        job.add_audit_message("Saving reports to " + outdir)
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        # Location for our interim CSV file of articles
        tmpdir = self.get_param(params, "tmpdir", 'tmp_article_duplicate_report')
        if not os.path.exists(tmpdir):
            os.makedirs(tmpdir)

        tmp_csvname = self.get_param(params, "article_csv", False)
        tmp_csvpath, total = self._make_csv_dump(tmpdir, tmp_csvname)

        # Initialise our reports
        global_reportfile = 'duplicate_articles_global_' + dates.today() + '.csv'
        global_reportpath = os.path.join(outdir, global_reportfile)
        f = codecs.open(global_reportpath, "wb", "utf-8")
        global_report = UnicodeWriter(f)
        header = ["article_id", "article_created", "article_doi", "article_fulltext", "article_owner", "article_issns", "article_in_doaj", "n_matches", "match_type", "match_id", "match_created", "match_doi", "match_fulltext", "match_owner", "match_issns", "match_in_doaj", "owners_match", "titles_match", "article_title", "match_title"]
        global_report.writerow(header)

        noids_reportfile = 'noids_' + dates.today() + '.csv'
        noids_reportpath = os.path.join(outdir, noids_reportfile)
        g = codecs.open(noids_reportpath, "wb", "utf-8")
        noids_report = UnicodeWriter(g)
        header = ["article_id", "article_created", "article_owner", "article_issns", "article_in_doaj"]
        noids_report.writerow(header)

        # Record the sets of duplicated articles
        global_matches = []

        a_count = 0

        articleService = DOAJ.articleService()

        # Read back in the article csv file we created earlier
        with codecs.open(tmp_csvpath, 'rb', 'utf-8') as t:
            article_reader = UnicodeReader(t)

            start = datetime.now()
            estimated_finish = ""
            for a in article_reader:
                if a_count > 1 and a_count % 100 == 0:
                    n = datetime.now()
                    diff = (n - start).total_seconds()
                    expected_total = ((diff / a_count) * total)
                    estimated_finish = dates.format(dates.after(start, expected_total))
                a_count += 1

                article = models.Article(_source={'id': a[0], 'created_date': a[1], 'bibjson': {'identifier': json.loads(a[2]), 'link': json.loads(a[3]), 'title': a[4]}, 'admin': {'in_doaj': json.loads(a[5])}})

                # Get the global duplicates
                try:
                    global_duplicates = articleService.discover_duplicates(article, owner=None, results_per_match_type=10000)
                except exceptions.DuplicateArticleException:
                    # this means the article did not have any ids that could be used for deduplication
                    owner = self._lookup_owner(article)
                    noids_report.writerow([article.id, article.created_date, owner, ','.join(article.bibjson().issns()), article.is_in_doaj()])
                    continue

                dupcount = 0
                if global_duplicates:

                    # Look up an article's owner
                    owner = self._lookup_owner(article)

                    # Deduplicate the DOI and fulltext duplicate lists
                    s = set([article.id] + [d.id for d in global_duplicates.get('doi', []) + global_duplicates.get('fulltext', [])])
                    dupcount = len(s) - 1
                    if s not in global_matches:
                        self._write_rows_from_duplicates(article, owner, global_duplicates, global_report)
                        global_matches.append(s)

                app.logger.debug('{0}/{1} {2} {3} {4} {5}'.format(a_count, total, article.id, dupcount, len(global_matches), estimated_finish))

        job.add_audit_message('{0} articles processed for duplicates. {1} global duplicate sets found.'.format(a_count, len(global_matches)))
        f.close()
        g.close()

        # Delete the transient temporary files.
        shutil.rmtree(tmpdir)

        # Email the reports if that parameter has been set.
        send_email = self.get_param(params, "email", False)
        if send_email:
            archive_name = "article_duplicates_" + dates.today()
            email_archive(outdir, archive_name)
            job.add_audit_message("email alert sent")
        else:
            job.add_audit_message("no email alert sent")