def make_status_spread(cls, desired_output, period, role_map): desired_output = deepcopy(desired_output) header = desired_output[0] del desired_output[0] del header[0] ranges = [] for h in header: start = None end = None if period == "month": startts = dates.parse(h, "%Y-%m") year, month = divmod(startts.month+1, 12) if month == 0: month = 12 year = year - 1 endts = datetime(startts.year + year, month, 1) start = dates.format(startts) end = dates.format(endts) elif period == "year": startts = dates.parse(h, "%Y") endts = datetime(startts.year + 1, 1, 1) start = dates.format(startts) end = dates.format(endts) ranges.append((start, end)) provs = [] for row in desired_output: user = row[0] del row[0] role = role_map[user] for i in range(len(row)): count = row[i] start, end = ranges[i] status = None if role == "associate_editor": status = constants.APPLICATION_STATUS_COMPLETED elif role == "editor": status = constants.APPLICATION_STATUS_READY elif role == "admin": status = ADMIN_STATUSES[randint(0, len(ADMIN_STATUSES) - 1)] for j in range(count): p = Provenance() p.set_created(dates.random_date(start, end)) p.user = user p.roles = [role] p.type = "suggestion" p.action = "status:" + status p.resource_id = uuid.uuid4().hex provs.append(p) return provs
def type_map(t): type = DO_TYPE_TO_JSON_TYPE.get(t, "string") if type == "timestamp": return dates.now() elif type == "datestamp": return dates.format(datetime.utcnow(), "%Y-%m-%d") return type
def datify(val): if val is None or val == "": return None if isinstance(val, date) or isinstance(val, datetime): return dates.format(val, format=out_format) else: return dates.reformat(val, in_format=in_format, out_format=out_format)
def make_action_spread(cls, desired_output, action, period): desired_output = deepcopy(desired_output) header = desired_output[0] del desired_output[0] del header[0] ranges = [] for h in header: start = None end = None if period == "month": startts = dates.parse(h, "%Y-%m") year, month = divmod(startts.month+1, 12) if month == 0: month = 12 year = year - 1 endts = datetime(startts.year + year, month, 1) start = dates.format(startts) end = dates.format(endts) elif period == "year": startts = dates.parse(h, "%Y") endts = datetime(startts.year + 1, 1, 1) start = dates.format(startts) end = dates.format(endts) ranges.append((start, end)) provs = [] for row in desired_output: user = row[0] del row[0] for i in range(len(row)): count = row[i] start, end = ranges[i] for j in range(count): p = Provenance() p.set_created(dates.random_date(start, end)) p.user = user p.type = "suggestion" p.action = action p.resource_id = uuid.uuid4().hex provs.append(p) return provs
def make_action_spread(cls, desired_output, action, period): desired_output = deepcopy(desired_output) header = desired_output[0] del desired_output[0] del header[0] ranges = [] for h in header: start = None end = None if period == "month": startts = dates.parse(h, "%Y-%m") year, month = divmod(startts.month + 1, 12) if month == 0: month = 12 year = year - 1 endts = datetime(startts.year + year, month, 1) start = dates.format(startts) end = dates.format(endts) elif period == "year": startts = dates.parse(h, "%Y") endts = datetime(startts.year + 1, 1, 1) start = dates.format(startts) end = dates.format(endts) ranges.append((start, end)) provs = [] for row in desired_output: user = row[0] del row[0] for i in range(len(row)): count = row[i] start, end = ranges[i] for j in range(count): p = Provenance() p.set_created(dates.random_date(start, end)) p.user = user p.type = "suggestion" p.action = action p.resource_id = uuid.uuid4().hex provs.append(p) return provs
def make_application_spread(cls, desired_output, period): desired_output = deepcopy(desired_output) header = desired_output[0] del desired_output[0] del header[0] ranges = [] for h in header: start = None end = None if period == "month": startts = dates.parse(h, "%Y-%m") year, month = divmod(startts.month+1, 12) if month == 0: month = 12 year = year - 1 endts = datetime(startts.year + year, month, 1) start = dates.format(startts) end = dates.format(endts) elif period == "year": startts = dates.parse(h, "%Y") endts = datetime(startts.year + 1, 1, 1) start = dates.format(startts) end = dates.format(endts) ranges.append((start, end)) apps = [] for row in desired_output: country = row[0] del row[0] for i in range(len(row)): count = row[i] start, end = ranges[i] for j in range(count): s = Suggestion() s.set_created(dates.random_date(start, end)) s.bibjson().country = country apps.append(s) return apps
def run(self): """ Execute the task as specified by the background_job :return: """ job = self.background_job # Connection to the ES index conn = Connection(app.config.get("ELASTIC_SEARCH_HOST"), index='_snapshot') snap_ttl = app.config.get('ELASTIC_SEARCH_SNAPSHOT_TTL', 366) snap_thresh = datetime.utcnow() - timedelta(days=snap_ttl) job.add_audit_message('Deleting backups older than {}'.format(dates.format(snap_thresh))) client = ESSnapshotsClient(conn, app.config['ELASTIC_SEARCH_SNAPSHOT_REPOSITORY']) client.prune_snapshots(snap_ttl, self.report_deleted_closure(job))
def run(self): """ Execute the task as specified by the background_job :return: """ job = self.background_job # Connection to the ES index conn = Connection(app.config.get("ELASTIC_SEARCH_HOST"), index='_snapshot') snap_ttl = app.config.get('ELASTIC_SEARCH_SNAPSHOT_TTL', 366) snap_thresh = datetime.utcnow() - timedelta(days=snap_ttl) job.add_audit_message('Deleting backups older than {}'.format( dates.format(snap_thresh))) client = ESSnapshotsClient( conn, app.config['ELASTIC_SEARCH_SNAPSHOT_REPOSITORY']) client.prune_snapshots(snap_ttl, self.report_deleted_closure(job))
def status(): res = { 'stable': True, 'ping': { 'apps': {}, 'indices': {} }, 'background': { 'status': 'Background jobs are stable', 'info': [] }, 'notes': [] } # to get monitoring on this, use uptime robot or similar to check that the status page # contains the 'stable': True string and the following note strings app_note = 'apps reachable' app_unreachable = 0 inodes_note = 'inode use on app machines below 95%' inodes_high = 0 writable_note = 'app machines can write to disk' not_writable = 0 #disk_note = 'disk use on app machines below 95%' #disk_high = 0 #memory_note = 'memory use on app machines below 95%' #memory_high = 0 es_note = 'indexes stable' es_unreachable = 0 indexable_note = 'index accepts index/delete operations' cluster_note = 'cluster stable' for addr in app.config.get('APP_MACHINES_INTERNAL_IPS', []): if not addr.startswith('http'): addr = 'http://' + addr addr += url_for('.stats') r = requests.get(addr) res['ping']['apps'][ addr] = r.status_code if r.status_code != 200 else r.json() try: if res['ping']['apps'][addr].get('inode_used_pc', 0) >= 95: inodes_high += 1 inodes_note = 'INODE GREATER THAN 95% ON ' + str( inodes_high) + ' APP MACHINES' if res['ping']['apps'][addr].get('writable', False) != True: not_writable += 1 writable_note = 'WRITE FAILURE ON ' + str( not_writable) + ' APP MACHINES' #if res['ping']['apps'][addr].get('disk_used_pc',0) >= 95: # disk_high += 1 # disk_note = 'DISK USE GREATER THAN 95% ON ' + disk_high + ' APP MACHINES' #if res['ping']['apps'][addr].get('memory_used_pc',0) >= 95: # memory_high += 1 # memory_note = 'MEMORY USE GREATER THAN 95% ON ' + memory_high + ' APP MACHINES' except: pass if r.status_code != 200: res['stable'] = False app_unreachable += 1 app_note = str(app_unreachable) + ' APPS UNREACHABLE' res['notes'].append(app_note) res['notes'].append(inodes_note) res['notes'].append(writable_note) #res['notes'].append(disk_note) #res['notes'].append(memory_note) # check that all necessary ES nodes can actually be pinged from this machine for eddr in [app.config['ELASTIC_SEARCH_HOST']] if isinstance( app.config['ELASTIC_SEARCH_HOST'], str) else app.config['ELASTIC_SEARCH_HOST']: if not eddr.startswith('http'): eddr = 'http://' + eddr if not eddr.endswith(':9200'): eddr += ':9200' r = requests.get(eddr) res['ping']['indices'][eddr] = r.status_code res['stable'] = r.status_code == 200 if r.status_code != 200: res['stable'] = False es_unreachable += 1 es_note = str(es_unreachable) + ' INDEXES UNREACHABLE' res['notes'].append(es_note) # query ES for cluster health and nodes up es_addr = str(app.config['ELASTIC_SEARCH_HOST'][0] if not isinstance(app.config['ELASTIC_SEARCH_HOST'], str) else app.config['ELASTIC_SEARCH_HOST']).rstrip('/') if not es_addr.startswith('http'): es_addr = 'http://' + es_addr if not es_addr.endswith(':9200'): es_addr += ':9200' try: es = requests.get(es_addr + '/_status').json() res['index'] = { 'cluster': {}, 'shards': { 'total': es['_shards']['total'], 'successful': es['_shards']['successful'] }, 'indices': {} } for k, v in es['indices'].items(): res['index']['indices'][k] = { 'docs': v['docs']['num_docs'], 'size': int( math.ceil(v['index']['primary_size_in_bytes']) / 1024 / 1024) } try: ces = requests.get(es_addr + '/_cluster/health') res['index']['cluster'] = ces.json() res['stable'] = res['index']['cluster']['status'] == 'green' if res['index']['cluster']['status'] != 'green': cluster_note = 'CLUSTER UNSTABLE' except: res['stable'] = False cluster_note = 'CLUSTER UNSTABLE' except: res['stable'] = False cluster_note = 'CLUSTER UNSTABLE' res['notes'].append(cluster_note) if False: # remove this False if happy to test write to the index (could be a setting) if res['stable'] and False: try: ts = str(int(time.time())) test_index = 'status_test_writable_' + ts test_type = 'test_' + ts test_id = ts rp = requests.put(es_addr + '/' + test_index + '/' + test_type + '/' + test_id, json={'hello': 'world'}) if rp.status_code != 201: indexable_note = 'NEW INDEX WRITE OPERATION FAILED TO WRITE, RETURNED ' + str( rp.status_code) else: try: rr = requests.get(es_addr + '/' + test_index + '/' + test_type + '/' + test_id).json() if rr['hello'] != 'world': indexable_note = 'INDEX READ DID NOT FIND EXPECTED VALUE IN NEW WRITTEN RECORD' try: rd = requests.delete(es_addr + '/' + test_index) if rd.status_code != 200: indexable_note = 'INDEX DELETE OF TEST INDEX DID NOT RETURNED UNEXPECTED STATUS CODE OF ' + str( rd.status_code) try: rg = requests.get(es_addr + '/' + test_index) if rg.status_code != 404: indexable_note = 'INDEX READ AFTER DELETE TEST RETURNED UNEXPECTED STATUS CODE OF ' + str( rg.status_code) except: pass except: indexable_note = 'INDEX DELETE OF TEST INDEX FAILED' except: indexable_note = 'INDEX READ OF NEW WRITTEN RECORD DID NOT SUCCEED' except: indexable_note = 'INDEX/DELETE OPERATIONS CAUSED EXCEPTION' else: indexable_note = 'INDEX/DELETE OPERATIONS NOT TESTED DUE TO SYSTEM ALREADY UNSTABLE' res['notes'].append(indexable_note) # check background jobs try: # check if journal_csv, which should run at half past every hour on the main queue, has completed in the last 2 hours (which confirms main queue) qcsv = { "query": { "bool": { "must": [{ "term": { "status": "complete" } }, { "term": { "action": "journal_csv" } }, { "range": { "created_date": { "gte": dates.format( dates.before(datetime.utcnow(), 7200)) } } }] } }, "size": 1, "sort": { "created_date": { "order": "desc" } } } rcsv = models.BackgroundJob.send_query( qcsv)['hits']['hits'][0]['_source'] res['background']['info'].append( 'journal_csv has run in the last 2 hours, confirming main queue is running' ) except: res['background']['status'] = 'Unstable' res['background']['info'].append( 'Error when trying to check background job journal_csv in the last 2 hours - could be a problem with this job or with main queue' ) res['stable'] = False try: # check if prune_es_backups, which should run at 9.30am every day, has completed in the last 24 hours (which confirms long running queue) qprune = { "query": { "bool": { "must": [{ "term": { "status": "complete" } }, { "term": { "action": "prune_es_backups" } }, { "range": { "created_date": { "gte": dates.format( dates.before(datetime.utcnow(), 86400)) } } }] } }, "size": 1, "sort": { "created_date": { "order": "desc" } } } rprune = models.BackgroundJob.send_query( qprune)['hits']['hits'][0]['_source'] res['background']['info'].append( 'prune_es_backups has run in the last 24 hours, confirming long running queue is running' ) except: res['background']['status'] = 'Unstable' res['background']['info'].append( 'Error when trying to check background job prune_es_backups in the last 24 hours - could be a problem with this job or with long running queue' ) res['stable'] = False # try: #fixme: commented out by SE - this isn't working well, it should probably be a background task itself # # remove old jobs if there are too many - remove anything over six months and complete # old_seconds = app.config.get("STATUS_OLD_REMOVE_SECONDS", 15552000) # qbg = {"query": {"bool": {"must": [ # {"term": {"status": "complete"}}, # {"range": {"created_date": {"lte": dates.format(dates.before(datetime.utcnow(), old_seconds))}}} # ]}}, "size": 10000, "sort": {"created_date": {"order": "desc"}}, "fields": "id"} # rbg = models.BackgroundJob.send_query(qbg) # for job in rbg.get('hits', {}).get('hits', []): # models.BackgroundJob.remove_by_id(job['fields']['id'][0]) # res['background']['info'].append('Removed {0} old complete background jobs'.format(rbg.get('hits', {}).get('total', 0))) # except: # res['background']['status'] = 'Unstable' # res['background']['info'].append('Error when trying to remove old background jobs') # res['stable'] = False try: # alert about errors in the last ten minutes - assuming we are going to use uptimerobot to check this every ten minutes error_seconds = app.config.get("STATUS_ERROR_CHECK_SECONDS", 600) error_ignore = app.config.get( "STATUS_ERROR_IGNORE", []) # configure a list of strings that denote something to ignore error_ignore = [error_ignore] if isinstance(error_ignore, str) else error_ignore error_ignore_fields = app.config.get( "STATUS_ERROR_IGNORE_FIELDS_TO_CHECK", False ) # which fields to get in the query, to check for the strings provided above error_ignore_fields = [error_ignore_fields] if isinstance( error_ignore_fields, str) else error_ignore_fields error_means_unstable = app.config.get("STATUS_ERROR_MEANS_UNSTABLE", True) qer = { "query": { "bool": { "must": [{ "term": { "status": "error" } }, { "range": { "created_date": { "gte": dates.format( dates.before(datetime.utcnow(), error_seconds)) } } }] } }, "size": 10000, "sort": { "created_date": { "order": "desc" } } } # this could be customised with a fields list if we only want to check certain fields for ignore types if error_ignore_fields != False: qer["fields"] = error_ignore_fields rer = models.BackgroundJob.send_query(qer) error_count = 0 for job in rer.get('hits', {}).get('hits', []): countable = True jsj = json.dumps(job) for ig in error_ignore: if ig in jsj: countable = False break if countable: error_count += 1 if error_count != 0: res['background']['status'] = 'Unstable' res['background']['info'].append( 'Background jobs are causing errors') res['stable'] = error_means_unstable emsg = 'Found {0} background jobs in error status in the last {1} seconds'.format( error_count, error_seconds) if len(error_ignore) != 0: emsg += '. Ignoring ' + ', '.join( error_ignore) + ' which reduced the error count from ' + str( rer.get('hits', {}).get('total', 0)) res['background']['info'].append(emsg) except: res['background']['status'] = 'Unstable' res['background']['info'].append( 'Error when trying to check background jobs for errors') res['stable'] = False resp = make_response(json.dumps(res)) resp.mimetype = "application/json" return resp
def run(self): job = self.background_job params = job.params # Set up the files we need to run this task - a dir to place the report, and a place to write the article csv outdir = self.get_param(params, "outdir", "article_duplicates_" + dates.today()) job.add_audit_message("Saving reports to " + outdir) if not os.path.exists(outdir): os.makedirs(outdir) # Location for our interim CSV file of articles tmpdir = self.get_param(params, "tmpdir", 'tmp_article_duplicate_report') if not os.path.exists(tmpdir): os.makedirs(tmpdir) tmp_csvname = self.get_param(params, "article_csv", False) tmp_csvpath, total = self._make_csv_dump(tmpdir, tmp_csvname) # Initialise our reports global_reportfile = 'duplicate_articles_global_' + dates.today( ) + '.csv' global_reportpath = os.path.join(outdir, global_reportfile) f = open(global_reportpath, "w", encoding="utf-8") global_report = csv.writer(f) header = [ "article_id", "article_created", "article_doi", "article_fulltext", "article_owner", "article_issns", "article_in_doaj", "n_matches", "match_type", "match_id", "match_created", "match_doi", "match_fulltext", "match_owner", "match_issns", "match_in_doaj", "owners_match", "titles_match", "article_title", "match_title" ] global_report.writerow(header) noids_reportfile = 'noids_' + dates.today() + '.csv' noids_reportpath = os.path.join(outdir, noids_reportfile) g = open(noids_reportpath, "w", encoding="utf-8") noids_report = csv.writer(g) header = [ "article_id", "article_created", "article_owner", "article_issns", "article_in_doaj" ] noids_report.writerow(header) # Record the sets of duplicated articles global_matches = [] a_count = 0 articleService = DOAJ.articleService() # Read back in the article csv file we created earlier with open(tmp_csvpath, 'r', encoding='utf-8') as t: article_reader = csv.reader(t) start = datetime.now() estimated_finish = "" for a in article_reader: if a_count > 1 and a_count % 100 == 0: n = datetime.now() diff = (n - start).total_seconds() expected_total = ((diff / a_count) * total) estimated_finish = dates.format( dates.after(start, expected_total)) a_count += 1 article = models.Article( _source={ 'id': a[0], 'created_date': a[1], 'bibjson': { 'identifier': json.loads(a[2]), 'link': json.loads(a[3]), 'title': a[4] }, 'admin': { 'in_doaj': json.loads(a[5]) } }) # Get the global duplicates try: global_duplicates = articleService.discover_duplicates( article, results_per_match_type=10000, include_article=False) except exceptions.DuplicateArticleException: # this means the article did not have any ids that could be used for deduplication owner = self._lookup_owner(article) noids_report.writerow([ article.id, article.created_date, owner, ','.join(article.bibjson().issns()), article.is_in_doaj() ]) continue dupcount = 0 if global_duplicates: # Look up an article's owner owner = self._lookup_owner(article) # Deduplicate the DOI and fulltext duplicate lists s = set([article.id] + [ d.id for d in global_duplicates.get('doi', []) + global_duplicates.get('fulltext', []) ]) # remove article's own id from global_duplicates dupcount = len(s) - 1 if s not in global_matches: self._write_rows_from_duplicates( article, owner, global_duplicates, global_report) global_matches.append(s) app.logger.debug('{0}/{1} {2} {3} {4} {5}'.format( a_count, total, article.id, dupcount, len(global_matches), estimated_finish)) job.add_audit_message( '{0} articles processed for duplicates. {1} global duplicate sets found.' .format(a_count, len(global_matches))) f.close() g.close() # Delete the transient temporary files. shutil.rmtree(tmpdir) # Email the reports if that parameter has been set. send_email = self.get_param(params, "email", False) if send_email: archive_name = "article_duplicates_" + dates.today() email_archive(outdir, archive_name) job.add_audit_message("email alert sent") else: job.add_audit_message("no email alert sent")
def run(self): job = self.background_job params = job.params # Set up the files we need to run this task - a dir to place the report, and a place to write the article csv outdir = self.get_param(params, "outdir", "article_duplicates_" + dates.today()) job.add_audit_message("Saving reports to " + outdir) if not os.path.exists(outdir): os.makedirs(outdir) # Location for our interim CSV file of articles tmpdir = self.get_param(params, "tmpdir", 'tmp_article_duplicate_report') if not os.path.exists(tmpdir): os.makedirs(tmpdir) tmp_csvname = self.get_param(params, "article_csv", False) tmp_csvpath, total = self._make_csv_dump(tmpdir, tmp_csvname) # Initialise our reports global_reportfile = 'duplicate_articles_global_' + dates.today() + '.csv' global_reportpath = os.path.join(outdir, global_reportfile) f = codecs.open(global_reportpath, "wb", "utf-8") global_report = UnicodeWriter(f) header = ["article_id", "article_created", "article_doi", "article_fulltext", "article_owner", "article_issns", "article_in_doaj", "n_matches", "match_type", "match_id", "match_created", "match_doi", "match_fulltext", "match_owner", "match_issns", "match_in_doaj", "owners_match", "titles_match", "article_title", "match_title"] global_report.writerow(header) noids_reportfile = 'noids_' + dates.today() + '.csv' noids_reportpath = os.path.join(outdir, noids_reportfile) g = codecs.open(noids_reportpath, "wb", "utf-8") noids_report = UnicodeWriter(g) header = ["article_id", "article_created", "article_owner", "article_issns", "article_in_doaj"] noids_report.writerow(header) # Record the sets of duplicated articles global_matches = [] a_count = 0 articleService = DOAJ.articleService() # Read back in the article csv file we created earlier with codecs.open(tmp_csvpath, 'rb', 'utf-8') as t: article_reader = UnicodeReader(t) start = datetime.now() estimated_finish = "" for a in article_reader: if a_count > 1 and a_count % 100 == 0: n = datetime.now() diff = (n - start).total_seconds() expected_total = ((diff / a_count) * total) estimated_finish = dates.format(dates.after(start, expected_total)) a_count += 1 article = models.Article(_source={'id': a[0], 'created_date': a[1], 'bibjson': {'identifier': json.loads(a[2]), 'link': json.loads(a[3]), 'title': a[4]}, 'admin': {'in_doaj': json.loads(a[5])}}) # Get the global duplicates try: global_duplicates = articleService.discover_duplicates(article, owner=None, results_per_match_type=10000) except exceptions.DuplicateArticleException: # this means the article did not have any ids that could be used for deduplication owner = self._lookup_owner(article) noids_report.writerow([article.id, article.created_date, owner, ','.join(article.bibjson().issns()), article.is_in_doaj()]) continue dupcount = 0 if global_duplicates: # Look up an article's owner owner = self._lookup_owner(article) # Deduplicate the DOI and fulltext duplicate lists s = set([article.id] + [d.id for d in global_duplicates.get('doi', []) + global_duplicates.get('fulltext', [])]) dupcount = len(s) - 1 if s not in global_matches: self._write_rows_from_duplicates(article, owner, global_duplicates, global_report) global_matches.append(s) app.logger.debug('{0}/{1} {2} {3} {4} {5}'.format(a_count, total, article.id, dupcount, len(global_matches), estimated_finish)) job.add_audit_message('{0} articles processed for duplicates. {1} global duplicate sets found.'.format(a_count, len(global_matches))) f.close() g.close() # Delete the transient temporary files. shutil.rmtree(tmpdir) # Email the reports if that parameter has been set. send_email = self.get_param(params, "email", False) if send_email: archive_name = "article_duplicates_" + dates.today() email_archive(outdir, archive_name) job.add_audit_message("email alert sent") else: job.add_audit_message("no email alert sent")