def test_02_local_large(self): local = StoreFactory.get(None) local.store("sludge", "sludge.txt", source_stream=SludgePump(100000000, format="text")) local.store("sludge", "sludge.bin", source_stream=SludgePump(90000000, format="bytes")) assert local.size("sludge", "sludge.txt") == 100000000 assert local.size("sludge", "sludge.bin") == 90000000
def test_01_local(self): local = StoreFactory.get(None) stringin = StringIO("test") local.store("string", "string.txt", source_stream=stringin) stringout_bin = local.get("string", "string.txt") assert stringout_bin.read().decode("utf-8") == "test" stringout_utf8 = local.get("string", "string.txt", encoding="utf-8") assert stringout_utf8.read() == "test" bytesin = BytesIO(b"here are some bytes") local.store("bytes", "bytes.bin", source_stream=bytesin) bytesout_bin = local.get("bytes", "bytes.bin") assert bytesout_bin.read() == b"here are some bytes"
def csv(self, prune=True): """ Generate the Journal CSV :param set_cache: whether to update the cache :param out_dir: the directory to output the file to. If set_cache is True, this argument will be overridden by the cache container :return: Tuple of (attachment_name, URL) """ # first validate the incoming arguments to ensure that we've got the right thing argvalidate("csv", [{ "arg": prune, "allow_none": False, "arg_name": "prune" }], exceptions.ArgumentException) filename = 'journalcsv__doaj_' + datetime.strftime( datetime.utcnow(), '%Y%m%d_%H%M') + '_utf8.csv' container_id = app.config.get("STORE_CACHE_CONTAINER") tmpStore = StoreFactory.tmp() out = tmpStore.path(container_id, filename, create_container=True, must_exist=False) YES_NO = {True: 'Yes', False: 'No', None: '', '': ''} def _make_journals_csv(file_object): """ Make a CSV file of information for all journals. :param file_object: a utf8 encoded file object. """ cols = {} for j in models.Journal.all_in_doaj( page_size=100000 ): # 10x how many journals we have right now assert isinstance( j, models.Journal) # for pycharm type inspection bj = j.bibjson() issn = bj.get_one_identifier(idtype=bj.P_ISSN) if issn is None: issn = bj.get_one_identifier(idtype=bj.E_ISSN) if issn is None: continue kvs = Journal2QuestionXwalk.journal2question(j) meta_kvs = _get_doaj_meta_kvs(j) article_kvs = _get_article_kvs(j) cols[issn] = kvs + meta_kvs + article_kvs issns = cols.keys() csvwriter = csv.writer(file_object) qs = None for i in sorted(issns): if qs is None: qs = [q for q, _ in cols[i]] csvwriter.writerow(qs) vs = [v for _, v in cols[i]] csvwriter.writerow(vs) def _get_doaj_meta_kvs(journal): """ Get key, value pairs for some meta information we want from the journal object :param journal: a models.Journal :return: a list of (key, value) tuples for our metadata """ kvs = [("DOAJ Seal", YES_NO.get(journal.has_seal(), "")), ("Tick: Accepted after March 2014", YES_NO.get(journal.is_ticked(), "")), ("Added on Date", journal.created_date), ("Subjects", ' | '.join(journal.bibjson().lcc_paths()))] return kvs def _get_article_kvs(journal): stats = journal.article_stats() kvs = [("Number of Article Records", str(stats.get("total"))), ("Most Recent Article Added", stats.get("latest"))] return kvs with open(out, 'w', encoding='utf-8') as csvfile: _make_journals_csv(csvfile) mainStore = StoreFactory.get("cache") try: mainStore.store(container_id, filename, source_path=out) url = mainStore.url(container_id, filename) finally: tmpStore.delete_file( container_id, filename ) # don't delete the container, just in case someone else is writing to it action_register = [] if prune: def sort(filelist): rx = "journalcsv__doaj_(.+?)_utf8.csv" return sorted(filelist, key=lambda x: datetime.strptime( re.match(rx, x).groups(1)[0], '%Y%m%d_%H%M'), reverse=True) def filter(filename): return filename.startswith("journalcsv__") action_register = prune_container(mainStore, container_id, sort, filter=filter, keep=2) # update the ES record to point to the new file models.Cache.cache_csv(url) return url, action_register
import argparse parser = argparse.ArgumentParser() parser.add_argument("-l", "--limit", type=int, help="Number of records to export from each type. If you specify e.g. 100, then only the first 100 accounts, 100 journals, 100 articles etc. will be exported. The \"first\" 100 will be ordered by whatever the esprit iterate functionality uses as default ordering, usually alphabetically by record id.") parser.add_argument("-c", "--clean", action="store_true", help="Clean any pre-existing output before continuing") parser.add_argument("-b", "--batch", default=100000, type=int, help="Output batch sizes") args = parser.parse_args() if args.limit > 0: limit = args.limit else: limit = None conn = esprit.raw.make_connection(None, app.config["ELASTIC_SEARCH_HOST"], None, app.config["ELASTIC_SEARCH_DB"]) tmpStore = StoreFactory.tmp() mainStore = StoreFactory.get("anon_data") container = app.config.get("STORE_ANON_DATA_CONTAINER") if args.clean: mainStore.delete_container(container) for type_ in esprit.raw.list_types(connection=conn): filename = type_ + ".bulk" output_file = tmpStore.path(container, filename, create_container=True, must_exist=False) print(dates.now() + " " + type_ + " => " + output_file + ".*") if type_ in anonymisation_procedures: transform = anonymisation_procedures[type_] filenames = esprit.tasks.dump(conn, type_, limit=limit, transform=transform, out_template=output_file, out_batch_sizes=args.batch, out_rollover_callback=_copy_on_complete, es_bulk_fields=["_id"])
def do_import(config): host = app.config["ELASTIC_SEARCH_HOST"] index = app.config["ELASTIC_SEARCH_DB"] if config.get("elastic_search_host") is not None: host = config.get("elastic_search_host") app.config["ELASTIC_SEARCH_HOST"] = host if config.get("elastic_search_db") is not None: index = config.get("elastic_search_db") app.config["ELASTIC_SEARCH_DB"] = index print("\n") print("Using host {x} and index {y}\n".format(x=host, y=index)) conn = esprit.raw.make_connection(None, host, None, index) # filter for the types we are going to work with import_types = {} for t, s in config.get("types", {}).iteritems(): if s.get("import", False) is True: import_types[t] = s print("==Carrying out the following import==") for import_type, cfg in import_types.iteritems(): count = "All" if cfg.get("limit", -1) == -1 else cfg.get("limit") print("{x} from {y}".format(x=count, y=import_type)) print("\n") if config.get("confirm", True): text = raw_input("Continue? [y/N] ") if text.lower() != "y": exit() # remove all the types that we are going to import for import_type in import_types.keys(): esprit.raw.delete(conn, import_type) # re-initialise the index (sorting out mappings, etc) print("==Initialising Index for Mappings==") initialise_index(app) mainStore = StoreFactory.get("anon_data") tempStore = StoreFactory.tmp() container = app.config.get("STORE_ANON_DATA_CONTAINER") print("\n==Importing==") for import_type, cfg in import_types.iteritems(): count = "all" if cfg.get("limit", -1) == -1 else cfg.get("limit") print("Importing {x} from {y}".format(x=count, y=import_type)) print("Obtaining {x} from storage".format(x=import_type)) limit = cfg.get("limit", -1) limit = None if limit == -1 else limit n = 1 while True: filename = import_type + ".bulk" + "." + str(n) handle = mainStore.get(container, filename) if handle is None: break tempStore.store(container, filename + ".gz", source_stream=handle) print("Retrieved {x} from storage".format(x=filename)) handle.close() print("Unzipping {x} in temporary store".format(x=filename)) compressed_file = tempStore.path(container, filename + ".gz") uncompressed_file = tempStore.path(container, filename, must_exist=False) with gzip.open(compressed_file, "rb") as f_in, open(uncompressed_file, "wb") as f_out: shutil.copyfileobj(f_in, f_out) tempStore.delete(container, filename + ".gz") print("Importing from {x}".format(x=filename)) imported_count = esprit.tasks.bulk_load(conn, import_type, uncompressed_file, limit=limit, max_content_length=config.get("max_content_length", 100000000)) tempStore.delete(container, filename) if limit is not None and imported_count != -1: limit -= imported_count if limit is not None and limit <= 0: break n += 1 tempStore.delete(container)
def run(self): """ Execute the task as specified by the background_job :return: """ job = self.background_job params = job.params clean = self.get_param(params, 'clean') prune = self.get_param(params, 'prune') types = self.get_param(params, 'types') tmpStore = StoreFactory.tmp() mainStore = StoreFactory.get("public_data_dump") container = app.config.get("STORE_PUBLIC_DATA_DUMP_CONTAINER") if clean: mainStore.delete_container(container) # create dir with today's date day_at_start = dates.today() # Do the search and save it page_size = app.config.get("DISCOVERY_BULK_PAGE_SIZE", 1000) records_per_file = app.config.get('DISCOVERY_RECORDS_PER_FILE', 100000) if types == 'all': types = ['article', 'journal'] else: types = [types] urls = {"article" : None, "journal" : None} sizes = {"article" : None, "journal" : None} # Scroll for article and/or journal for typ in types: job.add_audit_message(dates.now() + u": Starting export of " + typ) out_dir = tmpStore.path(container, "doaj_" + typ + "_data_" + day_at_start, create_container=True, must_exist=False) out_name = os.path.basename(out_dir) zipped_name = out_name + ".tar.gz" zip_dir = os.path.dirname(out_dir) zipped_path = os.path.join(zip_dir, zipped_name) tarball = tarfile.open(zipped_path, "w:gz") file_num = 1 out_file, path, filename = self._start_new_file(tmpStore, container, typ, day_at_start, file_num) first_in_file = True count = 0 for result in DiscoveryApi.scroll(typ, None, None, page_size, scan=True): if not first_in_file: out_file.write(",\n") else: first_in_file = False out_file.write(json.dumps(result)) count += 1 if count >= records_per_file: file_num += 1 self._finish_file(tmpStore, container, filename, path, out_file, tarball) out_file, path, filename = self._start_new_file(tmpStore, container, typ, day_at_start, file_num) first_in_file = True count = 0 if count > 0: self._finish_file(tmpStore, container, filename, path, out_file, tarball) tarball.close() # Copy the source directory to main store try: filesize = self._copy_on_complete(mainStore, tmpStore, container, zipped_path) except Exception as e: tmpStore.delete_container(container) raise BackgroundException("Error copying {0} data on complete {1}\n".format(typ, e.message)) store_url = mainStore.url(container, zipped_name) urls[typ] = store_url sizes[typ] = filesize if prune: self._prune_container(mainStore, container, day_at_start, types) self.background_job.add_audit_message(u"Removing temp store container {x}".format(x=container)) tmpStore.delete_container(container) # finally update the cache cache.Cache.cache_public_data_dump(urls["article"], sizes["article"], urls["journal"], sizes["journal"]) job.add_audit_message(dates.now() + u": done")
def run(self): """ Execute the task as specified by the background_job :return: """ job = self.background_job params = job.params clean = self.get_param(params, 'clean') prune = self.get_param(params, 'prune') types = self.get_param(params, 'types') tmpStore = StoreFactory.tmp() mainStore = StoreFactory.get("public_data_dump") container = app.config.get("STORE_PUBLIC_DATA_DUMP_CONTAINER") if clean: mainStore.delete_container(container) job.add_audit_message("Deleted existing data dump files") job.save() # create dir with today's date day_at_start = dates.today() # Do the search and save it page_size = app.config.get("DISCOVERY_BULK_PAGE_SIZE", 1000) records_per_file = app.config.get('DISCOVERY_RECORDS_PER_FILE', 100000) if types == 'all': types = ['article', 'journal'] else: types = [types] urls = {"article": None, "journal": None} sizes = {"article": None, "journal": None} # Scroll for article and/or journal for typ in types: job.add_audit_message(dates.now() + ": Starting export of " + typ) job.save() out_dir = tmpStore.path(container, "doaj_" + typ + "_data_" + day_at_start, create_container=True, must_exist=False) out_name = os.path.basename(out_dir) zipped_name = out_name + ".tar.gz" zip_dir = os.path.dirname(out_dir) zipped_path = os.path.join(zip_dir, zipped_name) tarball = tarfile.open(zipped_path, "w:gz") file_num = 1 out_file, path, filename = self._start_new_file( tmpStore, container, typ, day_at_start, file_num) first_in_file = True count = 0 for result in DiscoveryApi.scroll(typ, None, None, page_size, scan=True): if not first_in_file: out_file.write(",\n") else: first_in_file = False out_file.write(json.dumps(result)) count += 1 if count >= records_per_file: file_num += 1 self._finish_file(tmpStore, container, filename, path, out_file, tarball) job.save() out_file, path, filename = self._start_new_file( tmpStore, container, typ, day_at_start, file_num) first_in_file = True count = 0 if count > 0: self._finish_file(tmpStore, container, filename, path, out_file, tarball) job.save() tarball.close() # Copy the source directory to main store try: filesize = self._copy_on_complete(mainStore, tmpStore, container, zipped_path) job.save() except Exception as e: tmpStore.delete_container(container) raise BackgroundException( "Error copying {0} data on complete {1}\n".format( typ, str(e))) store_url = mainStore.url(container, zipped_name) urls[typ] = store_url sizes[typ] = filesize if prune: self._prune_container(mainStore, container, day_at_start, types) job.save() self.background_job.add_audit_message( "Removing temp store container {x}".format(x=container)) tmpStore.delete_container(container) # finally update the cache cache.Cache.cache_public_data_dump(urls["article"], sizes["article"], urls["journal"], sizes["journal"]) job.add_audit_message(dates.now() + ": done")
def csv(self, prune=True): """ Generate the Journal CSV :param set_cache: whether to update the cache :param out_dir: the directory to output the file to. If set_cache is True, this argument will be overridden by the cache container :return: Tuple of (attachment_name, URL) """ # first validate the incoming arguments to ensure that we've got the right thing argvalidate("csv", [ {"arg": prune, "allow_none" : False, "arg_name" : "prune"} ], exceptions.ArgumentException) filename = 'journalcsv__doaj_' + datetime.strftime(datetime.utcnow(), '%Y%m%d_%H%M') + '_utf8.csv' container_id = app.config.get("STORE_CACHE_CONTAINER") tmpStore = StoreFactory.tmp() out = tmpStore.path(container_id, filename, create_container=True, must_exist=False) YES_NO = {True: 'Yes', False: 'No', None: '', '': ''} def _make_journals_csv(file_object): """ Make a CSV file of information for all journals. :param file_object: a utf8 encoded file object. """ cols = {} for j in models.Journal.all_in_doaj(page_size=100000): # 10x how many journals we have right now assert isinstance(j, models.Journal) # for pycharm type inspection bj = j.bibjson() issn = bj.get_one_identifier(idtype=bj.P_ISSN) if issn is None: issn = bj.get_one_identifier(idtype=bj.E_ISSN) if issn is None: continue kvs = Journal2QuestionXwalk.journal2question(j) meta_kvs = _get_doaj_meta_kvs(j) article_kvs = _get_article_kvs(j) cols[issn] = kvs + meta_kvs + article_kvs issns = cols.keys() issns.sort() csvwriter = clcsv.UnicodeWriter(file_object) qs = None for i in issns: if qs is None: qs = [q for q, _ in cols[i]] csvwriter.writerow(qs) vs = [v for _, v in cols[i]] csvwriter.writerow(vs) def _get_doaj_meta_kvs(journal): """ Get key, value pairs for some meta information we want from the journal object :param journal: a models.Journal :return: a list of (key, value) tuples for our metadata """ kvs = [ ("DOAJ Seal", YES_NO.get(journal.has_seal(), "")), ("Tick: Accepted after March 2014", YES_NO.get(journal.is_ticked(), "")), ("Added on Date", journal.created_date), ("Subjects", ' | '.join(journal.bibjson().lcc_paths())) ] return kvs def _get_article_kvs(journal): stats = journal.article_stats() kvs = [ ("Number of Article Records", str(stats.get("total"))), ("Most Recent Article Added", stats.get("latest")) ] return kvs with codecs.open(out, 'wb', encoding='utf-8') as csvfile: _make_journals_csv(csvfile) mainStore = StoreFactory.get("cache") try: mainStore.store(container_id, filename, source_path=out) url = mainStore.url(container_id, filename) finally: tmpStore.delete_file(container_id, filename) # don't delete the container, just in case someone else is writing to it action_register = [] if prune: def sort(filelist): rx = "journalcsv__doaj_(.+?)_utf8.csv" return sorted(filelist, key=lambda x: datetime.strptime(re.match(rx, x).groups(1)[0], '%Y%m%d_%H%M'), reverse=True) def filter(filename): return filename.startswith("journalcsv__") action_register = prune_container(mainStore, container_id, sort, filter=filter, keep=2) # update the ES record to point to the new file models.Cache.cache_csv(url) return url, action_register
def do_import(config): host = app.config["ELASTIC_SEARCH_HOST"] index = app.config["ELASTIC_SEARCH_DB"] if config.get("elastic_search_host") is not None: host = config.get("elastic_search_host") app.config["ELASTIC_SEARCH_HOST"] = host if config.get("elastic_search_db") is not None: index = config.get("elastic_search_db") app.config["ELASTIC_SEARCH_DB"] = index print("\n") print("Using host {x} and index {y}\n".format(x=host, y=index)) conn = esprit.raw.make_connection(None, host, None, index) # filter for the types we are going to work with import_types = {} for t, s in config.get("types", {}).iteritems(): if s.get("import", False) is True: import_types[t] = s print("==Carrying out the following import==") for import_type, cfg in import_types.iteritems(): count = "All" if cfg.get("limit", -1) == -1 else cfg.get("limit") print("{x} from {y}".format(x=count, y=import_type)) print("\n") if config.get("confirm", True): text = raw_input("Continue? [y/N] ") if text.lower() != "y": exit() # remove all the types that we are going to import for import_type in import_types.keys(): esprit.raw.delete(conn, import_type) # re-initialise the index (sorting out mappings, etc) print("==Initialising Index for Mappings==") initialise_index(app) mainStore = StoreFactory.get("anon_data") tempStore = StoreFactory.tmp() container = app.config.get("STORE_ANON_DATA_CONTAINER") print("\n==Importing==") for import_type, cfg in import_types.iteritems(): count = "all" if cfg.get("limit", -1) == -1 else cfg.get("limit") print("Importing {x} from {y}".format(x=count, y=import_type)) print("Obtaining {x} from storage".format(x=import_type)) limit = cfg.get("limit", -1) limit = None if limit == -1 else limit n = 1 while True: filename = import_type + ".bulk" + "." + str(n) handle = mainStore.get(container, filename) if handle is None: break tempStore.store(container, filename + ".gz", source_stream=handle) print("Retrieved {x} from storage".format(x=filename)) handle.close() print("Unzipping {x} in temporary store".format(x=filename)) compressed_file = tempStore.path(container, filename + ".gz") uncompressed_file = tempStore.path(container, filename, must_exist=False) with gzip.open(compressed_file, "rb") as f_in, open(uncompressed_file, "wb") as f_out: shutil.copyfileobj(f_in, f_out) tempStore.delete_file(container, filename + ".gz") print("Importing from {x}".format(x=filename)) imported_count = esprit.tasks.bulk_load(conn, import_type, uncompressed_file, limit=limit, max_content_length=config.get("max_content_length", 100000000)) tempStore.delete_file(container, filename) if limit is not None and imported_count != -1: limit -= imported_count if limit is not None and limit <= 0: break n += 1 tempStore.delete_file(container)