コード例 #1
0
    def test_02_local_large(self):
        local = StoreFactory.get(None)

        local.store("sludge",
                    "sludge.txt",
                    source_stream=SludgePump(100000000, format="text"))
        local.store("sludge",
                    "sludge.bin",
                    source_stream=SludgePump(90000000, format="bytes"))

        assert local.size("sludge", "sludge.txt") == 100000000
        assert local.size("sludge", "sludge.bin") == 90000000
コード例 #2
0
    def test_01_local(self):
        local = StoreFactory.get(None)

        stringin = StringIO("test")
        local.store("string", "string.txt", source_stream=stringin)

        stringout_bin = local.get("string", "string.txt")
        assert stringout_bin.read().decode("utf-8") == "test"

        stringout_utf8 = local.get("string", "string.txt", encoding="utf-8")
        assert stringout_utf8.read() == "test"

        bytesin = BytesIO(b"here are some bytes")
        local.store("bytes", "bytes.bin", source_stream=bytesin)

        bytesout_bin = local.get("bytes", "bytes.bin")
        assert bytesout_bin.read() == b"here are some bytes"
コード例 #3
0
    def csv(self, prune=True):
        """
        Generate the Journal CSV

        :param set_cache: whether to update the cache
        :param out_dir: the directory to output the file to.  If set_cache is True, this argument will be overridden by the cache container
        :return: Tuple of (attachment_name, URL)
        """
        # first validate the incoming arguments to ensure that we've got the right thing
        argvalidate("csv", [{
            "arg": prune,
            "allow_none": False,
            "arg_name": "prune"
        }], exceptions.ArgumentException)

        filename = 'journalcsv__doaj_' + datetime.strftime(
            datetime.utcnow(), '%Y%m%d_%H%M') + '_utf8.csv'
        container_id = app.config.get("STORE_CACHE_CONTAINER")
        tmpStore = StoreFactory.tmp()
        out = tmpStore.path(container_id,
                            filename,
                            create_container=True,
                            must_exist=False)

        YES_NO = {True: 'Yes', False: 'No', None: '', '': ''}

        def _make_journals_csv(file_object):
            """
            Make a CSV file of information for all journals.
            :param file_object: a utf8 encoded file object.
            """

            cols = {}
            for j in models.Journal.all_in_doaj(
                    page_size=100000
            ):  # 10x how many journals we have right now
                assert isinstance(
                    j, models.Journal)  # for pycharm type inspection
                bj = j.bibjson()
                issn = bj.get_one_identifier(idtype=bj.P_ISSN)
                if issn is None:
                    issn = bj.get_one_identifier(idtype=bj.E_ISSN)
                if issn is None:
                    continue

                kvs = Journal2QuestionXwalk.journal2question(j)
                meta_kvs = _get_doaj_meta_kvs(j)
                article_kvs = _get_article_kvs(j)
                cols[issn] = kvs + meta_kvs + article_kvs

            issns = cols.keys()

            csvwriter = csv.writer(file_object)
            qs = None
            for i in sorted(issns):
                if qs is None:
                    qs = [q for q, _ in cols[i]]
                    csvwriter.writerow(qs)
                vs = [v for _, v in cols[i]]
                csvwriter.writerow(vs)

        def _get_doaj_meta_kvs(journal):
            """
            Get key, value pairs for some meta information we want from the journal object
            :param journal: a models.Journal
            :return: a list of (key, value) tuples for our metadata
            """
            kvs = [("DOAJ Seal", YES_NO.get(journal.has_seal(), "")),
                   ("Tick: Accepted after March 2014",
                    YES_NO.get(journal.is_ticked(), "")),
                   ("Added on Date", journal.created_date),
                   ("Subjects", ' | '.join(journal.bibjson().lcc_paths()))]
            return kvs

        def _get_article_kvs(journal):
            stats = journal.article_stats()
            kvs = [("Number of Article Records", str(stats.get("total"))),
                   ("Most Recent Article Added", stats.get("latest"))]
            return kvs

        with open(out, 'w', encoding='utf-8') as csvfile:
            _make_journals_csv(csvfile)

        mainStore = StoreFactory.get("cache")
        try:
            mainStore.store(container_id, filename, source_path=out)
            url = mainStore.url(container_id, filename)
        finally:
            tmpStore.delete_file(
                container_id, filename
            )  # don't delete the container, just in case someone else is writing to it

        action_register = []
        if prune:

            def sort(filelist):
                rx = "journalcsv__doaj_(.+?)_utf8.csv"
                return sorted(filelist,
                              key=lambda x: datetime.strptime(
                                  re.match(rx, x).groups(1)[0], '%Y%m%d_%H%M'),
                              reverse=True)

            def filter(filename):
                return filename.startswith("journalcsv__")

            action_register = prune_container(mainStore,
                                              container_id,
                                              sort,
                                              filter=filter,
                                              keep=2)

        # update the ES record to point to the new file
        models.Cache.cache_csv(url)
        return url, action_register
コード例 #4
0
    import argparse
    parser = argparse.ArgumentParser()

    parser.add_argument("-l", "--limit", type=int, help="Number of records to export from each type. If you specify e.g. 100, then only the first 100 accounts, 100 journals, 100 articles etc. will be exported. The \"first\" 100 will be ordered by whatever the esprit iterate functionality uses as default ordering, usually alphabetically by record id.")
    parser.add_argument("-c", "--clean", action="store_true", help="Clean any pre-existing output before continuing")
    parser.add_argument("-b", "--batch", default=100000, type=int, help="Output batch sizes")
    args = parser.parse_args()
    if args.limit > 0:
        limit = args.limit
    else:
        limit = None

    conn = esprit.raw.make_connection(None, app.config["ELASTIC_SEARCH_HOST"], None, app.config["ELASTIC_SEARCH_DB"])

    tmpStore = StoreFactory.tmp()
    mainStore = StoreFactory.get("anon_data")
    container = app.config.get("STORE_ANON_DATA_CONTAINER")

    if args.clean:
        mainStore.delete_container(container)

    for type_ in esprit.raw.list_types(connection=conn):
        filename = type_ + ".bulk"
        output_file = tmpStore.path(container, filename, create_container=True, must_exist=False)
        print(dates.now() + " " + type_ + " => " + output_file + ".*")
        if type_ in anonymisation_procedures:
            transform = anonymisation_procedures[type_]
            filenames = esprit.tasks.dump(conn, type_, limit=limit, transform=transform,
                                          out_template=output_file, out_batch_sizes=args.batch, out_rollover_callback=_copy_on_complete,
                                          es_bulk_fields=["_id"])
コード例 #5
0
def do_import(config):
    host = app.config["ELASTIC_SEARCH_HOST"]
    index = app.config["ELASTIC_SEARCH_DB"]
    if config.get("elastic_search_host") is not None:
        host = config.get("elastic_search_host")
        app.config["ELASTIC_SEARCH_HOST"] = host
    if config.get("elastic_search_db") is not None:
        index = config.get("elastic_search_db")
        app.config["ELASTIC_SEARCH_DB"] = index

    print("\n")
    print("Using host {x} and index {y}\n".format(x=host, y=index))
    conn = esprit.raw.make_connection(None, host, None, index)

    # filter for the types we are going to work with
    import_types = {}
    for t, s in config.get("types", {}).iteritems():
        if s.get("import", False) is True:
            import_types[t] = s

    print("==Carrying out the following import==")
    for import_type, cfg in import_types.iteritems():
        count = "All" if cfg.get("limit", -1) == -1 else cfg.get("limit")
        print("{x} from {y}".format(x=count, y=import_type))
    print("\n")

    if config.get("confirm", True):
        text = raw_input("Continue? [y/N] ")
        if text.lower() != "y":
            exit()

    # remove all the types that we are going to import
    for import_type in import_types.keys():
        esprit.raw.delete(conn, import_type)

    # re-initialise the index (sorting out mappings, etc)
    print("==Initialising Index for Mappings==")
    initialise_index(app)

    mainStore = StoreFactory.get("anon_data")
    tempStore = StoreFactory.tmp()
    container = app.config.get("STORE_ANON_DATA_CONTAINER")

    print("\n==Importing==")
    for import_type, cfg in import_types.iteritems():
        count = "all" if cfg.get("limit", -1) == -1 else cfg.get("limit")
        print("Importing {x} from {y}".format(x=count, y=import_type))
        print("Obtaining {x} from storage".format(x=import_type))

        limit = cfg.get("limit", -1)
        limit = None if limit == -1 else limit

        n = 1
        while True:
            filename = import_type + ".bulk" + "." + str(n)
            handle = mainStore.get(container, filename)
            if handle is None:
                break
            tempStore.store(container, filename + ".gz", source_stream=handle)
            print("Retrieved {x} from storage".format(x=filename))
            handle.close()

            print("Unzipping {x} in temporary store".format(x=filename))
            compressed_file = tempStore.path(container, filename + ".gz")
            uncompressed_file = tempStore.path(container, filename, must_exist=False)
            with gzip.open(compressed_file, "rb") as f_in, open(uncompressed_file, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)
            tempStore.delete(container, filename + ".gz")

            print("Importing from {x}".format(x=filename))
            imported_count = esprit.tasks.bulk_load(conn, import_type, uncompressed_file,
                                                    limit=limit, max_content_length=config.get("max_content_length", 100000000))
            tempStore.delete(container, filename)

            if limit is not None and imported_count != -1:
                limit -= imported_count
            if limit is not None and limit <= 0:
                break

            n += 1

    tempStore.delete(container)
コード例 #6
0
ファイル: public_data_dump.py プロジェクト: DOAJ/doaj
    def run(self):
        """
        Execute the task as specified by the background_job
        :return:
        """
        job = self.background_job
        params = job.params

        clean = self.get_param(params, 'clean')
        prune = self.get_param(params, 'prune')
        types = self.get_param(params, 'types')

        tmpStore = StoreFactory.tmp()
        mainStore = StoreFactory.get("public_data_dump")
        container = app.config.get("STORE_PUBLIC_DATA_DUMP_CONTAINER")

        if clean:
            mainStore.delete_container(container)

        # create dir with today's date
        day_at_start = dates.today()

        # Do the search and save it
        page_size = app.config.get("DISCOVERY_BULK_PAGE_SIZE", 1000)
        records_per_file = app.config.get('DISCOVERY_RECORDS_PER_FILE', 100000)

        if types == 'all':
            types = ['article', 'journal']
        else:
            types = [types]

        urls = {"article" : None, "journal" : None}
        sizes = {"article" : None, "journal" : None}

        # Scroll for article and/or journal
        for typ in types:
            job.add_audit_message(dates.now() + u": Starting export of " + typ)

            out_dir = tmpStore.path(container, "doaj_" + typ + "_data_" + day_at_start, create_container=True, must_exist=False)
            out_name = os.path.basename(out_dir)
            zipped_name = out_name + ".tar.gz"
            zip_dir = os.path.dirname(out_dir)
            zipped_path = os.path.join(zip_dir, zipped_name)
            tarball = tarfile.open(zipped_path, "w:gz")

            file_num = 1
            out_file, path, filename = self._start_new_file(tmpStore, container, typ, day_at_start, file_num)

            first_in_file = True
            count = 0
            for result in DiscoveryApi.scroll(typ, None, None, page_size, scan=True):
                if not first_in_file:
                    out_file.write(",\n")
                else:
                    first_in_file = False
                out_file.write(json.dumps(result))
                count += 1

                if count >= records_per_file:
                    file_num += 1
                    self._finish_file(tmpStore, container, filename, path, out_file, tarball)
                    out_file, path, filename = self._start_new_file(tmpStore, container, typ, day_at_start, file_num)
                    first_in_file = True
                    count = 0

            if count > 0:
                self._finish_file(tmpStore, container, filename, path, out_file, tarball)

            tarball.close()

            # Copy the source directory to main store
            try:
                filesize = self._copy_on_complete(mainStore, tmpStore, container, zipped_path)
            except Exception as e:
                tmpStore.delete_container(container)
                raise BackgroundException("Error copying {0} data on complete {1}\n".format(typ, e.message))

            store_url = mainStore.url(container, zipped_name)
            urls[typ] = store_url
            sizes[typ] = filesize

        if prune:
            self._prune_container(mainStore, container, day_at_start, types)

        self.background_job.add_audit_message(u"Removing temp store container {x}".format(x=container))
        tmpStore.delete_container(container)

        # finally update the cache
        cache.Cache.cache_public_data_dump(urls["article"], sizes["article"], urls["journal"], sizes["journal"])

        job.add_audit_message(dates.now() + u": done")
コード例 #7
0
ファイル: public_data_dump.py プロジェクト: mauromsl/doaj
    def run(self):
        """
        Execute the task as specified by the background_job
        :return:
        """
        job = self.background_job
        params = job.params

        clean = self.get_param(params, 'clean')
        prune = self.get_param(params, 'prune')
        types = self.get_param(params, 'types')

        tmpStore = StoreFactory.tmp()
        mainStore = StoreFactory.get("public_data_dump")
        container = app.config.get("STORE_PUBLIC_DATA_DUMP_CONTAINER")

        if clean:
            mainStore.delete_container(container)
            job.add_audit_message("Deleted existing data dump files")
            job.save()

        # create dir with today's date
        day_at_start = dates.today()

        # Do the search and save it
        page_size = app.config.get("DISCOVERY_BULK_PAGE_SIZE", 1000)
        records_per_file = app.config.get('DISCOVERY_RECORDS_PER_FILE', 100000)

        if types == 'all':
            types = ['article', 'journal']
        else:
            types = [types]

        urls = {"article": None, "journal": None}
        sizes = {"article": None, "journal": None}

        # Scroll for article and/or journal
        for typ in types:
            job.add_audit_message(dates.now() + ": Starting export of " + typ)
            job.save()

            out_dir = tmpStore.path(container,
                                    "doaj_" + typ + "_data_" + day_at_start,
                                    create_container=True,
                                    must_exist=False)
            out_name = os.path.basename(out_dir)
            zipped_name = out_name + ".tar.gz"
            zip_dir = os.path.dirname(out_dir)
            zipped_path = os.path.join(zip_dir, zipped_name)
            tarball = tarfile.open(zipped_path, "w:gz")

            file_num = 1
            out_file, path, filename = self._start_new_file(
                tmpStore, container, typ, day_at_start, file_num)

            first_in_file = True
            count = 0
            for result in DiscoveryApi.scroll(typ,
                                              None,
                                              None,
                                              page_size,
                                              scan=True):
                if not first_in_file:
                    out_file.write(",\n")
                else:
                    first_in_file = False
                out_file.write(json.dumps(result))
                count += 1

                if count >= records_per_file:
                    file_num += 1
                    self._finish_file(tmpStore, container, filename, path,
                                      out_file, tarball)
                    job.save()
                    out_file, path, filename = self._start_new_file(
                        tmpStore, container, typ, day_at_start, file_num)
                    first_in_file = True
                    count = 0

            if count > 0:
                self._finish_file(tmpStore, container, filename, path,
                                  out_file, tarball)
                job.save()

            tarball.close()

            # Copy the source directory to main store
            try:
                filesize = self._copy_on_complete(mainStore, tmpStore,
                                                  container, zipped_path)
                job.save()
            except Exception as e:
                tmpStore.delete_container(container)
                raise BackgroundException(
                    "Error copying {0} data on complete {1}\n".format(
                        typ, str(e)))

            store_url = mainStore.url(container, zipped_name)
            urls[typ] = store_url
            sizes[typ] = filesize

        if prune:
            self._prune_container(mainStore, container, day_at_start, types)
            job.save()

        self.background_job.add_audit_message(
            "Removing temp store container {x}".format(x=container))
        tmpStore.delete_container(container)

        # finally update the cache
        cache.Cache.cache_public_data_dump(urls["article"], sizes["article"],
                                           urls["journal"], sizes["journal"])

        job.add_audit_message(dates.now() + ": done")
コード例 #8
0
ファイル: journal.py プロジェクト: DOAJ/doaj
    def csv(self, prune=True):
        """
        Generate the Journal CSV

        :param set_cache: whether to update the cache
        :param out_dir: the directory to output the file to.  If set_cache is True, this argument will be overridden by the cache container
        :return: Tuple of (attachment_name, URL)
        """
        # first validate the incoming arguments to ensure that we've got the right thing
        argvalidate("csv", [
            {"arg": prune, "allow_none" : False, "arg_name" : "prune"}
        ], exceptions.ArgumentException)

        filename = 'journalcsv__doaj_' + datetime.strftime(datetime.utcnow(), '%Y%m%d_%H%M') + '_utf8.csv'
        container_id = app.config.get("STORE_CACHE_CONTAINER")
        tmpStore = StoreFactory.tmp()
        out = tmpStore.path(container_id, filename, create_container=True, must_exist=False)

        YES_NO = {True: 'Yes', False: 'No', None: '', '': ''}

        def _make_journals_csv(file_object):
            """
            Make a CSV file of information for all journals.
            :param file_object: a utf8 encoded file object.
            """

            cols = {}
            for j in models.Journal.all_in_doaj(page_size=100000):                     # 10x how many journals we have right now
                assert isinstance(j, models.Journal)                                               # for pycharm type inspection
                bj = j.bibjson()
                issn = bj.get_one_identifier(idtype=bj.P_ISSN)
                if issn is None:
                    issn = bj.get_one_identifier(idtype=bj.E_ISSN)
                if issn is None:
                    continue

                kvs = Journal2QuestionXwalk.journal2question(j)
                meta_kvs = _get_doaj_meta_kvs(j)
                article_kvs = _get_article_kvs(j)
                cols[issn] = kvs + meta_kvs + article_kvs

            issns = cols.keys()
            issns.sort()

            csvwriter = clcsv.UnicodeWriter(file_object)
            qs = None
            for i in issns:
                if qs is None:
                    qs = [q for q, _ in cols[i]]
                    csvwriter.writerow(qs)
                vs = [v for _, v in cols[i]]
                csvwriter.writerow(vs)

        def _get_doaj_meta_kvs(journal):
            """
            Get key, value pairs for some meta information we want from the journal object
            :param journal: a models.Journal
            :return: a list of (key, value) tuples for our metadata
            """
            kvs = [
                ("DOAJ Seal", YES_NO.get(journal.has_seal(), "")),
                ("Tick: Accepted after March 2014", YES_NO.get(journal.is_ticked(), "")),
                ("Added on Date", journal.created_date),
                ("Subjects", ' | '.join(journal.bibjson().lcc_paths()))
            ]
            return kvs

        def _get_article_kvs(journal):
            stats = journal.article_stats()
            kvs = [
                ("Number of Article Records", str(stats.get("total"))),
                ("Most Recent Article Added", stats.get("latest"))
            ]
            return kvs

        with codecs.open(out, 'wb', encoding='utf-8') as csvfile:
            _make_journals_csv(csvfile)

        mainStore = StoreFactory.get("cache")
        try:
            mainStore.store(container_id, filename, source_path=out)
            url = mainStore.url(container_id, filename)
        finally:
            tmpStore.delete_file(container_id, filename) # don't delete the container, just in case someone else is writing to it

        action_register = []
        if prune:
            def sort(filelist):
                rx = "journalcsv__doaj_(.+?)_utf8.csv"
                return sorted(filelist, key=lambda x: datetime.strptime(re.match(rx, x).groups(1)[0], '%Y%m%d_%H%M'), reverse=True)
            def filter(filename):
                return filename.startswith("journalcsv__")
            action_register = prune_container(mainStore, container_id, sort, filter=filter, keep=2)

        # update the ES record to point to the new file
        models.Cache.cache_csv(url)
        return url, action_register
コード例 #9
0
ファイル: anon_import.py プロジェクト: DOAJ/doaj
def do_import(config):
    host = app.config["ELASTIC_SEARCH_HOST"]
    index = app.config["ELASTIC_SEARCH_DB"]
    if config.get("elastic_search_host") is not None:
        host = config.get("elastic_search_host")
        app.config["ELASTIC_SEARCH_HOST"] = host
    if config.get("elastic_search_db") is not None:
        index = config.get("elastic_search_db")
        app.config["ELASTIC_SEARCH_DB"] = index

    print("\n")
    print("Using host {x} and index {y}\n".format(x=host, y=index))
    conn = esprit.raw.make_connection(None, host, None, index)

    # filter for the types we are going to work with
    import_types = {}
    for t, s in config.get("types", {}).iteritems():
        if s.get("import", False) is True:
            import_types[t] = s

    print("==Carrying out the following import==")
    for import_type, cfg in import_types.iteritems():
        count = "All" if cfg.get("limit", -1) == -1 else cfg.get("limit")
        print("{x} from {y}".format(x=count, y=import_type))
    print("\n")

    if config.get("confirm", True):
        text = raw_input("Continue? [y/N] ")
        if text.lower() != "y":
            exit()

    # remove all the types that we are going to import
    for import_type in import_types.keys():
        esprit.raw.delete(conn, import_type)

    # re-initialise the index (sorting out mappings, etc)
    print("==Initialising Index for Mappings==")
    initialise_index(app)

    mainStore = StoreFactory.get("anon_data")
    tempStore = StoreFactory.tmp()
    container = app.config.get("STORE_ANON_DATA_CONTAINER")

    print("\n==Importing==")
    for import_type, cfg in import_types.iteritems():
        count = "all" if cfg.get("limit", -1) == -1 else cfg.get("limit")
        print("Importing {x} from {y}".format(x=count, y=import_type))
        print("Obtaining {x} from storage".format(x=import_type))

        limit = cfg.get("limit", -1)
        limit = None if limit == -1 else limit

        n = 1
        while True:
            filename = import_type + ".bulk" + "." + str(n)
            handle = mainStore.get(container, filename)
            if handle is None:
                break
            tempStore.store(container, filename + ".gz", source_stream=handle)
            print("Retrieved {x} from storage".format(x=filename))
            handle.close()

            print("Unzipping {x} in temporary store".format(x=filename))
            compressed_file = tempStore.path(container, filename + ".gz")
            uncompressed_file = tempStore.path(container, filename, must_exist=False)
            with gzip.open(compressed_file, "rb") as f_in, open(uncompressed_file, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)
            tempStore.delete_file(container, filename + ".gz")

            print("Importing from {x}".format(x=filename))
            imported_count = esprit.tasks.bulk_load(conn, import_type, uncompressed_file,
                                                    limit=limit, max_content_length=config.get("max_content_length", 100000000))
            tempStore.delete_file(container, filename)

            if limit is not None and imported_count != -1:
                limit -= imported_count
            if limit is not None and limit <= 0:
                break

            n += 1

    tempStore.delete_file(container)