Beispiel #1
0
def reloadRecords(ctx, status_code):
    L = getLogger()
    L.info("reloadRecords, status_code = %s", status_code)
    raise NotImplementedError("reloadRecords")
    session = SQLModelDAO(ctx.obj["db_url"]).get_session()
    try:
        pass

    finally:
        session.close()
Beispiel #2
0
def reparseRecords(ctx):
    def _yieldRecordsByPage(qry, pk):
        nonlocal session
        offset = 0
        page_size = 5000
        while True:
            q = qry
            rec = None
            n = 0
            for rec in q.order_by(pk).offset(offset).limit(page_size):
                n += 1
                yield rec
            if n == 0:
                break
            offset += page_size

    L = getLogger()
    batch_size = 50
    L.info("reparseRecords with batch size: %s", batch_size)
    session = SQLModelDAO(ctx.obj["db_url"]).get_session()
    try:
        i = 0
        qry = session.query(Thing)
        pk = Thing.id
        for thing in _yieldRecordsByPage(qry, pk):
            itype = thing.item_type
            isb_lib.sesar_adapter.reparseThing(thing)
            L.info("%s: reparse %s, %s -> %s", i, thing.id, itype, thing.item_type)
            i += 1
            if i % batch_size == 0:
                session.commit()
        # don't forget to commit the remainder!
        session.commit()
    finally:
        session.close()
def reparseRelations(ctx):
    def _yieldRecordsByPage(qry, pk):
        nonlocal session
        offset = 0
        page_size = 5000
        while True:
            q = qry
            rec = None
            n = 0
            for rec in q.order_by(pk).offset(offset).limit(page_size):
                n += 1
                yield rec
            if n == 0:
                break
            offset += page_size

    L = getLogger()
    rsession = requests.session()
    batch_size = 1000
    L.info("reparseRecords with batch size: %s", batch_size)
    session = SQLModelDAO(ctx.obj["db_url"]).get_session()
    allkeys = set()
    try:
        i = 0
        n = 0
        qry = session.query(Thing).filter(
            Thing.authority_id == isb_lib.geome_adapter.GEOMEItem.AUTHORITY_ID)
        pk = Thing.id
        relations = []
        for thing in _yieldRecordsByPage(qry, pk):
            batch = isb_lib.geome_adapter.reparseRelations(thing)
            relations = relations + batch
            for relation in relations:
                allkeys.add(relation["id"])
            _rel_len = len(relations)
            n += len(batch)
            if i % 25 == 0:
                L.info("%s: relations id:%s num_rel:%s, total:%s", i, thing.id,
                       _rel_len, n)
            if _rel_len > batch_size:
                isb_lib.core.solrAddRecords(
                    rsession, relations, "http://localhost:8983/solr/isb_rel/")
                relations = []
            i += 1
        # don't forget to add the remainder!
        isb_lib.core.solrAddRecords(rsession, relations,
                                    "http://localhost:8983/solr/isb_rel/")
        L.info("%s: relations num_rel:%s, total:%s", i, len(relations), n)
        print(f"Total keys= {len(allkeys)}")
        isb_lib.core.solrCommit(rsession,
                                "http://localhost:8983/solr/isb_rel/")
    finally:
        session.close()
Beispiel #4
0
def reparseRelations(ctx):
    L = getLogger()
    rsession = requests.session()
    batch_size = 5000
    L.info("reparseRecords with batch size: %s", batch_size)
    session = SQLModelDAO(ctx.obj["db_url"]).get_session()
    allkeys = set()
    try:
        i = 0
        n = 0
        relations = []
        thing_iterator = isb_lib.core.ThingRecordIterator(
            session,
            authority_id=isb_lib.sesar_adapter.SESARItem.AUTHORITY_ID,
            page_size=batch_size,
            offset=0,
        )
        for thing in thing_iterator.yieldRecordsByPage():
            batch = isb_lib.sesar_adapter.reparseRelations(thing, as_solr=True)
            relations = relations + batch
            for r in relations:
                allkeys.add(r["id"])
            _rel_len = len(relations)
            n += len(batch)
            if i % 25 == 0:
                L.info(
                    "%s: relations id:%s num_rel:%s, total:%s", i, thing.id, _rel_len, n
                )
            if _rel_len > batch_size:
                isb_lib.core.solrAddRecords(
                    rsession, relations, "http://localhost:8983/solr/isb_rel/"
                )
                relations = []
            i += 1
        isb_lib.core.solrAddRecords(
            rsession, relations, "http://localhost:8983/solr/isb_rel/"
        )
        isb_lib.core.solrCommit(rsession, "http://localhost:8983/solr/isb_rel/")
        print(f"Total keys= {len(allkeys)}")
        # verify records
        # for verifying that all records were added to solr
        # found = 0
        # for _id in allkeys:
        #    res = rsession.get(f"http://localhost:8983/solr/isb_rel/get?id={_id}").json()
        #    if res.get("doc",{}).get("id") == _id:
        #        found = found +1
        #    else:
        #        print(f"Missed: {_id}")
        # print(f"Found = {found}")
    finally:
        session.close()
Beispiel #5
0
def loadRecords(ctx, max_records):
    L = getLogger()
    L.info("loadRecords, max = %s", max_records)
    if max_records == -1:
        max_records = 999999999
    session = SQLModelDAO(ctx.obj["db_url"]).get_session()
    try:
        oldest_record = sqlmodel_database.last_time_thing_created(
            session, isb_lib.sesar_adapter.SESARItem.AUTHORITY_ID
        )
        logging.info("Oldest = %s", oldest_record)
        time.sleep(1)
        loadSesarEntries(session, max_records, start_from=oldest_record)
    finally:
        session.close()
def populateIsbCoreSolr(ctx):
    L = getLogger()
    rsession = requests.session()
    db_batch_size = 1000
    solr_batch_size = 20
    L.info("reparseRecords with batch size: %s", db_batch_size)
    session = SQLModelDAO(ctx.obj["db_url"]).get_session()
    solr_url = ctx.obj["solr_url"]
    allkeys = set()
    try:
        offset = 0
        all_core_records = []
        thing_iterator = isb_lib.core.ThingRecordIterator(
            session,
            authority_id=isb_lib.geome_adapter.GEOMEItem.AUTHORITY_ID,
            page_size=db_batch_size,
            offset=offset,
        )
        for thing in thing_iterator.yieldRecordsByPage():
            core_records = isb_lib.geome_adapter.reparseAsCoreRecord(thing)
            print("Just added core_records: %s", str(core_records))
            all_core_records.extend(core_records)
            for r in all_core_records:
                allkeys.add(r["id"])
            batch_size = len(all_core_records)
            if batch_size > solr_batch_size:
                isb_lib.core.solrAddRecords(rsession,
                                            all_core_records,
                                            url=solr_url)
                all_core_records = []
        if len(all_core_records) > 0:
            isb_lib.core.solrAddRecords(rsession,
                                        all_core_records,
                                        url=solr_url)
        isb_lib.core.solrCommit(rsession, url=solr_url)
        print(f"Total keys= {len(allkeys)}")
        # verify records
        # for verifying that all records were added to solr
        # found = 0
        # for _id in allkeys:
        #    res = rsession.get(f"http://localhost:8983/solr/isb_rel/get?id={_id}").json()
        #    if res.get("doc",{}).get("id") == _id:
        #        found = found +1
        #    else:
        #        print(f"Missed: {_id}")
        # print(f"Found = {found}")
    finally:
        session.close()
def load_records(ctx, max_records):
    L = get_logger()
    session = SQLModelDAO(ctx.obj["db_url"]).get_session()
    max_created = sqlmodel_database.last_time_thing_created(
        session, isb_lib.opencontext_adapter.OpenContextItem.AUTHORITY_ID)
    L.info("loadRecords: %s", str(session))
    # ctx.obj["db_url"] = db_url
    load_open_context_entries(session, max_records, max_created)
def loadRecords(ctx, max_records):
    L = getLogger()
    L.info("loadRecords, max = %s", max_records)
    if max_records == -1:
        max_records = 999999999

    session = SQLModelDAO((ctx.obj["db_url"])).get_session()
    try:
        oldest_record = None
        res = (session.query(Thing).order_by(Thing.tcreated.desc()).first())
        if not res is None:
            oldest_record = res.tcreated
        logging.info("Oldest = %s", oldest_record)
        time.sleep(1)
        loadGEOMEEntries(session, max_records, start_from=oldest_record)
    finally:
        session.close()
Beispiel #9
0
 def __init__(
     self,
     db_url: typing.AnyStr,
     authority_id: typing.AnyStr,
     db_batch_size: int,
     solr_batch_size: int,
     solr_url: typing.AnyStr,
     offset: int = 0,
     min_time_created: datetime.datetime = None,
 ):
     self._db_session = SQLModelDAO(db_url).get_session()
     self._authority_id = authority_id
     self._min_time_created = min_time_created
     self._thing_iterator = ThingRecordIterator(
         self._db_session,
         authority_id=self._authority_id,
         page_size=db_batch_size,
         offset=offset,
         min_time_created=min_time_created,
     )
     self._db_batch_size = db_batch_size
     self._solr_batch_size = solr_batch_size
     self._solr_url = solr_url
Beispiel #10
0
class CoreSolrImporter:
    def __init__(
        self,
        db_url: typing.AnyStr,
        authority_id: typing.AnyStr,
        db_batch_size: int,
        solr_batch_size: int,
        solr_url: typing.AnyStr,
        offset: int = 0,
        min_time_created: datetime.datetime = None,
    ):
        self._db_session = SQLModelDAO(db_url).get_session()
        self._authority_id = authority_id
        self._min_time_created = min_time_created
        self._thing_iterator = ThingRecordIterator(
            self._db_session,
            authority_id=self._authority_id,
            page_size=db_batch_size,
            offset=offset,
            min_time_created=min_time_created,
        )
        self._db_batch_size = db_batch_size
        self._solr_batch_size = solr_batch_size
        self._solr_url = solr_url

    def run_solr_import(
            self, core_record_function: typing.Callable
    ) -> typing.Set[typing.AnyStr]:
        getLogger().info(
            "importing solr records with db batch size: %s, solr batch size: %s",
            self._db_batch_size,
            self._solr_batch_size,
        )
        allkeys = set()
        rsession = requests.session()
        try:
            core_records = []
            for thing in self._thing_iterator.yieldRecordsByPage():
                try:
                    core_record = core_record_function(thing)
                except Exception as e:
                    getLogger().error(
                        "Failed trying to run transformer, skipping record %s",
                        str(thing.resolved_content))
                    continue

                core_record["source"] = self._authority_id
                core_records.append(core_record)
                for r in core_records:
                    allkeys.add(r["id"])
                batch_size = len(core_records)
                if batch_size > self._solr_batch_size:
                    solrAddRecords(
                        rsession,
                        core_records,
                        url=self._solr_url,
                    )
                    getLogger().info(
                        "Just added solr records, length of all keys is %d",
                        len(allkeys),
                    )
                    core_records = []
            if len(core_records) > 0:
                solrAddRecords(
                    rsession,
                    core_records,
                    url=self._solr_url,
                )
            solrCommit(rsession, url=self._solr_url)
            # verify records
            # for verifying that all records were added to solr
            # found = 0
            # for _id in allkeys:
            #    res = rsession.get(f"http://localhost:8983/solr/isb_rel/get?id={_id}").json()
            #    if res.get("doc",{}).get("id") == _id:
            #        found = found +1
            #    else:
            #        print(f"Missed: {_id}")
            # print(f"Found = {found}")
        finally:
            self._db_session.close()
        return allkeys
Beispiel #11
0
def load_records(ctx, max_records, file):
    session = SQLModelDAO(ctx.obj["db_url"]).get_session()
    logging.info("loadRecords: %s", str(session))
    load_smithsonian_entries(session, max_records, file, None)
Beispiel #12
0
from isb_web.sqlmodel_database import SQLModelDAO

THIS_PATH = os.path.dirname(os.path.abspath(__file__))
WEB_ROOT = config.Settings().web_root
MEDIA_JSON = "application/json"
MEDIA_NQUADS = "application/n-quads"
MEDIA_GEO_JSON = "application/geo+json"

tags_metadata = [
    {
        "name": "heatmaps",
        "description": "Heatmap representations of Things, suitable for consumption by mapping APIs",
    }
]
app = fastapi.FastAPI(root_path=WEB_ROOT, openapi_tags=tags_metadata)
dao = SQLModelDAO(None)

app.add_middleware(
    fastapi.middleware.cors.CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

app.mount(
    "/static",
    fastapi.staticfiles.StaticFiles(directory=os.path.join(THIS_PATH, "static")),
    name="static",
)
templates = fastapi.templating.Jinja2Templates(
    directory=os.path.join(THIS_PATH, "templates")