Python uri_normalizeの例、hyputils.memex.util.uri.uri_normalize Pythonの例

コード例 #1

0

ファイルを表示

def make_anno(data, dbdocs):
    #document_uri_dicts = data['document']['document_uri_dicts']
    #document_meta_dicts = data['document']['document_meta_dicts']
    #del data['document']
    #data = {k:v for k, v in data.items() if k != 'document'}  # prevent overwrite on batch load

    annotation = models.Annotation(
        **data)  # FIXME for batch the overhead here is stupid beyond belief
    annotation.document_id = dbdocs[uri_normalize(annotation.target_uri)].id
    #for k, v in data.items():
    #print(k, v)
    #setattr(annotation, k, v)
    #id, created, updated = extra
    #annotation.id = id
    #annotation.created = created
    #annotation.updated = updated

    return annotation

    # this baby is super slow
    document = update_document_metadata(session,
                                        annotation.target_uri,
                                        document_meta_dicts,
                                        document_uri_dicts,
                                        created=created,
                                        updated=updated)
    annotation.document = document

    return annotation

コード例 #2

0

ファイルを表示

    def find_by_uris(cls, session, uris):
        """Find documents by a list of uris."""
        query_uris = [uri_normalize(u) for u in uris]

        matching_claims = (session.query(DocumentURI).filter(
            DocumentURI.uri_normalized.in_(query_uris)).distinct(
                DocumentURI.document_id).subquery())

        return session.query(Document).join(matching_claims)

コード例 #3

0

ファイルを表示

ファイル: document.py プロジェクト: tgbugs/hypush

    def find_by_uris(cls, session, uris):
        """Find documents by a list of uris."""
        query_uris = [uri_normalize(u) for u in uris]

        matching_claims = (
            session.query(DocumentURI)
            .filter(DocumentURI.uri_normalized.in_(query_uris))
            .distinct(DocumentURI.document_id)
            .subquery()
        )

        return session.query(Document).join(matching_claims)

コード例 #4

0

ファイルを表示

 def sync_anno_stream(self, search_after=None, stop_at=None):
     """ streaming one anno at a time version of sync """
     for row in self.yield_from_api(search_after=last_updated,
                                    stop_at=stop_at):
         yield row, 'TODO'
         continue
         # TODO
         datum = validate(row)  # roughly 30x slower than quickload
         # the h code I'm calling assumes these are new annos
         datum['id'] = row['id']
         datum['created'] = row['created']
         datum['updated'] = row['updated']
         document_dict = datum.pop('document')
         document_uri_dicts = document_dict['document_uri_dicts']
         document_meta_dicts = document_dict['document_meta_dicts']
         a = [
             models.Annotation(**d,
                               document_id=dbdocs[uri_normalize(
                                   d['target_uri'])].id) for d in datas
         ]  # slow
         self.log.debug('making annotations')
         self.session.add_all(a)
         self.log.debug('adding all annotations')

コード例 #5

0

ファイルを表示

 def do_check():
     api_rows  # so that it is accessible in function scope
     self.log.debug('checking for consistency')
     annos = self.session.query(models.Annotation).\
         filter(models.Annotation.groupid == self.group).all()
     #docs = self.session.query(models.Document).all()
     durs = self.session.query(models.DocumentURI).all()
     doc_uris = defaultdict(set)
     _ = [doc_uris[d.document_id].add(d.uri) for d in durs]
     doc_uris = dict(doc_uris)
     #dms = self.session.query(models.DocumentMeta).all()
     #doc_mismatch = [a for a in annos if anno_id_to_doc_id[a.id] != a.document.id]  # super slow due to orm fetches
     doc_missing = [a for a in annos if a.id not in anno_id_to_doc_id]
     assert not doc_missing
     doc_mismatch = [
         a for a in annos if anno_id_to_doc_id[a.id] != a.document_id
     ]
     assert not doc_mismatch, doc_mismatch
     # don't use the orm to do this, it is too slow even if you send the other queries above
     #embed()
     uri_mismatch = [(a.target_uri, doc_uris[a.document_id], a)
                     for a in annos
                     if a.target_uri not in doc_uris[a.document_id]]
     # NOTE hypothesis only allows 1 record per normalized uri, so we have to normalize here as well
     maybe_mismatch = set(
         frozenset(s) for u, s, a in uri_mismatch if not s.add(u))
     h_mismatch = set(s for s in maybe_mismatch
                      if len(frozenset(uri_normalize(u)
                                       for u in s)) > 1)
     self.log.debug(f'h mismatch has {len(h_mismatch)} cases')
     # the above normalization is not sufficient for cases where there are two
     # hypothes.is normalized uris AND a scibot normalized uri as well
     super_mismatch = set(
         s for s in h_mismatch
         if len(frozenset(uri_normalization(u) for u in s)) > 1)
     assert not super_mismatch, super_mismatch

コード例 #6

0

ファイルを表示

ファイル: utils.py プロジェクト: elifesciences-publications/scibot

def uri_normalization(uri):
    """ NOTE: this does NOT produce uris """
    try:
        # strip hypothesis extension prefix
        if uri.startswith('chrome-extension://bjfhmglciegochdpefhhlphglcehbmek/content/web/viewer.html?file='):
            junk, uri = uri.split('=', 1)

        # universal fixes
        no_fragment, *_frag = uri.rsplit('#', 1)
        no_trailing_slash = no_fragment.rstrip('/')  # annoying
        _scheme, no_scheme = no_trailing_slash.split('://', 1)

        # special cases
        if 'frontiersin.org' in no_scheme:
            # og:url on frontiers is incorrect
            no_scheme = no_scheme.replace('article/', 'articles/')
        elif 'fasebj.org' in no_scheme:  # FIXME this one has _all_ the variants :/
            no_scheme = (no_scheme
                         .replace('.abstract', '')
                         .replace('.full', '')
                         .replace('.pdf', '')
            )
        elif no_scheme.endswith('?needAccess=true'):
            no_scheme = no_scheme[:-len('?needAccess=true')]
        elif '?systemMessage' in no_scheme:
            no_scheme, junk = no_scheme.rsplit('?systemMessage', 1)

        # specific fixes
        if anyMembers(no_scheme,
                      'acs.org',
                      'ahajournals.org',
                      'biologicalpsychiatryjournal.com',
                      'ebiomedicine.com',
                      'fasebj.org',
                      'frontiersin.org',
                      'future-science.com',
                      'hindawi.com',
                      'ieee.org',
                      'jclinepi.com',
                      'jpeds.com',
                      'liebertpub.com',
                      'mitpressjournals.org',
                      'molbiolcell.org',
                      'molmetab.com',
                      'neurobiologyofaging.org',
                      'physiology.org',
                      'sagepub.com',
                      'sciencedirect.com',
                      'tandfonline.com',
                      'theriojournal.com',
                      'wiley.com',):
            # NOTE not all the above hit all of these
            # almost all still resolve
            normalized = (no_scheme
                          .replace('/abstract', '')
                          .replace('/abs', '')
                          .replace('/fulltext', '')
                          .replace('/full', '')
                          .replace('/pdf', ''))
        #elif ('sciencedirect.com' in no_scheme):
            #normalized = (no_scheme
                          #.replace('/abs', ''))
        elif ('cell.com' in no_scheme):
            normalized = (no_scheme  # FIXME looks like cell uses /abstract in og:url
                          .replace('/abstract', '/XXX')
                          .replace('/fulltext', '/XXX'))
        elif 'jneurosci.org' in no_scheme:
            # TODO content/early -> resolution_chain(doi)
            normalized = (no_scheme
                          .replace('.short', '')
                          .replace('.long', '')
                          .replace('.full', '')
                          .replace('.pdf', '')
                          # note .full.pdf is a thing
                          )
        elif 'pnas.org' in no_scheme:
            normalized = (no_scheme
                          .replace('.short', '')
                          .replace('.long', '')
                          .replace('.full', ''))
        elif 'mdpi.com' in no_scheme:
            normalized = (no_scheme
                          .replace('/htm', ''))
        elif 'f1000research.com' in no_scheme:
            # you should be ashamed of yourselves for being in here for this reason
            normalized, *maybe_version = no_scheme.rsplit('/v', 1)
        elif 'academic.oup.com' in no_scheme:
            normalized, *maybesr = no_scheme.rsplit('?searchresult=', 1)
            _normalized, maybe_junk = normalized.rsplit('/', 1)
            numbers = '0123456789'
            if (maybe_junk[0] not in numbers or  # various ways to detect the human readable junk after the id
                maybe_junk[-1] not in numbers or
                '-' in maybe_junk or
                len(maybe_junk) > 20):
                normalized = _normalized
        elif anyMembers(no_scheme,
                        'jci.org',
                        'nature.com'):
            # cases where safe to remove query fragment
            normalized, *_query = no_scheme.rsplit('?', 1)
            normalized, *table_number = normalized.rsplit('/tables/', 1)
        elif 'pubmed/?term=' in no_scheme and noneMembers(no_scheme, ' ', '+'):
            normalized = no_scheme.replace('?term=', '')
        elif 'nih.gov/pubmed/?' in no_scheme:
            # FIXME scibot vs client norm?
            normalized = no_scheme.replace(' ', '+')
        elif 'govhttp' in no_scheme:
            # lol oh dear
            hrm, oops = no_scheme.split('govhttp')
            ded, wat = oops.split('//', 1)
            blargh, suffix = wat.split('/', 1)
            normalized = hrm + 'gov/pmc/' + suffix
        elif 'table/undtbl' in no_scheme:
            normalized, table_number = no_scheme.rsplit('table/undtbl')
        elif anyMembers(no_scheme,
                        'index.php?',
                       ):
            # cases where we just use hypothes.is normalization
            _scheme, normalized = uri_normalize(uri).split('://')  # FIXME h dependency
        else:
            normalized = no_scheme

        'onlinelibrary.wiley.com/doi/10.1002/cne.23727?wol1URL=/doi/10.1002/cne.23727&regionCode=US-CA&identityKey=e2523300-b934-48c9-b08e-940de05d7335'
        'www.jove.com/video/55441/?language=Japanese'
        'www.nature.com/neuro/journal/v19/n5/full/nn.4282.html'
        'www.nature.com/cr/journal/vaop/ncurrent/full/cr201669a.html'
        'https://www.nature.com/articles/cr201669'

        #{'www.ingentaconnect.com/content/umrsmas/bullmar/2017/00000093/00000002/art00006':
         #[OntId('DOI:10.5343/bms.2016.1044'), OntId('DOI:info:doi/10.5343/bms.2016.1044')]}

        # pmid extract from pmc
        #<meta name="citation_pmid" content="28955177">
        return normalized


    except ValueError as e:  # split fail
        pdf_prefix = 'urn:x-pdf:'
        if uri.startswith(pdf_prefix):
            return uri
        elif uri in bad_uris:
            print('AAAAAAAAAAAAAAAAAAAAAAAAAAA', uri)
            return 'THIS URI IS GARBAGE AND THIS IS ITS NORMALIZED FORM'
        else:
            raise TypeError(uri) from e

コード例 #7

0

ファイルを表示

    def q_prepare_docs(self, rows):
        existing_unnormed = {
            r.uri: (r.document_id, self.convert[0](r.created),
                    self.convert[0](r.updated))
            for r in self.session.execute('SELECT uri, document_id, created, '
                                          'updated FROM document_uri')
        }
        created_updated = {
            docid: (created, updated)
            for _, (docid, created, updated) in existing_unnormed.items()
        }
        _existing = defaultdict(set)
        _ = [
            _existing[uri_normalization(uri)].add(docid)
            for uri, (docid, created, updated) in existing_unnormed.items()
        ]
        assert not [_ for _ in _existing.values() if len(_) > 1
                    ]  # TODO proper handling for this case
        h_existing_unnormed = {
            uri_normalize(uri): docid
            for uri, (docid, created, updated) in existing_unnormed.items()
        }
        existing = {k: next(iter(v))
                    for k, v in _existing.items()
                    }  # FIXME issues when things get big
        latest_existing = max(
            u
            for c, u in created_updated.values()) if created_updated else None

        new_docs = {
        }  # FIXME this is completely opaque since it is not persisted anywhere
        for row in sorted(rows, key=lambda r: r['created']):
            id = row['id']
            uri, uri_normed, (created, updated, claims) = self.uri_records(row)
            try:
                docid = existing[uri_normed]
                dc, du = created_updated[docid]
                doc = models.Document(id=docid, created=dc, updated=du)
                if doc.updated < updated:
                    # FIXME TODO update the record?
                    #self.log.warning('YOU ARE NOT UPDATING A DOC WHEN YOU SHOULD!!!!!!\n'
                    #f'{docid} {doc.updated} {updated}')
                    pass

                do_claims = False
            except KeyError as e:
                if existing:
                    if row['updated'] <= latest_existing:
                        # only need to worry if we are recreating
                        raise e
                if uri_normed not in new_docs:
                    do_claims = True
                    doc = models.Document(created=created, updated=updated)
                    self.session.add(doc)  # TODO perf testing vs add_all
                    new_docs[uri_normed] = doc
                else:
                    do_claims = False
                    doc = new_docs[uri_normed]

            #if type(doc.created) == str:
            #embed()
            yield id, doc

            if uri_normalize(uri) not in h_existing_unnormed:
                # NOTE allowing only the normalized uri can cause confusion (i.e. see checks in sync_annos)
                h_existing_unnormed[uri_normalize(uri)] = doc
                # TODO do these get added automatically if their doc gets added but exists?
                doc_uri = models.DocumentURI(document=doc,
                                             claimant=uri,
                                             uri=uri,
                                             type='self-claim',
                                             created=created,
                                             updated=updated)
                yield None, doc_uri

            # because of how this schema is designed
            # the only way that this can be fast is
            # if we assume that all claims are identical
            # FIXME if there is a new claim type then we are toast though :/
            # the modelling here assumes that title etc can't change
            #print(id, uri, uri_normed, row['user'], row['uri'], row['created'])
            if do_claims:
                for claim in claims:
                    #print(id, uri, uri_normed, claim['claimant'], claim['type'], claim['value'])
                    dm = models.DocumentMeta(document=doc,
                                             created=created,
                                             updated=updated,
                                             **claim)
                    yield None, dm

コード例 #8

0

ファイルを表示

ファイル: document.py プロジェクト: tgbugs/hypush

def create_or_update_document_meta(
    session, claimant, type, value, document, created, updated
):
    """
    Create or update a DocumentMeta with the given parameters.

    If an equivalent DocumentMeta already exists in the database then its value
    and updated time will be updated.

    If no equivalent DocumentMeta exists in the database then a new one will be
    created and added to the database.

    To be considered "equivalent" an existing DocumentMeta must have the given
    claimant and type, but its value, document and created and updated times
    needn't match the given ones.

    :param session: the database session
    :type session: sqlalchemy.orm.session.Session

    :param claimant: the value to use for the DocumentMeta's claimant attribute
        if a new DocumentMeta is created
    :type claimant: unicode

    :param type: the value of the new or existing DocumentMeta's type attribute
    :type type: unicode

    :param value: the value to set the new or existing DocumentMeta's value
        attribute to
    :type value: list of unicode strings

    :param document: the value to use for the DocumentMeta's document if a new
        DocumentMeta is created
    :type document: h.models.Document

    :param created: the value to use for the DocumentMeta's created attribute
        if a new DocumentMeta is created
    :type created: datetime.datetime

    :param updated: the value to set the new or existing DocumentMeta's updated
        attribute to
    :type updated: datetime.datetime

    """
    existing_dm = (
        session.query(DocumentMeta)
        .filter(
            DocumentMeta.claimant_normalized == uri_normalize(claimant),
            DocumentMeta.type == type,
        )
        .one_or_none()
    )

    if existing_dm is None:
        session.add(
            DocumentMeta(
                claimant=claimant,
                type=type,
                value=value,
                document=document,
                created=created,
                updated=updated,
            )
        )
    else:
        existing_dm.value = value
        existing_dm.updated = updated
        if not existing_dm.document == document:
            log.warning(
                "Found DocumentMeta (id: %d)'s document_id (%d) doesn't "
                "match given Document's id (%d)",
                existing_dm.id,
                existing_dm.document_id,
                document.id,
            )

    if type == "title" and value and not document.title:
        document.title = value[0]

    try:
        session.flush()
    except sa.exc.IntegrityError:
        raise ConcurrentUpdateError("concurrent document meta updates")

コード例 #9

0

ファイルを表示

ファイル: document.py プロジェクト: tgbugs/hypush

def create_or_update_document_uri(
    session, claimant, uri, type, content_type, document, created, updated
):
    """
    Create or update a DocumentURI with the given parameters.

    If an equivalent DocumentURI already exists in the database then its
    updated time will be updated.

    If no equivalent DocumentURI exists in the database then a new one will be
    created and added to the database.

    To be considered "equivalent" an existing DocumentURI must have the same
    claimant, uri, type and content_type, but the Document object that it
    belongs to may be different. The claimant and uri are normalized before
    comparing.

    :param session: the database session
    :type session: sqlalchemy.orm.session.Session

    :param claimant: the .claimant property of the DocumentURI
    :type claimant: unicode

    :param uri: the .uri property of the DocumentURI
    :type uri: unicode

    :param type: the .type property of the DocumentURI
    :type type: unicode

    :param content_type: the .content_type property of the DocumentURI
    :type content_type: unicode

    :param document: the Document that the new DocumentURI will belong to, if a
        new DocumentURI is created
    :type document: h.models.Document

    :param created: the time that will be used as the .created time for the new
        DocumentURI, if a new one is created
    :type created: datetime.datetime

    :param updated: the time that will be set as the .updated time for the new
        or existing DocumentURI
    :type updated: datetime.datetime

    """
    docuri = (
        session.query(DocumentURI)
        .filter(
            DocumentURI.claimant_normalized == uri_normalize(claimant),
            DocumentURI.uri_normalized == uri_normalize(uri),
            DocumentURI.type == type,
            DocumentURI.content_type == content_type,
        )
        .first()
    )

    if docuri is None:
        docuri = DocumentURI(
            claimant=claimant,
            uri=uri,
            type=type,
            content_type=content_type,
            document=document,
            created=created,
            updated=updated,
        )
        session.add(docuri)
    elif not docuri.document == document:
        log.warning(
            "Found DocumentURI (id: %d)'s document_id (%d) doesn't match "
            "given Document's id (%d)",
            docuri.id,
            docuri.document_id,
            document.id,
        )

    docuri.updated = updated

    try:
        session.flush()
    except sa.exc.IntegrityError:
        raise ConcurrentUpdateError("concurrent document uri updates")

コード例 #10

0

ファイルを表示

ファイル: document.py プロジェクト: tgbugs/hypush

 def claimant(self, value):
     self._claimant = value
     self._claimant_normalized = uri_normalize(value)

コード例 #11

0

ファイルを表示

ファイル: document.py プロジェクト: tgbugs/hypush

 def uri(self, value):
     self._uri = value
     self._uri_normalized = uri_normalize(value)

コード例 #12

0

ファイルを表示

def create_or_update_document_meta(session, claimant, type, value, document,
                                   created, updated):
    """
    Create or update a DocumentMeta with the given parameters.

    If an equivalent DocumentMeta already exists in the database then its value
    and updated time will be updated.

    If no equivalent DocumentMeta exists in the database then a new one will be
    created and added to the database.

    To be considered "equivalent" an existing DocumentMeta must have the given
    claimant and type, but its value, document and created and updated times
    needn't match the given ones.

    :param session: the database session
    :type session: sqlalchemy.orm.session.Session

    :param claimant: the value to use for the DocumentMeta's claimant attribute
        if a new DocumentMeta is created
    :type claimant: unicode

    :param type: the value of the new or existing DocumentMeta's type attribute
    :type type: unicode

    :param value: the value to set the new or existing DocumentMeta's value
        attribute to
    :type value: list of unicode strings

    :param document: the value to use for the DocumentMeta's document if a new
        DocumentMeta is created
    :type document: h.models.Document

    :param created: the value to use for the DocumentMeta's created attribute
        if a new DocumentMeta is created
    :type created: datetime.datetime

    :param updated: the value to set the new or existing DocumentMeta's updated
        attribute to
    :type updated: datetime.datetime

    """
    existing_dm = (session.query(DocumentMeta).filter(
        DocumentMeta.claimant_normalized == uri_normalize(claimant),
        DocumentMeta.type == type,
    ).one_or_none())

    if existing_dm is None:
        session.add(
            DocumentMeta(
                claimant=claimant,
                type=type,
                value=value,
                document=document,
                created=created,
                updated=updated,
            ))
    else:
        existing_dm.value = value
        existing_dm.updated = updated
        if not existing_dm.document == document:
            log.warning(
                "Found DocumentMeta (id: %d)'s document_id (%d) doesn't "
                "match given Document's id (%d)",
                existing_dm.id,
                existing_dm.document_id,
                document.id,
            )

    if type == "title" and value and not document.title:
        document.title = value[0]

    try:
        session.flush()
    except sa.exc.IntegrityError:
        raise ConcurrentUpdateError("concurrent document meta updates")

コード例 #13

0

ファイルを表示

def create_or_update_document_uri(session, claimant, uri, type, content_type,
                                  document, created, updated):
    """
    Create or update a DocumentURI with the given parameters.

    If an equivalent DocumentURI already exists in the database then its
    updated time will be updated.

    If no equivalent DocumentURI exists in the database then a new one will be
    created and added to the database.

    To be considered "equivalent" an existing DocumentURI must have the same
    claimant, uri, type and content_type, but the Document object that it
    belongs to may be different. The claimant and uri are normalized before
    comparing.

    :param session: the database session
    :type session: sqlalchemy.orm.session.Session

    :param claimant: the .claimant property of the DocumentURI
    :type claimant: unicode

    :param uri: the .uri property of the DocumentURI
    :type uri: unicode

    :param type: the .type property of the DocumentURI
    :type type: unicode

    :param content_type: the .content_type property of the DocumentURI
    :type content_type: unicode

    :param document: the Document that the new DocumentURI will belong to, if a
        new DocumentURI is created
    :type document: h.models.Document

    :param created: the time that will be used as the .created time for the new
        DocumentURI, if a new one is created
    :type created: datetime.datetime

    :param updated: the time that will be set as the .updated time for the new
        or existing DocumentURI
    :type updated: datetime.datetime

    """
    docuri = (session.query(DocumentURI).filter(
        DocumentURI.claimant_normalized == uri_normalize(claimant),
        DocumentURI.uri_normalized == uri_normalize(uri),
        DocumentURI.type == type,
        DocumentURI.content_type == content_type,
    ).first())

    if docuri is None:
        docuri = DocumentURI(
            claimant=claimant,
            uri=uri,
            type=type,
            content_type=content_type,
            document=document,
            created=created,
            updated=updated,
        )
        session.add(docuri)
    elif not docuri.document == document:
        log.warning(
            "Found DocumentURI (id: %d)'s document_id (%d) doesn't match "
            "given Document's id (%d)",
            docuri.id,
            docuri.document_id,
            document.id,
        )

    docuri.updated = updated

    try:
        session.flush()
    except sa.exc.IntegrityError:
        raise ConcurrentUpdateError("concurrent document uri updates")

コード例 #14

0

ファイルを表示

 def claimant(self, value):
     self._claimant = value
     self._claimant_normalized = uri_normalize(value)

コード例 #15

0

ファイルを表示

 def uri(self, value):
     self._uri = value
     self._uri_normalized = uri_normalize(value)