Exemple #1
0
def _normalize_document_meta_window(session, window):
    query = (
        session.query(models.DocumentMeta)
        .filter(models.DocumentMeta.updated.between(window.start, window.end))
        .order_by(models.DocumentMeta.updated.asc())
    )

    for docmeta in query:
        existing = session.query(models.DocumentMeta).filter(
            models.DocumentMeta.id != docmeta.id,
            models.DocumentMeta.claimant_normalized == uri.normalize(docmeta.claimant),
            models.DocumentMeta.type == docmeta.type,
        )

        if existing.count() > 0:
            session.delete(docmeta)
        else:

            docmeta._claimant_normalized = (  # pylint: disable=protected-access
                uri.normalize(
                    docmeta.claimant,
                )
            )

        session.flush()
def _normalize_document_uris_window(session, window):
    query = (session.query(models.DocumentURI).filter(
        models.DocumentURI.updated.between(window.start, window.end)).order_by(
            models.DocumentURI.updated.asc()))

    for docuri in query:
        documents = models.Document.find_by_uris(session, [docuri.uri])
        if documents.count() > 1:
            merge_documents(session, documents)

        existing = session.query(models.DocumentURI).filter(
            models.DocumentURI.id != docuri.id,
            models.DocumentURI.document_id == docuri.document_id,
            models.DocumentURI.claimant_normalized == uri.normalize(
                docuri.claimant),
            models.DocumentURI.uri_normalized == uri.normalize(docuri.uri),
            models.DocumentURI.type == docuri.type,
            models.DocumentURI.content_type == docuri.content_type,
        )

        if existing.count() > 0:
            session.delete(docuri)
        else:
            docuri._claimant_normalized = uri.normalize(docuri.claimant)
            docuri._uri_normalized = uri.normalize(docuri.uri)

        session.flush()
Exemple #3
0
def _normalize_document_uris_window(session, window):
    query = (
        session.query(models.DocumentURI)
        .filter(models.DocumentURI.updated.between(window.start, window.end))
        .order_by(models.DocumentURI.updated.asc())
    )

    for docuri in query:
        documents = models.Document.find_by_uris(session, [docuri.uri])
        if documents.count() > 1:
            merge_documents(session, documents)

        existing = session.query(models.DocumentURI).filter(
            models.DocumentURI.id != docuri.id,
            models.DocumentURI.document_id == docuri.document_id,
            models.DocumentURI.claimant_normalized == uri.normalize(docuri.claimant),
            models.DocumentURI.uri_normalized == uri.normalize(docuri.uri),
            models.DocumentURI.type == docuri.type,
            models.DocumentURI.content_type == docuri.content_type,
        )

        if existing.count() > 0:
            session.delete(docuri)
        else:
            docuri._claimant_normalized = uri.normalize(docuri.claimant)
            docuri._uri_normalized = uri.normalize(docuri.uri)

        session.flush()
Exemple #4
0
def _normalize_document_meta_window(session, window):
    query = session.query(models.DocumentMeta) \
        .filter(models.DocumentMeta.updated.between(window.start, window.end)) \
        .order_by(models.DocumentMeta.updated.asc())

    for docmeta in query:
        existing = session.query(models.DocumentMeta).filter(
            models.DocumentMeta.id != docmeta.id,
            models.DocumentMeta.claimant_normalized == uri.normalize(docmeta.claimant),
            models.DocumentMeta.type == docmeta.type)

        if existing.count() > 0:
            session.delete(docmeta)
        else:
            docmeta._claimant_normalized = uri.normalize(docmeta.claimant)

        session.flush()
Exemple #5
0
def _fetch_document_uri_canonical_self_claim(session, uri_):
    return (
        session.query(models.DocumentURI)
        .filter(
            models.DocumentURI.uri_normalized == uri.normalize(uri_),
            models.DocumentURI.type.in_([u"self-claim", u"rel-canonical"]),
        )
        .all()
    )
Exemple #6
0
def _has_uri_ever_been_annotated(db, uri):
    """Return `True` if a given URI has ever been annotated."""

    # This check is written with SQL directly to guarantee an efficient query
    # and minimize SQLAlchemy overhead. We query `document_uri.uri_normalized`
    # instead of `annotation.target_uri_normalized` because there is an existing
    # index on `uri_normalized`.
    query = 'SELECT EXISTS(SELECT 1 FROM document_uri WHERE uri_normalized = :uri)'
    result = db.execute(query, {'uri': normalize(uri)}).first()
    return result[0] is True
Exemple #7
0
def _has_uri_ever_been_annotated(db, uri):
    """Return `True` if a given URI has ever been annotated."""

    # This check is written with SQL directly to guarantee an efficient query
    # and minimize SQLAlchemy overhead. We query `document_uri.uri_normalized`
    # instead of `annotation.target_uri_normalized` because there is an existing
    # index on `uri_normalized`.
    query = "SELECT EXISTS(SELECT 1 FROM document_uri WHERE uri_normalized = :uri)"
    result = db.execute(query, {"uri": normalize(uri)}).first()
    return result[0] is True
Exemple #8
0
Fichier : query.py Projet : kael/h
    def __call__(self, search, params):
        if 'uri' not in params and 'url' not in params:
            return search
        query_uris = popall(params, 'uri') + popall(params, 'url')

        uris = set()
        for query_uri in query_uris:
            expanded = storage.expand_uri(self.request.db, query_uri)

            us = [uri.normalize(u) for u in expanded]
            uris.update(us)
        return search.filter('terms', **{'target.scope': list(uris)})
def _normalize_annotations_window(session, window):
    query = (session.query(models.Annotation).filter(
        models.Annotation.updated.between(window.start, window.end)).order_by(
            models.Annotation.updated.asc()))

    ids = set()
    for a in query:
        normalized = uri.normalize(a.target_uri)
        if normalized != a.target_uri_normalized:
            a._target_uri_normalized = normalized
            ids.add(a.id)

    return ids
Exemple #10
0
def _normalize_annotations_window(session, window):
    query = session.query(models.Annotation) \
        .filter(models.Annotation.updated.between(window.start, window.end)) \
        .order_by(models.Annotation.updated.asc())

    ids = set()
    for a in query:
        normalized = uri.normalize(a.target_uri)
        if normalized != a.target_uri_normalized:
            a._target_uri_normalized = normalized
            ids.add(a.id)

    return ids
Exemple #11
0
    def __call__(self, params):
        if 'uri' not in params and 'url' not in params:
            return None
        query_uris = [v for k, v in params.items() if k in ['uri', 'url']]
        if 'uri' in params:
            del params['uri']
        if 'url' in params:
            del params['url']

        uris = set()
        for query_uri in query_uris:
            expanded = storage.expand_uri(self.request.db, query_uri)

            us = [uri.normalize(u) for u in expanded]
            uris.update(us)

        return {"terms": {"target.scope": list(uris)}}
Exemple #12
0
Fichier : query.py Projet : gnott/h
    def __call__(self, params):
        if 'uri' not in params and 'url' not in params:
            return None
        query_uris = [v for k, v in params.items() if k in ['uri', 'url']]
        if 'uri' in params:
            del params['uri']
        if 'url' in params:
            del params['url']

        uris = set()
        for query_uri in query_uris:
            expanded = storage.expand_uri(self.request.db, query_uri)

            us = [uri.normalize(u) for u in expanded]
            uris.update(us)

        return {"terms": {"target.scope": list(uris)}}
Exemple #13
0
def _normalize_annotations_window(session, window):
    query = (
        session.query(models.Annotation)
        .filter(models.Annotation.updated.between(window.start, window.end))
        .order_by(models.Annotation.updated.asc())
    )

    ids = set()
    for annotation in query:
        normalized = uri.normalize(annotation.target_uri)
        if normalized != annotation.target_uri_normalized:
            annotation._target_uri_normalized = (  # pylint: disable=protected-access
                normalized
            )
            ids.add(annotation.id)

    return ids
Exemple #14
0
    def _wildcard_uri_normalized(self, wildcard_uri):
        """
        Same as uri.normalized but it replaces _'s with ?'s after normalization.

        Although elasticsearch uses ? we use _ since ? is a special reserved url
        character and this means we can avoid dealing with normalization headaches.

        While it's possible to escape wildcards`using \\, the uri.normalize
        converts \\ to encoded url format which does not behave the same in
        elasticsearch. Thus, escaping wildcard characters is not currently
        supported.
        """
        # If the url is something like http://example.com/*, normalize it to
        #  http://example.com* so it finds all urls including the base url.
        trailing_wildcard = ""
        if wildcard_uri.endswith("*"):
            trailing_wildcard = wildcard_uri[-1]
            wildcard_uri = wildcard_uri[:-1]
        wildcard_uri = uri.normalize(wildcard_uri)
        wildcard_uri += trailing_wildcard
        return wildcard_uri.replace("_", "?")
Exemple #15
0
    def _wildcard_uri_normalized(self, wildcard_uri):
        """
        Same as uri.normalized but it replaces _'s with ?'s after normalization.

        Although elasticsearch uses ? we use _ since ? is a special reserved url
        character and this means we can avoid dealing with normalization headaches.

        While it's possible to escape wildcards`using \\, the uri.normalize
        converts \\ to encoded url format which does not behave the same in
        elasticsearch. Thus, escaping wildcard characters is not currently
        supported.
        """
        # If the url is something like http://example.com/*, normalize it to
        #  http://example.com* so it finds all urls including the base url.
        trailing_wildcard = ""
        if wildcard_uri.endswith("*"):
            trailing_wildcard = wildcard_uri[-1]
            wildcard_uri = wildcard_uri[:-1]
        wildcard_uri = uri.normalize(wildcard_uri)
        wildcard_uri += trailing_wildcard
        return wildcard_uri.replace("_", "?")
Exemple #16
0
Fichier : query.py Projet : kael/h
    def _wildcard_uri_normalized(self, wildcard_uri):
        """
        Same as uri.normalized but it doesn't strip ending `?` from uri's.

        It's possible to have a wildcard at the end of a uri, however
        uri.normalize strips `?`s from the end of uris and something like
        http://foo.com/* will not be normalized to http://foo.com* without
        removing the `*` before normalization. To compensate for this,
        we check for an ending wildcard and add it back after normalization.

        While it's possible to escape `?` and `*` using \\, the uri.normalize
        converts \\ to encoded url format which does not behave the same in
        elasticsearch. Thus, escaping wildcard characters is not currently
        supported.
        """
        trailing_wildcard = ""
        if wildcard_uri.endswith("?") or wildcard_uri.endswith("*"):
            trailing_wildcard = wildcard_uri[-1]
            wildcard_uri = wildcard_uri[:-1]
        wildcard_uri = uri.normalize(wildcard_uri)
        wildcard_uri += trailing_wildcard
        return wildcard_uri
Exemple #17
0
 def test_it_strips_fragments(self, url_in, url_out):
     assert uri.normalize(url_in) == url_out
Exemple #18
0
 def test_it_leaves_invalid_urls_alone(self, url_in, url_out):
     assert uri.normalize(url_in) == url_out
Exemple #19
0
 def test_it_normalises_url_casing(self, url_in, url_out):
     assert uri.normalize(url_in) == url_out
Exemple #20
0
 def test_it_black_lists_invalid_params(self, url_in, url_out):
     assert uri.normalize(url_in) == url_out
Exemple #21
0
 def test_it_strips_via_urls(self, url_in, url_out):
     assert uri.normalize(url_in) == url_out
Exemple #22
0
 def test_it_sorts_params(self, url_in, url_out):
     assert uri.normalize(url_in) == url_out
Exemple #23
0
def _fetch_document_uri_claimants(session, uri_):
    return session.query(models.DocumentURI).filter(
        models.DocumentURI.claimant_normalized == uri.normalize(uri_)).all()
Exemple #24
0
def _fetch_document_uri_claimants(session, uri_):
    return session.query(models.DocumentURI).filter(
        models.DocumentURI.claimant_normalized == uri.normalize(uri_)).all()
Exemple #25
0
def _fetch_annotations(session, uri_):
    return session.query(models.Annotation).filter(
        models.Annotation.target_uri_normalized == uri.normalize(uri_)).all()
Exemple #26
0
 def test_it_removes_ports(self, url_in, url_out):
     assert uri.normalize(url_in) == url_out
Exemple #27
0
def _fetch_annotations(session, uri_):
    return session.query(models.Annotation).filter(
        models.Annotation.target_uri_normalized == uri.normalize(uri_)).all()
Exemple #28
0
def test_normalize_returns_unicode(url, _):
    assert isinstance(uri.normalize(url), str)
Exemple #29
0
def _fetch_document_uri_canonical_self_claim(session, uri_):
    return session.query(models.DocumentURI).filter(
        models.DocumentURI.uri_normalized == uri.normalize(uri_),
        models.DocumentURI.type.in_([u'self-claim', u'rel-canonical'])).all()
Exemple #30
0
def test_normalize_returns_unicode(url, _):
    assert isinstance(uri.normalize(url), text_type)
Exemple #31
0
 def test_it_removes_trailing_slashes(self, url_in, url_out):
     assert uri.normalize(url_in) == url_out
Exemple #32
0
 def target_uri(self, value):
     self._target_uri = value
     self._target_uri_normalized = uri.normalize(value)
Exemple #33
0
 def test_it_translates_scheme_correctly(self, url_in, url_out):
     assert uri.normalize(url_in) == url_out
Exemple #34
0
 def test_it_handles_invalid_params(self, url_in, url_out):
     assert uri.normalize(url_in) == url_out
Exemple #35
0
def test_normalize(url_in, url_out):
    assert uri.normalize(url_in) == url_out
Exemple #36
0
 def test_it_decodes_params_correctly(self, url_in, url_out):
     assert uri.normalize(url_in) == url_out
Exemple #37
0
 def target_uri(self, value):
     self._target_uri = value
     self._target_uri_normalized = uri.normalize(value)
Exemple #38
0
def test_normalize(url_in, url_out):
    assert uri.normalize(url_in) == url_out