def first_http_url(type_=None):
     for document_uri in self.document_uris:
         uri = document_uri.uri
         if type_ is not None and document_uri.type != type_:
             continue
         if urlparse.urlparse(uri).scheme not in ['http', 'https']:
             continue
         return document_uri.uri
Example #2
0
        def first_http_url(type_=None):
            """
            Return this document's first http(s) URL of the given type.

            Return None if this document doesn't have any http(s) URLs of the
            given type.

            If no type is given just return this document's first http(s)
            URL, or None.

            """
            for document_uri in self.document_uris:
                uri = document_uri.uri
                if type_ is not None and document_uri.type != type_:
                    continue
                if urlparse.urlparse(uri).scheme not in ['http', 'https']:
                    continue
                return document_uri.uri
Example #3
0
def create_or_update_document_uri(session, claimant, uri, type, content_type,
                                  document, created, updated):
    """
    Create or update a DocumentURI with the given parameters.

    If an equivalent DocumentURI already exists in the database then its
    updated time will be updated.

    If no equivalent DocumentURI exists in the database then a new one will be
    created and added to the database.

    To be considered "equivalent" an existing DocumentURI must have the same
    claimant, uri, type and content_type, but the Document object that it
    belongs to may be different. The claimant and uri are normalized before
    comparing.

    :param session: the database session
    :type session: sqlalchemy.orm.session.Session

    :param claimant: the .claimant property of the DocumentURI
    :type claimant: unicode

    :param uri: the .uri property of the DocumentURI
    :type uri: unicode

    :param type: the .type property of the DocumentURI
    :type type: unicode

    :param content_type: the .content_type property of the DocumentURI
    :type content_type: unicode

    :param document: the Document that the new DocumentURI will belong to, if a
        new DocumentURI is created
    :type document: memex.models.Document

    :param created: the time that will be used as the .created time for the new
        DocumentURI, if a new one is created
    :type created: datetime.datetime

    :param updated: the time that will be set as the .updated time for the new
        or existing DocumentURI
    :type updated: datetime.datetime

    """
    docuri = session.query(DocumentURI).filter(
        DocumentURI.claimant_normalized == uri_normalize(claimant),
        DocumentURI.uri_normalized == uri_normalize(uri),
        DocumentURI.type == type,
        DocumentURI.content_type == content_type).first()

    if docuri is None:
        docuri = DocumentURI(claimant=claimant,
                             uri=uri,
                             type=type,
                             content_type=content_type,
                             document=document,
                             created=created,
                             updated=updated)
        session.add(docuri)
    elif not docuri.document == document:
        log.warn(
            "Found DocumentURI (id: %d)'s document_id (%d) doesn't match "
            "given Document's id (%d)", docuri.id, docuri.document_id,
            document.id)

    docuri.updated = updated

    if not document.web_uri:
        parsed = urlparse.urlparse(uri)
        if parsed.scheme in ['http', 'https']:
            document.web_uri = uri

    try:
        session.flush()
    except sa.exc.IntegrityError:
        raise ConcurrentUpdateError('concurrent document uri updates')
Example #4
0
File: document.py Project: nlisgo/h
def create_or_update_document_uri(session,
                                  claimant,
                                  uri,
                                  type,
                                  content_type,
                                  document,
                                  created,
                                  updated):
    """
    Create or update a DocumentURI with the given parameters.

    If an equivalent DocumentURI already exists in the database then its
    updated time will be updated.

    If no equivalent DocumentURI exists in the database then a new one will be
    created and added to the database.

    To be considered "equivalent" an existing DocumentURI must have the same
    claimant, uri, type and content_type, but the Document object that it
    belongs to may be different. The claimant and uri are normalized before
    comparing.

    :param session: the database session
    :type session: sqlalchemy.orm.session.Session

    :param claimant: the .claimant property of the DocumentURI
    :type claimant: unicode

    :param uri: the .uri property of the DocumentURI
    :type uri: unicode

    :param type: the .type property of the DocumentURI
    :type type: unicode

    :param content_type: the .content_type property of the DocumentURI
    :type content_type: unicode

    :param document: the Document that the new DocumentURI will belong to, if a
        new DocumentURI is created
    :type document: memex.models.Document

    :param created: the time that will be used as the .created time for the new
        DocumentURI, if a new one is created
    :type created: datetime.datetime

    :param updated: the time that will be set as the .updated time for the new
        or existing DocumentURI
    :type updated: datetime.datetime

    """
    docuri = session.query(DocumentURI).filter(
        DocumentURI.claimant_normalized == uri_normalize(claimant),
        DocumentURI.uri_normalized == uri_normalize(uri),
        DocumentURI.type == type,
        DocumentURI.content_type == content_type).first()

    if docuri is None:
        docuri = DocumentURI(claimant=claimant,
                             uri=uri,
                             type=type,
                             content_type=content_type,
                             document=document,
                             created=created,
                             updated=updated)
        session.add(docuri)
    elif not docuri.document == document:
        log.warn("Found DocumentURI (id: %d)'s document_id (%d) doesn't match "
                 "given Document's id (%d)",
                 docuri.id, docuri.document_id, document.id)

    docuri.updated = updated

    if not document.web_uri:
        parsed = urlparse.urlparse(uri)
        if parsed.scheme in ['http', 'https']:
            document.web_uri = uri

    try:
        session.flush()
    except sa.exc.IntegrityError:
        raise ConcurrentUpdateError('concurrent document uri updates')
def _document_web_uri(document):
    for docuri in document.document_uris:
        uri = urlparse.urlparse(docuri.uri)
        if uri.scheme in ['http', 'https']:
            return docuri.uri
def _document_web_uri(document):
    for docuri in document.document_uris:
        uri = urlparse.urlparse(docuri.uri)
        if uri.scheme in ['http', 'https']:
            return docuri.uri