Exemple #1
0
    def find_by_uris(cls, session, uris):
        """Find documents by a list of uris."""
        query_uris = [uri_normalize(u) for u in uris]

        matching_claims = (session.query(DocumentURI).filter(
            DocumentURI.uri_normalized.in_(query_uris)).distinct(
                DocumentURI.document_id).subquery())

        return session.query(Document).join(matching_claims)
Exemple #2
0
    def find_by_uris(cls, session, uris):
        """Find documents by a list of uris."""
        query_uris = [uri_normalize(u) for u in uris]

        matching_claims = (
            session.query(DocumentURI)
                   .filter(DocumentURI.uri_normalized.in_(query_uris))
                   .distinct(DocumentURI.document_id)
                   .subquery()
        )

        return session.query(Document).join(matching_claims)
def upgrade():
    session = Session(bind=op.get_bind())

    windows = _fetch_windows(session)
    session.rollback()

    new_documents = 0
    document_id_updated = 0

    for window in windows:
        query = session.query(Annotation) \
            .filter(Annotation.updated.between(window.start, window.end)) \
            .filter(Annotation.document_id.is_(None)) \
            .order_by(Annotation.updated.asc())

        for ann in query:
            if ann.document_id:
                continue

            if ann.document_through_uri is None:
                uri = ann.target_uri
                uri_normalized = uri_normalize(uri)

                doc = Document(created=ann.created, updated=ann.updated)
                docuri = DocumentURI(created=ann.created,
                                     updated=ann.updated,
                                     claimant=uri,
                                     claimant_normalized=uri_normalized,
                                     uri=uri,
                                     uri_normalized=uri_normalized,
                                     type='self-claim',
                                     document=doc)
                ann.document = doc
                session.flush()
                new_documents += 1
            else:
                ann.document_id = ann.document_through_uri.id
                document_id_updated += 1

        session.commit()

    log.debug('Created %d new documents' % new_documents)
    log.debug('Filled in %d existing document ids' % document_id_updated)
Exemple #4
0
def create_or_update_document_meta(session, claimant, type, value, document,
                                   created, updated):
    """
    Create or update a DocumentMeta with the given parameters.

    If an equivalent DocumentMeta already exists in the database then its value
    and updated time will be updated.

    If no equivalent DocumentMeta exists in the database then a new one will be
    created and added to the database.

    To be considered "equivalent" an existing DocumentMeta must have the given
    claimant and type, but its value, document and created and updated times
    needn't match the given ones.

    :param session: the database session
    :type session: sqlalchemy.orm.session.Session

    :param claimant: the value to use for the DocumentMeta's claimant attribute
        if a new DocumentMeta is created
    :type claimant: unicode

    :param type: the value of the new or existing DocumentMeta's type attribute
    :type type: unicode

    :param value: the value to set the new or existing DocumentMeta's value
        attribute to
    :type value: list of unicode strings

    :param document: the value to use for the DocumentMeta's document if a new
        DocumentMeta is created
    :type document: memex.models.Document

    :param created: the value to use for the DocumentMeta's created attribute
        if a new DocumentMeta is created
    :type created: datetime.datetime

    :param updated: the value to set the new or existing DocumentMeta's updated
        attribute to
    :type updated: datetime.datetime

    """
    existing_dm = session.query(DocumentMeta).filter(
        DocumentMeta.claimant_normalized == uri_normalize(claimant),
        DocumentMeta.type == type).one_or_none()

    if existing_dm is None:
        session.add(
            DocumentMeta(
                claimant=claimant,
                type=type,
                value=value,
                document=document,
                created=created,
                updated=updated,
            ))
    else:
        existing_dm.value = value
        existing_dm.updated = updated
        if not existing_dm.document == document:
            log.warn(
                "Found DocumentMeta (id: %d)'s document_id (%d) doesn't "
                "match given Document's id (%d)", existing_dm.id,
                existing_dm.document_id, document.id)

    if type == 'title' and value and not document.title:
        document.title = value[0]

    try:
        session.flush()
    except sa.exc.IntegrityError:
        raise ConcurrentUpdateError('concurrent document meta updates')
Exemple #5
0
def create_or_update_document_uri(session, claimant, uri, type, content_type,
                                  document, created, updated):
    """
    Create or update a DocumentURI with the given parameters.

    If an equivalent DocumentURI already exists in the database then its
    updated time will be updated.

    If no equivalent DocumentURI exists in the database then a new one will be
    created and added to the database.

    To be considered "equivalent" an existing DocumentURI must have the same
    claimant, uri, type and content_type, but the Document object that it
    belongs to may be different. The claimant and uri are normalized before
    comparing.

    :param session: the database session
    :type session: sqlalchemy.orm.session.Session

    :param claimant: the .claimant property of the DocumentURI
    :type claimant: unicode

    :param uri: the .uri property of the DocumentURI
    :type uri: unicode

    :param type: the .type property of the DocumentURI
    :type type: unicode

    :param content_type: the .content_type property of the DocumentURI
    :type content_type: unicode

    :param document: the Document that the new DocumentURI will belong to, if a
        new DocumentURI is created
    :type document: memex.models.Document

    :param created: the time that will be used as the .created time for the new
        DocumentURI, if a new one is created
    :type created: datetime.datetime

    :param updated: the time that will be set as the .updated time for the new
        or existing DocumentURI
    :type updated: datetime.datetime

    """
    docuri = session.query(DocumentURI).filter(
        DocumentURI.claimant_normalized == uri_normalize(claimant),
        DocumentURI.uri_normalized == uri_normalize(uri),
        DocumentURI.type == type,
        DocumentURI.content_type == content_type).first()

    if docuri is None:
        docuri = DocumentURI(claimant=claimant,
                             uri=uri,
                             type=type,
                             content_type=content_type,
                             document=document,
                             created=created,
                             updated=updated)
        session.add(docuri)
    elif not docuri.document == document:
        log.warn(
            "Found DocumentURI (id: %d)'s document_id (%d) doesn't match "
            "given Document's id (%d)", docuri.id, docuri.document_id,
            document.id)

    docuri.updated = updated

    if not document.web_uri:
        parsed = urlparse.urlparse(uri)
        if parsed.scheme in ['http', 'https']:
            document.web_uri = uri

    try:
        session.flush()
    except sa.exc.IntegrityError:
        raise ConcurrentUpdateError('concurrent document uri updates')
Exemple #6
0
 def claimant(self, value):
     self._claimant = value
     self._claimant_normalized = uri_normalize(value)
Exemple #7
0
 def uri(self, value):
     self._uri = value
     self._uri_normalized = uri_normalize(value)
Exemple #8
0
def create_or_update_document_meta(session,
                                   claimant,
                                   type,
                                   value,
                                   document,
                                   created,
                                   updated):
    """
    Create or update a DocumentMeta with the given parameters.

    If an equivalent DocumentMeta already exists in the database then its value
    and updated time will be updated.

    If no equivalent DocumentMeta exists in the database then a new one will be
    created and added to the database.

    To be considered "equivalent" an existing DocumentMeta must have the given
    claimant and type, but its value, document and created and updated times
    needn't match the given ones.

    :param session: the database session
    :type session: sqlalchemy.orm.session.Session

    :param claimant: the value to use for the DocumentMeta's claimant attribute
        if a new DocumentMeta is created
    :type claimant: unicode

    :param type: the value of the new or existing DocumentMeta's type attribute
    :type type: unicode

    :param value: the value to set the new or existing DocumentMeta's value
        attribute to
    :type value: list of unicode strings

    :param document: the value to use for the DocumentMeta's document if a new
        DocumentMeta is created
    :type document: memex.models.Document

    :param created: the value to use for the DocumentMeta's created attribute
        if a new DocumentMeta is created
    :type created: datetime.datetime

    :param updated: the value to set the new or existing DocumentMeta's updated
        attribute to
    :type updated: datetime.datetime

    """
    existing_dm = session.query(DocumentMeta).filter(
        DocumentMeta.claimant_normalized == uri_normalize(claimant),
        DocumentMeta.type == type).one_or_none()

    if existing_dm is None:
        session.add(DocumentMeta(
                    claimant=claimant,
                    type=type,
                    value=value,
                    document=document,
                    created=created,
                    updated=updated,
                    ))
    else:
        existing_dm.value = value
        existing_dm.updated = updated
        if not existing_dm.document == document:
            log.warn("Found DocumentMeta (id: %d)'s document_id (%d) doesn't "
                     "match given Document's id (%d)",
                     existing_dm.id, existing_dm.document_id, document.id)

    if type == 'title' and value and not document.title:
        document.title = value[0]

    try:
        session.flush()
    except sa.exc.IntegrityError:
        raise ConcurrentUpdateError('concurrent document meta updates')
Exemple #9
0
def create_or_update_document_uri(session,
                                  claimant,
                                  uri,
                                  type,
                                  content_type,
                                  document,
                                  created,
                                  updated):
    """
    Create or update a DocumentURI with the given parameters.

    If an equivalent DocumentURI already exists in the database then its
    updated time will be updated.

    If no equivalent DocumentURI exists in the database then a new one will be
    created and added to the database.

    To be considered "equivalent" an existing DocumentURI must have the same
    claimant, uri, type and content_type, but the Document object that it
    belongs to may be different. The claimant and uri are normalized before
    comparing.

    :param session: the database session
    :type session: sqlalchemy.orm.session.Session

    :param claimant: the .claimant property of the DocumentURI
    :type claimant: unicode

    :param uri: the .uri property of the DocumentURI
    :type uri: unicode

    :param type: the .type property of the DocumentURI
    :type type: unicode

    :param content_type: the .content_type property of the DocumentURI
    :type content_type: unicode

    :param document: the Document that the new DocumentURI will belong to, if a
        new DocumentURI is created
    :type document: memex.models.Document

    :param created: the time that will be used as the .created time for the new
        DocumentURI, if a new one is created
    :type created: datetime.datetime

    :param updated: the time that will be set as the .updated time for the new
        or existing DocumentURI
    :type updated: datetime.datetime

    """
    docuri = session.query(DocumentURI).filter(
        DocumentURI.claimant_normalized == uri_normalize(claimant),
        DocumentURI.uri_normalized == uri_normalize(uri),
        DocumentURI.type == type,
        DocumentURI.content_type == content_type).first()

    if docuri is None:
        docuri = DocumentURI(claimant=claimant,
                             uri=uri,
                             type=type,
                             content_type=content_type,
                             document=document,
                             created=created,
                             updated=updated)
        session.add(docuri)
    elif not docuri.document == document:
        log.warn("Found DocumentURI (id: %d)'s document_id (%d) doesn't match "
                 "given Document's id (%d)",
                 docuri.id, docuri.document_id, document.id)

    docuri.updated = updated

    if not document.web_uri:
        parsed = urlparse.urlparse(uri)
        if parsed.scheme in ['http', 'https']:
            document.web_uri = uri

    try:
        session.flush()
    except sa.exc.IntegrityError:
        raise ConcurrentUpdateError('concurrent document uri updates')
Exemple #10
0
 def claimant(self, value):
     self._claimant = value
     self._claimant_normalized = uri_normalize(value)
Exemple #11
0
 def uri(self, value):
     self._uri = value
     self._uri_normalized = uri_normalize(value)