Example #1
    def get_page_ident_hash(self, page_uuid, page_version,
                            book_uuid, book_version, latest=None):
        """Return the uuid of the page and full ident_hash of the page.

        which may or may not include the book uuid depending on whether
        the page is within the book.
        from cnxepub import flatten_tree_to_ident_hashes  # XXX
        plan = self.plpy.prepare(
            'SELECT tree_to_json($1, $2, FALSE)::json', ('text', 'text'))
        tree = self.plpy.execute(
            plan, (book_uuid, book_version))[0]['tree_to_json']
        if isinstance(tree, basestring):
            tree = json.loads(tree)
        pages = list(flatten_tree_to_ident_hashes(tree))
        book_ident_hash = join_ident_hash(book_uuid, book_version)
        page_ident_hash = join_ident_hash(page_uuid, page_version)
        for p_ident_hash in pages:
            p_id, p_version = split_ident_hash(p_ident_hash)
            if (p_id == page_uuid and
                    (page_version is None or
                     page_version == p_version)):
                return book_uuid, '{}:{}'.format(
                    latest and book_uuid or book_ident_hash,
        # The page isn't in the given book, so only return the page.
        return page_uuid, page_ident_hash
Example #2
def get_id_n_version(ident_hash):
    """From the given ``ident_hash`` return the id and version."""
        id, version = split_ident_hash(ident_hash)
    except IdentHashMissingVersion:
        # XXX Don't import from views... And don't use httpexceptions
        from pyramid.httpexceptions import HTTPNotFound
        from cnxarchive.views.helpers import get_latest_version
            version = get_latest_version(ident_hash)
        except HTTPNotFound:
            raise NotFound(ident_hash)
        id, version = split_ident_hash(join_ident_hash(ident_hash, version))
        verify_id_n_version(id, version)

    return id, version
Example #3
def get_id_n_version(ident_hash):
    """From the given ``ident_hash`` return the id and version."""
        id, version = split_ident_hash(ident_hash)
    except IdentHashMissingVersion:
        # XXX Don't import from views... And don't use httpexceptions
        from pyramid.httpexceptions import HTTPNotFound
        from cnxarchive.views import get_latest_version
            version = get_latest_version(ident_hash)
        except HTTPNotFound:
            raise NotFound(ident_hash)
        id, version = split_ident_hash(join_ident_hash(ident_hash, version))
        verify_id_n_version(id, version)

    return id, version
def inject_resource(ident_hash, file, filename, media_type):
    """Injects the contents of ``file`` (a file-like object) into the database
    as ``filename`` with ``media_type`` in association with the content
    at ``ident_hash``.

    resource_hash = get_file_sha1(file)
    with db_connect() as db_conn:
        with db_conn.cursor() as cursor:
            s_ident_hash = split_ident_hash(ident_hash)
            module_ident = lookup_module_ident(*s_ident_hash)
            fileid, resource_hash = insert_file(file, media_type)
            upsert_module_file(module_ident, fileid, filename)
    return resource_hash
Example #5
def inject_resource(ident_hash, file, filename, media_type):
    """Injects the contents of ``file`` (a file-like object) into the database
    as ``filename`` with ``media_type`` in association with the content
    at ``ident_hash``.

    resource_hash = get_file_sha1(file)
    with db_connect() as db_conn:
        with db_conn.cursor() as cursor:
            s_ident_hash = split_ident_hash(ident_hash)
            module_ident = lookup_module_ident(*s_ident_hash)
            fileid, resource_hash = insert_file(file, media_type)
            upsert_module_file(module_ident, fileid, filename)
    return resource_hash
def _insert_metadata(cursor, model, publisher, message):
    """Insert a module with the given ``metadata``."""
    params = model.metadata.copy()
    params['publisher'] = publisher
    params['publication_message'] = message
    params['_portal_type'] = _model_to_portaltype(model)

    # Transform person structs to id lists for database array entry.
    for person_field in ATTRIBUTED_ROLE_KEYS:
        params[person_field] = [parse_user_uri(x['id'])
                                for x in params.get(person_field, [])]
    params['parent_ident_hash'] = parse_parent_ident_hash(model)

    # Assign the id and version if one is known.
    if model.ident_hash is not None:
        uuid, version = split_ident_hash(model.ident_hash,
        params['_uuid'] = uuid
        params['_major_version'], params['_minor_version'] = version
        # Lookup legacy ``moduleid``.
        cursor.execute("SELECT moduleid FROM latest_modules WHERE uuid = %s",
        # There is the chance that a uuid and version have been set,
        #   but a previous publication does not exist. Therefore the
        #   moduleid will not be found. This happens on a pre-publication.
            moduleid = cursor.fetchone()[0]
        except TypeError as exc:  # NoneType
            moduleid = None
        params['_moduleid'] = moduleid

        # Format the statement to accept the identifiers.
        stmt = MODULE_INSERTION_TEMPLATE.format(**{
            '__uuid__': "%(_uuid)s::uuid",
            '__major_version__': "%(_major_version)s",
            '__minor_version__': "%(_minor_version)s",
            '__moduleid__': moduleid is None and "DEFAULT" or "%(_moduleid)s",
        # Format the statement for defaults.
        stmt = MODULE_INSERTION_TEMPLATE.format(**{
            '__uuid__': "DEFAULT",
            '__major_version__': "DEFAULT",
            '__minor_version__': "DEFAULT",
            '__moduleid__': "DEFAULT",

    cursor.execute(stmt, params)
    return cursor.fetchone()
Example #7
def main(argv=None):
    """Count the hits from logfile."""
    parser = create_parser('hits_counter', description=__doc__)
    parser.add_argument('--hostname', default='cnx.org',
                        help="hostname of the site (default: cnx.org)")
                        default=LOG_FORMAT_GZ, choices=LOG_FORMATS,
                        help="(default: {})".format(LOG_FORMAT_GZ))
                        help="path to the logfile.")
    args = parser.parse_args(argv)

    opener = LOG_FORMAT_OPENERS_MAPPING[args.log_format]

    # Build the URL pattern.
    hostname = args.hostname.replace('.', '\.')
    url_pattern = URL_PATTERN_TMPLT.format(hostname)
    url_pattern = re.compile(url_pattern)

    # Parse the log to structured data.
    with opener(args.log_file) as log:
        hits, start_timestamp, end_timestamp = parse_log(log, url_pattern)

    # Parse the configuration file for the postgres connection string.
    settings = get_app_settings_from_arguments(args)

    # Insert the hits into the database.
    connection_string = settings[config.CONNECTION_STRING]
    with psycopg2.connect(connection_string) as db_connection:
        with db_connection.cursor() as cursor:
            for ident_hash, hit_count in hits.items():
                id, version = split_ident_hash(ident_hash)
                               (id, version))
                module_ident = cursor.fetchone()
                payload = (module_ident, start_timestamp, end_timestamp,
                cursor.execute("INSERT INTO document_hits "
                               "  VALUES (%s, %s, %s, %s);",
            cursor.execute("SELECT update_hit_ranks();")
    return 0
def get_previous_publication(cursor, ident_hash):
    """Get the previous publication of the given
    publication as an ident-hash.
    uuid, version = split_ident_hash(ident_hash)
WITH contextual_module AS (
  SELECT uuid, module_ident
  FROM modules
  WHERE uuid = %s AND concat_ws('.', major_version, minor_version) = %s)
SELECT m.uuid||'@'||concat_ws('.', m.major_version, m.minor_version)
FROM modules AS m JOIN contextual_module AS context ON (m.uuid = context.uuid)
  m.module_ident < context.module_ident
LIMIT 1""",
                   (uuid, version,))
        previous_ident_hash = cursor.fetchone()[0]
    except TypeError:  # NoneType
        previous_ident_hash = None
    return previous_ident_hash
def check_REVISED_BOOK_in_archive(test_case, cursor):
    """This checker assumes that the only content in the database
    is the content within the BOOK and REVISED_BOOK use cases.
    binder = REVISED_BOOK
    document = REVISED_BOOK[0][0]

    # Check the module records...
SELECT uuid, moduleid, major_version, minor_version, version
FROM modules ORDER BY major_version ASC""")
    records = {}
    key_sep = '--'
    for row in cursor.fetchall():
        key = key_sep.join([str(x) for x in row[:2]])
        value = list(row[2:])
        if key not in records:
            records[key] = []
    binder_uuid = split_ident_hash(binder.id)[0]
    document_uuid = split_ident_hash(document.id)[0]
    expected_records = {
        # [uuid, moduleid]: [[major_version, minor_version, version], ...]
        key_sep.join([binder_uuid, 'col10000']): [
            [1, 1, '1.1'],  # BOOK
            [2, 1, '1.2'],  # REVISED_BOOK
        key_sep.join([document_uuid, 'm10000']): [
            [1, None, '1.1'],
            [2, None, '1.2'],
    test_case.assertEqual(expected_records, records)

    # Check the tree...
    # This also proves that the REVISED_BOOK is in latest_modules
    # by virtual of using the tree_to_json function.
    binder_ident_hash = join_ident_hash(split_ident_hash(binder.id)[0],
                                        (2, 1,))
    document_ident_hash = join_ident_hash(split_ident_hash(document.id)[0],
                                          (2, None,))
    expected_tree = {
        u"id": unicode(binder_ident_hash),
        u"title": u"Book of Infinity",
        u"contents": [
            {u"id": u"subcol",
             u"title": REVISED_BOOK[0].metadata['title'],
             u"contents": [
                 {u"id": unicode(document_ident_hash),
                  u"title": REVISED_BOOK[0].get_title_for_node(document)}]}]}
SELECT tree_to_json(uuid::text, concat_ws('.', major_version, minor_version))
FROM latest_modules
WHERE portal_type = 'Collection'""")
    tree = json.loads(cursor.fetchone()[0])
    test_case.assertEqual(expected_tree, tree)

    resource_hash = hashlib.new(cnxepub.RESOURCE_HASH_TYPE,
                                _read_file(RESOURCE_ONE_FILEPATH).read()) \
    # FIXME Remove and change assertion after cnx-archive switches to
    # ``cnxepub.RESOURCE_HASH_TYPE`` as hash. Use ``resource_hash`` in the
    # check instead of ``file_md5``.
    file_md5 = hashlib.new('md5',
                           _read_file(RESOURCE_ONE_FILEPATH).read()) \
SELECT f.file, mf.mimetype,
FROM files as f natural join module_files as mf, latest_modules as m
  mf.module_ident = m.module_ident
  f.md5 = %s""", (file_md5,))
    file, mime_type, ident_hash = cursor.fetchone()
    test_case.assertEqual(mime_type, 'image/png')
    test_case.assertEqual(ident_hash, document_ident_hash)
    test_case.assertEqual(file[:], _read_file(RESOURCE_ONE_FILEPATH).read())
def republish_binders(cursor, models):
    """Republish the Binders that share Documents in the publication context.
    This needs to be given all the models in the publication context."""
    documents = set([])
    binders = set([])
    history_mapping = {}  # <previous-ident-hash>: <current-ident-hash>
    if not isinstance(models, (list, tuple, set,)):
        raise TypeError("``models`` Must be a sequence of model objects." \
                        "We were given: {}".format(models))
    for model in models:
        if isinstance(model, (cnxepub.Binder,)):
            for doc in cnxepub.flatten_to_documents(model):

    to_be_republished = []
    # What binders are these documents a part of?
    for (uuid, version) in documents:
        ident_hash = join_ident_hash(uuid, version)
        previous_ident_hash = get_previous_publication(cursor, ident_hash)
        if previous_ident_hash is None:
            # Has no prior existence.
            history_mapping[previous_ident_hash] = ident_hash
WITH RECURSIVE t(nodeid, parent_id, documentid, path) AS (
  SELECT tr.nodeid, tr.parent_id, tr.documentid, ARRAY[tr.nodeid]
  FROM trees tr
  WHERE tr.documentid = (
    SELECT module_ident FROM modules
    WHERE uuid||'@'||concat_ws('.', major_version, minor_version) = %s)
  SELECT c.nodeid, c.parent_id, c.documentid, path || ARRAY[c.nodeid]
  FROM trees c JOIN t ON (c.nodeid = t.parent_id)
  WHERE not c.nodeid = ANY(t.path)
SELECT uuid||'@'||concat_ws('.', major_version, minor_version)
FROM t JOIN latest_modules m ON (t.documentid = m.module_ident)
WHERE t.parent_id IS NULL
                                  for x in cursor.fetchall()])
    to_be_republished = set(to_be_republished)

    republished_ident_hashes = []
    # Republish the Collections set.
    for (uuid, version) in to_be_republished:
        if (uuid, version,) in binders:
            # This binder is already in the publication context,
            # don't try to publish it again.
        ident_hash = join_ident_hash(uuid, version)
        bumped_version = bump_version(cursor, uuid, is_minor_bump=True)
        republished_ident_hash = republish_collection(cursor, ident_hash,
        # Set the identifier history.
        history_mapping[ident_hash] = republished_ident_hash
        rebuild_collection_tree(cursor, ident_hash, history_mapping)

    return republished_ident_hashes
Example #11
    def fix_link_references(self):
        """Fix references to internal documents and resources."""
        # Catch the invalid, unparsable, etc. references.
        bad_references = []

        # Note, all c:link will have an @url. We purposely dumb down the xslt
        #   in order to make the scan here easy. Plus the xslt could never
        #   fully match and disassemble the url into the various attributes on
        #   a link tag.
        for link in self.apply_xpath('//c:link'):
            ref = link.get('url')

            if not ref or self._should_ignore_reference(ref):

                ref_type, payload = parse_html_reference(ref)
            except ValueError:
                exc = InvalidReference(self.document_ident, ref)

            # Delete the tentative attribute

            # TODO handle #{id} refs in the url_frag, which could also contain
            #      path elements, so further parsing is necessary.

            if ref_type == DOCUMENT_REFERENCE:
                id, version, url_frag = payload
                mid, version = self.get_mid_n_version(id, version)
                if mid is None:
                        ReferenceNotFound("Unable to find a reference to "
                                          "'{}' at version '{}'."
                                          .format(id, version),
                                          self.document_ident, ref))
                    # Assign the document specific attributes.
                    link.attrib['document'] = mid
                    if version is not None:
                        link.attrib['version'] = version
            elif ref_type == BINDER_REFERENCE:
                id, version, bound_document, url_frag = payload
                mid, version = self.get_mid_n_version(id, version)
                if mid is None:
                    id, version, id_type = split_ident_hash(bound_document)
                    cid, cversion = self.get_mid_n_version(id, version)
                    if cid is None:
                        # Binder ref doesn't exist, but Document does.
                        # Assign the document specific attributes.
                        link.attrib['document'] = mid
                        if version is not None:
                            link.attrib['version'] = version
                        # Process the ref as an external url.
                        url = "http://legacy.cnx.org/content/" \
                              "{}/{}/?collection={}".format(mid, version, cid)
                        if cversion is not None:
                            url = "{}/{}".format(url, cversion)
                        link.attrib['url'] = url
                        ReferenceNotFound("Unable to find a reference to "
                                          "'{}' at version '{}'."
                                          .format(id, version),
                                          self.document_ident, ref))
            elif ref_type == RESOURCE_REFERENCE:
                # TODO If the filename is in the url_frag, use it
                #      as the filename instead of looking it up.
                    sha1_hash, url_frag = payload
                    filename = self.get_resource_filename(sha1_hash)
                except ReferenceNotFound as exc:
                    link.attrib['resource'] = filename
                exc = InvalidReference(self.document_ident, ref)
                # Preserve the content value...
                link.attrib['url'] = ref

        return bad_references