def get_page_ident_hash(self, page_uuid, page_version, book_uuid, book_version, latest=None): """Return the uuid of the page and full ident_hash of the page. which may or may not include the book uuid depending on whether the page is within the book. """ from cnxepub import flatten_tree_to_ident_hashes # XXX plan = self.plpy.prepare( 'SELECT tree_to_json($1, $2, FALSE)::json', ('text', 'text')) tree = self.plpy.execute( plan, (book_uuid, book_version))[0]['tree_to_json'] if isinstance(tree, basestring): tree = json.loads(tree) pages = list(flatten_tree_to_ident_hashes(tree)) book_ident_hash = join_ident_hash(book_uuid, book_version) page_ident_hash = join_ident_hash(page_uuid, page_version) for p_ident_hash in pages: p_id, p_version = split_ident_hash(p_ident_hash) if (p_id == page_uuid and (page_version is None or page_version == p_version)): return book_uuid, '{}:{}'.format( latest and book_uuid or book_ident_hash, page_ident_hash) # The page isn't in the given book, so only return the page. return page_uuid, page_ident_hash
def get_id_n_version(ident_hash): """From the given ``ident_hash`` return the id and version.""" try: id, version = split_ident_hash(ident_hash) except IdentHashMissingVersion: # XXX Don't import from views... And don't use httpexceptions from pyramid.httpexceptions import HTTPNotFound from cnxarchive.views.helpers import get_latest_version try: version = get_latest_version(ident_hash) except HTTPNotFound: raise NotFound(ident_hash) id, version = split_ident_hash(join_ident_hash(ident_hash, version)) else: verify_id_n_version(id, version) return id, version
def get_id_n_version(ident_hash): """From the given ``ident_hash`` return the id and version.""" try: id, version = split_ident_hash(ident_hash) except IdentHashMissingVersion: # XXX Don't import from views... And don't use httpexceptions from pyramid.httpexceptions import HTTPNotFound from cnxarchive.views import get_latest_version try: version = get_latest_version(ident_hash) except HTTPNotFound: raise NotFound(ident_hash) id, version = split_ident_hash(join_ident_hash(ident_hash, version)) else: verify_id_n_version(id, version) return id, version
def inject_resource(ident_hash, file, filename, media_type): """Injects the contents of ``file`` (a file-like object) into the database as ``filename`` with ``media_type`` in association with the content at ``ident_hash``. """ resource_hash = get_file_sha1(file) with db_connect() as db_conn: with db_conn.cursor() as cursor: s_ident_hash = split_ident_hash(ident_hash) module_ident = lookup_module_ident(*s_ident_hash) fileid, resource_hash = insert_file(file, media_type) upsert_module_file(module_ident, fileid, filename) return resource_hash
def _insert_metadata(cursor, model, publisher, message): """Insert a module with the given ``metadata``.""" params = model.metadata.copy() params['publisher'] = publisher params['publication_message'] = message params['_portal_type'] = _model_to_portaltype(model) # Transform person structs to id lists for database array entry. for person_field in ATTRIBUTED_ROLE_KEYS: params[person_field] = [parse_user_uri(x['id']) for x in params.get(person_field, [])] params['parent_ident_hash'] = parse_parent_ident_hash(model) # Assign the id and version if one is known. if model.ident_hash is not None: uuid, version = split_ident_hash(model.ident_hash, split_version=True) params['_uuid'] = uuid params['_major_version'], params['_minor_version'] = version # Lookup legacy ``moduleid``. cursor.execute("SELECT moduleid FROM latest_modules WHERE uuid = %s", (uuid,)) # There is the chance that a uuid and version have been set, # but a previous publication does not exist. Therefore the # moduleid will not be found. This happens on a pre-publication. try: moduleid = cursor.fetchone()[0] except TypeError as exc: # NoneType moduleid = None params['_moduleid'] = moduleid # Format the statement to accept the identifiers. stmt = MODULE_INSERTION_TEMPLATE.format(**{ '__uuid__': "%(_uuid)s::uuid", '__major_version__': "%(_major_version)s", '__minor_version__': "%(_minor_version)s", '__moduleid__': moduleid is None and "DEFAULT" or "%(_moduleid)s", }) else: # Format the statement for defaults. stmt = MODULE_INSERTION_TEMPLATE.format(**{ '__uuid__': "DEFAULT", '__major_version__': "DEFAULT", '__minor_version__': "DEFAULT", '__moduleid__': "DEFAULT", }) cursor.execute(stmt, params) return cursor.fetchone()
def main(argv=None): """Count the hits from logfile.""" parser = create_parser('hits_counter', description=__doc__) parser.add_argument('--hostname', default='cnx.org', help="hostname of the site (default: cnx.org)") parser.add_argument('--log-format', default=LOG_FORMAT_GZ, choices=LOG_FORMATS, help="(default: {})".format(LOG_FORMAT_GZ)) parser.add_argument('log_file', help="path to the logfile.") args = parser.parse_args(argv) opener = LOG_FORMAT_OPENERS_MAPPING[args.log_format] # Build the URL pattern. hostname = args.hostname.replace('.', '\.') url_pattern = URL_PATTERN_TMPLT.format(hostname) url_pattern = re.compile(url_pattern) # Parse the log to structured data. with opener(args.log_file) as log: hits, start_timestamp, end_timestamp = parse_log(log, url_pattern) # Parse the configuration file for the postgres connection string. settings = get_app_settings_from_arguments(args) # Insert the hits into the database. connection_string = settings[config.CONNECTION_STRING] with psycopg2.connect(connection_string) as db_connection: with db_connection.cursor() as cursor: for ident_hash, hit_count in hits.items(): id, version = split_ident_hash(ident_hash) cursor.execute(SQL_GET_MODULE_IDENT_BY_UUID_N_VERSION, (id, version)) module_ident = cursor.fetchone() payload = (module_ident, start_timestamp, end_timestamp, hit_count,) cursor.execute("INSERT INTO document_hits " " VALUES (%s, %s, %s, %s);", payload) cursor.execute("SELECT update_hit_ranks();") return 0
def get_previous_publication(cursor, ident_hash): """Get the previous publication of the given publication as an ident-hash. """ uuid, version = split_ident_hash(ident_hash) cursor.execute("""\ WITH contextual_module AS ( SELECT uuid, module_ident FROM modules WHERE uuid = %s AND concat_ws('.', major_version, minor_version) = %s) SELECT m.uuid||'@'||concat_ws('.', m.major_version, m.minor_version) FROM modules AS m JOIN contextual_module AS context ON (m.uuid = context.uuid) WHERE m.module_ident < context.module_ident ORDER BY revised DESC LIMIT 1""", (uuid, version,)) try: previous_ident_hash = cursor.fetchone()[0] except TypeError: # NoneType previous_ident_hash = None return previous_ident_hash
def check_REVISED_BOOK_in_archive(test_case, cursor): """This checker assumes that the only content in the database is the content within the BOOK and REVISED_BOOK use cases. """ binder = REVISED_BOOK document = REVISED_BOOK[0][0] # Check the module records... cursor.execute("""\ SELECT uuid, moduleid, major_version, minor_version, version FROM modules ORDER BY major_version ASC""") records = {} key_sep = '--' for row in cursor.fetchall(): key = key_sep.join([str(x) for x in row[:2]]) value = list(row[2:]) if key not in records: records[key] = [] records[key].append(value) binder_uuid = split_ident_hash(binder.id)[0] document_uuid = split_ident_hash(document.id)[0] expected_records = { # [uuid, moduleid]: [[major_version, minor_version, version], ...] key_sep.join([binder_uuid, 'col10000']): [ [1, 1, '1.1'], # BOOK [2, 1, '1.2'], # REVISED_BOOK ], key_sep.join([document_uuid, 'm10000']): [ [1, None, '1.1'], [2, None, '1.2'], ], } test_case.assertEqual(expected_records, records) # Check the tree... # This also proves that the REVISED_BOOK is in latest_modules # by virtual of using the tree_to_json function. binder_ident_hash = join_ident_hash(split_ident_hash(binder.id)[0], (2, 1,)) document_ident_hash = join_ident_hash(split_ident_hash(document.id)[0], (2, None,)) expected_tree = { u"id": unicode(binder_ident_hash), u"title": u"Book of Infinity", u"contents": [ {u"id": u"subcol", u"title": REVISED_BOOK[0].metadata['title'], u"contents": [ {u"id": unicode(document_ident_hash), u"title": REVISED_BOOK[0].get_title_for_node(document)}]}]} cursor.execute("""\ SELECT tree_to_json(uuid::text, concat_ws('.', major_version, minor_version)) FROM latest_modules WHERE portal_type = 'Collection'""") tree = json.loads(cursor.fetchone()[0]) test_case.assertEqual(expected_tree, tree) resource_hash = hashlib.new(cnxepub.RESOURCE_HASH_TYPE, _read_file(RESOURCE_ONE_FILEPATH).read()) \ .hexdigest() # FIXME Remove and change assertion after cnx-archive switches to # ``cnxepub.RESOURCE_HASH_TYPE`` as hash. Use ``resource_hash`` in the # check instead of ``file_md5``. file_md5 = hashlib.new('md5', _read_file(RESOURCE_ONE_FILEPATH).read()) \ .hexdigest() cursor.execute("""\ SELECT f.file, mf.mimetype, m.uuid||'@'||concat_ws('.',m.major_version,m.minor_version) FROM files as f natural join module_files as mf, latest_modules as m WHERE mf.module_ident = m.module_ident AND f.md5 = %s""", (file_md5,)) file, mime_type, ident_hash = cursor.fetchone() test_case.assertEqual(mime_type, 'image/png') test_case.assertEqual(ident_hash, document_ident_hash) test_case.assertEqual(file[:], _read_file(RESOURCE_ONE_FILEPATH).read())
def republish_binders(cursor, models): """Republish the Binders that share Documents in the publication context. This needs to be given all the models in the publication context.""" documents = set([]) binders = set([]) history_mapping = {} # <previous-ident-hash>: <current-ident-hash> if not isinstance(models, (list, tuple, set,)): raise TypeError("``models`` Must be a sequence of model objects." \ "We were given: {}".format(models)) for model in models: if isinstance(model, (cnxepub.Binder,)): binders.add(split_ident_hash(model.ident_hash)) for doc in cnxepub.flatten_to_documents(model): documents.add(split_ident_hash(doc.ident_hash)) else: documents.add(split_ident_hash(model.ident_hash)) to_be_republished = [] # What binders are these documents a part of? for (uuid, version) in documents: ident_hash = join_ident_hash(uuid, version) previous_ident_hash = get_previous_publication(cursor, ident_hash) if previous_ident_hash is None: # Has no prior existence. continue else: history_mapping[previous_ident_hash] = ident_hash cursor.execute("""\ WITH RECURSIVE t(nodeid, parent_id, documentid, path) AS ( SELECT tr.nodeid, tr.parent_id, tr.documentid, ARRAY[tr.nodeid] FROM trees tr WHERE tr.documentid = ( SELECT module_ident FROM modules WHERE uuid||'@'||concat_ws('.', major_version, minor_version) = %s) UNION ALL SELECT c.nodeid, c.parent_id, c.documentid, path || ARRAY[c.nodeid] FROM trees c JOIN t ON (c.nodeid = t.parent_id) WHERE not c.nodeid = ANY(t.path) ) SELECT uuid||'@'||concat_ws('.', major_version, minor_version) FROM t JOIN latest_modules m ON (t.documentid = m.module_ident) WHERE t.parent_id IS NULL """, (previous_ident_hash,)) to_be_republished.extend([split_ident_hash(x[0]) for x in cursor.fetchall()]) to_be_republished = set(to_be_republished) republished_ident_hashes = [] # Republish the Collections set. for (uuid, version) in to_be_republished: if (uuid, version,) in binders: # This binder is already in the publication context, # don't try to publish it again. continue ident_hash = join_ident_hash(uuid, version) bumped_version = bump_version(cursor, uuid, is_minor_bump=True) republished_ident_hash = republish_collection(cursor, ident_hash, version=bumped_version) # Set the identifier history. history_mapping[ident_hash] = republished_ident_hash rebuild_collection_tree(cursor, ident_hash, history_mapping) republished_ident_hashes.append(republished_ident_hash) return republished_ident_hashes
def fix_link_references(self): """Fix references to internal documents and resources.""" # Catch the invalid, unparsable, etc. references. bad_references = [] # Note, all c:link will have an @url. We purposely dumb down the xslt # in order to make the scan here easy. Plus the xslt could never # fully match and disassemble the url into the various attributes on # a link tag. for link in self.apply_xpath('//c:link'): ref = link.get('url') if not ref or self._should_ignore_reference(ref): continue try: ref_type, payload = parse_html_reference(ref) except ValueError: exc = InvalidReference(self.document_ident, ref) bad_references.append(exc) continue # Delete the tentative attribute link.attrib.pop('url') # TODO handle #{id} refs in the url_frag, which could also contain # path elements, so further parsing is necessary. if ref_type == DOCUMENT_REFERENCE: id, version, url_frag = payload mid, version = self.get_mid_n_version(id, version) if mid is None: bad_references.append( ReferenceNotFound("Unable to find a reference to " "'{}' at version '{}'." .format(id, version), self.document_ident, ref)) else: # Assign the document specific attributes. link.attrib['document'] = mid if version is not None: link.attrib['version'] = version elif ref_type == BINDER_REFERENCE: id, version, bound_document, url_frag = payload mid, version = self.get_mid_n_version(id, version) if mid is None: id, version, id_type = split_ident_hash(bound_document) cid, cversion = self.get_mid_n_version(id, version) if cid is None: # Binder ref doesn't exist, but Document does. # Assign the document specific attributes. link.attrib['document'] = mid if version is not None: link.attrib['version'] = version else: # Process the ref as an external url. url = "http://legacy.cnx.org/content/" \ "{}/{}/?collection={}".format(mid, version, cid) if cversion is not None: url = "{}/{}".format(url, cversion) link.attrib['url'] = url else: bad_references.append( ReferenceNotFound("Unable to find a reference to " "'{}' at version '{}'." .format(id, version), self.document_ident, ref)) elif ref_type == RESOURCE_REFERENCE: # TODO If the filename is in the url_frag, use it # as the filename instead of looking it up. try: sha1_hash, url_frag = payload filename = self.get_resource_filename(sha1_hash) except ReferenceNotFound as exc: bad_references.append(exc) else: link.attrib['resource'] = filename else: exc = InvalidReference(self.document_ident, ref) bad_references.append(exc) # Preserve the content value... link.attrib['url'] = ref return bad_references