def up(cursor): pg_user = cursor.connection.get_dsn_parameters()['user'] with super_user() as super_cursor: super_cursor.execute("""\ CREATE OR REPLACE FUNCTION sha1(file text) RETURNS text LANGUAGE plpythonu IMMUTABLE STRICT AS $function$ import hashlib return hashlib.new('sha1', file).hexdigest() $function$; """) super_cursor.execute(sql.SQL('alter function sha1(text) owner to {}' ).format(sql.Identifier(pg_user))) super_cursor.execute("""\ CREATE OR REPLACE FUNCTION sha1(f bytea) RETURNS text LANGUAGE plpythonu AS $function$ import hashlib return hashlib.new('sha1',f).hexdigest() $function$ """) super_cursor.execute(sql.SQL('alter function sha1(bytea) owner to {}' ).format(sql.Identifier(pg_user)))
def up(cursor): with super_user() as super_cursor: with open_here('../archive-sql/schema/shred_collxml.sql', 'rb') as f: super_cursor.execute(f.read()) super_cursor.execute('drop function shred_collxml(text)') super_cursor.execute('drop function shred_collxml(int)') super_cursor.execute('drop function shred_collxml(int,int)')
def up(cursor): pg_user = cursor.connection.get_dsn_parameters()['user'] with super_user() as super_cursor: super_cursor.execute("""\ CREATE OR REPLACE FUNCTION sha1(file text) RETURNS text LANGUAGE plpythonu IMMUTABLE STRICT AS $function$ import hashlib return hashlib.new('sha1', file).hexdigest() $function$; """) super_cursor.execute( sql.SQL('alter function sha1(text) owner to {}').format( sql.Identifier(pg_user))) super_cursor.execute("""\ CREATE OR REPLACE FUNCTION sha1(f bytea) RETURNS text LANGUAGE plpythonu AS $function$ import hashlib return hashlib.new('sha1',f).hexdigest() $function$ """) super_cursor.execute( sql.SQL('alter function sha1(bytea) owner to {}').format( sql.Identifier(pg_user)))
def up(cursor): with super_user() as super_cursor: super_cursor.execute("""\ CREATE EXTENSION pg_trgm; CREATE INDEX modules_strip_html_name_trgm_gin ON modules \ USING gin(strip_html(name) gin_trgm_ops); """)
def down(cursor): with super_user() as super_cursor: with open_here('shred_collxml_20170912134157_pre.sql', 'rb') as f: super_cursor.execute(f.read()) super_cursor.execute('drop function shred_collxml(text, integer)') super_cursor.execute( 'drop function shred_collxml(integer, integer, bool)')
def up(cursor): """Add subcol metadata support to shredding and data upgrades""" with super_user() as super_cursor: super_cursor.execute(""" CREATE FUNCTION uuid5(namespace uuid, name text) RETURNS uuid LANGUAGE plpythonu AS $_$ import uuid; return uuid.uuid5(uuid.UUID(namespace), name) $_$;""") super_cursor.execute(""" CREATE OR REPLACE FUNCTION public.is_baked(col_uuid uuid, col_ver text) RETURNS boolean LANGUAGE sql AS $function$ SELECT bool_or(is_collated) FROM modules JOIN trees ON module_ident = documentid WHERE uuid = col_uuid AND module_version(major_version, minor_version) = col_ver $function$;""") with open_here('../archive-sql/schema/subcol_uuids_func.sql', 'rb') as f: super_cursor.execute(f.read()) with open_here('../archive-sql/schema/shred_collxml.sql', 'rb') as f: super_cursor.execute(f.read())
def down(cursor): with super_user() as super_cursor: super_cursor.execute('drop function uuid5(uuid, text)') super_cursor.execute('drop function is_baked(uuid, text)') super_cursor.execute('drop function subcol_uuids(text, text)') with open_here('shred_collxml_20170201213850_pre.sql', 'rb') as f: super_cursor.execute(f.read())
def up(cursor): with super_user() as super_cursor: with open_here('../archive-sql/schema/functions.sql', 'rb') as f: super_cursor.execute(f.read()) cursor.execute(""" CREATE INDEX modules_ident_hash on modules(ident_hash(uuid, major_version, minor_version)); CREATE INDEX modules_short_ident_hash on modules(short_ident_hash(uuid, major_version, minor_version));""")
def up(cursor): with super_user() as super_cursor: with open_here('../archive-sql/schema/functions.sql', 'rb') as f: super_cursor.execute(f.read()) cursor.execute(""" CREATE INDEX modules_ident_hash on modules(ident_hash(uuid, major_version, minor_version)); CREATE INDEX modules_short_ident_hash on modules(short_ident_hash(uuid, major_version, minor_version));""" )
def up(cursor): with super_user() as super_cursor: super_cursor.execute("""\ CREATE OR REPLACE FUNCTION strip_html(html_text TEXT) RETURNS text AS $$ import re return re.sub('<[^>]*?>', '', html_text, re.MULTILINE) $$ LANGUAGE plpythonu IMMUTABLE; """)
def up(cursor): cursor.execute('SELECT current_user') username = cursor.fetchall()[0][0] with super_user() as super_cursor: super_cursor.execute(''' ALTER FUNCTION ident_hash (uuid, integer, integer) OWNER TO {user}; ALTER FUNCTION is_baked (uuid, text) OWNER TO {user}; ALTER FUNCTION short_ident_hash (uuid, integer, integer) OWNER TO {user}; ALTER FUNCTION shred_collxml (integer, integer, boolean) OWNER TO {user}; ALTER FUNCTION shred_collxml (text, integer) OWNER TO {user}; ALTER FUNCTION strip_html (text) OWNER TO {user}; ALTER FUNCTION subcol_uuids (uuid, text) OWNER TO {user}; ALTER FUNCTION uuid5 (uuid, text) OWNER TO {user}; '''.format(user=username))
def down(cursor): cursor.execute(""" DROP FUNCTION legacy_collxml (int, bool, text); DROP FUNCTION legacy_subcol (int, text); DROP FUNCTION legacy_content (int, text); DROP FUNCTION legacy_module (int, text); DROP FUNCTION legacy_mdml (int, bool, text); DROP FUNCTION legacy_mdml_inner (int, bool, text); """) with super_user() as super_cursor: super_cursor.execute(""" DROP FUNCTION pretty_print (xml); """)
def up(cursor): with super_user() as super_cursor: with open_here('../archive-sql/schema/shred_collxml.sql', 'rb') as f: super_cursor.execute(f.read())
def down(cursor): with super_user() as super_cursor: super_cursor.execute("""\ DROP INDEX IF EXISTS modules_strip_html_name_trgm_gin; DROP EXTENSION IF EXISTS pg_trgm; """)
def down(cursor): with super_user() as super_cursor: super_cursor.execute( 'drop function ident_hash(uuid, int, int) CASCADE') super_cursor.execute( 'drop function short_ident_hash(uuid, int, int) CASCADE')
def down(cursor): with super_user() as super_cursor: super_cursor.execute("DROP FUNCTION sha1(text)") super_cursor.execute("DROP FUNCTION sha1(bytea)")
def down(cursor): with super_user() as super_cursor: super_cursor.execute("DROP FUNCTION IF EXISTS strip_html(TEXT)")
def up(cursor): with super_user() as super_cursor: super_cursor.execute('SELECT current_user') assert super_cursor.fetchone()[0] == 'postgres'
def down(cursor): with super_user() as super_cursor: with open_here('shred_collxml_20180408175622_pre.sql', 'rb') as f: super_cursor.execute(f.read())
def down(cursor): # rollback to broken function def with super_user() as super_cursor: super_cursor.execute(''' CREATE OR REPLACE FUNCTION subcol_uuids(uuid uuid, version text) RETURNS void LANGUAGE sql AS $function$ WITH RECURSIVE t(node, title, path,value, depth, corder, is_collated) AS ( SELECT nodeid, title, ARRAY[nodeid], documentid, 1, ARRAY[childorder], is_collated FROM trees tr, modules m WHERE m.uuid = $1 AND module_version( m.major_version, m.minor_version) = $2 AND tr.documentid = m.module_ident AND tr.parent_id IS NULL AND tr.is_collated = False UNION ALL SELECT c1.nodeid, c1.title, t.path || ARRAY[c1.nodeid], c1.documentid, t.depth+1, t.corder || ARRAY[c1.childorder], c1.is_collated /* Recursion */ FROM trees c1 JOIN t ON (c1.parent_id = t.node) WHERE not nodeid = any (t.path) AND t.is_collated = c1.is_collated ) INSERT INTO document_controls (uuid) SELECT uuid5($1::uuid, t.title) FROM t WHERE t.value IS NULL AND not exists (select 1 from document_controls where uuid = uuid5($1::uuid, t.title)) WINDOW w as (ORDER BY corder) order by corder; WITH RECURSIVE t(node, title, path,value, depth, corder, is_collated) AS ( SELECT nodeid, title, ARRAY[nodeid], documentid, 1, ARRAY[childorder], is_collated FROM trees tr, modules m WHERE m.uuid = $1 AND module_version( m.major_version, m.minor_version) = $2 AND tr.documentid = m.module_ident AND tr.parent_id IS NULL AND tr.is_collated = False UNION ALL SELECT c1.nodeid, c1.title, t.path || ARRAY[c1.nodeid], c1.documentid, t.depth+1, t.corder || ARRAY[c1.childorder], c1.is_collated /* Recursion */ FROM trees c1 JOIN t ON (c1.parent_id = t.node) WHERE not nodeid = any (t.path) AND t.is_collated = c1.is_collated ) INSERT INTO modules ( doctype, portal_type, moduleid, uuid, version, name, created, revised, licenseid, submitter, submitlog, stateid, parent, language, authors, maintainers, licensors, parentauthors, google_analytics, buylink, major_version, minor_version, print_style) SELECT t.node, 'SubCollection', 'col' || nextval('collectionid_seq'), uuid5($1::uuid, t.title), m.version, t.title, m.created, m.revised, m.licenseid, m.submitter, m.submitlog, m.stateid, m.parent, m.language, m.authors, m.maintainers, m.licensors, m.parentauthors, m.google_analytics, m.buylink, m.major_version, m.minor_version, m.print_style FROM t, modules m WHERE value IS NULL and m.uuid = $1 and module_version(m.major_version, m.minor_version) = $2 and not exists (select 1 from modules where uuid = uuid5($1::uuid, t.title) and module_version(major_version, minor_version) = $2) WINDOW w AS (ORDER BY corder) ORDER BY corder; WITH RECURSIVE t(node, title, path,value, depth, corder, is_collated) AS ( SELECT nodeid, title, ARRAY[nodeid], documentid, 1, ARRAY[childorder], is_collated FROM trees tr, modules m WHERE m.uuid = $1 AND module_version( m.major_version, m.minor_version) = $2 AND tr.documentid = m.module_ident AND tr.parent_id IS NULL AND tr.is_collated = False UNION ALL SELECT c1.nodeid, c1.title, t.path || ARRAY[c1.nodeid], c1.documentid, t.depth+1, t.corder || ARRAY[c1.childorder], c1.is_collated /* Recursion */ FROM trees c1 JOIN t ON (c1.parent_id = t.node) WHERE not nodeid = any (t.path) AND t.is_collated = c1.is_collated ) UPDATE trees set documentid = module_ident FROM t, modules m WHERE nodeid = t.node AND t.value IS NULL and nodeid::text = m.doctype; $function$ ''')
def down(cursor): with super_user() as super_cursor: with open_here('shred_collxml_20170912134157_pre.sql', 'rb') as f: super_cursor.execute(f.read()) super_cursor.execute('drop function shred_collxml(text, integer)') super_cursor.execute('drop function shred_collxml(integer, integer, bool)')
def down(cursor): # Drop the new column to the trees table with super_user() as super_cursor: super_cursor.execute("ALTER TABLE trees DROP COLUMN slug")
def down(cursor): with super_user() as cur: cur.execute("""\ -- Deprecated (3-Feb-2015) Use html_abstract(module_ident int) -- This was deprecated to align the call params with -- synonymous function cnxml_abstract, which requires -- access to the module_ident to perform reference resolution. CREATE OR REPLACE FUNCTION html_abstract(abstract text) RETURNS text AS $$ plpy.warning('This function is deprecated, please use html_abstract(<module_ident>') from cnxarchive.transforms import transform_abstract_to_html html_abstract, warning_messages = transform_abstract_to_html(abstract, None, plpy) if warning_messages: plpy.warning(warning_messages) return html_abstract $$ LANGUAGE plpythonu; CREATE OR REPLACE FUNCTION html_abstract(module_ident int) RETURNS text AS $$ from cnxarchive.transforms import transform_abstract_to_html plan = plpy.prepare("SELECT abstract FROM modules NATURAL JOIN abstracts WHERE module_ident = $1", ('integer',)) abstract = plpy.execute(plan, (module_ident,))[0]['abstract'] html_abstract, warning_messages = transform_abstract_to_html(abstract, module_ident, plpy) if warning_messages: plpy.warning(warning_messages) return html_abstract $$ LANGUAGE plpythonu; -- Deprecated (3-Feb-2015) Use html_content(module_ident int) -- This was deprecated to align the call params with -- synonymous function cnxml_content, which requires -- access to the module_ident to perform reference resolution. CREATE OR REPLACE FUNCTION html_content(cnxml text) RETURNS text AS $$ plpy.warning('This function is deprecated, please use html_content(<module_ident>') from cnxarchive.transforms import transform_module_content html_content, warning_messages = transform_module_content(cnxml, 'cnxml2html', plpy) if warning_messages: plpy.warning(warning_messages) return html_content $$ LANGUAGE plpythonu; CREATE OR REPLACE FUNCTION html_content(module_ident int) RETURNS text AS $$ from cnxarchive.transforms import transform_module_content plan = plpy.prepare("SELECT convert_from(file, 'utf-8') FROM module_files AS mf NATURAL JOIN files AS f WHERE module_ident = $1 AND (filename = 'index.cnxml' OR filename = 'index.html.cnxml')", ('integer',)) cnxml = plpy.execute(plan, (module_ident,))[0]['convert_from'] content, warning_messages = transform_module_content(cnxml, 'cnxml2html', plpy, module_ident) if warning_messages: plpy.warning(warning_messages) return content $$ LANGUAGE plpythonu; CREATE OR REPLACE FUNCTION cnxml_abstract(module_ident int) RETURNS text AS $$ from cnxarchive.transforms import transform_abstract_to_cnxml plan = plpy.prepare("SELECT html FROM modules NATURAL JOIN abstracts WHERE module_ident = $1", ('integer',)) abstract = plpy.execute(plan, (module_ident,))[0]['html'] cnxml_abstract, warning_messages = transform_abstract_to_cnxml(abstract, module_ident, plpy) if warning_messages: plpy.warning(warning_messages) return cnxml_abstract $$ LANGUAGE plpythonu; CREATE OR REPLACE FUNCTION cnxml_content(module_ident int) RETURNS text AS $$ from cnxarchive.transforms import transform_module_content plan = plpy.prepare("SELECT convert_from(file, 'utf-8') FROM module_files AS mf NATURAL JOIN files AS f WHERE module_ident = $1 AND filename = 'index.cnxml.html'", ('integer',)) html = plpy.execute(plan, (module_ident,))[0]['convert_from'] content, warning_messages = transform_module_content(html, 'html2cnxml', plpy, module_ident) if warning_messages: plpy.warning(warning_messages) return content $$ LANGUAGE plpythonu; """)
def up(cursor): # Add the new column to the trees table with super_user() as super_cursor: super_cursor.execute("ALTER TABLE trees ADD COLUMN slug text")
def up(cursor): cursor.execute(""" -- Build MDML from modules table and friends "recurse" param fills in derived-from parent metadata elements -- Note that legacy_mdml_inner returns an xml fragment with an assumed xmlns alias of md for -- http://cnx.rice.edu/mdml, rather than a rooted document, because of how metadata is incorporated into the -- collxml (the metadata root is in collxml NS, not mdml NS) as well as the derived-from (takes child md tags, -- not a full metadata tag) legacy_mdml() and legacy_collxml() define those namespaces CREATE OR REPLACE FUNCTION legacy_mdml_inner ( mod_ident int, recurse bool default False, repo text default 'https://legacy.cnx.org/content') RETURNS XML as $$ SELECT xmlconcat( xmlcomment(E' WARNING! The \\'metadata\\' section is read only. Do not edit below. Changes to the metadata section in the source will not be saved. '), xmlelement(name "md:repository", legacy_mdml_inner.repo), xmlelement(name "md:content-url", legacy_mdml_inner.repo || '/' || m.moduleid||'/'|| m.version), xmlelement(name "md:content-id", m.moduleid), xmlelement(name "md:title", m.name), xmlelement(name "md:version", m.version), xmlelement(name "md:created", m.created), xmlelement(name "md:revised", m.revised), xmlelement(name "md:language", m.language), xmlelement(name "md:license", xmlattributes(l.url as "url"), l.name), xmlcomment(E' For information on license requirements for use or modification, see license url in the above <md:license> element. For information on formatting required attribution, see the URL: CONTENT_URL/content_info#cnx_cite_header where CONTENT_URL is the value provided above in the <md:content-url> element. '), xmlelement(name "md:actors", (SELECT xmlagg( xmlelement(name "md:person", xmlattributes(a.personid as "userid"), xmlelement(name "md:firstname", a.firstname), xmlelement(name "md:surname", a.surname), xmlelement(name "md:fullname", a.fullname), xmlelement(name "md:email", a.email)) ) FROM ( SELECT distinct p.* from persons p, moduleoptionalroles mor right join modules mod on mor.module_ident = mod.module_ident WHERE (p.personid = any (mod.authors) or p.personid = any (mod.maintainers) or p.personid = any (mod.licensors) or p.personid = any (mor.personids) ) and mod.module_ident = m.module_ident ) as a ) ), xmlelement(name "md:roles", xmlelement(name "md:role", xmlattributes('authors' as "type"), array_to_string(m.authors,' ')), xmlelement(name "md:role", xmlattributes('maintainers' as "type"), array_to_string(m.maintainers,' ')), xmlelement(name "md:role", xmlattributes('licensors' as "type"), array_to_string(m.licensors,' ')), (SELECT xmlelement(name "md:role", xmlattributes(roleparam as "type"), array_to_string(personids, ' ')) FROM roles r join moduleoptionalroles mor on r.roleid = mor.roleid WHERE mor.module_ident = m.module_ident ) ), xmlelement(name "md:abstract", ab.abstract), (SELECT xmlelement(name "md:subjectlist", xmlagg( xmlelement(name "md:subject", t.tag)) ) FROM moduletags mt join tags t on mt.tagid = t.tagid WHERE mt.module_ident = m.module_ident and t.scheme = 'ISKME subject' HAVING count(*) > 0 ), (SELECT xmlelement(name "md:keywordlist", xmlagg( xmlelement(name "md:keyword", k.word)) ) FROM modulekeywords mk join keywords k on mk.keywordid = k.keywordid WHERE mk.module_ident = m.module_ident HAVING count(*) > 0 ), (SELECT xmlelement(name "md:derived-from", xmlattributes( legacy_mdml_inner.repo || '/' || p.moduleid||'/'|| p.version as "url"), CASE WHEN legacy_mdml_inner.recurse = True THEN legacy_mdml_inner(m.parent, legacy_mdml_inner.recurse, legacy_mdml_inner.repo) ELSE NULL END ) FROM modules p WHERE p.module_ident = m.parent ) ) FROM modules m join licenses l on m.licenseid = l.licenseid join abstracts ab on m.abstractid = ab.abstractid WHERE m.module_ident = legacy_mdml_inner.mod_ident GROUP BY m.module_ident, m.moduleid, m.name, m.version, m.created, m.revised, m.language, l.url, l.name, ab.abstract $$ LANGUAGE SQL; CREATE OR REPLACE FUNCTION legacy_mdml ( mod_ident int, recurse bool default False, repo text default 'https://legacy.cnx.org/content') RETURNS XML as $$ SELECT xmlelement(name "md:metadata", xmlattributes('http://cnx.rice.edu/mdml' as "xmlns:md"), legacy_mdml_inner(legacy_mdml.mod_ident, legacy_mdml.recurse, legacy_mdml.repo) ) $$ LANGUAGE SQL; -- Now for COLLXML, parsed from trees and modules tables - the reverse of shred, basically -- Each of the sub-tree functions takes a tree nodeid as primary parameter -- this creates a leaf 'module' element CREATE OR REPLACE FUNCTION legacy_module(nodeid int, repo text default 'https://legacy.cnx.org/content') RETURNS xml LANGUAGE SQL AS $$ SELECT xmlelement(name "col:module", xmlattributes(m.moduleid as "document", ( CASE WHEN t.latest THEN 'latest' ELSE m.version END ) as "version", legacy_module.repo as "repository", m.version as "cnxorg:version-at-this-collection-version"), xmlelement(name "md:title", COALESCE (t.title, m.name) ) ) FROM trees t JOIN modules m on t.documentid = module_ident WHERE t.nodeid = legacy_module.nodeid $$; -- to recurse, we need the content tag function and the subcol tag function to refer to each other, so a -- stub function with the correct type signature, here, to be replaced w/ functional code below CREATE OR REPLACE FUNCTION legacy_content(nodeid int, repo text default 'https://legacy.cnx.org/content') RETURNS xml LANGUAGE SQL AS $$ SELECT NULL::xml -- STUB redefined below due to mutual recursion w/ legacy_subcol $$; CREATE OR REPLACE FUNCTION legacy_subcol(nodeid int, repo text default 'https://legacy.cnx.org/content') RETURNS xml LANGUAGE SQL AS $$ SELECT xmlelement(name "col:subcollection", xmlelement(name "md:title", COALESCE (t.title, m.name)), legacy_content(legacy_subcol.nodeid, legacy_subcol.repo) ) FROM trees t JOIN modules m on t.documentid = module_ident WHERE t.nodeid = legacy_subcol.nodeid $$; -- redefine or above, since needs to recurse back to subcol -- recursion terminates on all-module subcols CREATE OR REPLACE FUNCTION legacy_content(nodeid int, repo text default 'https://legacy.cnx.org/content') RETURNS xml LANGUAGE SQL AS $$ SELECT xmlelement(name "col:content", (SELECT xmlagg( CASE WHEN portal_type in ('Module','CompositeModule') THEN legacy_module(t.nodeid, legacy_content.repo) WHEN portal_type in ('SubCollection','CompositeSubCollection') THEN legacy_subcol(t.nodeid, legacy_content.repo) END) FROM trees t join modules m on t.documentid = m.module_ident WHERE parent_id = legacy_content.nodeid ) ) $$; -- Wrap it all together - takes a module_ident as primary parameter. Recurse passed down to mdml function -- to control recursion into derived-from metadata CREATE OR REPLACE FUNCTION legacy_collxml (ident int, recurse bool default False, repo text default 'https://legacy.cnx.org/content') RETURNS xml LANGUAGE SQL AS $$ SELECT xmlelement(name "col:collection", xmlattributes( 'http://cnx.rice.edu/collxml' as "xmlns", 'http://cnx.rice.edu/cnxml' as "xmlns:cnx", 'http://cnx.rice.edu/system-info' as "xmlns:cnxorg", 'http://cnx.rice.edu/mdml' as "xmlns:md", 'http://cnx.rice.edu/collxml' as "xmlns:col", m.language as "xml:lang"), xmlelement(name "metadata", xmlattributes( 'http://cnx.rice.edu/mdml' as "xmlns:md", '0.5' as "mdml-version"), legacy_mdml_inner(legacy_collxml.ident, legacy_collxml.recurse, legacy_collxml.repo) ), xmlelement(name "col:parameters", xmlelement(name "col:param", xmlattributes('print-style' as "name", print_style as "value") ) ), legacy_content(nodeid, legacy_collxml.repo) ) FROM modules m JOIN trees t ON m.module_ident = t.documentid WHERE m.module_ident = legacy_collxml.ident $$; """) cursor.execute('SELECT current_user') username = cursor.fetchall()[0][0] with super_user() as super_cursor: super_cursor.execute(""" CREATE OR REPLACE FUNCTION pretty_print(doc xml) RETURNS xml LANGUAGE plpythonu AS $$ from lxml import etree return etree.tostring(etree.fromstring(doc), pretty_print=True) $$; ALTER FUNCTION pretty_print(xml) OWNER TO {username} """.format(username=username))