def update_comments_opinion_counts(debug=False): # Count number of likes and dislikes for each comment in the repository req = u""" PREFIX sioc: <http://rdfs.org/sioc/ns#> PREFIX lodep: <https://webgate.acceptance.ec.testa.eu/eparticipation/ontologies/LOD_Eparticipation/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?post COUNT(DISTINCT ?reply) AS ?count_tot COUNT(DISTINCT ?pos_reply) AS ?count_pos COUNT(DISTINCT ?neg_reply) AS ?count_neg WHERE { ?post a sioc:Post . ?reply lodep:give_note_to ?post . OPTIONAL { ?pos_reply lodep:give_note_to ?post ; sioc:note "yes"^^rdfs:Literal . } OPTIONAL { ?neg_reply lodep:give_note_to ?post ; sioc:note "no"^^rdfs:Literal . } }""" results = virtuoso_read(req) # Iterate on the comment and update the ``lodep:num_like`` and # ``lodep:num_dislike`` properties with the correct values counted above func = partial(build_comment_counts_update_request, debug=debug) res = save_items_in_repository(results, u"counts", func) return res
def load_documents(doc_base_uri, default_creation_date=DEFAULT_DOC_CREATION_DATE): """ Return a dictionary that gives for each language the tree structure of the document with the identifiers of each part of this structure (document, section, chapter, articles). """ docs = {} for lang_code3, lang_code2 in LANG_CODE2.items(): sys.stdout.write(u" {0} ".format(lang_code2.upper())) sys.stdout.flush() # Get document content path from repository lng_doc_uri = build_doc_part_uri(doc_base_uri, lang_code3) req = u""" PREFIX sioc: <http://rdfs.org/sioc/ns#> PREFIX lodep: <https://webgate.acceptance.ec.testa.eu/eparticipation/ontologies/LOD_Eparticipation/> SELECT DISTINCT ?doc_space ?doc_date WHERE { <%(uri)s> sioc:has_space ?doc_space . OPTIONAL { <%(uri)s> lodep:created_at ?doc_date . } }""" % { u"uri": lng_doc_uri, } result = virtuoso_read(req) if len(result) == 1: # Get actual document content from application resp = requests.get(LODEPART_LOAD_DOC_URL, params={u"path": result[0][u"doc_space"]}) content = resp.json() if len(content) > 0: lng_doc_id = content[u"uri"] if not lng_doc_uri.endswith(lng_doc_id): sys.stdout.write(u"error\n\n") raise ValueError( u"Inconsistent document content retrieved from " u"application") base_uri = lng_doc_uri[:-(len(lng_doc_id) + 1)] docs[lang_code2] = build_doc_structure(content, base_uri) if u"doc_date" in result[0]: docs[lang_code2][u"creation_date"] = \ datetime.strptime(result[0][u"doc_date"], u"%Y-%m-%dT%H:%M:%S") else: str_d = default_creation_date.strftime( u"%Y-%m-%d %H:%M:%S") sys.stdout.write( u"No creation date in repository, using default one " u"({0})\n ".format(str_d)) docs[lang_code2][u"creation_date"] = default_creation_date sys.stdout.write(u"ok\n") else: sys.stdout.write(u"no content retrieved from application\n") else: sys.stdout.write(u"not found in repository\n") return docs
def run(): data = {} with open(osp.join(DATA_DIR, USERS_FILENAME)) as stream: data = json.load(stream) # Users creation users_to_activate = {} for idx, user in enumerate(data.values()): sys.stdout.write(u"Adding user {0} ({1}/{2})...".format( user[u"user_id"], idx + 1, len(data.values()))) sys.stdout.flush() res = create_user(user) if res: sys.stdout.write(u" ok\n") users_to_activate[user[u"user_id"]] = { u"lang": user.get(u"lang", u"en"), u"email": user[u"email"], } else: sys.stdout.write(u" ===> ERROR!\n") sleep(0.25) # Users activation sys.stdout.write(u"\n") req = u""" PREFIX sioc: <http://rdfs.org/sioc/ns#> PREFIX lodep: <https://webgate.acceptance.ec.testa.eu/eparticipation/ontologies/LOD_Eparticipation/> SELECT ?user ?email ?token WHERE { ?user sioc:member_of ?grp ; foaf:mbox ?email ; lodep:token_id ?token . FILTER ( STRENDS(STR(?grp), "%(grp_postfix)s") ) } """ % { u"grp_postfix": TEST_USERS_GROUP_POSTFIX.replace(u" ", u"_"), } results = virtuoso_read(req) user_tokens = { res_line[u"email"]: res_line[u"token"] for res_line in results } for idx, (user_id, user_data) in enumerate(users_to_activate.items()): sys.stdout.write(u"Activating user {0} ({1}/{2})...".format( user_id, idx + 1, len(users_to_activate))) sys.stdout.flush() md5_email = hashlib.md5(user_data[u"email"].strip().encode(u"ascii"))\ .hexdigest() params = { u"lang": user_data[u"lang"], u"token": user_tokens[md5_email], } resp = requests.get(LODEPART_ACTIVATION_URL, params=params) if resp.status_code == requests.codes.ok: sys.stdout.write(u" ok\n") else: sys.stdout.write(u" ===> ERROR!\n") sleep(0.25) sys.stdout.write(u"\n")
def generate_comments(docs, plays, pers, comments_low_max=COMMENTS_LOW_MAX, comments_high_max=COMMENTS_HIGH_MAX, low_max_percent=COMMENTS_LOW_MAX_PERCENT, replies_max=REPLIES_MAX): comments = [] available_langs = list( set(docs.keys()).intersection(set(LANGUAGES.keys()))) available_langs.sort() # Get URIs already used req = u""" PREFIX sioc: <http://rdfs.org/sioc/ns#> SELECT ?comm WHERE { ?comm a sioc:Post . }""" results = virtuoso_read(req) used_uris = set(res_line[u"comm"] for res_line in results) # Choose parameters for the comments generation lang_medians = choose_language_opinion_medians(available_langs) lang_props = choose_language_comments_proportion(available_langs) doc_part_devs = {} doc_part_nums = {} for lang in available_langs: doc_part_devs = choose_doc_part_deviations(docs[lang], LANGUAGES[lang], doc_part_devs) doc_part_nums = choose_doc_part_comment_numbers( docs[lang], LANGUAGES[lang], lang_props[lang], doc_part_nums, low_max=comments_low_max, high_max=comments_high_max, low_percent=low_max_percent) # Actually generate the comments for all the document parts for lang in available_langs: doc_struct = docs[lang] sys.stdout.write(u" {0} ".format(lang.upper())) sys.stdout.flush() comments.extend( generate_comments_for_item(doc_struct, plays, pers, lang, doc_struct[u"creation_date"], lang_medians, doc_part_devs, doc_part_nums, replies_max, used_uris=used_uris)) sys.stdout.write(u"\n") return comments
def load_personae(): """ Return a dictionary that gives for each play a dictionary containing the personae data. This data contains the corresponding user URI stored inside the repository. """ personae = {} in_data = {} with open(osp.join(DATA_DIR, USERS_FILENAME)) as stream: in_data = json.load(stream) for user in in_data.values(): pers_data = {} for key in (u"play_id", u"user_id", u"persona", u"email", u"password", u"group", u"lang"): pers_data[key] = user[key] personae.setdefault(pers_data[u"play_id"], {}) personae[pers_data[u"play_id"]][pers_data[u"persona"]] = pers_data # Get user URI from repository md5_email = hashlib.md5( pers_data[u"email"].strip().encode(u"ascii")).hexdigest() req = u""" PREFIX sioc: <http://rdfs.org/sioc/ns#> PREFIX foaf: <http://xmlns.com/foaf/0.1/> SELECT DISTINCT ?user_uri WHERE { ?user_uri a sioc:UserAccount ; foaf:mbox <%(email)s> ; sioc:member_of <%(group)s> . }""" % { u"email": md5_email, u"group": build_group_uri(pers_data[u"group"]) } result = virtuoso_read(req) if len(result) != 1: raise ValueError( u"User {0} ({1}) can't be found in the repository. Have you " u"created the users?" u"".format(pers_data[u"persona"], pers_data[u"user_id"])) pers_data[u"user_uri"] = result[0][u"user_uri"] pers_data[u"lodepart_user_id"] = extract_user_id( pers_data[u"user_uri"]) return personae
def run(document_uri, comments_low_max, comments_high_max, low_max_percent, replies_max, default_creation_date): if comments_high_max < comments_low_max: sys.stderr.write("Low maximum number of comments ({0}) should be " u"inferior to high maximum number of comments ({1})" u"".format(comments_low_max, comments_high_max)) sys.exit(1) if low_max_percent < 0 or low_max_percent > 100: sys.stderr.write("Percent of document parts using low maximum of " u"comments ({0}) should be between 0 and 100" u"".format(low_max_percent)) sys.exit(1) sys.stdout.write(u"Loading the document in the various languages...\n") docs = load_documents(document_uri, default_creation_date) if len(docs) == 0: sys.stderr.write( u"\nDocument with URI <{0}> not found in repository!\n\n" u"".format(document_uri)) sys.exit(1) sys.stdout.write(u"Loading users' data...\n") pers = load_personae() sys.stdout.write(u"Loading users' dialog lines...\n") plays = load_plays() sys.stdout.write(u"Generating comments...\n") comments = generate_comments(docs, plays, pers, comments_low_max, comments_high_max, low_max_percent, replies_max) sys.stdout.write(u"Saving documents structure...\n") save_doc_structure(docs) sys.stdout.write(u"Saving comments...\n") save_comments(comments) sys.stdout.write(u"Updating counts of opinions on comments ...\n") update_comments_opinion_counts() sys.stdout.write(u"Updating counts of opinions on document parts ...\n") update_doc_parts_opinion_counts() req = u""" PREFIX lodep: <https://webgate.acceptance.ec.testa.eu/eparticipation/ontologies/LOD_Eparticipation/> SELECT ?doc_uri ?total_na ?yes_na ?no_na ?mixed_na ?total ?yes ?no ?mixed WHERE { OPTIONAL { ?doc_uri lodep:num_items_total_na ?total_na . } OPTIONAL { ?doc_uri lodep:num_items_yes_na ?yes_na . } OPTIONAL { ?doc_uri lodep:num_items_no_na ?no_na . } OPTIONAL { ?doc_uri lodep:num_items_mixed_na ?mixed_na . } OPTIONAL { ?doc_uri lodep:num_items_total ?total . } OPTIONAL { ?doc_uri lodep:num_items_yes ?yes . } OPTIONAL { ?doc_uri lodep:num_items_no ?no . } OPTIONAL { ?doc_uri lodep:num_items_mixed ?mixed . } FILTER ( ?doc_uri = <%s> ) }""" % document_uri res = virtuoso_read(req) sys.stdout.write(u"\nEnd of comments creation\n\n") sys.stdout.write(u"SPARQL Request:\n{0}\n\n".format(req)) sys.stdout.write(u"JSON result:\n\n{0}\n\n".format(str(res)))
from const import VIRTUOSO_URL print("Virtuoso URL: {0}".format(VIRTUOSO_URL)) req = """ SELECT ?s ?p ?o { ?s ?p ?o . FILTER ( ?s = <_MY_TEST_COMMENT> ) } """ print(req) res = virtuoso_read(req) print(u"--> {0}".format(str(res))) req = """ SELECT ?s ?p ?o { ?s ?p ?o . FILTER ( ?s = <_MY_TEST_DOCUMENT> ) } """ print(req) res = virtuoso_read(req) print(u"--> {0}".format(str(res)))
def run(): sys.stdout.write(u"Getting all test users... \n") # Get all test users req = u""" PREFIX sioc: <http://rdfs.org/sioc/ns#> SELECT ?user WHERE { ?user sioc:member_of ?grp . FILTER ( STRENDS(STR(?grp), "%(grp_postfix)s") ) }""" % { u"grp_postfix": TEST_USERS_GROUP_POSTFIX.replace(u" ", u"_"), } results = virtuoso_read(req) # Delete all the posts of these users sys.stdout.write(u"Deleting all comments written by these users... \n") for res in results: res[u"graph"] = LODEPART_GRAPH req = u""" PREFIX sioc: <http://rdfs.org/sioc/ns#> WITH <%(graph)s> DELETE { ?post ?p ?o } WHERE { ?post sioc:has_creator <%(user)s> . ?post ?p ?o . }""" % res virtuoso_write(req) # Delete all the document parts that have no comment and that aren't parent # of a document part with comments. sys.stdout.write( u"Deleting all document parts that now have no comment... " u"\n") sel_req = u""" PREFIX sioc: <http://rdfs.org/sioc/ns#> SELECT DISTINCT ?doc_part WHERE { ?doc_part a sioc:Thread . FILTER ( NOT EXISTS { ?child sioc:has_parent ?doc_part . } && NOT EXISTS { ?post sioc:has_container ?doc_part . } ) }""" # Selects the document parts with no comment and no child results = virtuoso_read(sel_req) while len(results) > 0: # Delete this document parts, then selects the document parts with # no comment and no child and so on for res in results: res[u"graph"] = LODEPART_GRAPH req = u""" PREFIX sioc: <http://rdfs.org/sioc/ns#> WITH <%(graph)s> DELETE { <%(doc_part)s> ?p ?o } WHERE { <%(doc_part)s> ?p ?o }""" % res virtuoso_write(req) results = virtuoso_read(sel_req) # Finally update the opinion counts on comments and document parts sys.stdout.write(u"Updating counts of opinions on comments ...\n") update_comments_opinion_counts() sys.stdout.write(u"Updating counts of opinions on document parts ...\n") update_doc_parts_opinion_counts()
def check_documents(): found_docs = {} # Find all documents defined in the repository req = u""" PREFIX sioc: <http://rdfs.org/sioc/ns#> SELECT DISTINCT ?doc_uri WHERE { ?doc_uri a sioc:Forum . FILTER( NOT EXISTS { ?doc_uri sioc:has_parent ?parent . } ) }""" result = virtuoso_read(req) sys.stdout.write(u"\nFound {0:d} document(s) in repository\n" u"".format(len(result))) for res_line in result: # Search language versions of each document doc_uri = res_line[u"doc_uri"] sys.stdout.write(u"\n** Document {0}\n".format(doc_uri)) found_docs[doc_uri] = [] if not (doc_uri.startswith(LODEPART_BASE_URI)): sys.stdout.write( u" ---> Document URI doesn't start with the base URI defined " u"in the `const.py` file\n <{0}> vs. <{1}>\n" u" PLEASE CHECK THIS IS CORRECT!\n" u"".format(doc_uri, LODEPART_BASE_URI)) if LODEPART_BASE_URI.endswith(u"/"): sys.stdout.write( u" ---> Base URI defined in the `const.py` file ends with " u"\"/\"\n <{0}>\n PLEASE CHECK THIS IS CORRECT!\n" u"".format(LODEPART_BASE_URI)) for lang_code3, lang_code2 in LANG_CODE2.items(): sys.stdout.write(u"\n {0} version:\n".format(lang_code2.upper())) # Get document content path from repository lng_doc_uri = build_doc_part_uri(doc_uri, lang_code3) req = u""" PREFIX sioc: <http://rdfs.org/sioc/ns#> SELECT DISTINCT ?doc_space WHERE { <%(uri)s> sioc:has_space ?doc_space . }""" % { u"uri": lng_doc_uri, } result = virtuoso_read(req) if len(result) == 0: sys.stdout.write( u" ---> Document <{0}> not found in repository\n" u" SKIPPING DOCUMENT!\n".format(lng_doc_uri)) elif len(result) > 1: sys.stdout.write( u" ---> Multiple documents <{0}> found in repository\n" u" REPOSITORY IS INCONSISTENT! SKIPPING DOCUMENT!" u"\n".format(lng_doc_uri)) else: sys.stdout.write( u" Document <{0}> found in repository; trying to load " u"document content from Web application\n" u"".format(lng_doc_uri)) # Get actual document content from application resp = requests.get(LODEPART_LOAD_DOC_URL, params={u"path": result[0][u"doc_space"]}) try: content = resp.json() except ValueError: content = {} if len(content) > 0: lng_doc_id = content[u"uri"] if not lng_doc_uri.endswith(lng_doc_id): sys.stdout.write( u" ---> Document ID read from content is " u"inconsistent with document URI\n" u" \"{0}\" vs <{1}>\n" u" SKIPPING DOCUMENT!\n" u"".format(lng_doc_id, lng_doc_uri)) else: sys.stdout.write( u" Document content properly retrieved from " u"Web application\n Ok!\n") found_docs[doc_uri].append(lang_code2) else: sys.stdout.write( u" ---> No content found in Web application for " u"document \"{0}\"\n SKIPPING DOCUMENT!\n" u"".format(result[0][u"doc_space"])) # Write a synthesis sys.stdout.write(u"\nSynthesis\n=========\n\n") doc_written = False for doc_uri, versions in sorted(found_docs.items()): if len(versions) == 0: continue doc_written = True sys.stdout.write(u"* {0}\n".format(doc_uri)) sys.stdout.write(u" versions: {0}\n" u"".format(u", ".join( [lng.upper() for lng in sorted(versions)]))) if not doc_written: sys.stdout.write(u"No document found\n") sys.stdout.write(u"\n")
def update_doc_parts_opinion_counts(debug=False): # Count the positive, negative and mixed notes on the comments for each # document part in the repository req = u""" PREFIX sioc: <http://rdfs.org/sioc/ns#> PREFIX lodep: <https://webgate.acceptance.ec.testa.eu/eparticipation/ontologies/LOD_Eparticipation/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?doc_part COUNT(DISTINCT ?comment) AS ?count_tot COUNT(DISTINCT ?pos_comment) AS ?count_pos COUNT(DISTINCT ?neg_comment) AS ?count_neg COUNT(DISTINCT ?mix_comment) AS ?count_mix WHERE { ?comment a sioc:Post ; sioc:has_container ?doc_part . OPTIONAL { ?pos_comment a sioc:Post ; sioc:has_container ?doc_part ; sioc:note "yes"^^rdfs:Literal . } OPTIONAL { ?neg_comment a sioc:Post ; sioc:has_container ?doc_part ; sioc:note "no"^^rdfs:Literal . } OPTIONAL { ?mix_comment a sioc:Post ; sioc:has_container ?doc_part ; sioc:note "mixed"^^rdfs:Literal . } }""" results = virtuoso_read(req) opinions = {} for res in results: opinions[res[u"doc_part"]] = { u"total": int(res[u"count_tot"]), u"positive": int(res[u"count_pos"]), u"negative": int(res[u"count_neg"]), u"mixed": int(res[u"count_mix"]), } # Read the document tree structures from the repository thanks to the # ``sioc:has_parent`` links req = u""" PREFIX sioc: <http://rdfs.org/sioc/ns#> SELECT ?doc_part ?doc_type ?parent_doc_part ?parent_doc_type WHERE { ?doc_part a ?doc_type . ?doc_part sioc:has_parent ?parent_doc_part . ?parent_doc_part a ?parent_doc_type . }""" results = virtuoso_read(req) doc_structs = {} doc_roots = set() doc_not_roots = set() for res in results: doc_structs.setdefault( res[u"doc_part"], { u"uri": res[u"doc_part"], u"is_forum": res[u"doc_type"].endswith(u"Forum"), u"repo_children": [] }) doc_structs.setdefault( res[u"parent_doc_part"], { u"uri": res[u"parent_doc_part"], u"is_forum": res[u"doc_type"].endswith(u"Forum"), u"repo_children": [] }) doc_structs[res[u"parent_doc_part"]][u"repo_children"].append( doc_structs[res[u"doc_part"]]) doc_roots.discard(res[u"doc_part"]) doc_not_roots.add(res[u"doc_part"]) if res[u"parent_doc_part"] not in doc_not_roots: doc_roots.add(res[u"parent_doc_part"]) for uri in doc_not_roots: doc_structs.pop(uri) # The document structures in the repository are very strange and can not # directly be used: "doc_root" is parent of "doc_root/eng", "doc_root/fra"; # "doc_root/eng" is parent of "doc_root/art_001"; "doc_root/fra" is parent # of "doc_root/art_001"; "doc_root/art001" is parent of # "doc_root/art001/eng", "doc_root/art001/fra"; etc. # Correct the document structures to have: "doc_root" is parent of # "doc_root/art_001"; "doc_root/fra" is parent_of "doc_root/art001/fra"; # "doc_root/eng" is parent of "doc_root/art_001/eng"; "doc_root" # has language versions "doc_root/eng", "doc_root/fra"; ""doc_root/art001" # has language versions "doc_root/art001/fra", "doc_root/art001/eng" for struct in doc_structs.values(): correct_doc_structure(struct) clean_doc_structure(struct) # Count the opinions for each document part inside the structures and # insert these counts in the repository update_requests = [] for struct in doc_structs.values(): set_opinion_counts_in_doc_structure(struct, opinions) propagate_opinion_counts_in_doc_structure(struct) update_requests.extend( build_opinion_counts_update_from_doc_structure(struct, debug=debug)) res = save_items_in_repository(update_requests, u"counts", (lambda x: x)) return res
def find_doc_items_to_save(doc_struct): """ Given a document structure (containing the document in all the processed languages), finds the document parts that don't exist in the repository and the missing links between these parts and their parent. """ doc_parts_uri = [] for lang, lng_doc_struct in doc_struct.items(): doc_parts_uri.extend(get_doc_parts_and_parents_uri(lng_doc_struct)) processed_uris = set() doc_parts_to_create = [] parent_links_to_weave = [] for doc_part_uri, doc_lang_root_uri in doc_parts_uri: if doc_lang_root_uri is None: # Whole documents (highest level of the tree) are already in the # repository. continue if doc_part_uri in processed_uris: continue # check if doc_part already exists in repository req = u""" PREFIX sioc: <http://rdfs.org/sioc/ns#> SELECT COUNT(*) AS ?cnt WHERE { <%(uri)s> a sioc:Thread . }""" % { u"uri": doc_part_uri } result = virtuoso_read(req) processed_uris.add(doc_part_uri) if int(result[0][u"cnt"]) == 0: # Current PHP application stores a strange tree: # doc_part is child of generic_doc_part (without language # specification) and generic_doc_part is child of # doc_lang_root (whole document ith language specification) generic_doc_part_uri = doc_part_uri[:doc_part_uri.rfind(u"/")] # Create doc_part and weave parent link doc_parts_to_create.append(doc_part_uri) parent_links_to_weave.append((doc_part_uri, generic_doc_part_uri)) # Check if generic_doc_part_uri has already been created if generic_doc_part_uri in processed_uris: # generic_doc_part exists; only weave parent link parent_links_to_weave.append( (generic_doc_part_uri, doc_lang_root_uri)) else: processed_uris.add(generic_doc_part_uri) req = u""" PREFIX sioc: <http://rdfs.org/sioc/ns#> SELECT COUNT(*) AS ?cnt WHERE { <%(uri)s> a sioc:Thread . }""" % { u"uri": generic_doc_part_uri } result = virtuoso_read(req) if int(result[0][u"cnt"]) == 0: # Create generic_doc_part and weave parent link doc_parts_to_create.append(generic_doc_part_uri) parent_links_to_weave.append( (generic_doc_part_uri, doc_lang_root_uri)) else: # generic_doc_part exists; only weave parent link parent_links_to_weave.append( (generic_doc_part_uri, doc_lang_root_uri)) return doc_parts_to_create, parent_links_to_weave