def makePostsTriples(self): if not self.hastext: return self.totalchars = sum(self.size_chars_overall) self.mchars_messages = n.mean(self.size_chars_overall) self.dchars_messages = n.std(self.size_chars_overall) self.totaltokens = sum(self.size_tokens_overall) self.mtokens_messages = n.mean(self.size_tokens_overall) self.dtokens_messages = n.std(self.size_tokens_overall) self.totalsentences = sum(self.size_sentences_overall) self.msentences_messages = n.mean(self.size_sentences_overall) self.dsentences_messages = n.std(self.size_sentences_overall) self.nmessages = P.get( "SELECT (COUNT(?s) as ?s) WHERE { ?s a po:Message }", context=self.translation_graph) self.nparticipants = P.get( "SELECT (COUNT(?s) as ?s) WHERE { ?s a po:Participant }", context=self.translation_graph) self.nurls = P.get( "SELECT (COUNT(?s) as ?s) WHERE { ?s po:hasUrl ?o }", context=self.translation_graph) triples = [ (self.snapshoturi, po.nParticipants, self.nparticipants), (self.snapshoturi, po.nMessages, self.nmessages), (self.snapshoturi, po.nCharsOverall, self.totalchars), (self.snapshoturi, po.mCharsOverall, self.mchars_messages), (self.snapshoturi, po.dCharsOverall, self.dchars_messages), (self.snapshoturi, po.nTokensOverall, self.totaltokens), (self.snapshoturi, po.mTokensOverall, self.mtokens_messages), (self.snapshoturi, po.dTokensOverall, self.dtokens_messages), (self.snapshoturi, po.nSentencesOverall, self.totalsentences), (self.snapshoturi, po.mSentencesOverall, self.msentences_messages), (self.snapshoturi, po.dSentencesOverall, self.dsentences_messages), ] P.add(triples, context=self.meta_graph)
def translateStates(self): count = 0 triples = [] for estado in self.data["estados"]: gid = estado[0] nome = estado[1] abr = estado[2] created = estado[3] updated = estado[4] relevance = estado[5] uri = P.rdf.ic(po.State, self.snapshotid+"-"+str(gid), self.translation_graph, self.snapshoturi) triples += [ (uri, po.name, nome), (uri, po.abbreviation, abr), (uri, po.createdAt, created), (uri, po.relevance, relevance), ] if updated != created: triples += [ (uri, po.updatedAt, updated), ] count += 1 if count % 60 == 0: c("finished states entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of states entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of states entries")
def translateSupporters(self): count = 0 triples = [] for adesao in self.data["adesoes"]: tid = adesao[0] uid = adesao[1] created = adesao[2] updated = adesao[3] aid = adesao[4] uri = P.rdf.ic(po.Support, self.snapshotid+"-"+str(aid), self.translation_graph, self.snapshoturi) triples += [ (uri, po.participant, po.Participant+'#'+self.snapshotid+'-'+str(uid)), (uri, po.topic, po.Topic+'#'+self.snapshotid+'-'+str(tid)), (uri, po.createdAt, created), ] if updated != created: triples += [ (uri, po.updatedAt, updated), ] count += 1 if count % 60 == 0: c("finished supporters entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of supporters entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of supporters entries")
def translateObservatories(self): count = 0 triples = [] for observatorio in self.data["observatorios"]: oid = observatorio[0] uid = observatorio[1] created = observatorio[4] updated = observatorio[5] uri = P.rdf.ic(po.Observatory, self.snapshotid+"-"+str(oid), self.translation_graph, self.snapshoturi) triples += [ (uri, po.participant, po.Participant+'#'+self.snapshoturi+'-'+str(uid)), (uri, po.createdAt, created), ] if updated != created: triples += [ (uri, po.updatedAt, updated), ] count += 1 if count % 60 == 0: c("finished observatory entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of observatory entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of observatory entries")
def minimalTestData(): triples=[ (NS.po.SnapshotFoo+"#1", NS.facebook.ego, True), (NS.po.SnapshotFoo+"#1", NS.facebook.userID, "1039203918"), (NS.po.SnapshotFoo+"#1", NS.facebook.user, NS.facebook.Participant+"Foop"), ] P.add(triples,context="void")
def translateNeighborhoods(self): count = 0 triples = [] for bairro in self.data["bairros"]: bid = bairro[0] nome = bairro[1] cid = bairro[2] created = bairro[3] updated = bairro[4] relevance = bairro[5] uri = P.rdf.ic(po.Neighborhood, self.snapshotid+"-"+str(bid), self.translation_graph, self.snapshoturi) triples += [ (uri, po.name, nome), (uri, po.city, po.City+'#'+self.snapshotid+'-'+str(cid)), (uri, po.createdAt, created), (uri, po.relevance, relevance) ] if updated != created: triples += [ (uri, po.updatedAt, updated), ] count += 1 if count % 60 == 0: c("finished neighborhood entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of neighborhood entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of neighborhood entries")
def translateCities(self): count = 0 triples = [] for cidade in self.data["cidades"]: cid = cidade[0] nome = cidade[1] eid = cidade[2] slug = cidade[3] created = cidade[4] updated = cidade[5] relevance = cidade[6] uri = P.rdf.ic(po.City, self.snapshotid+"-"+str(cid), self.translation_graph, self.snapshoturi) triples += [ (uri, po.name, nome), (uri, po.state, po.State+'#'+self.snapshotid+str(eid)), (uri, po.slug, slug), (uri, po.createdAt, created), (uri, po.relevance, relevance) ] if updated != created: triples += [ (uri, po.updatedAt, updated), ] count += 1 if count % 60 == 0: c("finished cities k entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of cities entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of cities entries")
def addArticleBody(self, body, articleuri): triples = [] if re.findall(r"<(.*)>(.*)<(.*)>", body, re.S): try: P.add((articleuri, po.htmlBodyText, body), context=self.translation_graph) except QueryBadFormed: c("QUOTING HTML BODY") P.add((articleuri, po.quotedHtmlBodyText, urllib.parse.quote(body)), context=self.translation_graph) cleanbody = BeautifulSoup(body, 'html.parser').get_text() if cleanbody: try: P.add((articleuri, po.cleanBodyText, cleanbody), context=self.translation_graph) except QueryBadFormed: c("QUOTING HTML CLEAN BODY") P.add((articleuri, po.quotedCleanBodyText, urllib.parse.quote(cleanbody)), context=self.translation_graph) else: triples += [ (articleuri, po.cleanBodyText, body), ] P.add(triples, context=self.translation_graph) self.bodies += [body]
def writeRdf(self): pub_dir = './participabr_snapshot/' if not os.path.isdir(pub_dir): os.mkdir(pub_dir) g = P.context(self.translation_graph) g.serialize(pub_dir+'participabr.ttl', 'turtle') c('participation ttl serialized') g.serialize(pub_dir+'participabr.rdf', 'xml') c('participation xml serialized') # metadados: group, platform, triples = [ (self.snapshoturi, a, po.Snapshot), # (self.snapshoturi, a, po.ParticipabrSnapshot), (self.snapshoturi, po.snapshotID, self.snapshotid), (self.snapshoturi, po.isEgo, False), (self.snapshoturi, po.isGroup, True), (self.snapshoturi, po.isFriendship, True), (self.snapshoturi, po.isInteraction, True), (self.snapshoturi, po.isPost, True), (self.snapshoturi, po.socialProtocol, 'ParticipaBR'), (self.snapshoturi, po.dateObtained, datetime.date(2012, 6, 28)), ] P.add(triples, self.meta_graph) g = P.context(self.meta_graph) g.serialize(pub_dir+'participabrMeta.ttl', 'turtle') c('participation meta ttl serialized') g.serialize(pub_dir+'participabrMeta.rdf', 'xml') c('participation meta xml serialized')
def translateLinks(self): count = 0 triples = [] for link in self.data['links']: lid = link[0] nome = link[1] url = link[2] tid = link[4] created = link[5] updated = link[6] uri = P.rdf.ic(po.Link, self.snapshotid+"-"+str(lid), self.translation_graph, self.snapshoturi) triples += [ (uri, po.name, nome), (uri, po.url, url), (uri, po.topic, po.Topic+'#'+self.snapshotid+'-'+str(tid)), (uri, po.createdAt, created) ] if updated != created: triples += [ (uri, po.updatedAt, updated), ] count += 1 if count % 60 == 0: c("finished links entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of links entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of links entries")
def rdfGroupPosts(self, filename_posts_): data = [ i.split("\t") for i in open(filename_posts_, "r").read().split("\n")[:-1] ] tvars = data[0] standard_vars = [ 'id', 'type', 'message', 'created_time', 'comments', 'likes', 'commentsandlikes' ] if len(tvars) != sum([i == j for i, j in zip(tvars, standard_vars)]): raise ValueError("the tab file format was not understood") data = data[1:] triples = [] self.nposts = 0 nchars_all = [] ntokens_all = [] for post in data: ind = P.rdf.ic(po.Post, post[0], self.posts_graph, self.snapshoturi) ptext = post[2].replace("_", "\n") nchars = len(ptext) nchars_all += [nchars] ntokens = len(k.tokenize.wordpunct_tokenize(ptext)) ntokens_all += [ntokens] triples += [ (ind, po.snapshot, self.snapshoturi), (ind, po.postID, post[0]), (ind, po.postType, post[1]), (ind, po.postText, ptext), (ind, po.createdAt, dateutil.parser.parse(post[3])), (ind, po.nComments, int(post[4])), (ind, po.nLikes, int(post[5])), (ind, po.nChars, nchars), (ind, po.nTokens, ntokens), ] if self.nposts % 200 == 0: c("posts: ", self.nposts) self.nposts += 1 self.postsvars = [ "postID", "postType", "postText", "createdAt", "nComments", "nLikes", "nChars", "nTokens" ] self.mcharsposts = n.mean(nchars_all) self.dcharsposts = n.std(nchars_all) self.totalchars = n.sum(nchars_all) self.mtokensposts = n.mean(ntokens_all) self.dtokensposts = n.std(ntokens_all) self.totaltokens = n.sum(ntokens_all) #triples+=[ # went to meta file # (self.snapshoturi,po.mCharsPosts,self.mcharsposts), # (self.snapshoturi,po.dCharsPosts,self.dcharsposts), # (self.snapshoturi,po.totalCharsPosts,self.totalchars), # (self.snapshoturi,po.mTokensPosts,self.mtokensposts), # (self.snapshoturi,po.dTokensPosts,self.dtokensposts), # (self.snapshoturi,po.totalTokensPosts,self.totaltokens), # ] P.add(triples, context=self.posts_graph)
def triplesScaffolding(subjects, predicates, objects, context=None): """Link subject(s) through predicate(s) to subject(s). Accepts any combination of one and N triples in inputs, eg: triplesScafolding(participants,NS.po.name,names) # N 1 N triplesScafolding(participants,name_props,name) # N N 1 triplesScafolding(participant,name_pros,names) # 1 N N triplesScafolding(participant, names_props,name) # 1 N 1 triplesScafolding(participant, NS.po.name,names) # 1 1 N triplesScafolding(participants,NS.po.name,name) # N 1 1 Might be useful for rearanging lists into triples: triplesScafolding(participants,name_props,names) # N N N triplesScafolding(participant,NS.po.name,names) # 1 1 1""" if isinstance(subjects, str): subjects = r.URIRef(subjects) N = max([len(subjects), 0][isinstance(subjects, (r.URIRef, r.Namespace))], [len(predicates), 0][isinstance(predicates, (r.URIRef, r.Namespace))], [len(objects), 0][isinstance(objects, (r.URIRef, r.Namespace))]) check = sum([((len(i) == N) or isinstance(i, (r.URIRef, r.Namespace))) for i in (subjects, predicates, objects)]) == 3 if not check: raise ValueError( "input should be a combination of loose URIs and lists of same size " ) triples = [] if check == 3: for i, subject in enumerate(subjects): predicate = predicates[i] object_ = objects[i] triples += [(subject, predicate, object_)] else: if isinstance(subjects, (r.URIRef, r.Namespace)): subjects = [subjects] if isinstance(predicates, (r.URIRef, r.Namespace)): predicates = [predicates] if isinstance(objects, (r.URIRef, r.Namespace)): objects = [objects] if len(subjects) == 1: subjects *= N if len(predicates) == 1: predicates *= N if len(objects) == 1: objects *= N for subject, predicate, object_ in zip(subjects, predicates, objects): triples += [(subject, predicate, object_)] if context == "return_triples": return triples # c(outer_frame,dir(outer_frame),outer_frame.f_locals) # frames = inspect.getouterframes(inspect.currentframe()) # outer_frame = frames[1][0] # if "triples" in outer_frame.f_locals: # outer_frame.f_locals["triples"]+=triples # else: # P.add(triples,context=context) P.add(triples, context=context)
def void(): triples=[ (NS.po.SnapshotFoo+"#1", a, NS.po.FacebookSnapshot), (NS.po.SnapshotFoo+"#1", NS.po.rawFile, "~/.percolation/data/somedirs/something.raw"), (NS.po.SnapshotFoo+"#1", NS.po.rdfFile, "~/.percolation/data/somedirs/something.rdf"), (NS.po.SnapshotFoo+"#1", NS.po.voidFile, "~/.percolation/data/somedirs/void.raw"), ] P.add(triples,context="void")
def minimumTestOntology(context="minimum_ontology"): triples=[ (NS.po.FacebookSnapshot,NS.rdfs.subClassOf,NS.po.Snapshot), (NS.facebook.user,NS.rdfs.range,NS.po.Participant), (NS.facebook.ego,NS.rdfs.domain,NS.po.FacebookSnapshot), (NS.facebook.userID,NS.rdfs.subPropertyOf,NS.po.userID), ] P.add(triples,context=context)
def minimalTestData(): triples = [ (NS.po.SnapshotFoo + "#1", NS.facebook.ego, True), (NS.po.SnapshotFoo + "#1", NS.facebook.userID, "1039203918"), (NS.po.SnapshotFoo + "#1", NS.facebook.user, NS.facebook.Participant + "Foop"), ] P.add(triples, context="void")
def parseLegacyFiles(data_dir=DATADIR+"twitter/"): """Parse legacy pickle files with Twitter tweets""" filenames=os.listdir(data_dir) filenames=[i for i in filenames if i!="ipython_log.py" and not i.endswith(".swp")] snapshots=set() triples=[] for filename in filenames: snapshotid="twitter-legacy-"+filename.replace("_","") snapshoturi=po.TwitterSnapshot+"#"+snapshotid expressed_classes=[po.Participant,po.Tweet] expressed_reference=filename.replace("_","").replace(".pickle","") name_humanized="Twitter"+expressed_reference filesize=os.path.getsize(data_dir+filename)/10**6 fileformat="pickle" fileuri=po.File+"#twitter-file-"+filename triples+=[ (snapshoturi,a,po.Snapshot), (snapshoturi,a,po.TwitterSnapshot), (snapshoturi,po.snapshotID,snapshotid), (snapshoturi, po.isEgo, False), (snapshoturi, po.isGroup, True), (snapshoturi, po.isFriendship, False), (snapshoturi, po.isInteraction, True), (snapshoturi, po.isPost, True), (snapshoturi, po.humanizedName, name_humanized), (snapshoturi, po.expressedReference, expressed_reference), (snapshoturi, po.rawFile, fileuri), (fileuri, po.fileSize, filesize), (fileuri, po.fileName, filename), (fileuri, po.fileFormat, fileformat), ]+[ (fileuri, po.expressedClass, expressed_class) for expressed_class in expressed_classes ] snapshots.add(snapshoturi) nfiles=len(filenames) nsnapshots=len(snapshots) P.context("social_twitter","remove") platformuri=P.rdf.ic(po.Platform,"Twitter",context="social_twitter") triples+=[ (NS.social.Session,NS.social.nIRCParsedFiles,nfiles), (NS.social.Session,NS.social.nIRCSnapshots,nsnapshots), (platformuri, po.dataDir,data_dir), ] P.add(triples,context="social_twitter") c("parsed {} twitter files ({} snapshots) are in percolation graph and 'social_twitter' context".format(nfiles,nsnapshots)) c("percolation graph have {} triples ({} in social_twitter context)".format(len(P.percolation_graph),len(P.context("social_twitter")))) negos=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isEgo true } } ") ngroups=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isGroup true } } ") nfriendships=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isFriendship true } } ") ninteractions=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isInteraction true } } ") nposts=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isPost true } } ") totalsize=sum(P.query(r" SELECT ?size WHERE { GRAPH <social_twitter> { ?s po:fileSize ?size } } ")) c("""{} are ego snapshots, {} are group snapshots {} have a friendship network. {} have an interaction network. {} have post texts and reaction counts Total raw data size is {:.2f}MB""".format(negos,ngroups,nfriendships,ninteractions,nposts,totalsize)) return snapshots
def translateObservatoryTags(self): triples = [] for ot in self.data["observatorios_tem_tags"]: oid = ot[0] tid = ot[1] triples.append((po.Observatory+'#'+self.snapshotid+'-'+str(oid), po.hasTag, po.Tag+'#'+self.snapshotid+'-'+str(tid))) P.add(triples, self.translation_graph) c("finished add of observatory tag entries")
def parseLegacyFiles(data_dir=DATADIR+"irc/"): """Parse legacy txt files with irc logs""" filenames=os.listdir(data_dir) filenames=[i for i in filenames if i!="ipython_log.py" and not i.endswith(".swp")] snapshots=set() triples=[] for filename in filenames: snapshotid="irc-legacy-"+filename.replace("#","") snapshoturi=po.TwitterSnapshot+"#"+snapshotid expressed_classes=[po.Participant,po.IRCMessage] expressed_reference=filename.replace("#","").replace(".txt","").replace(".log","") name_humanized="IRC log of channel "+expressed_reference filesize=os.path.getsize(data_dir+filename)/10**6 fileformat="txt" fileuri=po.File+"#Irc-log-"+filename.replace("#","") triples+=[ (snapshoturi,a,po.Snapshot), (snapshoturi,a,po.IRCSnapshot), (snapshoturi,po.snapshotID,snapshotid), (snapshoturi, po.isEgo, False), (snapshoturi, po.isGroup, True), (snapshoturi, po.isFriendship, False), (snapshoturi, po.isInteraction, True), (snapshoturi, po.isPost, True), (snapshoturi, po.humanizedName, name_humanized), (snapshoturi, po.expressedReference, expressed_reference), (snapshoturi, po.rawFile, fileuri), (fileuri, po.fileSize, filesize), (fileuri, po.fileName, filename), (fileuri, po.fileFormat, fileformat), ]+[ (fileuri, po.expressedClass, expressed_class) for expressed_class in expressed_classes ] snapshots.add(snapshoturi) nfiles=len(filenames) nsnapshots=len(snapshots) P.context("social_irc","remove") platformuri=P.rdf.ic(po.Platform,"IRC",context="social_irc") triples+=[ (NS.social.Session,NS.social.nIRCParsedFiles,nfiles), (NS.social.Session,NS.social.nIRCSnapshots,nsnapshots), (platformuri, po.dataDir,data_dir), ] P.add(triples,context="social_irc") c("parsed {} irc logs files ({} snapshots) are in percolation graph and 'irc_twitter' context".format(nfiles,nsnapshots)) c("percolation graph have {} triples ({} in social_irc context)".format(len(P.percolation_graph),len(P.context("social_irc")))) negos=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isEgo true } } ") ngroups=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isGroup true } } ") nfriendships=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isFriendship true } } ") ninteractions=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isInteraction true } } ") nposts=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isPost true } } ") totalsize=sum(P.query(r" SELECT ?size WHERE { GRAPH <social_irc> { ?s po:fileSize ?size } } ")) c("""{} are ego snapshots, {} are group snapshots {} have a friendship structures. {} have an interaction structures. {} have texts Total raw data size is {:.2f}MB""".format(negos,ngroups,nfriendships,ninteractions,nposts,totalsize)) return snapshots
def void(): triples = [ (NS.po.SnapshotFoo + "#1", a, NS.po.FacebookSnapshot), (NS.po.SnapshotFoo + "#1", NS.po.rawFile, "~/.percolation/data/somedirs/something.raw"), (NS.po.SnapshotFoo + "#1", NS.po.rdfFile, "~/.percolation/data/somedirs/something.rdf"), (NS.po.SnapshotFoo + "#1", NS.po.voidFile, "~/.percolation/data/somedirs/void.raw"), ] P.add(triples, context="void")
def translateComments(self): trans = {'resposta': 'answer', 'pergunta': 'question', 'comentario': 'comment', 'ideia': 'idea'} triples = [] count = 0 for comment in self.data['comments']: cid = comment[0] tid = comment[1] # topic id body = comment[3] if not body: continue body = body.replace('', '') uid = comment[4] ctype = comment[8] created = comment[9] updated = comment[10] assert isinstance(cid, int) assert isinstance(tid, int) assert isinstance(body, str) assert isinstance(uid, int) assert isinstance(ctype, str) assert isinstance(created, datetime.datetime) assert isinstance(updated, datetime.datetime) commenturi = P.rdf.ic(po.Comment, self.snapshotid+"-"+str(cid), self.translation_graph, self.snapshoturi) participanturi = po.Participant+'#'+self.snapshotid+"-"+str(uid) # topicuri = self.topicuris[tid] topicuri = po.Topic+'#'+self.snapshotid+'-'+str(tid) triples += [ (commenturi, po.author, participanturi), (commenturi, po.topic, topicuri), (commenturi, po.text, body), # (commenturi, po.nChars, len(body)), (commenturi, po.type, trans[ctype]), (topicuri, po.createdAt, created), ] if updated != created: triples.append( (topicuri, po.updatedAt, updated), ) count += 1 if count % 60 == 0: c("finished comment entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of comment entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of comment entries")
def translatePlaces(self): count = 0 triples = [] for local in self.data["locais"]: lid = local[0] rid = local[1] rtype = local[2] bid = local[3] cid = local[4] created = local[7] updated = local[8] cep = local[9] eid = local[10] uri = P.rdf.ic(po.Place, self.snapshotid+"-"+str(lid), self.translation_graph, self.snapshoturi) triples += [(uri, po.createdAt, created)] if bid: triples.append((uri, po.neighborhood, po.Neighborhood+'#'+self.snapshotid+'-'+str(bid))) if cid: triples.append((uri, po.city, po.City+'#'+self.snapshotid+'-'+str(cid))) if eid: triples.append((uri, po.state, po.State+'#'+self.snapshotid+'-'+str(eid))) if cep: triples.append((uri, po.cep, cep)) if updated != created: triples += [ (uri, po.updatedAt, updated), ] if rtype == "Topico": uri_ = po.Topic+'#'+self.snapshotid+'-'+str(rid) elif rtype == "User": uri_ = po.User+'#'+self.snapshotid+'-'+str(rid) elif rtype == "Competition": uri_ = po.Competition+'#'+self.snapshotid+'-'+str(rid) elif rtype == "Observatorio": uri_ = po.Observatory+'#'+self.snapshotid+'-'+str(rid) if rtype: triples.append((uri, po.accountable, uri_)) count += 1 if count % 60 == 0: c("finished places entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of places entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of places entries")
def writeAll(self): g=P.context(self.meta_graph) ntriples=len(g) triples=[ (self.snapshoturi,po.nMetaTriples,ntriples) , ] P.add(triples,context=self.meta_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml") c("serialized meta") if not os.path.isdir(self.final_path_+"scripts"): os.mkdir(self.final_path_+"scripts") shutil.copy(PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py") # copia do base data self.dates=[i.isoformat() for i in self.dates] date1=min(self.dates) date2=max(self.dates) with open(self.final_path_+"README","w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the IRC snapshot {snapid} with tweets from {date1} to {date2} (total of {ntrip} triples).{tinteraction}{tposts} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""".format( snapid=self.snapshotid,date1=date1,date2=date2,ntrip=self.ntriples, tinteraction=tposts, tposts=tposts, mrdf=self.translation_xml, mttl=self.translation_ttl, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc ))
def rdfGroupPosts(self,filename_posts_): data=[i.split("\t") for i in open(filename_posts_,"r").read().split("\n")[:-1]] tvars=data[0] standard_vars=['id','type','message','created_time','comments','likes','commentsandlikes'] if len(tvars)!=sum([i==j for i,j in zip(tvars,standard_vars)]): raise ValueError("the tab file format was not understood") data=data[1:] triples=[] self.nposts=0 nchars_all=[] ntokens_all=[] for post in data: ind=P.rdf.ic(po.Post,post[0],self.posts_graph,self.snapshoturi) ptext=post[2].replace("_","\n") nchars=len(ptext) nchars_all+=[nchars] ntokens=len(k.tokenize.wordpunct_tokenize(ptext)) ntokens_all+=[ntokens] triples+=[ (ind,po.snapshot,self.snapshoturi), (ind,po.postID,post[0]), (ind,po.postType,post[1]), (ind,po.postText,ptext), (ind,po.createdAt,dateutil.parser.parse(post[3])), (ind,po.nComments,int(post[4])), (ind,po.nLikes,int(post[5])), (ind,po.nChars,nchars), (ind,po.nTokens,ntokens), ] if self.nposts%200==0: c("posts: ",self.nposts) self.nposts+=1 self.postsvars=["postID","postType","postText","createdAt","nComments","nLikes","nChars","nTokens"] self.mcharsposts=n.mean(nchars_all) self.dcharsposts=n.std( nchars_all) self.totalchars=n.sum( nchars_all) self.mtokensposts=n.mean(ntokens_all) self.dtokensposts=n.std( ntokens_all) self.totaltokens=n.sum( ntokens_all) #triples+=[ # went to meta file # (self.snapshoturi,po.mCharsPosts,self.mcharsposts), # (self.snapshoturi,po.dCharsPosts,self.dcharsposts), # (self.snapshoturi,po.totalCharsPosts,self.totalchars), # (self.snapshoturi,po.mTokensPosts,self.mtokensposts), # (self.snapshoturi,po.dTokensPosts,self.dtokensposts), # (self.snapshoturi,po.totalTokensPosts,self.totaltokens), # ] P.add(triples,context=self.posts_graph)
def translateVotes(self): triples = [] commentids = set(self.comments_table.get("id")) count = 0 for id_, vote, voteable_id, voteable_type,\ voter_id, voter_type, created_at in \ self.votes_table.getMany( ("id", "vote", "voteable_id", "voteable_type", "voter_id", "voter_type", "created_at")): assert isinstance(id_, int) assert isinstance(voteable_id, int) assert isinstance(created_at, datetime.datetime) voteuri = P.rdf.ic(po.Vote, self.snapshotid+"-"+str(id_), self.translation_graph, self.snapshoturi) if voteable_type == "Article": type__ = self.articletypes[voteable_id].split("::")[-1] # referenceuri = \ # eval("po."+type__)+"#"+self.snapshotid+"-"+str(voteable_id) referenceuri = \ po.Article+"#"+self.snapshotid+"-"+str(voteable_id) elif voteable_type == "Comment": assert voteable_id in commentids referenceuri = \ po.Comment+"#"+self.snapshotid+"-"+str(voteable_id) else: raise ValueError("unexpected voteable type") triples += [ (voteuri, po.createdAt, created_at), (voteuri, po.vote, vote), (voteuri, po.reference, referenceuri), ] if voter_id: assert voter_type == "Profile" assert isinstance(voter_id, int) participanturi = po.Participant + '#' + \ self.snapshotid+"-"+self.profileids[voter_id] triples += [ (voteuri, po.author, participanturi), ] count += 1 if count % 100 == 0: c("votes done:", count) c("ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of votes") triples = [] if triples: c("ntriples:", len(triples)) P.add(triples, self.translation_graph)
def makeMeta(self): triples = [ (self.snapshoturi, a, po.Snapshot), # (self.snapshoturi, a, po.AASnapshot), # (self.snapshoturi, a, po.AAIRCSnapshot), (self.snapshoturi, po.snapshotID, self.snapshotid), (self.snapshoturi, po.isEgo, False), (self.snapshoturi, po.isGroup, True), (self.snapshoturi, po.isFriendship, False), (self.snapshoturi, po.isInteraction, False), (self.snapshoturi, po.isPost, True), (self.snapshoturi, po.socialProtocol, 'Algorithmic Autoregulation'), (self.snapshoturi, po.dateObtained, datetime.date(2015, 7, 15)), ] P.add(triples, self.meta_graph)
def makeMeta(self): triples = [ (self.snapshoturi, a, po.Snapshot), # (self.snapshoturi, a, po.AASnapshot), # (self.snapshoturi, a, po.AAIRCSnapshot), (self.snapshoturi, po.snapshotID, self.snapshotid), (self.snapshoturi, po.isEgo, False), (self.snapshoturi, po.isGroup, True), (self.snapshoturi, po.isFriendship, False), (self.snapshoturi, po.isInteraction, False), (self.snapshoturi, po.isPost, True), (self.snapshoturi, po.socialProtocol, 'Cidade Democrática'), (self.snapshoturi, po.dateObtained, datetime.date(2014, 3, 19)), ] P.add(triples, self.meta_graph)
def translateImages(self): triples = [] count = 0 for imagem in self.data["imagens"]: iid = imagem[0] rid = imagem[1] rtype = imagem[2] size = imagem[3] ctype = imagem[4] fname = imagem[5] height = imagem[6] width = imagem[7] legenda = imagem[11] created = imagem[12] updated = imagem[13] uri = P.rdf.ic(po.Image, self.snapshotid+"-"+str(iid), self.translation_graph, self.snapshoturi) triples.append((uri, po.createdAt, created)) if rtype == "User": triples.append((uri, po.accountable, po.Participant+"#"+self.snapshotid+'-'+str(rid))) if rtype == "Topico": triples.append((uri, po.accountable, po.Topic+"#"+self.snapshotid+'-'+str(rid))) if size: triples.append((uri, po.size, int(size))) if ctype: triples.append((uri, po.contentType, ctype)) if fname: triples.append((uri, po.filename, fname)) if height: triples.append((uri, po.height, int(height))) if width: triples.append((uri, po.width, int(width))) if legenda: triples.append((uri, po.caption, legenda)) if updated != created: triples.append((uri, po.updatedAt, updated)) count += 1 if count % 60 == 0: c("finished image entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of image entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of prizes entries")
def translateLoginHistory(self): triples = [] for login in self.data["historico_de_logins"]: lid = login[0] uid = login[1] created = login[2] ip = login[3] uri = P.rdf.ic(po.Login, self.snapshotid+"-"+str(lid), self.translation_graph, self.snapshoturi) triples += [ (uri, po.participant, po.Participant+'#'+self.snapshotid+'-'+str(uid)), (uri, po.createdAt, created), (uri, po.ip, ip) ] P.add(triples, self.translation_graph) c("finished add of login entries")
def translateMacrotags(self): triples = [] for mt in self.data["macro_tags"]: mtid = mt[0] title = mt[1] created = mt[2] updated = mt[3] uri = P.rdf.ic(po.Macrotag, self.snapshotid+"-"+str(mtid), self.translation_graph, self.snapshoturi) triples.append((uri, po.createdAt, created)) if updated != created: triples += [ (uri, po.updatedAt, updated), ] if title: triples.append((uri, po.title, title)) P.add(triples, self.translation_graph) c("finished add of microtag entries")
def translateFriendships(self): triples = [] fids = self.friendships_table.getMany(("person_id", "friend_id")) added_friendships = [] count = 0 for person_id, friend_id, created_at, group in \ self.friendships_table.getMany( ('person_id', 'friend_id', 'created_at', 'group')): if [friend_id, person_id] in added_friendships: pass else: added_friendships += [[person_id, friend_id]] id0 = self.profileids[person_id] id1 = self.profileids[friend_id] friendshipuri = P.rdf.ic(po.Friendship, self.snapshotid+'-'+id0+'-'+id1, self.translation_graph, self.snapshoturi) participanturi0 = po.Participant+"#"+self.snapshotid+"-"+id0 participanturi1 = po.Participant+"#"+self.snapshotid+"-"+id1 assert isinstance(created_at, datetime.datetime) triples += [ (friendshipuri, po.member, participanturi0), (friendshipuri, po.member, participanturi1), (friendshipuri, po.createdAt, created_at), ] if [friend_id, person_id] not in fids: triples += [ (participanturi0, po.knows, participanturi1), ] if group: triples += [ (friendshipuri, po.socialCircle, group), ] count += 1 if count % 100 == 0: c("done friendships:", count) c("ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of friendships") triples = [] if triples: c("ntriples:", len(triples)) P.add(triples, self.translation_graph)
def ic(uriref, string, context=None, snapshoturi=None): uri = uriref + "#" + urllib.parse.quote(string, safe="") assert rfc3986.is_valid_uri(uri) # also rfc3986.normalize_uri triples = [ (uri, a, uriref), ] if snapshoturi: triples += [ (uri, NS.po.snapshot, snapshoturi), ] # frames = inspect.getouterframes(inspect.currentframe()) # c(outer_frame,dir(outer_frame),outer_frame.f_locals) # outer_frame = frames[1][0] # if "triples" in outer_frame.f_locals: # outer_frame.f_locals["triples"]+=triples # else: # P.add(triples,context=context) P.add(triples, context=context) return uri
def translateCompetitions(self): count = 0 triples = [] for competition in self.data['competitions']: coid = competition[0] sdesc = competition[1] created = competition[3] updated = competition[4] start = competition[5] title = competition[11] ldesc = competition[14] adesc = competition[15] reg = competition[16] aw = competition[17] part = competition[18] competitionuri = P.rdf.ic(po.Competition, self.snapshotid+"-"+str(coid), self.translation_graph, self.snapshoturi) triples += [ (competitionuri, po.shortDescription, sdesc), (competitionuri, po.description, ldesc), (competitionuri, po.authorDescription, adesc), (competitionuri, po.createdAt, created), (competitionuri, po.startAt, start), (competitionuri, po.title, title), (competitionuri, po.regulations, reg), (competitionuri, po.awards, aw), (competitionuri, po.partners, part), ] if updated != created: triples.append( (competitionuri, po.updatedAt, updated), ) count += 1 if count % 60 == 0: c("finished competition entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of competition entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finisheg add of competitiok entries")
def translatePrizes(self): count = 0 triples = [] for prize in self.data["competition_prizes"]: pid = prize[0] name = prize[1] description = prize[2] competition_id = prize[3] offerer_id = prize[4] tid = prize[5] created = prize[6] updated = prize[7] prizeuri = P.rdf.ic(po.Prize, self.snapshotid+"-"+str(pid), self.translation_graph, self.snapshoturi) triples += [ (prizeuri, po.name, name), (prizeuri, po.description, description), (prizeuri, po.description, description), (prizeuri, po.competition, po.Competition+"#"+self.snapshotid+'-'+str(competition_id)), (prizeuri, po.offerer, po.Participant+"#"+self.snapshotid+'-'+str(offerer_id)), (prizeuri, po.topic, po.Topic+"#"+self.snapshotid+'-'+str(tid)), (prizeuri, po.createdAt, created) ] if updated != created: triples += [ (prizeuri, po.updatedAt, updated), ] count += 1 if count % 60 == 0: c("finished prizes entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of prizes entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of prizes entries")
def rdfTweets(self): tweets = [] if self.pickle_filename1: tweets += readPickleTweetFile(self.data_path + self.pickle_filename1)[0] if self.pickle_filename2: # limit chuck to 10k tweets tweets, fopen = readPickleTweetChunk( self.data_path + self.pickle_filename2, tweets, None, 10000) chunk_count = 0 # self.tweets = tweets # for debugging only, remove to release memory while tweets: c("rendering tweets, chunk:", chunk_count, "ntweets:", len(tweets), "snapshotid", self.snapshotid) count = 0 for tweet in tweets: tweeturi, triples = self.tweetTriples(tweet) if "retweeted_status" in tweet.keys(): # self.nretweets += 1 tweeturi0, triples0 = self.tweetTriples( tweet['retweeted_status']) triples.extend(triples0) triples.append((tweeturi, po.retweetOf, tweeturi0)) self.ntriples += len(triples) P.add(triples, context=self.tweet_graph) count += 1 if count % 1000 == 0: c("triplified", count, "tweets") c("end of chunk:", chunk_count, "ntriples:", self.ntriples) self.writeTweets(chunk_count) c("chunk has been written") chunk_count += 1 # if chunk_count == 2: # break if self.pickle_filename2: tweets, fopen = readPickleTweetChunk(None, [], fopen, 10000) else: tweets = []
def translateInspirations(self): count = 0 triples = [] for inspiration in self.data["inspirations"]: iid = inspiration[0] cid = inspiration[1] desc = inspiration[2] created = inspiration[3] updated = inspiration[4] image = inspiration[5] uid = inspiration[6] title = inspiration[7] uri = P.rdf.ic(po.Inspiration, self.snapshotid+"-"+str(iid), self.translation_graph, self.snapshoturi) triples += [ (uri, po.competition, po.Competition+'#'+self.snapshotid+'-'+str(cid)), (uri, po.description, desc), (uri, po.createdAt, created), (uri, po.participant, po.Participant+'#'+self.snapshotid+'-'+str(uid)), (uri, po.title, title), (uri, po.filename, image), ] if updated != created: triples += [ (uri, po.updatedAt, updated), ] count += 1 if count % 60 == 0: c("finished inspiration entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of inspiration entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of inspiration entries")
def rdfsInferenceIterate(data_context=None,ontology_context=None,inferred_context=None): contexts=[i.identifier.lower() for i in P.context()] if data_context not in contexts: c("no data context") if ontology_context not in contexts: c("no ontology context") if inferred_context not in contexts: c("inferred context to be created context:",inferred_context) for subject, foo, object_ in P.percolation_graph.triples(\ (None,NS.rdfs.subClassOf,None),context=ontology_context): for individual, footype, foosubject in P.percolation_graph.triples(\ (None,a,subject),context=data_context): P.add((individual,a,object_),context=inferred_context) for foosubject, fooproperty, subject in P.percolation_graph.triples(\ (None,None,subject),context=data_context): P.add((foosubject,fooproperty,object_),context=inferred_context) c("finished subclass reasoning") for subject, foo, object_ in P.percolation_graph.triples(\ (None,NS.rdfs.subPropertyOf,None),context=ontology_context): c(subject,foo,object_) for subject2,propertyfoo,object2 in P.percolation_graph.triples(\ (None,subject,None),context=data_context): c(subject2,propertyfoo,object2) P.add((subject2,object_,object2),context=inferred_context) c("finished subproperty reasoning") for subject, foo, object_ in P.percolation_graph.triples(\ (None,NS.rdfs.domain,None),context=ontology_context): for subject2,predicatefoo,objectfoo in P.percolation_graph.triples(\ (None,subject,None),context=data_context): P.add((subject2,a,object_),context=inferred_context) c("finished domain reasoning") for subject, foo, object_ in P.percolation_graph.triples(\ (None,NS.rdfs.range,None),context=ontology_context): for subjectfoo,predicatefoo,object2 in P.percolation_graph.triples(\ (None,subject,None),context=data_context): P.add((object2,a,object_),context=inferred_context) c("finished range reasoning")
def translateTags(self): count = 0 triples = [] for tag in self.data["tags"]: tid = tag[0] tag_ = tag[1] relevancia = tag[2] uri = P.rdf.ic(po.Tag, self.snapshotid+"-"+str(tid), self.translation_graph, self.snapshoturi) triples += [ (uri, po.text, tag_), (uri, po.relevance, relevancia), ] count += 1 if count % 160 == 0: c("finished tag entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of tag entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of tag entries")
def translateTaggings(self): count = 0 triples = [] for tagging in self.data["taggings"]: tid_ = tagging[0] tid = tagging[1] toid = tagging[2] uid = tagging[3] ttype = tagging[5] created = tagging[7] uri = po.Tagging+"#"+self.snapshotid+'-'+str(tid_) uri = P.rdf.ic(po.Tagging, self.snapshotid+"-"+str(tid_), self.translation_graph, self.snapshoturi) triples += [ (uri, po.tag, po.Tag+"#"+self.snapshotid+'-'+str(tid)), (uri, po.tagger, po.Participant+"#"+self.snapshotid+'-'+str(uid)), (uri, po.createdAt, created) ] if ttype == "Topico": # tagging -> topico triples.append((uri, po.tagged, po.Topic+'#'+self.snapshotid+'-'+str(toid))) else: triples.append((uri, po.tagged, po.Macrotag+"#"+self.snapshotid+'-'+str(toid))) count += 1 if count % 160 == 0: c("finished tagging entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of tagging entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of tagging entries")
def makeMetadata(self): triples=P.get(self.snapshoturi,None,None,"social_facebook") for rawfile in P.get(self.snapshoturi,po.rawFile,None,"social_facebook",strict=True,minimized=True): triples+=P.get(rawfile,None,None,"social_facebook") P.add(triples,context=self.meta_graph) self.ffile="base/"+self.filename_friendships self.frdf=self.snapshotid+"Friendship.rdf" self.fttl=self.snapshotid+"Friendship.ttl" triples=[ (self.snapshoturi, po.onlineOriginalFriendshipFile,self.online_prefix+self.ffile), (self.snapshoturi, po.originalFriendshipFileName,self.ffile), (self.snapshoturi, po.onlineFriendshipXMLFile,self.online_prefix+self.frdf), (self.snapshoturi, po.onlineFriendshipTTLFile,self.online_prefix+self.fttl), (self.snapshoturi, po.friendshipXMLFileName, self.frdf), (self.snapshoturi, po.friendshipTTLFileName, self.fttl), (self.snapshoturi, po.nFriends, self.nfriends), (self.snapshoturi, po.nFriendships, self.nfriendships), (self.snapshoturi, po.friendshipsAnonymized ,self.friendships_anonymized), ] P.add(triples,context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.frienshipParticipantAttribute]*len(self.friendsvars), self.friendsvars,context=self.meta_graph) self.mrdf=self.snapshotid+"Meta.rdf" self.mttl=self.snapshotid+"Meta.ttl" self.desc="facebook network with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid,self.snapshoturi,self.isego,self.isgroup,) self.desc+="\nisFriendship: {}".format(self.isfriendship) self.desc+="; nFriends: {}; nFrienships: {}.".format(self.nfriends,self.nfriendships,) self.desc+="\nisInteraction: {}".format(self.isinteraction) self.desc+="\nisPost: {} (alias hasText: {})".format(self.hastext,self.hastext) triples=[ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, po.triplifiedBy, "scripts/"), (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]), (self.snapshoturi, po.availableAt, self.online_prefix), (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf), (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl), (self.snapshoturi, po.metaXMLFileName, self.mrdf), (self.snapshoturi, po.metaTTLFileName, self.mttl), (self.snapshoturi, po.acquiredThrough, "Netvizz"), (self.snapshoturi, po.socialProtocolTag, "Facebook"), (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform,"Facebook",self.meta_graph,self.snapshoturi)), (self.snapshoturi, NS.rdfs.comment, self.desc), ] P.add(triples,self.meta_graph)
def makeMetadata(self): # triples = P.get(self.snapshoturi, None, None, self.social_graph) # for rawfile in P.get(self.snapshoturi, po.rawFile, None, self.social_graph, strict=True, minimized=True): # triples += P.get(rawfile, None, None, self.social_graph) # self.totalchars = sum(self.nchars_all) # self.mcharsmessages = n.mean(self.nchars_all) # self.dcharsmessages = n.std(self.nchars_all) # self.totaltokens = sum(self.ntokens_all) # self.mtokensmessages = n.mean(self.ntokens_all) # self.dtokensmessages = n.std(self.ntokens_all) # self.totalsentences = sum(self.nsentences_all) # self.msentencesmessages = n.mean(self.nsentences_all) # self.dsentencesmessages = n.std(self.nsentences_all) # self.nparticipants = len(self.NICKS) # self.nmessages = len(self.messageids) # self.ntriples = len(P.context(self.irc_graph)) # triples = [ # (self.snapshoturi, po.numberOfParticipants, self.nparticipants), # (self.snapshoturi, po.numberOfMessages, self.nmessages), # (self.snapshoturi, po.numberOfDirectMessages, self.ndirect), # (self.snapshoturi, po.numberOfUserMentions, self.nmention), # (self.snapshoturi, po.numberOfChars, self.totalchars), # (self.snapshoturi, po.meanChars, self.mcharsmessages), # (self.snapshoturi, po.deviationChars, self.dcharsmessages), # (self.snapshoturi, po.numberOfTokens, self.totaltokens), # (self.snapshoturi, po.meanTokens, self.mtokensmessages), # (self.snapshoturi, po.deviationTokens, self.dtokensmessages), # (self.snapshoturi, po.numberOfSentences, self.totalsentences), # (self.snapshoturi, po.meanSentences, self.msentencesmessages), # (self.snapshoturi, po.deviationSentences, self.dsentencesmessages), # ] # P.add(triples, context=self.meta_graph) # P.rdf.triplesScaffolding( # self.snapshoturi, # [po.ircParticipantAttribute]*len(self.participantvars), # self.participantvars, context=self.meta_graph # ) # P.rdf.triplesScaffolding( # self.snapshoturi, # [po.logXMLFilename]*len(self.log_xml)+[po.logTTLFilename]*len(self.log_ttl), # self.log_xml+self.log_ttl, context=self.meta_graph # ) # P.rdf.triplesScaffolding( # self.snapshoturi, # [po.onlineLogXMLFile]*len(self.log_xml)+[po.onlineLogTTLFile]*len(self.log_ttl), # [self.online_prefix+i for i in self.log_xml+self.log_ttl], context=self.meta_graph # ) self.mrdf = self.snapshotid + "Meta.rdf" self.mttl = self.snapshotid + "Meta.ttl" self.desc = "irc dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid, self.snapshoturi, self.isego, self.isgroup, ) self.desc += "\nisFriendship: {}; ".format(self.isfriendship) self.desc += "isInteraction: {}.".format(self.isinteraction) # self.desc += "\nnParticipants: {}; nInteractions: {} (directed messages+user mentions).".format( # self.nparticipants, self.ndirect+self.nmention) self.desc += "\nisPost: {} (alias hasText: {})".format( self.hastext, self.hastext) # self.desc += "\nnumberOfMessages: {}; ".format(self.nmessages) # self.desc += "nDirectedMessages: {}; numberOfUserMentions: {};".format(self.ndirect, self.nmention) # self.desc += "\nnumberOfChars: {}; meanChars: {}; deviationChars: {}.".format( # self.totalchars, self.mcharsmessages, self.dcharsmessages) # self.desc += "\nnumberOfTokens: {}; meanTokens: {}; deviationTokens: {};" # self.totaltokens, self.mtokensmessages, self.dtokensmessages) # self.desc += "\nnSentencesOverall: {}; meanSentences: {}; deviationSentences: {};".format( # self.totalsentences, self.msentencesmessages, self.dsentencesmessages) # self.desc += "\nnumberOfURLs: {}; numberOfAAMessages {}.".format(self.nurls, self.naamessages) triples = [ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, a, po.Snapshot), (self.snapshoturi, po.snapshotID, self.snapshotid), (self.snapshoturi, po.isEgo, False), (self.snapshoturi, po.isGroup, True), (self.snapshoturi, po.isFriendship, False), (self.snapshoturi, po.isInteraction, True), (self.snapshoturi, po.isPost, True), (self.snapshoturi, po.channel, '#' + self.snapshotid.replace('irc-legacy-', '')), # (self.snapshoturi, po.triplifiedBy, "scripts/"), # (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]), # (self.snapshoturi, po.availableAt, self.online_prefix), # (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf), # (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl), # (self.snapshoturi, po.metaXMLFileName, self.mrdf), # (self.snapshoturi, po.metaTTLFileName, self.mttl), # (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_xml)), # (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)), (self.snapshoturi, po.acquiredThrough, "channel text log"), (self.snapshoturi, po.socialProtocol, "IRC"), # (self.snapshoturi, po.socialProtocolTag, "IRC"), # (self.snapshoturi, po.socialProtocol, P.rdf.ic( po.Platform, "IRC", self.meta_graph, self.snapshoturi)), # (self.snapshoturi, po.numberOfTriples, self.ntriples), (self.snapshoturi, po.comment, self.desc), ] P.add(triples, self.meta_graph)
def rdfFriendshipNetwork(self, fnet): if sum([("user" in i) for i in fnet["individuals"]["label"]]) == \ len(fnet["individuals"]["label"]): # fake names and local ids self.friendships_anonymized = True else: self.friendships_anonymized = False tkeys = list(fnet["individuals"].keys()) if "groupid" in tkeys: self.groupid = fnet["individuals"]["groupid"][0] tkeys.remove("groupid") else: self.groupid = None if self.friendships_anonymized: self.friendsvars = [trans[i] for i in tkeys if i not in ('label', 'name')] else: self.friendsvars = [trans[i] for i in tkeys] insert = {"uris": [], "vals": []} # values for each participant are in the same order as insert['uris'] for tkey in tkeys: insert["uris"].append(eval("po."+trans[tkey])) insert["vals"].append(fnet["individuals"][tkey]) self.nfriends = len(insert["vals"][0]) iname = tkeys.index("name") ilabel = tkeys.index("label") for vals_ in zip(*insert["vals"]): if self.friendships_anonymized: if vals_[ilabel] and ("user" not in vals_[ilabel]): raise ValueError("Anonymized networks should have no \ informative name. Found: "+vals_[ilabel]) name_ = "{}-{}".format(self.self.snapshotid, vals_[iname]) insert_uris_ = [el for i, el in enumerate(insert['uris']) if i not in (ilabel, iname)] vals_ = [el for i, el in enumerate(vals_) if (i not in (ilabel, iname))] obsname = '{}-{}'.format(self.snapshotid, self.observation_count) self.observation_count += 1 else: name_ = "{}-{}".format(self.provenance_prefix, vals_[iname]) insert_uris_ = [el for i, el in enumerate(insert['uris'])] vals_ = [el for i, el in enumerate(vals_)] # uri = insert['uris'][iname] # numericID = vals_[iname] # P.add([(ind, uri, numericID)], self.friendship_graph) obsname = '{}-{}'.format(self.snapshotid, vals_[iname]) ind = P.rdf.ic(po.Participant, name_, self.friendship_graph, self.snapshoturi) obs = P.rdf.ic(po.Observation, obsname, self.friendship_graph, self.snapshoturi) P.add([(ind, po.observation, obs)], self.friendship_graph) P.rdf.triplesScaffolding(obs, insert_uris_, vals_, self.friendship_graph) c("participants written") friendships_ = [fnet["relations"][i] for i in ("node1", "node2")] i = 0 for uid1, uid2 in zip(*friendships_): uids_ = [uid1, uid2] uids_.sort() if self.friendships_anonymized: flabel = "{}-{}-{}".format(self.snapshotid, *uids_) uids = [r.URIRef(po.Participant+"#{}-{}".format( self.snapshotid, i)) for i in (uid1, uid2)] else: flabel = "{}-{}-{}".format(self.provenance_prefix, *uids_) uids = [r.URIRef(po.Participant+"#{}-{}".format( self.provenance_prefix, i)) for i in (uid1, uid2)] friendship_uri = P.rdf.ic(po.Friendship, flabel, self.friendship_graph, self.snapshoturi) P.rdf.triplesScaffolding(friendship_uri, [po.member]*2, uids, self.friendship_graph) i += 1 if (i % 1000) == 0: c("friendships", i) self.nfriendships = len(friendships_[0]) c("friendships written")
def probeOntology(endpoint_url, graph_urns, final_dir, one_datatype=True): if not os.path.isdir(final_dir): os.makedirs(final_dir) client = P.rdf.sparql.classes.LegacyClient(endpoint_url) from_ = '' for graph_urn in graph_urns: from_ += '\nFROM <%s>' % (graph_urn, ) def mkQuery(query, plain=True): query_ = query.split('WHERE') query__ = (query_[0], from_, '\nWHERE ' + query_[1]) query___ = ''.join(query__) result = client.retrieveQuery(query___) if plain: return pl(result) else: return result['results']['bindings'] c('find all classes') q = "SELECT DISTINCT ?class WHERE { ?s a ?class . }" # classes = pl(client.retrieveQuery(prefix+q)) classes = mkQuery(q) c('antecedents, consequents and restrictions of each class') neighbors = {} triples = [] existential_restrictions = {} universal_restrictions = {} for aclass in classes: q = "SELECT DISTINCT ?cs ?p WHERE { ?i a <%s> . ?s ?p ?i . OPTIONAL { ?s a ?cs . } }" % ( aclass, ) antecedent_property = mkQuery(q) # q = "SELECT DISTINCT ?ap (datatype(?o) as ?do) WHERE { ?i a <%s> . ?i ?ap ?o . filter (datatype(?o) != '') }" % (aclass,) # consequent_property = mkQuery(q) # q = "SELECT DISTINCT ?ap ?co WHERE { ?i a <%s> . ?i ?ap ?o . ?o a ?co . }" % (aclass,) # consequent_property_ = mkQuery(q) q = "SELECT DISTINCT ?ap ?co (datatype(?o) as ?do) WHERE { ?i a <%s> . ?i ?ap ?o . OPTIONAL { ?o a ?co . } }" % ( aclass, ) consequent_property__ = mkQuery(q, 0) consequent_property = [[i['ap']['value'], i['do']['value']] for i in consequent_property__ if 'do' in i] consequent_property_ = [[i['ap']['value'], i['co']['value']] for i in consequent_property__ if 'co' in i] neighbors[aclass] = (antecedent_property, consequent_property + consequent_property_) # neighbors[aclass] = (antecedent_property, dict(consequent_property, **consequent_property_)) # class restrictions q = "SELECT DISTINCT ?p WHERE {?s a <%s>. ?s ?p ?o .}" % (aclass, ) props_c = mkQuery(q) # q = "SELECT DISTINCT ?s WHERE {?s a <%s>}" % (aclass,) # inds = mkQuery(q) q = "SELECT (COUNT(DISTINCT ?s) as ?cs) WHERE {?s a <%s>}" % (aclass, ) ninds = pl(client.retrieveQuery(q))[0] for pc in props_c: if '22-rdf-syntax' in pc: continue # q = "SELECT DISTINCT ?s ?co (datatype(?o) as ?do) WHERE {?s a <%s>. ?s <%s> ?o . OPTIONAL {?o a ?co . }}" % (aclass, pc) q = "SELECT DISTINCT ?co (datatype(?o) as ?do) WHERE {?s a <%s>. ?s <%s> ?o . OPTIONAL {?o a ?co . }}" % ( aclass, pc) inds2 = mkQuery(q, 0) # inds2_ = set([i["s"]["value"] for i in inds2]) objs = set([i["co"]["value"] for i in inds2 if "co" in i.keys()]) vals = set([i["do"]["value"] for i in inds2 if "do" in i.keys()]) q = "SELECT (COUNT(DISTINCT ?s) as ?cs) WHERE {?s a <%s>. ?s <%s> ?o . }" % ( aclass, pc) ninds2 = pl(client.retrieveQuery(q))[0] # if len(inds) == len(inds2_): # existential if ninds == ninds2: # existential if len(vals): ob = list(vals)[0] else: if len(objs): ob = list(objs)[0] else: ob = 0 if ob: B = r.BNode() triples += [(aclass, rdfs.subClassOf, B), (B, a, owl.Restriction), (B, owl.onProperty, pc), (B, owl.someValuesFrom, ob)] if aclass in existential_restrictions.keys(): existential_restrictions[aclass].append((pc, ob)) else: existential_restrictions[aclass] = [(pc, ob)] q = "SELECT (COUNT(DISTINCT ?s) as ?cs) WHERE { ?s <%s> ?o . ?s a ?ca . FILTER(str(?ca) != '%s') }" % ( pc, aclass) ninds3 = pl(client.retrieveQuery(q))[0] # q = "SELECT DISTINCT ?s WHERE { ?s <%s> ?o .}" % (pc,) # inds3 = mkQuery(q) # if set(inds) == set(inds3): # universal # if all([i in set(inds) for i in inds3]): # universal # if ninds == ninds3: # universal if ninds3 == 0: # universal if len(vals): ob = list(vals)[0] else: if len(objs): ob = list(objs)[0] else: ob = 0 if ob: B = r.BNode() triples += [(aclass, rdfs.subClassOf, B), (B, a, owl.Restriction), (B, owl.onProperty, pc), (B, owl.allValuesFrom, ob)] if aclass in universal_restrictions.keys(): universal_restrictions[aclass].append((pc, ob)) else: universal_restrictions[aclass] = [(pc, ob)] del q, aclass, antecedent_property, consequent_property c('find properties') q = "SELECT DISTINCT ?p WHERE {?s ?p ?o}" # properties = pl(client.retrieveQuery(prefix+q)) properties = mkQuery(q) # properties_ = [i.split("/")[-1] for i in properties] c('check if property is functional and get range and domain') functional_properties = set() for prop in properties: # check if property is functional q = 'SELECT DISTINCT (COUNT(?o) as ?co) WHERE { ?s <%s> ?o } GROUP BY ?s' % ( prop, ) is_functional = mkQuery(q) if len(is_functional) == 1 and is_functional[0] == 1: triples.append((prop, a, owl.FunctionalProperty)) functional_properties.add(prop) # datatype or object properties suj = mkQuery("SELECT DISTINCT ?cs WHERE { ?s <%s> ?o . ?s a ?cs . }" % (prop, )) # obj = mkQuery("SELECT DISTINCT ?co (datatype(?o) as ?do) WHERE { ?s <%s> ?o . OPTIONAL { ?o a ?co . } }" % (prop,)) obj1 = mkQuery( "SELECT DISTINCT ?co WHERE { ?s <%s> ?o . ?o a ?co . }" % (prop, )) obj2 = mkQuery( "SELECT DISTINCT (datatype(?o) as ?do) WHERE { ?s <%s> ?o . }" % (prop, )) obj = obj1 + obj2 if len(obj) and ("XMLS" in obj[0]): triples.append((prop, a, owl.DataProperty)) else: triples.append((prop, a, owl.ObjectProperty)) if len(suj) > 1: B = r.BNode() triples.append((prop, rdfs.domain, B)) for ss in suj: triples.append((B, owl.unionOf, ss)) elif suj: triples.append((prop, rdfs.domain, suj[0])) if len(obj) > 1: B = r.BNode() triples.append((prop, rdfs.range, B)) for ss in suj: triples.append((B, owl.unionOf, ss)) elif obj: triples.append((prop, rdfs.range, obj[0])) # for drawing # prop_ = prop.split("/")[-1] # suj_ = [i.split('/')[-1] for i in suj] # obj_ = [i.split('/')[-1] for i in obj] # Drawing c('started drawing') A = gv.AGraph(directed=True, strict=False) q = """PREFIX po: <http://purl.org/socialparticipation/po/> SELECT DISTINCT ?snap WHERE { { ?i po:snapshot ?snap } UNION { ?snap po:snapshotID ?idfoo } }""" # SELECT DISTINCT ?snap WHERE { ?i po:snapshot ?snap }""" snap = mkQuery(q)[0] q = """PREFIX po: <http://purl.org/socialparticipation/po/> SELECT ?provenance WHERE { <%s> po:socialProtocol ?provenance }""" % (snap) # WHERE { { <%s> po:socialProtocolTag ?provenance } UNION # { <%s> po:humanizedName ?provenance } }""" % (snap, snap) provenance = pl(client.retrieveQuery(q))[0] # A.graph_attr["label"] = r"General diagram of ontological structure from %s in the http://purl.org/socialparticipation/participationontology/ namespace.\nGreen edge denotes existential restriction;\ninverted edge nip denotes universal restriction;\nfull edge (non-dashed) denotes functional property." % (provenance,) edge_counter = 1 node_counter = 1 data_nodes = {} for aclass in classes: aclass_ = aclass.split('/')[-1] if aclass_ not in A.nodes(): A.add_node(aclass_, style="filled") n = A.get_node(aclass_) n.attr['color'] = "#A2F3D1" neigh = neighbors[aclass] # for i in range(len(neigh[0])): # antecendents # label = neigh[0][i][0].split("/")[-1] # elabel = neigh[0][i][1] # elabel_ = elabel.split("/")[-1] # if label not in A.nodes(): # A.add_node(label, style="filled") # n = A.get_node(label) # n.attr['color'] = "#A2F3D1" # ekey = '{}-{}-{}'.format(label, aclass_, edge_counter) # edge_counter += 1 # A.add_edge(label, aclass_, ekey) # e = A.get_edge(label, aclass_, key=ekey) # e.attr["label"] = elabel_ # e.attr["penwidth"] = 2. # e.attr["arrowsize"] = 2. # if elabel not in functional_properties: # e.attr["style"] = "dashed" # if neigh[0][i][0] in existential_restrictions.keys(): # restriction = existential_restrictions[neigh[0][i][0]] # prop = [iii[0] for iii in restriction] # obj = [iii[1] for iii in restriction] # if (elabel in prop) and (obj[prop.index(elabel)] == aclass): # e.attr["color"] = "#A0E0A0" # if neigh[0][i][0] in universal_restrictions.keys(): # restriction = universal_restrictions[neigh[0][i][0]] # prop = [iii[0] for iii in restriction] # obj = [iii[1] for iii in restriction] # if (elabel in prop) and (obj[prop.index(elabel)] == aclass): # e.attr["color"] = "inv" for i in range(len(neigh[1])): # consequents label = neigh[1][i][1].split("/")[-1] elabel = neigh[1][i][0] elabel_ = elabel.split('/')[-1] if "XMLS" in label: color = "#FFE4AA" if one_datatype: if label in data_nodes: label_ = data_nodes[label] else: label_ = node_counter node_counter += 1 data_nodes[label] = label_ else: label_ = node_counter node_counter += 1 else: label_ = label color = "#A2F3D1" if label_ not in A.nodes(): A.add_node(label_, style="filled") n = A.get_node(label_) n.attr['label'] = label.split("#")[-1] n.attr['color'] = color ekey = '{}-{}-{}'.format(aclass_, label_, edge_counter) edge_counter += 1 A.add_edge(aclass_, label_, ekey) e = A.get_edge(aclass_, label_, key=ekey) e.attr["label"] = elabel_ e.attr["color"] = color e.attr["penwidth"] = 2 if r.URIRef(elabel) not in functional_properties: e.attr["style"] = "dashed" if aclass in existential_restrictions.keys(): restrictions = existential_restrictions[aclass] prop = [iii[0] for iii in restrictions] if r.URIRef(elabel) in prop: e.attr["color"] = "#A0E0A0" if aclass in universal_restrictions.keys(): restrictions = universal_restrictions[aclass] prop = [iii[0] for iii in restrictions] if r.URIRef(elabel) in prop: e.attr["arrowhead"] = "inv" e.attr["arrowsize"] = 2. # A.draw(os.path.join(final_dir, "{}.png".format(final_dir)), prog="dot") # try: # A.draw(os.path.join(final_dir, "{}_circo.png".format(final_dir)), prog="circo") # except: # pass # A.draw(os.path.join(final_dir, "{}_twopi.png".format(final_dir)), prog="twopi", args="-Granksep=4") # A.write(os.path.join(final_dir, "{}.dot".format(final_dir))) A.draw(os.path.join(final_dir, "draw.png"), prog="dot") try: A.draw(os.path.join(final_dir, "draw_circo.png"), prog="circo") except: pass A.draw(os.path.join(final_dir, "draw_twopi.png"), prog="twopi", args="-Granksep=4") A.write(os.path.join(final_dir, "draw.dot")) # for triple in triples: # g.add(triple) P.start(False) P.context('ontology', 'remove') P.add(triples, 'ontology') g = P.context('ontology') g.serialize(os.path.join(final_dir, 'ontology.owl')) g.serialize(os.path.join(final_dir, 'ontology.ttl'), 'turtle') return locals()
def parseLegacyFiles(data_dir=DATADIR + "twitter/"): """Parse legacy pickle files with Twitter tweets""" filenames = os.listdir(data_dir) filenames = [ i for i in filenames if i != "ipython_log.py" and not i.endswith(".swp") ] snapshots = set() triples = [] for filename in filenames: snapshotid = "twitter-legacy-" + filename.replace("_", "").replace( 'tw.pickle', '') snapshoturi = po.Snapshot + "#" + snapshotid expressed_classes = [po.Participant, po.Tweet] expressed_reference = filename.replace("_", "").replace(".pickle", "") name_humanized = "Twitter " + expressed_reference filesize = os.path.getsize(data_dir + filename) / 10**6 fileformat = "pickle" fileuri = po.File + "#twitter-file-" + filename triples += [ (snapshoturi, a, po.Snapshot), # (snapshoturi, a, po.TwitterSnapshot), (snapshoturi, po.snapshotID, snapshotid), (snapshoturi, po.isEgo, False), (snapshoturi, po.isGroup, True), (snapshoturi, po.isFriendship, False), (snapshoturi, po.isInteraction, True), (snapshoturi, po.isPost, True), # (snapshoturi, po.humanizedName, name_humanized), # (snapshoturi, po.expressedReference, expressed_reference), (snapshoturi, po.rawFile, fileuri), # (fileuri, po.fileSize, filesize), (fileuri, po.fileName, filename), # (fileuri, po.fileFormat, fileformat), ] + [ # (fileuri, po.expressedClass, expressed_class) for # expressed_class in expressed_classes ] snapshots.add(snapshoturi) nfiles = len(filenames) nsnapshots = len(snapshots) P.context("social_twitter", "remove") platformuri = P.rdf.ic(po.Platform, "Twitter", context="social_twitter") triples += [ (NS.social.Session, NS.social.nTwitterParsedFiles, nfiles), (NS.social.Session, NS.social.nTwitterSnapshots, nsnapshots), (platformuri, po.dataDir, data_dir), ] P.add(triples, context="social_twitter") c("parsed {} twitter files ({} snapshots) are in percolation graph \ and 'social_twitter' context".format(nfiles, nsnapshots)) c("percolation graph have {} triples ({} in social_twitter context)". format(len(P.percolation_graph), len(P.context("social_twitter")))) negos = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isEgo true } } " ) ngroups = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isGroup true } } " ) nfriendships = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isFriendship true } } " ) ninteractions = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isInteraction true } } " ) nposts = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isPost true } } " ) totalsize = sum( P.query( r" SELECT ?size WHERE { GRAPH <social_twitter> { ?s po:fileSize ?size } } " )) c("""{} are ego snapshots, {} are group snapshots {} have a friendship network. {} have an interaction network. \ {} have post texts and reaction counts. Total raw data size is {:.2f}MB""" .format(negos, ngroups, nfriendships, ninteractions, nposts, totalsize)) return snapshots
def makeMetadata(self): if self.isfriendship and self.groupid and self.groupid2 and ( self.groupid != self.groupid2): raise ValueError("Group IDS are different") # put all triples from social_facebook to self.meta_graph #g1=P.context("social_facebook") #g2=P.context(self.meta_graph) #for subject, predicate, object_ in g1.triples((self.snapshoturi)) triples = P.get(self.snapshoturi, None, None, "social_facebook") for rawfile in P.get(self.snapshoturi, po.rawFile, None, "social_facebook", strict=True, minimized=True): triples += P.get(rawfile, None, None, "social_facebook") P.add(triples, context=self.meta_graph) foo = {"uris": [], "vals": []} if self.isfriendship: foo["uris"]+=[ po.onlineOriginalFriendshipFile, po.originalFriendshipFileName, po.onlineFriendshipXMLFile, po.onlineFriendshipTTLFile, po.friendshipXMLFileName, po.friendshipTTLFileName, po.nFriends, po.nFriendships, po.friendshipsAnonymized ]+\ [po.frienshipParticipantAttribute]*len(self.friendsvars) self.ffile = "base/" + self.filename_friendships self.frdf = self.snapshotid + "Friendship.rdf" self.fttl = self.snapshotid + "Friendship.ttl" foo["vals"] += [ self.online_prefix + self.ffile, self.ffile, self.online_prefix + self.frdf, self.online_prefix + self.fttl, self.frdf, self.fttl, self.nfriends, self.nfriendships, self.friendships_anonymized ] + list(self.friendsvars) if self.isinteraction: foo["uris"]+=[ po.onlineOriginalInteractionFile, po.originalInteractionFileName, po.onlineInteractionXMLFile, po.onlineInteractionTTLFile, po.interactionXMLFileName, po.interactionTTLFileName, po.nInteracted, po.nInteractions, po.interactionsAnonymized ]+\ [po.interactionParticipantAttribute]*len(self.interactionsvars) self.ifile = "base/" + self.filename_interactions self.irdf = irdf = self.snapshotid + "Interaction.rdf" self.ittl = ittl = self.snapshotid + "Interaction.ttl" foo["vals"] += [ self.ifile, self.online_prefix + self.ifile, self.online_prefix + irdf, self.online_prefix + ittl, irdf, ittl, self.ninteractions, self.ninteracted, self.interactions_anonymized, ] + list(self.interactionsvars) if self.hastext: foo["uris"]+=[ po.onlineOriginalPostsFile, po.originalPostsFileName, po.onlinePostsXMLFile, po.onlinePostsTTLFile, po.postsXMLFileName, po.postsTTLFileName, po.nPosts, po.nCharsOverall, po.mCharsOverall, po.dCharsOverall, po.nTokensOverall, po.mTokensOverall, po.dTokensOverall, ]+\ [po.postAttribute]*len(self.postsvars) self.pfile = "base/" + self.filename_posts self.prdf = self.snapshotid + "Post.rdf" self.pttl = self.snapshotid + "Post.ttl" foo["vals"] += [ self.online_prefix + self.pfile, self.pfile, self.online_prefix + self.prdf, self.online_prefix + self.pttl, self.prdf, self.pttl, self.nposts, int(self.totalchars), self.mcharsposts, self.dcharsposts, int(self.totaltokens), self.mtokensposts, self.dtokensposts, ] + list(self.postsvars) foo["uris"] += [ po.isGroup, po.isEgo, po.isFriendship, po.isInteraction, po.hasText, po.isPost, ] self.isego = bool(P.get(r.URIRef(self.snapshoturi), a, po.EgoSnapshot)) self.isgroup = bool( P.get(r.URIRef(self.snapshoturi), a, po.GroupSnapshot)) foo["vals"] += [ self.isgroup, self.isego, self.isfriendship, self.isinteraction, self.hastext, self.hastext ] self.mrdf = self.snapshotid + "Meta.rdf" self.mttl = self.snapshotid + "Meta.ttl" self.desc = "facebook network with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid, self.snapshoturi, self.isego, self.isgroup, ) self.desc += "\nisFriendship: {}".format(self.isfriendship) if self.isfriendship: self.desc += "; nFriends: {}; nFrienships: {}.".format( self.nfriends, self.nfriendships, ) self.desc += "\nisInteraction: {}".format(self.isinteraction) if self.isinteraction: self.desc += "; nInteracted: {}; nInteractions: {}.".format( self.ninteracted, self.ninteractions, ) self.desc += "\nisPost: {} (alias hasText: {})".format( self.hastext, self.hastext) if self.hastext: self.desc += ";\nmCharsPostsOverall: {}; dCharsPostsOverall: {}; totalCharsOverall: {}; \ \nmTokensPostsOverall: {}; dTokensPostsOverall: {}; totalTokensOverall: {}".format( self.nposts, self.mcharsposts, self.dcharsposts, self.totalchars, self.mtokensposts, self.dtokensposts, self.totaltokens, ) P.rdf.triplesScaffolding(self.snapshoturi, [ po.triplifiedIn, po.triplifiedBy, po.donatedBy, po.availableAt, po.onlineMetaXMLFile, po.onlineMetaTTLFile, po.metaXMLFileName, po.metaTTLFileName, po.acquiredThrough, po.socialProtocolTag, po.socialProtocol, NS.rdfs.comment, ] + foo["uris"], [ datetime.datetime.now(), "scripts/", self.snapshotid[:-4], self.online_prefix, self.online_prefix + self.mrdf, self.online_prefix + self.mttl, self.mrdf, self.mttl, "Netvizz", "Facebook", P.rdf.ic(po.Platform, "Facebook", self.meta_graph, self.snapshoturi), self.desc, ] + foo["vals"], self.meta_graph)
def writeAllTW(self): # write meta and readme with self.desc, finished. g=P.context(self.meta_graph) ntriples=len(g) triples=[ (self.snapshoturi,po.nMetaTriples,ntriples) , ] P.add(triples,context=self.meta_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml") c("serialized meta") # copia o script que gera este codigo if not os.path.isdir(self.final_path_+"scripts"): os.mkdir(self.final_path_+"scripts") shutil.copy(S.PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py") # copia do base data tinteraction="""\n\n{} individuals with metadata {} and {} interactions (retweets: {}, replies: {}, user_mentions: {}) constitute the interaction network in the RDF/XML file(s): {} and the Turtle file(s): {} (anonymized: {}).""".format( self.nparticipants,str(self.participantvars), self.nretweets+self.nreplies+self.nuser_mentions,self.nretweets,self.nreplies,self.nuser_mentions, self.tweet_rdf, self.tweet_ttl, self.interactions_anonymized) tposts="""\n\nThe dataset consists of {} tweets with metadata {} {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {} {:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}""".format( self.ntweets,str(self.tweetvars), self.mcharstweets,self.dcharstweets,self.totalchars, self.mtokenstweets,self.dtokenstweets,self.totaltokens, ) self.dates=[i.isoformat() for i in self.dates] date1=min(self.dates) date2=max(self.dates) with open(self.final_path_+"README","w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the twitter snapshot {snapid} with tweets from {date1} to {date2} (total of {ntrip} triples).{tinteraction}{tposts} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""".format( snapid=self.snapshotid,date1=date1,date2=date2,ntrip=self.ntriples, tinteraction=tinteraction, tposts=tposts, mrdf=self.mrdf, mttl=self.mttl, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc ))
def parseLegacyFiles(data_dir=DATADIR): """Parse legacy txt files with irc logs""" data_dir = os.path.expanduser(data_dir) directories = os.listdir(data_dir) directories = [i for i in directories if os.path.isdir(data_dir + i)] snapshots = set() triples = [] for directory in directories: all_files = [ i for i in os.listdir(data_dir + directory) if i.isdigit() ] if all_files: all_files.sort() foo = all_files[0].lstrip("0") if not foo: foo = "0" snapshotid = "legacy-" + directory + "-" + foo + "-" + all_files[ -1].lstrip("0") snapshoturi = po.GmaneSnapshot + "#" + snapshotid expressed_classes = [ po.GmaneParticipant, po.EmailPeer, po.EmailMessage ] expressed_reference = directory name_humanized = "Gmane email list with id " + expressed_reference # get size for all files in dir directorysize = sum( os.path.getsize(data_dir + directory + "/" + filename) for filename in os.listdir(data_dir + directory)) / 10**6 nfiles = len(all_files) fileformat = "mbox" directoryuri = po.Directory + "#gmane-" + directory triples += [ (snapshoturi, a, po.Snapshot), (snapshoturi, po.dataDir, data_dir), (snapshoturi, a, po.Snapshot), (snapshoturi, a, po.GmaneSnapshot), (snapshoturi, po.snapshotID, snapshotid), (snapshoturi, po.isEgo, False), (snapshoturi, po.isGroup, True), (snapshoturi, po.isFriendship, False), (snapshoturi, po.isInteraction, True), (snapshoturi, po.isPost, True), (snapshoturi, po.humanizedName, name_humanized), (snapshoturi, po.expressedReference, expressed_reference), (snapshoturi, po.rawDirectory, directoryuri), (directoryuri, po.directorySize, directorysize), (directoryuri, po.directoryName, directory), (directoryuri, po.fileFormat, fileformat), ] + [(directoryuri, po.expressedClass, expressed_class) for expressed_class in expressed_classes] snapshots.add(snapshoturi) nsnapshots = ndirectories = len(directories) #P.context("gmane","remove") platformuri = P.rdf.ic(po.Platform, "Gmane", context="gmane") triples += [ (NS.social.Session, NS.social.nGmaneParsedDirectories, ndirectories), (NS.social.Session, NS.social.nGmaneSnapshots, nsnapshots), (NS.social.Session, po.platform, platformuri), ] P.add(triples, context="gmane") c("parsed {} gmane data directories (=={} snapshots) are in percolation graph and 'gmane' context" .format(ndirectories, nsnapshots)) c("percolation graph have {} triples ({} in gmane context)".format( len(P.percolation_graph), len(P.context("gmane")))) negos = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isEgo true } } " ) ngroups = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isGroup true } } " ) nfriendships = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isFriendship true } } " ) ninteractions = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isInteraction true } } " ) nposts = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isPost true } } " ) totalsize = sum( P.query( r" SELECT ?size WHERE { GRAPH <gmane> { ?s po:directorySize ?size } } " )) c("""{} are ego snapshots, {} are group snapshots {} have a friendship structures. {} have an interaction structures. {} have texts Total raw data size is {:.2f}MB""".format(negos, ngroups, nfriendships, ninteractions, nposts, totalsize)) return snapshots
def makeMetadata(self): # triples = P.get(self.snapshoturi, None, None, self.social_graph) # for rawfile in P.get(self.snapshoturi, po.rawFile, None, # self.social_graph, strict=True, minimized=True): # triples.extend(P.get(rawfile, None, None, self.social_graph)) # P.add(triples, context=self.meta_graph) self.ffile = "base/" + self.filename_friendships self.frdf = self.snapshotid + "Friendship.rdf" self.fttl = self.snapshotid + "Friendship.ttl" triples = [ # (self.snapshoturi, po.onlineOriginalFriendshipFile, # self.online_prefix+self.ffile), # (self.snapshoturi, po.originalFriendshipFileName, self.ffile), # (self.snapshoturi, po.onlineFriendshipXMLFile, # self.online_prefix+self.frdf), # (self.snapshoturi, po.onlineFriendshipTTLFile, # self.online_prefix+self.fttl), # (self.snapshoturi, po.friendshipXMLFileName, self.frdf), # (self.snapshoturi, po.friendshipTTLFileName, self.fttl), # (self.snapshoturi, po.numberOfFriends, self.nfriends), # (self.snapshoturi, po.numberOfFriendships, self.nfriendships), (self.snapshoturi, po.friendshipsAnonymized, self.friendships_anonymized), ] P.add(triples, context=self.meta_graph) # P.rdf.triplesScaffolding(self.snapshoturi, # [po.frienshipParticipantAttribute] * # len(self.friendsvars), # self.friendsvars, context=self.meta_graph) self.mrdf = self.snapshotid + "Meta.rdf" self.mttl = self.snapshotid + "Meta.ttl" self.desc = "facebook network with snapshotID: {}\nsnapshotURI: {} \n\ isEgo: {}. isGroup: {}.".format(self.snapshotid, self.snapshoturi, self.isego, self.isgroup) self.desc += "\nisFriendship: {}".format(self.isfriendship) # self.desc += "; numberOfFriends: {}; numberOfFrienships: {}."\ # .format(self.nfriends, self.nfriendships) self.desc += "\nisInteraction: {}".format(self.isinteraction) self.desc += "\nisPost: {} (hasText)".format(self.hastext) date_obtained = P.get(r.URIRef(self.snapshoturi), po.dateObtained)[2].toPython() assert isinstance(date_obtained, datetime.date) name = P.get(r.URIRef(self.snapshoturi), po.name, None, context=self.social_graph)[2] triples = [ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, a, po.Snapshot), (self.snapshoturi, po.snapshotID, self.snapshotid), (self.snapshoturi, po.isEgo, True), (self.snapshoturi, po.isGroup, False), (self.snapshoturi, po.isFriendship, True), (self.snapshoturi, po.isInteraction, False), (self.snapshoturi, po.isPost, False), (self.snapshoturi, po.dateObtained, date_obtained), (self.snapshoturi, po.name, name), # (self.snapshoturi, po.triplifiedBy, "scripts/"), # (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]), # (self.snapshoturi, po.availableAt, self.online_prefix), # (self.snapshoturi, po.onlineMetaXMLFile, # self.online_prefix+self.mrdf), # (self.snapshoturi, po.onlineMetaTTLFile, # self.online_prefix+self.mttl), # (self.snapshoturi, po.metaXMLFileName, self.mrdf), # (self.snapshoturi, po.metaTTLFileName, self.mttl), (self.snapshoturi, po.acquiredThrough, "Netvizz"), (self.snapshoturi, po.socialProtocol, "Facebook"), # (self.snapshoturi, po.socialProtocolTag, "Facebook"), # (self.snapshoturi, po.socialProtocol, # P.rdf.ic(po.Platform, "Facebook", self.meta_graph, # self.snapshoturi)), (self.snapshoturi, po.comment, self.desc), ] numericID = P.get(r.URIRef(self.snapshoturi), po.numericID, None, context=self.social_graph) if numericID: triples.append((self.snapshoturi, po.numericID, numericID[2])) stringID = P.get(r.URIRef(self.snapshoturi), po.stringID, None, context=self.social_graph) if stringID: triples.append((self.snapshoturi, po.stringID, stringID[2])) url = P.get(r.URIRef(self.snapshoturi), po.url, None, context=self.social_graph) if url: triples.append((self.snapshoturi, po.url, url[2])) P.add(triples, self.meta_graph)
def makeMetadata(self): self.makePostsTriples() # get participant and message vars from snapshot through queries self.participantvars = P.get("""SELECT DISTINCT ?p WHERE { GRAPH <%s> { ?fooparticipant po:snapshot <%s> . ?fooparticipant a po:Participant . ?fooparticipant ?p ?fooobject . } } """ % ( self.translation_graph, self.snapshoturi)) P.rdf.triplesScaffolding( self.snapshoturi, [po.ParticipantAttribute]*len(self.participantvars), self.participantvars, context=self.meta_graph) self.messagevars = P.get("""SELECT DISTINCT ?p WHERE { GRAPH <%s> { ?foomessage po:snapshot <%s> . ?foomessage a po:Message . ?foomessage ?p ?fooobject . } } """ % ( self.translation_graph, self.snapshoturi)) P.rdf.triplesScaffolding( self.snapshoturi, [po.MessageAttribute]*len(self.messagevars), self.messagevars, context=self.meta_graph) self.mrdf = self.snapshotid+"Meta.rdf" self.mttl = self.snapshotid+"Meta.ttl" self.desc = "dataset with snapshotID:\ {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid, self.snapshoturi, self.isego, self.isgroup) self.desc += "\nisFriendship: {}; ".format(self.isfriendship) self.desc += "isInteraction: {}.".format(self.isinteraction) self.desc += "\nhasText: {}".format(self.hastext) self.nchecks = P.get(r"SELECT (COUNT(?checker) as ?cs) WHERE { \ ?foosession po:checkParticipant ?checker}", context=self.translation_graph) self.desc += "\nnParticipants: {}; nInteractions: {} \ (only session checks in first aa).".format( self.nparticipants, self.nchecks) self.desc += "\nnMessages: {}; ".format(self.nmessages) self.desc += "\nnCharsOverall: {}; mCharsOverall: {};\ dCharsOverall: {}.".format(self.totalchars, self.mchars_messages, self.dchars_messages) self.desc += "\nnTokensOverall: {}; mTokensOverall: {};\ dTokensOverall: {};".format(self.totaltokens, self.mtokens_messages, self.dtokens_messages) self.desc += "\nnSentencesOverall: {}; mSentencesOverall: {};\ dSentencesOverall: {};".format( self.totalsentences, self.msentences_messages, self.dsentences_messages) self.desc += "\nnURLs: {}; nAAMessages {}.".format( self.nurls, self.nmessages) self.dates = P.get(r"SELECT ?date WHERE { GRAPH <%s> {\ ?fooshout po:createdAt ?date } " % ( self.translation_graph,)) self.desc += "\nReference timespan: {} to {}".format( min(dates), max(dates)) self.desc += """\nRDF expression in the XML file(s): {} and the Turtle file(s): {} (anonymized: {}).""".format(self.translation_xml, self.translation_ttl, self.anonymized) self.desc += """\nMetadata of this snapshot in the XML file(s): {} and the Turtle file(s): {}.""".format(self.meta_xml, self.meta_ttl) self.desc += """\nFiles should be available in: \n{}""".format() self.desc += "\n\nNote: numeric variables starting with n area \ countings, with m are means and d are standard deviations." if isinstance(self.translation_xml, list): P.rdf.triplesScaffolding( self.snapshoturi, [po.translationXMLFilename]*len(self.translation_xml) + [po.translationTTLFilename]*len(self.translation_ttl), self.translation_xml+self.translation_ttl, context=self.meta_graph) P.rdf.triplesScaffolding( self.snapshoturi, [po.onlineTranslationXMLFileURI]*len(self.translation_xml) + [po.onlineTranslationTTLFileURI]*len(self.translation_ttl), [self.online_prefix+i for i in self.translation_xml+self.translation_ttl], context=self.meta_graph) triples = [ (self.snapshoturi, po.translationXMLFilesize, self.translation_size_xml), (self.snapshoturi, po.translationTTLFilesize, self.translation_size_ttl), ] else: triples = [ (self.snapshoturi, po.translationXMLFilename, self.translation_xml), (self.snapshoturi, po.translationXMLFilesize, self.translation_size_xml), (self.snapshoturi, po.translationTTLFilename, self.translation_ttl), (self.snapshoturi, po.translationTTLFilesize, self.translation_size_ttl), ] P.add(triples,self.meta_graph) triples=[ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, po.triplifiedBy, "scripts/"), # (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]), (self.snapshoturi, po.availableAt, self.online_prefix), (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf), (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl), (self.snapshoturi, po.metaXMLFileName, self.mrdf), (self.snapshoturi, po.metaTTLFileName, self.mttl), # (self.snapshoturi, po.acquiredThrough, "aa shouts in "+self.snapshotid), (self.snapshoturi, po.socialProtocolTag, self.social_protocol), # AA, fb, etc (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform,self.social_protocol,self.meta_graph,self.snapshoturi)), (self.snapshoturi, po.nTriples, self.ntranslation_triples), (self.snapshoturi, NS.rdfs.comment, self.desc), ] P.add(triples,self.meta_graph)
def parseLegacyFiles(datadir=DATADIR + "facebook/"): """Parse legacy gdf, gml and tab files of facebook structures Synthax of facebook filenames is: <prefix><name><date><suffix><extension> where: <prefix> used are: *) avlab_ for files obtained with participants at AVLAB *) posavlab_ for files obtained from participants *) page_ for files about facebook pages *) ego_ for ego networks ommited for gml files and gdf group files. <name> is any string name associated with the user or group delimiting the structure in the file, e.g. FernandValfro. it gets split with spaces before uppercase letter chunks for po:humanizedName: REM splits to REM. RFabbri to RFabbri. <date> daymonthyear in 2/2/4 digits, e.g. 20032014 for 20/March/2014. <suffix> is ommited for friendship .gml .gdf networks, .tab are text and activity files. _interaction is used if interaction network. <extension> is either .gml for gml files, all are ego friendship network data .gdf for gdf files with group and ego, interaction and friendship network data .tab for tab files with post data, such as text These render snapshots of two classes: po:FacebookEgoFriendshipSnapshot from .gml files and gdf files with prefix avlab_ posavlab_ or ego_ po:FacebookGroupFriendshipInteractionSnapshot from .gdf files without prefix with and without _interaction suffix and the .tab files. They form sets of files, all with friendship and interaction networks and some with a .tab file. ToDo: *) Implement parsing of page files. *) Implement parsing of new group files.""" platformuri = P.rdf.ic(po.Platform, "Facebook", context="social_facebook") triples = [ (platformuri, po.dataDir, datadir), ] filenames = os.listdir(datadir) filenames = [ i for i in filenames if not i.endswith("swp") and "ipython_log.py" != i ] snapshots = set() regex = re.compile( r"^(avlab_|ego_|posavlab_|page_)*(.*?)(\d{8})(_interactions|_comments){0,1}\.(gdf|tab|gml)$" ) regex2 = re.compile(r'([A-Z]{2,}(?=[A-Z]|$)|[A-Z][a-z]*)') for filename in filenames: prefix, name, date, sufix, format_ = regex.findall(filename)[0] if prefix == "page_": c("page data currently not supported. Jumping", filename) continue # size in megabytes filesize = os.path.getsize(datadir + filename) / (10**6) snapshotid = 'facebook-legacy-' + filename.replace( "_interactions.gdf", "").replace(".tab", "").replace( '.gml', '').replace('.gdf', '') snapshoturi = po.Snapshot + "#" + snapshotid date_obtained = datetime.date(int(date[4:]), int(date[2:4]), int(date[:2])) name_humanized = " ".join(regex2.findall(name)) metadata = S.legacy.facebook.files.files_dict[filename.replace( "_interactions.gdf", ".gdf").replace(".tab", ".gdf")] if metadata[0]: triples += [(snapshoturi, po.numericID, metadata[0])] if metadata[1]: triples += [(snapshoturi, po.stringID, metadata[1])] if len(metadata) == 3: if not metadata[2]: c("group data without a publishing link: ", filename) else: triples += [(snapshoturi, po.url, metadata[2])] if filename.endswith(".gml") or any( filename.startswith(i) for i in ("ego_", "avlab_", "posavlab_")): isego = True isgroup = False isfriendship = True isinteraction = False isposts = False expressed_classes = (po.Friendship, po.Participant) if metadata[0]: expressed_reference = po.Participant+"#" + \ snapshotid+"-"+metadata[0] else: if "Mirtes" in filename: expressed_reference = po.Participant+"#" + \ snapshotid+"-anon_mirtes" else: raise ValueError( "Numeric ID is needed for friendship networks") triples += [(expressed_reference, a, po.FacebookParticipant)] else: # group snapshot isego = False isgroup = True ffilename = prefix + name + date + ".gdf" ifilename = prefix + name + date + "_interactions.gdf" tfilename = prefix + name + date + ".tab" isfriendship = ffilename in filenames isinteraction = ifilename in filenames isposts = tfilename in filenames if metadata[0]: expressed_reference = po.FacebookGroup+"#" +\ snapshotid+"-"+metadata[0] else: if metadata[1]: expressed_reference = po.FacebookGroup+"#" +\ snapshotid+"-"+metadata[1] else: raise ValueError("Numeric or string ID is needed\ for group networks") triples += [(expressed_reference, a, po.FacebookGroup)] if filename == ffilename: expressed_classes = (po.Friendship, po.Participant) elif filename == ifilename: expressed_classes = (po.Interaction, po.Participant) elif format_ == "tab": expressed_classes = (po.Post, ) else: raise NameError("filename structure not understood") fileuri = NS.po.File + "#" + snapshotid + "-_file_-" + filename triples += [ (snapshoturi, a, po.Snapshot), # (snapshoturi, a, po.FacebookSnapshot), (snapshoturi, po.snapshotID, snapshotid), (snapshoturi, po.isEgo, isego), (snapshoturi, po.isGroup, isgroup), (snapshoturi, po.isFriendship, isfriendship), (snapshoturi, po.isInteraction, isinteraction), (snapshoturi, po.isPost, isposts), (snapshoturi, po.name, name_humanized), (snapshoturi, po.dateObtained, date_obtained), # (snapshoturi, po.expressedReference, expressed_reference), (snapshoturi, po.rawFile, fileuri), # (fileuri, po.fileSize, filesize), (fileuri, po.fileName, filename), (fileuri, po.fileFormat, format_), ] triples += [(fileuri, po.expressedClass, expressed_class) for expressed_class in expressed_classes] note = theNote(filename) # for avlab and posavlab if note: triples += [ (snapshoturi, NS.rdfs.comment, note), ] snapshots.add(snapshoturi) # data about the overall data in percolation graph nfiles = len(filenames) nsnapshots = len(snapshots) triples += [ (NS.social.Session, NS.social.nFacebookParsedFiles, nfiles), (NS.social.Session, NS.social.nFacebookSnapshots, nsnapshots), ] P.context("social_facebook", "remove") P.add(triples, context="social_facebook") c("parsed {} facebook files ({} snapshots) are in percolation \ graph and 'social_facebook' context".format(nfiles, nsnapshots)) c("percolation graph have {} triples ({} in social_facebook context\ )".format(len(P.percolation_graph), len(P.context("social_facebook")))) negos = P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isEgo true } ") ngroups = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isGroup true } ") nfriendships = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isFriendship true } ") ninteractions = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isInteraction true } ") nposts = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isPost true } ") totalsize = sum(P.query(r" SELECT ?size WHERE { ?s po:fileSize ?size } ")) c("""{} are ego snapshots, {} are group snapshots {} have a friendship network. {} have an interaction network. \ {} have post texts and reaction counts Total raw data size is {:.2f}MB""".format(negos, ngroups, nfriendships, ninteractions, nposts, totalsize)) return snapshots
def rdfInteractionNetwork(self, fnet): if sum([("user" in i) for i in fnet["individuals"]["label"]]) == \ len(fnet["individuals"]["label"]): # fake names and local ids self.interactions_anonymized = True else: self.interactions_anonymized = False tkeys = list(fnet["individuals"].keys()) if "groupid" in tkeys: self.groupid2 = fnet["individuals"]["groupid"][0] tkeys.remove("groupid") else: self.groupid2 = None if self.interactions_anonymized: self.varsfriendsinteraction = [ trans[i] for i in tkeys if i not in ('label', 'name') ] else: self.varsfriendsinteraction = [trans[i] for i in tkeys] insert = {"uris": [], "vals": []} for tkey in tkeys: insert["uris"].append(eval("po." + trans[tkey])) insert["vals"].append(fnet["individuals"][tkey]) self.ninteracted = len(insert["vals"][0]) iname = tkeys.index("name") ilabel = tkeys.index("label") for vals_ in zip(*insert["vals"]): if self.interactions_anonymized: name_ = "{}-{}".format(self.snapshotid, vals_[iname]) insert_uris_ = [ el for i, el in enumerate(insert['uris']) if i not in (ilabel, iname) ] vals__ = [ el for i, el in enumerate(vals_) if i not in (ilabel, iname) ] name__ = '{}-{}'.format(self.snapshotid, self.observation_count) self.observation_count += 1 else: name_ = "{}-{}".format(self.provenance_prefix, vals_[iname]) insert_uris_ = [el for i, el in enumerate(insert['uris'])] vals__ = [el for i, el in enumerate(vals_)] # uri = insert['uris'][iname] # numericID = vals_[iname] # P.add([(ind, uri, numericID)], self.interaction_graph) obsname = '{}-{}'.format(self.snapshotid, vals_[iname]) ind = P.rdf.ic(po.Participant, name_, self.interaction_graph, self.snapshoturi) obs = P.rdf.ic(po.Observation, obsname, self.interaction_graph, self.snapshoturi) P.add([(ind, po.observation, obs)], self.interaction_graph) if vals__: P.rdf.triplesScaffolding(obs, insert_uris_, vals__, self.interaction_graph) else: c( "anonymous participant without attributes (besides local id). \ snapshotid:", self.snapshotid, "values:", vals_) c("participant written") self.interactionsvarsfoo = ["node1", "node2", "weight"] interactions_ = [ fnet["relations"][i] for i in self.interactionsvarsfoo ] self.ninteractions = len(interactions_[0]) self.interactionsvars = ["iFrom", "iTo", "weight"] i = 0 for uid1, uid2, weight in zip(*interactions_): weight_ = int(weight) assert weight_-weight == 0, \ "float weights in fb interaction networks?" if self.interactions_anonymized: iid = "{}-{}-{}".format(self.snapshotid, uid1, uid2) uids = [ r.URIRef(po.Participant + "#{}-{}".format(self.snapshotid, i)) for i in (uid1, uid2) ] else: iid = "{}-{}-{}".format(self.provenance_prefix, uid1, uid2) uids = [ r.URIRef(po.Participant + "#{}-{}".format(self.provenance_prefix, i)) for i in (uid1, uid2) ] ind = P.rdf.ic(po.Interaction, iid, self.interaction_graph, self.snapshoturi) P.rdf.triplesScaffolding(ind, [po.interactionFrom, po.interactionTo], uids, self.interaction_graph) obsname = '{}-{}-{}'.format(self.snapshotid, uid1, uid2) obs = P.rdf.ic(po.Observation, obsname, self.interaction_graph, self.snapshoturi) P.add([(ind, po.observation, obs), (obs, po.weight, weight_)], self.interaction_graph) if (i % 1000) == 0: c("interactions: ", i) i += 1 c("escritas interações")
def makeMetadata(self): # triples = P.get(self.snapshoturi, None, None, self.social_graph) # for rawfile in P.get(self.snapshoturi, po.rawFile, None, # self.social_graph, strict=True, minimized=True): # triples.extend(P.get(rawfile, None, None, self.social_graph)) # self.totalchars = sum(self.nchars_all) # self.mcharstweets = n.mean(self.nchars_all) # self.dcharstweets = n.std(self.nchars_all) # self.totaltokens = sum(self.ntokens_all) # self.mtokenstweets = n.mean(self.ntokens_all) # self.dtokenstweets = n.std(self.ntokens_all) # P.add(triples, context=self.meta_graph) # (self.snapshoturi, po.numberOfParticipants, self.nparticipants), # (self.snapshoturi, po.numberOfTweets, self.ntweets), # (self.snapshoturi, po.numberOfReplies, self.nreplies), # (self.snapshoturi, po.numberOfRetweets, self.nretweets), # (self.snapshoturi, po.numberOfChars, self.totalchars), # (self.snapshoturi, po.meanChars, self.mcharstweets), # (self.snapshoturi, po.deviationChars, self.dcharstweets), # (self.snapshoturi, po.numberOfTokens, self.totaltokens), # (self.snapshoturi, po.meanTokens, self.mtokenstweets), # (self.snapshoturi, po.deviationTokens, self.dtokenstweets), # P.rdf.triplesScaffolding( # self.snapshoturi, # [po.tweetParticipantAttribute]*len(self.participantvars), # self.participantvars, context=self.meta_graph # ) # P.rdf.triplesScaffolding( # self.snapshoturi, # [po.tweetXMLFilename]*len(self.tweet_rdf) + # [po.tweetTTLFilename]*len(self.tweet_ttl), # self.tweet_rdf+self.tweet_ttl, context=self.meta_graph) # P.rdf.triplesScaffolding( # self.snapshoturi, # [po.onlineTweetXMLFile]*len(self.tweet_rdf) + # [po.onlineTweetTTLFile]*len(self.tweet_ttl), # [self.online_prefix+i for i in self.tweet_rdf+self.tweet_ttl], # context=self.meta_graph) self.mrdf = self.snapshotid + "Meta.rdf" self.mttl = self.snapshotid + "Meta.ttl" self.desc = ("twitter dataset with snapshotID: {}\nsnapshotURI: " "{} \nisEgo: {}. isGroup: {}.").format( self.snapshotid, self.snapshoturi, self.isego, self.isgroup, ) self.desc += "\nisFriendship: {}; ".format(self.isfriendship) self.desc += "isInteraction: {}.".format(self.isinteraction) # self.desc += ("\nnParticipants: {}; nInteractions: {} " # "(replies+retweets+user mentions).").format( # self.nparticipants, self.nreplies+self.nretweets + # self.nuser_mentions,) self.desc += "\nisPost: {}".format(self.hastext) # self.desc += "\nnumberOfTweets: {}; ".format(self.ntweets) # self.desc += "numberOfReplies: {}; numberOfRetweets: {}; nmberOfUserMentions: {}.".format( # self.nreplies, self.nretweets, self.nuser_mentions) # self.desc += "\nnumberOfTokens: {}; meanTokens: {}; deviationTokens: {};".format( # self.totaltokens, self.mtokenstweets, self.dtokenstweets) # self.desc += "\nnumberOfChars: {}; meanChars: {}; deviationChars: {}.".format( # self.totalchars, self.mcharstweets, self.dcharstweets) # self.desc += "\nnumberOfHashtags: {}; numberOfMedia: {}; ".format( # self.nhashtags, self.nmedia) triples = [] triples.extend(( (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, a, po.Snapshot), (self.snapshoturi, po.snapshotID, self.snapshotid), (self.snapshoturi, po.isEgo, False), (self.snapshoturi, po.isGroup, True), (self.snapshoturi, po.isFriendship, False), (self.snapshoturi, po.isInteraction, True), (self.snapshoturi, po.isPost, True), (self.snapshoturi, po.hashtag, '#' + self.snapshotid.replace('twitter-legacy-', '')), # (self.snapshoturi, po.triplifiedBy, "scripts/"), # (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]), # (self.snapshoturi, po.availableAt, self.online_prefix), # (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf), # (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl), # (self.snapshoturi, po.metaXMLFileName, self.mrdf), # (self.snapshoturi, po.metaTTLFileName, self.mttl), # (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_rdf)), # (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)), # (self.snapshoturi, po.acquiredThrough, "Twitter APIs"), (self.snapshoturi, po.socialProtocol, "Twitter"), # (self.snapshoturi, po.socialProtocolTag, "Twitter"), # (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform, "Twitter", self.meta_graph, self.snapshoturi)), # (self.snapshoturi, po.numberOfTriples, self.ntriples), (self.snapshoturi, po.comment, self.desc), )) P.add(triples, self.meta_graph)
def minimumOntology(context="minimum_ontology"): triples=rdfsTriples() if context=="triples": return triples P.add(triples,context=context)
def writeAllTW(self): # write meta and readme with self.desc, finished. g = P.context(self.meta_graph) ntriples = len(g) triples = [ (self.snapshoturi, po.nMetaTriples, ntriples), ] P.add(triples, context=self.meta_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml") c("serialized meta") # copia o script que gera este codigo if not os.path.isdir(self.final_path_ + "scripts"): os.mkdir(self.final_path_ + "scripts") shutil.copy(S.PACKAGEDIR + "/../tests/triplify.py", self.final_path_ + "scripts/triplify.py") # copia do base data tinteraction = """\n\n{} individuals with metadata {} and {} interactions (retweets: {}, replies: {}, user_mentions: {}) constitute the interaction network in the RDF/XML file(s): {} and the Turtle file(s): {} (anonymized: {}).""".format( self.nparticipants, str(self.participantvars), self.nretweets + self.nreplies + self.nuser_mentions, self.nretweets, self.nreplies, self.nuser_mentions, self.tweet_rdf, self.tweet_ttl, self.interactions_anonymized) tposts = """\n\nThe dataset consists of {} tweets with metadata {} {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {} {:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}""".format( self.ntweets, str(self.tweetvars), self.mcharstweets, self.dcharstweets, self.totalchars, self.mtokenstweets, self.dtokenstweets, self.totaltokens, ) self.dates = [i.isoformat() for i in self.dates] date1 = min(self.dates) date2 = max(self.dates) with open(self.final_path_ + "README", "w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the twitter snapshot {snapid} with tweets from {date1} to {date2} (total of {ntrip} triples).{tinteraction}{tposts} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""" .format(snapid=self.snapshotid, date1=date1, date2=date2, ntrip=self.ntriples, tinteraction=tinteraction, tposts=tposts, mrdf=self.mrdf, mttl=self.mttl, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc))
def makeMetadata(self): triples = P.get(self.snapshoturi, None, None, self.social_graph) for rawfile in P.get(self.snapshoturi, po.rawFile, None, self.social_graph, strict=True, minimized=True): triples += P.get(rawfile, None, None, self.social_graph) self.totalchars = sum(self.nchars_all) self.mcharstweets = n.mean(self.nchars_all) self.dcharstweets = n.std(self.nchars_all) self.totaltokens = sum(self.ntokens_all) self.mtokenstweets = n.mean(self.ntokens_all) self.dtokenstweets = n.std(self.ntokens_all) P.add(triples, context=self.meta_graph) triples = [ (self.snapshoturi, po.nParticipants, self.nparticipants), (self.snapshoturi, po.nTweets, self.ntweets), (self.snapshoturi, po.nReplies, self.nreplies), (self.snapshoturi, po.nRetweets, self.nretweets), (self.snapshoturi, po.nCharsOverall, self.totalchars), (self.snapshoturi, po.mCharsOverall, self.mcharstweets), (self.snapshoturi, po.dCharsOverall, self.dcharstweets), (self.snapshoturi, po.nTokensOverall, self.totaltokens), (self.snapshoturi, po.mTokensOverall, self.mtokenstweets), (self.snapshoturi, po.dTokensOverall, self.dtokenstweets), ] P.add(triples, context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.tweetParticipantAttribute] * len(self.participantvars), self.participantvars, context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.tweetXMLFilename] * len(self.tweet_rdf) + [po.tweetTTLFilename] * len(self.tweet_ttl), self.tweet_rdf + self.tweet_ttl, context=self.meta_graph) P.rdf.triplesScaffolding( self.snapshoturi, [po.onlineTweetXMLFile] * len(self.tweet_rdf) + [po.onlineTweetTTLFile] * len(self.tweet_ttl), [self.online_prefix + i for i in self.tweet_rdf + self.tweet_ttl], context=self.meta_graph) self.mrdf = self.snapshotid + "Meta.rdf" self.mttl = self.snapshotid + "Meta.ttl" self.desc = "twitter dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid, self.snapshoturi, self.isego, self.isgroup, ) self.desc += "\nisFriendship: {}; ".format(self.isfriendship) self.desc += "isInteraction: {}.".format(self.isinteraction) self.desc += "\nnParticipants: {}; nInteractions: {} (replies+retweets+user mentions).".format( self.nparticipants, self.nreplies + self.nretweets + self.nuser_mentions, ) self.desc += "\nisPost: {} (alias hasText: {})".format( self.hastext, self.hastext) self.desc += "\nnTweets: {}; ".format(self.ntweets) self.desc += "nReplies: {}; nRetweets: {}; nUserMentions: {}.".format( self.nreplies, self.nretweets, self.nuser_mentions) self.desc += "\nnTokens: {}; mTokens: {}; dTokens: {};".format( self.totaltokens, self.mtokenstweets, self.dtokenstweets) self.desc += "\nnChars: {}; mChars: {}; dChars: {}.".format( self.totalchars, self.mcharstweets, self.dcharstweets) self.desc += "\nnHashtags: {}; nMedia: {}; nLinks: {}.".format( self.nhashtags, self.nmedia, self.nlinks) triples = [ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, po.triplifiedBy, "scripts/"), (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]), (self.snapshoturi, po.availableAt, self.online_prefix), (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix + self.mrdf), (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix + self.mttl), (self.snapshoturi, po.metaXMLFileName, self.mrdf), (self.snapshoturi, po.metaTTLFileName, self.mttl), (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_rdf)), (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)), (self.snapshoturi, po.acquiredThrough, "Twitter APIs"), (self.snapshoturi, po.socialProtocolTag, "Twitter"), (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform, "Twitter", self.meta_graph, self.snapshoturi)), (self.snapshoturi, po.nTriples, self.ntriples), (self.snapshoturi, NS.rdfs.comment, self.desc), ] P.add(triples, self.meta_graph)
def writeAllFB(self): c("started rendering of the snapshot publication. snapshotID:", self.snapshotid) self.final_path_ = "{}{}/".format(self.final_path, self.snapshotid) if not os.path.isdir(self.final_path_): os.mkdir(self.final_path_) #fnet,inet,mnet triples = [] if self.isfriendship: g = P.context(self.friendship_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Friendship.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Friendship.rdf", "xml") c("serialized friendships") # get filesize and ntriples filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid + "Friendship.rdf") / (10**6) filesizettl = os.path.getsize(self.final_path_ + self.snapshotid + "Friendship.ttl") / (10**6) ntriples = len(g) triples += [ (self.snapshoturi, po.friendshipXMLFileSizeMB, filesizerdf), (self.snapshoturi, po.friendshipTTLFileSizeMB, filesizettl), (self.snapshoturi, po.nFriendshipTriples, ntriples), ] if self.isinteraction: g = P.context(self.interaction_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Interaction.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Interaction.rdf", "xml") c("serialized interaction") filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid + "Interaction.rdf") / (10**6) filesizettl = os.path.getsize(self.final_path_ + self.snapshotid + "Interaction.ttl") / (10**6) ntriples = len(g) triples += [ (self.snapshoturi, po.interactionXMLFileSizeMB, filesizerdf), (self.snapshoturi, po.interactionTTLFileSizeMB, filesizettl), (self.snapshoturi, po.nInteractionTriples, ntriples), ] if self.hastext: g = P.context(self.posts_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Posts.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Posts.rdf", "xml") c("serialized posts") filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid + "Posts.rdf") / (10**6) filesizettl = os.path.getsize(self.final_path_ + self.snapshotid + "Posts.ttl") / (10**6) ntriples = len(g) triples += [ (self.snapshoturi, po.postsXMLFileSizeMB, filesizerdf), (self.snapshoturi, po.postsTTLFileSizeMB, filesizettl), (self.snapshoturi, po.nPostsTriples, ntriples), ] g = P.context(self.meta_graph) ntriples = len(g) triples += [ (self.snapshoturi, po.nMetaTriples, ntriples), ] P.add(triples, context=self.meta_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml") c("serialized meta") # copia o script que gera este codigo if not os.path.isdir(self.final_path_ + "scripts"): os.mkdir(self.final_path_ + "scripts") shutil.copy(S.PACKAGEDIR + "/../tests/triplify.py", self.final_path_ + "scripts/triplify.py") # copia do base data if not os.path.isdir(self.final_path_ + "base"): os.mkdir(self.final_path_ + "base") originals = "" if self.isfriendship: shutil.copy(self.data_path + self.filename_friendships, self.final_path_ + "base/") originals += "base/{}".format(self.filename_friendships) tfriendship = """\n\n{nf} individuals with metadata {fvars} and {nfs} friendships constitute the friendship network in the RDF/XML file: {frdf} \nor in the Turtle file: \n{fttl} (anonymized: {fan}).""".format( nf=self.nfriends, fvars=str(self.friendsvars), nfs=self.nfriendships, frdf=self.frdf, fttl=self.fttl, fan=self.friendships_anonymized, ) else: tfriendship = "" if self.isinteraction: shutil.copy(self.data_path + self.filename_interactions, self.final_path_ + "base/") tinteraction = """\n\n{} individuals with metadata {} and {} interactions with metadata {} constitute the interaction network in the RDF/XML file: {} or in the Turtle file: {} (anonymized: {}).""".format(self.ninteracted, str(self.varsfriendsinteraction), self.ninteractions, str(self.interactionsvars), self.irdf, self.ittl, self.interactions_anonymized) originals += "\nbase/{}".format(self.filename_interactions) else: tinteraction = "" if self.hastext: shutil.copy(self.data_path + self.filename_posts, self.final_path_ + "base/") tposts = """\n\n{} posts with {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {} {:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {} posts data in the RDF/XML file: {} or in the Turtle file: {}""".format(self.nposts, self.mcharsposts, self.dcharsposts, self.totalchars, self.mtokensposts, self.dtokensposts, self.totaltokens, self.prdf, self.pttl) originals += "\nbase/{}".format(self.filename_posts) else: tposts = "" # P.rdf.writeAll(mnet,aname+"Meta",fpath_,1) # faz um README datetime_string = P.get(r.URIRef(self.snapshoturi), po.dateObtained, None, context="social_facebook")[2] # if not os.path.isdir(self.final_path+"base"): # os.mkdir(self.final_path+"base") with open(self.final_path_ + "README", "w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the facebook snapshot {snapid} collected around {date}.{tfriendship}{tinteraction}{tposts} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nOriginal file(s): {origs} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""" .format(snapid=self.snapshotid, date=datetime_string, tfriendship=tfriendship, tinteraction=tinteraction, tposts=tposts, mrdf=self.mrdf, mttl=self.mttl, origs=originals, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc))
def makeMetadata(self): triples = P.get(self.snapshoturi, None, None, self.social_graph) for rawfile in P.get(self.snapshoturi, po.rawFile, None, self.social_graph, strict=True, minimized=True): triples += P.get(rawfile, None, None, self.social_graph) P.add(triples, context=self.meta_graph) self.totalchars = sum(self.nchars_all) self.mcharsmessages = n.mean(self.nchars_all) self.dcharsmessages = n.std(self.nchars_all) self.totaltokens = sum(self.ntokens_all) self.mtokensmessages = n.mean(self.ntokens_all) self.dtokensmessages = n.std(self.ntokens_all) self.totalsentences = sum(self.nsentences_all) self.msentencesmessages = n.mean(self.nsentences_all) self.dsentencesmessages = n.std(self.nsentences_all) self.nparticipants = len(self.NICKS) self.nmessages = len(self.messageids) self.ntriples = len(P.context(self.irc_graph)) triples = [ (self.snapshoturi, po.nParticipants, self.nparticipants), (self.snapshoturi, po.nMessages, self.nmessages), (self.snapshoturi, po.nDirectMessages, self.ndirect), (self.snapshoturi, po.nUserMentions, self.nmention), (self.snapshoturi, po.nCharsOverall, self.totalchars), (self.snapshoturi, po.mCharsOverall, self.mcharsmessages), (self.snapshoturi, po.dCharsOverall, self.dcharsmessages), (self.snapshoturi, po.nTokensOverall, self.totaltokens), (self.snapshoturi, po.mTokensOverall, self.mtokensmessages), (self.snapshoturi, po.dTokensOverall, self.dtokensmessages), (self.snapshoturi, po.nSentencesOverall, self.totalsentences), (self.snapshoturi, po.mSentencesOverall, self.msentencesmessages), (self.snapshoturi, po.dSentencesOverall, self.dsentencesmessages), ] P.add(triples, context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.ircParticipantAttribute] * len(self.participantvars), self.participantvars, context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.logXMLFilename] * len(self.log_xml) + [po.logTTLFilename] * len(self.log_ttl), self.log_xml + self.log_ttl, context=self.meta_graph) P.rdf.triplesScaffolding( self.snapshoturi, [po.onlineLogXMLFile] * len(self.log_xml) + [po.onlineLogTTLFile] * len(self.log_ttl), [self.online_prefix + i for i in self.log_xml + self.log_ttl], context=self.meta_graph) self.mrdf = self.snapshotid + "Meta.rdf" self.mttl = self.snapshotid + "Meta.ttl" self.desc = "irc dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid, self.snapshoturi, self.isego, self.isgroup, ) self.desc += "\nisFriendship: {}; ".format(self.isfriendship) self.desc += "isInteraction: {}.".format(self.isinteraction) self.desc += "\nnParticipants: {}; nInteractions: {} (directed messages+user mentions).".format( self.nparticipants, self.ndirect + self.nmention) self.desc += "\nisPost: {} (alias hasText: {})".format( self.hastext, self.hastext) self.desc += "\nnMessages: {}; ".format(self.nmessages) self.desc += "nDirectedMessages: {}; nUserMentions: {};".format( self.ndirect, self.nmention) self.desc += "\nnCharsOverall: {}; mCharsOverall: {}; dCharsOverall: {}.".format( self.totalchars, self.mcharsmessages, self.dcharsmessages) self.desc += "\nnTokensOverall: {}; mTokensOverall: {}; dTokensOverall: {};".format( self.totaltokens, self.mtokensmessages, self.dtokensmessages) self.desc += "\nnSentencesOverall: {}; mSentencesOverall: {}; dSentencesOverall: {};".format( self.totalsentences, self.msentencesmessages, self.dsentencesmessages) self.desc += "\nnURLs: {}; nAAMessages {}.".format( self.nurls, self.naamessages) triples = [ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, po.triplifiedBy, "scripts/"), (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]), (self.snapshoturi, po.availableAt, self.online_prefix), (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix + self.mrdf), (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix + self.mttl), (self.snapshoturi, po.metaXMLFileName, self.mrdf), (self.snapshoturi, po.metaTTLFileName, self.mttl), (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_xml)), (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)), (self.snapshoturi, po.acquiredThrough, "channel text log"), (self.snapshoturi, po.socialProtocolTag, "IRC"), (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform, "IRC", self.meta_graph, self.snapshoturi)), (self.snapshoturi, po.nTriples, self.ntriples), (self.snapshoturi, NS.rdfs.comment, self.desc), ] P.add(triples, self.meta_graph)
def rdfMbox(self): for filecount, file_ in enumerate(self.files): if filecount % 100 == 0: c(self.snapshoturi, filecount) mbox = mailbox.mbox(self.data_path + self.directory + "/" + file_) if not mbox.keys(): self.nempty += 1 mbox.close() # c("||||||||||| EMPTY MESSAGE |||||||||||||||||||||",self.snapshotid,file_,"(",filecount,")") continue if not mbox[0]["Message-Id"]: raise ValueError( "What to do with nonempy messages without id?") message = mbox[0] gmaneid = self.makeId(message["Message-Id"]) #c("gmaneid",gmaneid) if not gmaneid: raise ValueError("Message without id") messageuri = P.rdf.ic(po.EmailMessage, gmaneid, self.translation_graph, self.snapshoturi) self.nmessages += 1 triples = [ (messageuri, po.gmaneID, gmaneid), ] email, name = self.parseParticipant(message["From"]) if not email: raise ValueError("message without author") participanturi = P.rdf.ic(po.GmaneParticipant, email, self.translation_graph, self.snapshoturi) if not P.get(participanturi, po.emailAddress, None, self.translation_graph): self.nparticipants += 1 if self.nparticipants == 100: pass triples += [ (messageuri, po.author, participanturi), (participanturi, po.emailAddress, email), ] if name: triples += [ (participanturi, po.name, name), ] subject = message["Subject"] if subject: subject = decodeHeader(subject) assert isinstance(subject, str) triples += [ (messageuri, po.subject, subject), ] replyid_ = message["In-Reply-To"] saneid = self.makeId(replyid_) if bool(replyid_) and not bool(saneid): self.nreplies += 1 replyid = self.snapshotid + "-" + str(self.nlost_messages) self.nlost_messages += 1 replymessageuri = P.rdf.ic(po.LostEmailMessage, replyid, self.translation_graph, self.snapshoturi) triples += [ (replymessageuri, a, po.EmailMessage), (replymessageuri, NS.rdfs.comment, "This message registered as having a reply, but the field might be ill-formed: " + replyid_), (messageuri, po.replyTo, replymessageuri), ] elif saneid: self.nreplies += 1 replymessageuri = P.rdf.ic(po.EmailMessage, saneid, self.translation_graph, self.snapshoturi) triples += [ (replymessageuri, po.gmaneID, saneid), (messageuri, po.replyTo, replymessageuri), ] if isinstance(message["Date"], str): datetime = parseDate(message["Date"]) elif isinstance(message["Date"], mailbox.email.header.Header): datetimestring = decodeHeader(message["Date"]) if False in [i in string.printable for i in datetimestring]: datetime = None triples += [ (messageuri, po.lostCreatedAt, True), ] else: datetime_ = re.findall(r"(.*\d\d:\d\d:\d\d).*", datetimestring)[0] datetime = parseDate(datetime_) else: raise ValueError("datetime not understood") if datetime: self.dates += [datetime] triples += [ (messageuri, po.createdAt, datetime), ] if message["References"]: references = message["References"].replace("\n", "").replace( "\t", "").replace(" ", "") if not re.findall(r"\A<(.*?)>\Z", references): c("::: ::: ::: references field not understood", message["References"]) triples += [ (messageuri, po.comment, "the references are not understood (<.*> ids are added anyway): " + message["References"]), (messageuri, po.referencesLost, True), ] for reference in re.findall(r"<(.*?)>", references): self.nreferences += 1 referenceuri = P.rdf.ic(po.EmailMessage, reference, self.translation_graph, self.snapshoturi) triples += [ (referenceuri, po.gmaneID, reference), (messageuri, po.hasReference, referenceuri), ] for part in message["References"].replace("\n", "").replace( "\t", "").split(): if validate_email(part): self.nreferences += 1 referenceuri = P.rdf.ic(po.EmailMessage, part, self.translation_graph, self.snapshoturi) triples += [ (referenceuri, po.gmaneID, reference), (messageuri, po.hasReference, referenceuri), ] text = getText(message) if text: nchars = len(text) ntokens = len(k.wordpunct_tokenize(text)) nsentences = len(k.sent_tokenize(text)) triples += [ (messageuri, po.messageText, text), (messageuri, po.nChars, nchars), (messageuri, po.nTokens, ntokens), (messageuri, po.nSentences, nsentences), ] self.nchars_all += [nchars] self.ntokens_all += [ntokens] self.nsentences_all += [nsentences] clean_text = cleanEmailBody(text) self.nremoved_lines += text.count("\n") - clean_text.count( "\n") self.nlines += text.count("\n") nchars_clean = len(clean_text) ntokens_clean = len(k.wordpunct_tokenize(clean_text)) nsentences_clean = len(k.sent_tokenize(clean_text)) triples += [ (messageuri, po.messageTextClean, clean_text), (messageuri, po.nCharsClean, nchars_clean), (messageuri, po.nTokensClean, ntokens_clean), (messageuri, po.nSentencesClean, nsentences_clean), ] self.nchars_clean_all += [nchars_clean] self.ntokens_clean_all += [ntokens_clean] self.nsentences_clean_all += [nsentences_clean] for url in re.findall( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', clean_text): self.nurls += 1 triples += [ (messageuri, po.hasUrl, url), ] content_type = message.get_content_type() if content_type: triples += [(messageuri, po.contentType, content_type)] else: raise ValueError("/\/\/\/\/\ message without content type") organization = message["Organization"] if organization: if not isinstance(organization, str): organization = "".join(i for i in str(organization) if i in string.printable) triples += [ (messageuri, po.organization, organization), ] if message["cc"]: cc, unparsed = parseAddresses(message["cc"]) if unparsed: triples += [ (messageuri, po.unparsedCC, unparsed), ] for peeraddress, peername in cc: peeraddress = peeraddress.strip() assert bool(peeraddress) peeruri = P.rdf.ic(po.EmailPeer, peeraddress, self.translation_graph, self.snapshoturi) triples += [ (messageuri, po.cc, peeruri), (peeruri, po.emailAddress, peeraddress), ] self.ncc += 1 if peername: triples += [ (peeruri, po.name, peername.strip()), ] if message["to"]: to, unparsed = parseAddresses(message["to"]) if unparsed: triples += [ (messageuri, po.unparsedTo, unparsed), ] for peeraddress, peername in to: peeraddress = peeraddress.strip() assert bool(peeraddress) peeruri = P.rdf.ic(po.EmailPeer, peeraddress, self.translation_graph, self.snapshoturi) triples += [ (messageuri, po.to, peeruri), (peeruri, po.emailAddress, peeraddress), ] self.nto += 1 if peername: triples += [ (peeruri, po.name, peername.strip()), ] listid = message["list-id"] if listid: assert isinstance(listid, str) listid = listid.replace("\n", "").replace("\t", "") if listid.count("<") == listid.count(">") == listid.count( " ") == 0: listname = "" listid_ = listid elif listid.count("<") == listid.count(">") == 0: parts = listid.split() lens = [len(i) for i in parts] listid_ = [i for i in parts if len(i) == max(lens)][0] listname = " ".join(i for i in parts if len(i) != max(lens)) elif listid.count("<") == listid.count(">") == 1: listname, listid_ = re.findall(r"(.*) {0,1}<(.*)>", listid)[0] else: raise ValueError("Unexpected listid string format") listuri = P.rdf.ic(po.EmailList, listid_, self.translation_graph, self.snapshoturi) triples += [ (messageuri, po.emailList, listuri), (listuri, po.listID, listid_), ] if listname: triples += [ (listuri, po.name, listname.strip()), ] P.add(triples, self.translation_graph) mbox.close()
def makeMetadata(self): triples=P.get(self.snapshoturi,None,None,self.social_graph) for rawfile in P.get(self.snapshoturi,po.rawFile,None,self.social_graph,strict=True,minimized=True): triples+=P.get(rawfile,None,None,self.social_graph) self.totalchars=sum(self.nchars_all) self.mcharstweets=n.mean(self.nchars_all) self.dcharstweets=n.std(self.nchars_all) self.totaltokens=sum(self.ntokens_all) self.mtokenstweets=n.mean(self.ntokens_all) self.dtokenstweets=n.std(self.ntokens_all) P.add(triples,context=self.meta_graph) triples=[ (self.snapshoturi, po.nParticipants, self.nparticipants), (self.snapshoturi, po.nTweets, self.ntweets), (self.snapshoturi, po.nReplies, self.nreplies), (self.snapshoturi, po.nRetweets, self.nretweets), (self.snapshoturi, po.nCharsOverall, self.totalchars), (self.snapshoturi, po.mCharsOverall, self.mcharstweets), (self.snapshoturi, po.dCharsOverall, self.dcharstweets), (self.snapshoturi, po.nTokensOverall, self.totaltokens), (self.snapshoturi, po.mTokensOverall, self.mtokenstweets), (self.snapshoturi, po.dTokensOverall, self.dtokenstweets), ] P.add(triples,context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.tweetParticipantAttribute]*len(self.participantvars), self.participantvars,context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.tweetXMLFilename]*len(self.tweet_rdf)+[po.tweetTTLFilename]*len(self.tweet_ttl), self.tweet_rdf+self.tweet_ttl,context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.onlineTweetXMLFile]*len(self.tweet_rdf)+[po.onlineTweetTTLFile]*len(self.tweet_ttl), [self.online_prefix+i for i in self.tweet_rdf+self.tweet_ttl],context=self.meta_graph) self.mrdf=self.snapshotid+"Meta.rdf" self.mttl=self.snapshotid+"Meta.ttl" self.desc="twitter dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid,self.snapshoturi,self.isego,self.isgroup,) self.desc+="\nisFriendship: {}; ".format(self.isfriendship) self.desc+="isInteraction: {}.".format(self.isinteraction) self.desc+="\nnParticipants: {}; nInteractions: {} (replies+retweets+user mentions).".format(self.nparticipants,self.nreplies+self.nretweets+self.nuser_mentions,) self.desc+="\nisPost: {} (alias hasText: {})".format(self.hastext,self.hastext) self.desc+="\nnTweets: {}; ".format(self.ntweets) self.desc+="nReplies: {}; nRetweets: {}; nUserMentions: {}.".format(self.nreplies,self.nretweets,self.nuser_mentions) self.desc+="\nnTokens: {}; mTokens: {}; dTokens: {};".format(self.totaltokens,self.mtokenstweets,self.dtokenstweets) self.desc+="\nnChars: {}; mChars: {}; dChars: {}.".format(self.totalchars,self.mcharstweets,self.dcharstweets) self.desc+="\nnHashtags: {}; nMedia: {}; nLinks: {}.".format(self.nhashtags,self.nmedia,self.nlinks) triples=[ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, po.triplifiedBy, "scripts/"), (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]), (self.snapshoturi, po.availableAt, self.online_prefix), (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf), (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl), (self.snapshoturi, po.metaXMLFileName, self.mrdf), (self.snapshoturi, po.metaTTLFileName, self.mttl), (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_rdf)), (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)), (self.snapshoturi, po.acquiredThrough, "Twitter APIs"), (self.snapshoturi, po.socialProtocolTag, "Twitter"), (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform,"Twitter",self.meta_graph,self.snapshoturi)), (self.snapshoturi, po.nTriples, self.ntriples), (self.snapshoturi, NS.rdfs.comment, self.desc), ] P.add(triples,self.meta_graph)
def makeMetadata(self): self.totalchars = sum(self.nchars_all) self.mchars_messages = n.mean(self.nchars_all) self.dchars_messages = n.std(self.nchars_all) self.totaltokens = sum(self.ntokens_all) self.mtokens_messages = n.mean(self.ntokens_all) self.dtokens_messages = n.std(self.ntokens_all) self.totalsentences = sum(self.nsentences_all) self.msentences_messages = n.mean(self.nsentences_all) self.dsentences_messages = n.std(self.nsentences_all) self.totalchars_clean = sum(self.nchars_clean_all) self.mchars_messages_clean = n.mean(self.nchars_clean_all) self.dchars_messages_clean = n.std(self.nchars_clean_all) self.totaltokens_clean = sum(self.ntokens_clean_all) self.mtokens_messages_clean = n.mean(self.ntokens_clean_all) self.dtokens_messages_clean = n.std(self.ntokens_clean_all) self.totalsentences_clean = sum(self.nsentences_clean_all) self.msentences_messages_clean = n.mean(self.nsentences_clean_all) self.dsentences_messages_clean = n.std(self.nsentences_clean_all) fremoved_lines = self.nremoved_lines / self.nlines triples = [ (self.snapshoturi, po.nParticipants, self.nparticipants), (self.snapshoturi, po.nMessages, self.nmessages), (self.snapshoturi, po.nEmptyMessages, self.nempty), (self.snapshoturi, po.nReplies, self.nreplies), (self.snapshoturi, po.nCC, self.ncc), (self.snapshoturi, po.nTo, self.nto), (self.snapshoturi, po.nReferences, self.nreferences), (self.snapshoturi, po.nUrls, self.nurls), (self.snapshoturi, po.nCharsOverall, self.totalchars), (self.snapshoturi, po.mCharsOverall, self.mchars_messages), (self.snapshoturi, po.dCharsOverall, self.dchars_messages), (self.snapshoturi, po.nTokensOverall, self.totaltokens), (self.snapshoturi, po.mTokensOverall, self.mtokens_messages), (self.snapshoturi, po.dTokensOverall, self.dtokens_messages), (self.snapshoturi, po.nSentencesOverall, self.totalsentences), (self.snapshoturi, po.mSentencesOverall, self.msentences_messages), (self.snapshoturi, po.dSentencesOverall, self.dsentences_messages), (self.snapshoturi, po.nCharsOverallClean, self.totalchars_clean), (self.snapshoturi, po.mCharsOverallClean, self.mchars_messages_clean), (self.snapshoturi, po.dCharsOverallClean, self.dchars_messages_clean), (self.snapshoturi, po.nTokensOverallClean, self.totaltokens_clean), (self.snapshoturi, po.mTokensOverallClean, self.mtokens_messages_clean), (self.snapshoturi, po.dTokensOverallClean, self.dtokens_messages_clean), (self.snapshoturi, po.nSentencesOverallClean, self.totalsentences_clean), (self.snapshoturi, po.mSentencesOverallClean, self.msentences_messages_clean), (self.snapshoturi, po.dSentencesOverallClean, self.dsentences_messages_clean), (self.snapshoturi, po.fRemovedLines, fremoved_lines), ] P.add(triples, context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.gmaneParticipantAttribute] * len(self.participantvars), self.participantvars, context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.gmaneMessageAttribute] * len(self.messagevars), self.messagevars, context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.emailXMLFilename] * len(self.email_xml) + [po.emailTTLFilename] * len(self.email_ttl), self.email_xml + self.email_ttl, context=self.meta_graph) P.rdf.triplesScaffolding( self.snapshoturi, [po.onlineEmailXMLFile] * len(self.email_xml) + [po.onlineEmailTTLFile] * len(self.email_ttl), [self.online_prefix + i for i in self.email_xml + self.email_ttl], context=self.meta_graph) self.mrdf = self.snapshotid + "Meta.rdf" self.mttl = self.snapshotid + "Meta.ttl" self.desc = "gmane public email list dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid, self.snapshoturi, self.isego, self.isgroup, ) self.desc += "\nisFriendship: {}; ".format(self.isfriendship) self.desc += "isInteraction: {}.".format(self.isinteraction) self.desc += "\nnParticipants: {}; nInteractions: {} (replies+references+cc+to).".format( self.nparticipants, self.nreplies + self.nreferences + self.ncc + self.nto) self.desc += "\nisPost: {} (alias hasText: {})".format( self.hastext, self.hastext) self.desc += "\nnMessages: {} (+ empty: {}); ".format( self.nmessages, self.nempty) self.desc += "nReplies: {}; nReferences: {}; nTo {}; nCC: {}.".format( self.nreplies, self.nreferences, self.ncc, self.nto) self.desc += "\nnChars: {}; mChars: {}; dChars: {}.".format( self.totalchars, self.mchars_messages, self.dchars_messages) self.desc += "\nnTokens: {}; mTokens: {}; dTokens: {};".format( self.totaltokens, self.mtokens_messages, self.dtokens_messages) self.desc += "\nnSentences: {}; mSentences: {}; dSentences: {}.".format( self.totalsentences, self.msentences_messages, self.dsentences_messages) self.desc += "\nnCharsClean: {}; mCharsClean: {}; dCharsClean: {}.".format( self.totalchars_clean, self.mchars_messages_clean, self.dchars_messages_clean) self.desc += "\nnTokensClean: {}; mTokensClean: {}; dTokensClean: {};".format( self.totaltokens_clean, self.mtokens_messages_clean, self.dtokens_messages_clean) self.desc += "\nnSentencesClean: {}; mSentencesClean: {}; dSentencesClean: {}.".format( self.totalsentences_clean, self.msentences_messages_clean, self.dsentences_messages_clean) self.desc += "\nnUrls: {}; fRemovedLines {};.".format( self.nurls, fremoved_lines) self.ntriples = len(P.context(self.translation_graph)) triples = [ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, po.triplifiedBy, "scripts/"), (self.snapshoturi, po.donatedBy, self.snapshotid), (self.snapshoturi, po.availableAt, self.online_prefix), (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix + self.mrdf), (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix + self.mttl), (self.snapshoturi, po.metaXMLFileName, self.mrdf), (self.snapshoturi, po.metaTTLFileName, self.mttl), (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_xml)), (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)), (self.snapshoturi, po.acquiredThrough, "Gmane public mailing list archive RSS feed"), (self.snapshoturi, po.socialProtocolTag, "Gmane"), (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform, "Gmane", self.meta_graph, self.snapshoturi)), (self.snapshoturi, po.nTriples, self.ntriples), (self.snapshoturi, NS.rdfs.comment, self.desc), (self.snapshoturi, po.gmaneID, self.directory), ] P.add(triples, context=self.meta_graph)