def makePostsTriples(self): if not self.hastext: return self.totalchars = sum(self.size_chars_overall) self.mchars_messages = n.mean(self.size_chars_overall) self.dchars_messages = n.std(self.size_chars_overall) self.totaltokens = sum(self.size_tokens_overall) self.mtokens_messages = n.mean(self.size_tokens_overall) self.dtokens_messages = n.std(self.size_tokens_overall) self.totalsentences = sum(self.size_sentences_overall) self.msentences_messages = n.mean(self.size_sentences_overall) self.dsentences_messages = n.std(self.size_sentences_overall) self.nmessages = P.get( "SELECT (COUNT(?s) as ?s) WHERE { ?s a po:Message }", context=self.translation_graph) self.nparticipants = P.get( "SELECT (COUNT(?s) as ?s) WHERE { ?s a po:Participant }", context=self.translation_graph) self.nurls = P.get( "SELECT (COUNT(?s) as ?s) WHERE { ?s po:hasUrl ?o }", context=self.translation_graph) triples = [ (self.snapshoturi, po.nParticipants, self.nparticipants), (self.snapshoturi, po.nMessages, self.nmessages), (self.snapshoturi, po.nCharsOverall, self.totalchars), (self.snapshoturi, po.mCharsOverall, self.mchars_messages), (self.snapshoturi, po.dCharsOverall, self.dchars_messages), (self.snapshoturi, po.nTokensOverall, self.totaltokens), (self.snapshoturi, po.mTokensOverall, self.mtokens_messages), (self.snapshoturi, po.dTokensOverall, self.dtokens_messages), (self.snapshoturi, po.nSentencesOverall, self.totalsentences), (self.snapshoturi, po.mSentencesOverall, self.msentences_messages), (self.snapshoturi, po.dSentencesOverall, self.dsentences_messages), ] P.add(triples, context=self.meta_graph)
def publishAny(snapshoturi): # publish to umbrelladir # get friendship and interaction of the snapshoturi triples=[ (snapshoturi, po.rawFile, "?fileurifoo"), (snapshoturi, po.snapshotID, "?snapshotid"), ("?fileurifoo", po.expressedClass, po.Friendship), ("?fileurifoo", po.fileFormat, "?fileformat"), ("?fileurifoo", po.fileName, "?filename"), ] fileformat,friendship_filename,snapshotid=P.get(triples) triples=[ (snapshoturi, NS.po.rawFile, "?fileurifoo"), ("?fileurifoo", po.expressedClass, po.Interaction), ("?fileurifoo", NS.po.fileName, "?filename"), ] interaction_filename=P.get(triples,context=social_facebook_inferred) triples=[ (snapshoturi, NS.po.rawFile, "?fileurifoo"), ("?fileurifoo", po.expressedClass, po.Post), ("?fileurifoo", NS.po.fileName, "?filename"), ] posts_filename=P.get(triples,context=social_facebook_inferred) c(fileformat) if "gdf" in fileformat: c("publish gdf", snapshoturi) # friendship_filename,interaction_filename=None,None return GdfRdfPublishing(snapshoturi,snapshotid,friendship_filename,interaction_filename,posts_filename) elif fileformat=="gml": c("publish gml", snapshoturi) return GmlRdfPublishing(snapshoturi,snapshotid,friendship_filename)
def replyTriples(self,tweet,tweeturi): triples=[] if tweet["in_reply_to_user_id_str"] or tweet["in_reply_to_status_id_str"]: self.nreplies+=1 if tweet["in_reply_to_status_id_str"]: userid_reply=self.snapshotid+"-"+tweet["in_reply_to_user_id_str"] useruri_reply=P.rdf.ic(po.Participant,userid_reply,self.tweet_graph,self.snapshoturi) if not P.get(useruri_reply,po.numericID,None): # new user self.nparticipants+=1 triples+=[(useruri_reply,po.numericID,userid_reply)] else: userid_reply=self.snapshotid+"-anonymous-"+str(self.anonymous_user_count) useruri_reply=P.rdf.ic(po.Participant,userid_reply,self.tweet_graph,self.snapshoturi) self.anonymous_user_count+=1 triples+=[(useruri_reply,po.anonymous,True)] if tweet["in_reply_to_status_id_str"]: tweetid_reply=userid_reply+"-"+tweet["in_reply_to_status_id_str"] tweeturi_reply=P.rdf.ic(po.Tweet,tweetid_reply,self.tweet_graph,self.snapshoturi) if not P.get(tweeturi_reply,po.numericID,None): # new message self.ntweets+=1 triples+=[(tweeturi_reply,po.numericID,tweetid_reply)] else: tweetid_reply=self.snapshotid+"-noidmsg-"+str(self.anonymous_tweet_count) tweeturi_reply=P.rdf.ic(po.Tweet,tweetid_reply,self.tweet_graph,self.snapshoturi) self.anonymous_tweet_count+=1 triples+=[(tweeturi_reply,po.noid,True)] triples+=[ (tweeturi,po.inReplyToTweet,tweeturi_reply), (tweeturi_reply,po.author,useruri_reply), ] return triples
def countNew(self, tweetid, userid): query = [("?uri", a, po.Tweet), ("?uri", po.stringID, tweetid)] tweet_known = P.get(query) query = [("?uri", a, po.Participant), ("?uri", po.numericID, userid)] participant_known = P.get(query) if not tweet_known: self.ntweets += 1 if not participant_known: self.nparticipants += 1
def publishAll(snapshoturis=None): """express irc logs as RDF for publishing""" if not snapshoturis: c("getting irc snapshots, implementation needs verification TTM") uridict={} for snapshoturi in P.get(None,a,NS.po.IRCSnapshot,minimized=True): uridict[snapshoturi]=0 for rawFile in P.get(snapshoturi,NS.po.rawFile,strict=True,minimized=True): uridict[snapshoturi]+=P.get(rawFile,NS.po.fileSize,minimized=True).toPython() snapshoturis.sort(key=lambda x: uridict[x]) for snapshoturi in snapshoturis: triplification_class=publishAny(snapshoturi) #writePublishingReadme() return triplification_class
def makeMetadata(self): triples=P.get(self.snapshoturi,None,None,"social_facebook") for rawfile in P.get(self.snapshoturi,po.rawFile,None,"social_facebook",strict=True,minimized=True): triples+=P.get(rawfile,None,None,"social_facebook") P.add(triples,context=self.meta_graph) self.ffile="base/"+self.filename_friendships self.frdf=self.snapshotid+"Friendship.rdf" self.fttl=self.snapshotid+"Friendship.ttl" triples=[ (self.snapshoturi, po.onlineOriginalFriendshipFile,self.online_prefix+self.ffile), (self.snapshoturi, po.originalFriendshipFileName,self.ffile), (self.snapshoturi, po.onlineFriendshipXMLFile,self.online_prefix+self.frdf), (self.snapshoturi, po.onlineFriendshipTTLFile,self.online_prefix+self.fttl), (self.snapshoturi, po.friendshipXMLFileName, self.frdf), (self.snapshoturi, po.friendshipTTLFileName, self.fttl), (self.snapshoturi, po.nFriends, self.nfriends), (self.snapshoturi, po.nFriendships, self.nfriendships), (self.snapshoturi, po.friendshipsAnonymized ,self.friendships_anonymized), ] P.add(triples,context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.frienshipParticipantAttribute]*len(self.friendsvars), self.friendsvars,context=self.meta_graph) self.mrdf=self.snapshotid+"Meta.rdf" self.mttl=self.snapshotid+"Meta.ttl" self.desc="facebook network with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid,self.snapshoturi,self.isego,self.isgroup,) self.desc+="\nisFriendship: {}".format(self.isfriendship) self.desc+="; nFriends: {}; nFrienships: {}.".format(self.nfriends,self.nfriendships,) self.desc+="\nisInteraction: {}".format(self.isinteraction) self.desc+="\nisPost: {} (alias hasText: {})".format(self.hastext,self.hastext) triples=[ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, po.triplifiedBy, "scripts/"), (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]), (self.snapshoturi, po.availableAt, self.online_prefix), (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf), (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl), (self.snapshoturi, po.metaXMLFileName, self.mrdf), (self.snapshoturi, po.metaTTLFileName, self.mttl), (self.snapshoturi, po.acquiredThrough, "Netvizz"), (self.snapshoturi, po.socialProtocolTag, "Facebook"), (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform,"Facebook",self.meta_graph,self.snapshoturi)), (self.snapshoturi, NS.rdfs.comment, self.desc), ] P.add(triples,self.meta_graph)
def publishAny(snapshoturi): # publish to umbrelladir triples=[ (snapshoturi, po.rawFile, "?fileurifoo"), ("?fileurifoo", po.fileName, "?filename"), ] filenames=P.get(triples,join_queries="list",strict=True) filenames.sort() # filenames=[i for i in filenames if i.count("_")==2] triples=[ (snapshoturi, po.snapshotID, "?snapshotid"), ] snapshotid=P.get(triples) if filenames: return PicklePublishing(snapshoturi,snapshotid,filenames)
def publishAll(snapshoturis=None): """express tweets as RDF for publishing""" if not snapshoturis: c("getting twitter snapshots, implementation needs verification TTM") uridict={} for snapshoturi in P.get(None,a,NS.po.TwitterSnapshot,minimized=True): uridict[snapshoturi]=0 for rawFile in P.get(snapshoturi,NS.po.rawFile,strict=True,minimized=True): uridict[snapshoturi]+=P.get(rawFile,NS.po.fileSize,minimized=True).toPython() snapshoturis=[i for i in list(uridict.keys()) if i.endswith(".gml")] snapshoturis.sort(key=lambda x: uridict[x]) for snapshoturi in snapshoturis: triplification_class=publishAny(snapshoturi) #writePublishingReadme() return triplification_class
def countNew(self,tweetid,userid): query=[ ("?uri",a,po.Tweet), ("?uri",po.stringID,tweetid) ] tweet_known=P.get(query) query=[ ("?uri",a,po.Participant), ("?uri",po.numericID,userid) ] participant_known=P.get(query) if not tweet_known: self.ntweets+=1 if not participant_known: self.nparticipants+=1
def publishAny(snapshoturi): # publish to umbrelladir triples = [ (snapshoturi, po.rawFile, "?fileurifoo"), ("?fileurifoo", po.fileName, "?filename"), ] filenames = P.get(triples, join_queries="list", strict=True) filenames.sort() # filenames=[i for i in filenames if i.count("_")==2] triples = [ (snapshoturi, po.snapshotID, "?snapshotid"), ] snapshotid = P.get(triples) if filenames: return PicklePublishing(snapshoturi, snapshotid, filenames)
def startSession(context="session"): current_user_uri=P.get(NS.per.currentUser) # from rdf.rdflib OK now=datetime.now() P.context("session","remove") if not current_user_uri: nick=randomNick() # OK current_user_uri=P.rdf.timestampedURI(NS.per.Participant,nick,now) # rdf.rdflib OK triples=[ (current_user_uri, a, NS.per.DefaultParticipant), (current_user_uri, NS.per.nick, nick), (current_user_uri, NS.per.registered, now), ] c("Please create a user with P.utils.createUser() ASAP. Registered for now as {} with URI: {}".format(nick,current_user_uri)) else: triples=[] session_uri=P.rdf.timestampedURI(NS.per.Session,nick,now) # from rdf.rdflib OK current_status_uri=NS.per.CurrentStatus # class in per: ontology OK triples+=[ (current_status_uri,NS.per.currentSession,session_uri), (session_uri,NS.per.started,now), (session_uri,NS.per.user,current_user_uri), (current_status_uri,NS.per.currentUser,current_user_uri), ] P.set_(triples,context=context) # from rdf.rdflib OK #P.rdf.minimumOntology() # from rdf.ontology P.rdf.ontology.minimumTestOntology() # from rdf.ontology #P.legacy.triples.datasets.datasets() # from legacy.triples P.legacy.triples.datasets.minimalTestData() # from legacy.triples P.rdf.inference.performRdfsInference("void","minimum_ontology","session_legacy_metadata") # from rdf.inference
def publishAll(snapshoturis=None): """express emails as RDF for publishing""" if not snapshoturis: c("getting email snapshots, implementation needs verification TTM") uridict = {} for snapshoturi in P.get(None, a, NS.po.Snapshot, minimized=True): uridict[snapshoturi] = 0 for rawFile in P.get(snapshoturi, NS.po.rawFile, strict=True, minimized=True): uridict[snapshoturi] += P.get(rawFile, NS.po.directorySize, minimized=True).toPython() snapshoturis.sort(key=lambda x: uridict[x]) c("on triplification") triplification_classes = [] for snapshoturi in list(snapshoturis)[:10]: triplification_classes += [publishAny(snapshoturi)] # writePublishingReadme() return triplification_classes
def publishAny(snapshoturi): triples = [ (snapshoturi, po.rawFile, "?fileurifoo"), ("?fileurifoo", po.fileName, "?filename"), (snapshoturi, po.snapshotID, "?snapshotid"), ] filename, snapshotid = P.get(triples) # filenames=[i for i in filenames if i.count("_")==2] return LogPublishing(snapshoturi, snapshotid, filename)
def publishAll(snapshoturis=None): """express irc logs as RDF for publishing""" if not snapshoturis: c("getting irc snapshots, implementation needs verification TTM") uridict = {} for snapshoturi in P.get(None, a, NS.po.IRCSnapshot, minimized=True): uridict[snapshoturi] = 0 for rawFile in P.get(snapshoturi, NS.po.rawFile, strict=True, minimized=True): uridict[snapshoturi] += P.get(rawFile, NS.po.fileSize, minimized=True).toPython() snapshoturis.sort(key=lambda x: uridict[x]) for snapshoturi in snapshoturis: triplification_class = publishAny(snapshoturi) return triplification_class
def publishAny(snapshoturi): # publish to umbrelladir triples = [ (snapshoturi, po.dataDir, "?datadir"), (snapshoturi, po.snapshotID, "?snapshotid"), (snapshoturi, po.rawDirectory, "?directoryurifoo"), ("?directoryurifoo", po.directoryName, "?directoryname"), ] data_dir, directory, snapshotid = P.get(triples) return MboxPublishing(snapshoturi, snapshotid, directory, data_dir)
def publishAny(snapshoturi): # publish to umbrelladir triples=[ (snapshoturi, po.rawFile, "?fileurifoo"), ("?fileurifoo", po.fileName, "?filename"), (snapshoturi, po.snapshotID, "?snapshotid"), ] filename,snapshotid=P.get(triples) # filenames=[i for i in filenames if i.count("_")==2] return LogPublishing(snapshoturi,snapshotid,filename)
def replyTriples(self, tweet, tweeturi): triples = [] if tweet["in_reply_to_user_id_str"] or tweet[ "in_reply_to_status_id_str"]: self.nreplies += 1 if tweet["in_reply_to_status_id_str"]: userid_reply = self.snapshotid + "-" + tweet[ "in_reply_to_user_id_str"] useruri_reply = P.rdf.ic(po.Participant, userid_reply, self.tweet_graph, self.snapshoturi) if not P.get(useruri_reply, po.numericID, None): # new user self.nparticipants += 1 triples += [(useruri_reply, po.numericID, userid_reply)] else: userid_reply = self.snapshotid + "-anonymous-" + str( self.anonymous_user_count) useruri_reply = P.rdf.ic(po.Participant, userid_reply, self.tweet_graph, self.snapshoturi) self.anonymous_user_count += 1 triples += [(useruri_reply, po.anonymous, True)] if tweet["in_reply_to_status_id_str"]: tweetid_reply = userid_reply + "-" + tweet[ "in_reply_to_status_id_str"] tweeturi_reply = P.rdf.ic(po.Tweet, tweetid_reply, self.tweet_graph, self.snapshoturi) if not P.get(tweeturi_reply, po.numericID, None): # new message self.ntweets += 1 triples += [(tweeturi_reply, po.numericID, tweetid_reply)] else: tweetid_reply = self.snapshotid + "-noidmsg-" + str( self.anonymous_tweet_count) tweeturi_reply = P.rdf.ic(po.Tweet, tweetid_reply, self.tweet_graph, self.snapshoturi) self.anonymous_tweet_count += 1 triples += [(tweeturi_reply, po.noid, True)] triples += [ (tweeturi, po.inReplyToTweet, tweeturi_reply), (tweeturi_reply, po.author, useruri_reply), ] return triples
def publishAll(snapshoturis=None): """express emails as RDF for publishing""" if not snapshoturis: c("getting email snapshots, implementation needs verification TTM") uridict = {} for snapshoturi in P.get(None, a, NS.po.GmaneSnapshot, minimized=True): uridict[snapshoturi] = 0 for rawFile in P.get(snapshoturi, NS.po.rawFile, strict=True, minimized=True): uridict[snapshoturi] += P.get(rawFile, NS.po.directorySize, minimized=True).toPython() snapshoturis.sort(key=lambda x: uridict[x]) c("on triplification") triplification_classes = [] for snapshoturi in list(snapshoturis)[:10]: triplification_classes += [publishAny(snapshoturi)] #writePublishingReadme() return triplification_classes
def publishAll(snapshoturis=None): #triples=S.facebook.ontology.snapshots() #P.add(triples,context="facebook_snapshots_ontology") #P.rdf.inference.performRdfsInference("social_facebook","facebook_snapshots_ontology",social_facebook_inferred,False) if not snapshoturis: c("getting facebook snapshots, implementation needs verification TTM") uridict={} for snapshoturi in P.get(None,a,NS.po.FacebookSnapshot,minimized=True): uridict[snapshoturi]=0 for rawFile in P.get(snapshoturi,NS.po.rawFile,strict=True,minimized=True): uridict[snapshoturi]+=P.get(rawFile,NS.po.fileSize,minimized=True).toPython() snapshoturis=[i for i in list(uridict.keys()) if i.endswith(".gml")] snapshoturis.sort(key=lambda x: uridict[x]) # snapshoturis=[i for i in snapshoturis if i.endswith("gml")] c("snapuris:",snapshoturis) count=0 for snapshoturi in snapshoturis: triplification_class=publishAny(snapshoturi) count+=1 #writePublishingReadme() return triplification_class
def publishAll(snapshoturis=None): """express tweets as RDF for publishing""" if not snapshoturis: c("getting twitter snapshots, implementation needs verification TTM") uridict = {} for snapshoturi in P.get(None, a, NS.po.TwitterSnapshot, minimized=True): uridict[snapshoturi] = 0 for rawFile in P.get(snapshoturi, NS.po.rawFile, strict=True, minimized=True): uridict[snapshoturi] += P.get(rawFile, NS.po.fileSize, minimized=True).toPython() snapshoturis = [i for i in list(uridict.keys()) if i.endswith(".gml")] snapshoturis.sort(key=lambda x: uridict[x]) for snapshoturi in snapshoturis: triplification_class = publishAny(snapshoturi) return triplification_class
def publishAll(snapshoturis=None): if not snapshoturis: c("getting facebook snapshots, implementation needs verification TTM") uridict = {} for snapshoturi in P.get(None, a, NS.po.FacebookSnapshot, minimized=True): uridict[snapshoturi] = 0 for rawFile in P.get(snapshoturi, NS.po.rawFile, strict=True, minimized=True): uridict[snapshoturi] += P.get(rawFile, NS.po.fileSize, minimized=True).toPython() snapshoturis = [i for i in list(uridict.keys()) if i.endswith(".gml")] snapshoturis.sort(key=lambda x: uridict[x]) c("snapuris:", snapshoturis) count = 0 for snapshoturi in snapshoturis: triplification_class = publishAny(snapshoturi) count += 1 return triplification_class
def entityTriples(self, tweet, tweeturi): triples = [] for hashtag_ in tweet["entities"]["hashtags"]: self.nhashtags += 1 hashtag = hashtag_["text"] triples += [ (tweeturi, po.hashtag, hashtag), ] for user_mention in tweet["entities"]["user_mentions"]: self.nuser_mentions += 1 userid_mention_ = user_mention["id_str"] name_mention = user_mention["name"] screen_name_mention = user_mention["screen_name"] userid_mention = self.snapshotid + "-" + userid_mention_ useruri_mention = P.rdf.ic(po.Participant, userid_mention, self.tweet_graph, self.snapshoturi) triples += [ (tweeturi, po.userMention, useruri_mention), (useruri_mention, po.name, name_mention), (useruri_mention, po.screenName, screen_name_mention), (useruri_mention, po.stringID, userid_mention), ] if not P.get(useruri_mention, po.numericID, None): # new user self.nparticipants += 1 triples += [(useruri_mention, po.numericID, userid_mention)] links = [] for link in tweet["entities"]["urls"]: self.nlinks += 1 url = link["url"] triples += [(tweeturi, po.expandedURL, link["expanded_url"])] if "media" in tweet["entities"].keys(): for media in tweet["entities"]["media"]: self.nmedia += 1 mediaid = self.snapshoturi + "-" + str(self.nmedia) mediauri = P.rdf.ic(po.Media, mediaid, self.tweet_graph, self.snapshoturi) triples += [ (tweeturi, po.media, mediauri), (mediauri, po.type, media["type"]), (mediauri, po.expandedURL, media["expanded_url"]), ] #symbols? return triples
def entityTriples(self,tweet,tweeturi): triples=[] for hashtag_ in tweet["entities"]["hashtags"]: self.nhashtags+=1 hashtag=hashtag_["text"] triples+=[ (tweeturi,po.hashtag,hashtag), ] for user_mention in tweet["entities"]["user_mentions"]: self.nuser_mentions+=1 userid_mention_=user_mention["id_str"] name_mention=user_mention["name"] screen_name_mention=user_mention["screen_name"] userid_mention=self.snapshotid+"-"+userid_mention_ useruri_mention=P.rdf.ic(po.Participant,userid_mention,self.tweet_graph,self.snapshoturi) triples+=[ (tweeturi,po.userMention,useruri_mention), (useruri_mention,po.name,name_mention), (useruri_mention,po.screenName,screen_name_mention), (useruri_mention,po.stringID,userid_mention), ] if not P.get(useruri_mention,po.numericID,None): # new user self.nparticipants+=1 triples+=[(useruri_mention,po.numericID,userid_mention)] links=[] for link in tweet["entities"]["urls"]: self.nlinks+=1 url=link["url"] triples+=[ (tweeturi,po.expandedURL,link["expanded_url"]) ] if "media" in tweet["entities"].keys(): for media in tweet["entities"]["media"]: self.nmedia+=1 mediaid=self.snapshoturi+"-"+str(self.nmedia) mediauri=P.rdf.ic(po.Media,mediaid,self.tweet_graph,self.snapshoturi) triples+=[ (tweeturi,po.media,mediauri), (mediauri,po.type,media["type"]), (mediauri,po.expandedURL,media["expanded_url"]), ] #symbols? return triples
def rdfMbox(self): for filecount,file_ in enumerate(self.files): if filecount%100==0: c(self.snapshoturi,filecount) mbox = mailbox.mbox(self.data_path+self.directory+"/"+file_) if not mbox.keys(): self.nempty+=1 mbox.close() # c("||||||||||| EMPTY MESSAGE |||||||||||||||||||||",self.snapshotid,file_,"(",filecount,")") continue if not mbox[0]["Message-Id"]: raise ValueError("What to do with nonempy messages without id?") message=mbox[0] gmaneid=self.makeId(message["Message-Id"]) #c("gmaneid",gmaneid) if not gmaneid: raise ValueError("Message without id") messageuri=P.rdf.ic(po.EmailMessage,gmaneid,self.translation_graph,self.snapshoturi) self.nmessages+=1 triples=[ (messageuri,po.gmaneID,gmaneid), ] email,name=self.parseParticipant(message["From"]) if not email: raise ValueError("message without author") participanturi=P.rdf.ic(po.GmaneParticipant,email,self.translation_graph,self.snapshoturi) if not P.get(participanturi,po.emailAddress,None,self.translation_graph): self.nparticipants+=1 if self.nparticipants==100: pass triples+=[ (messageuri,po.author,participanturi), (participanturi,po.emailAddress,email), ] if name: triples+=[ (participanturi,po.name,name), ] subject=message["Subject"] if subject: subject=decodeHeader(subject) assert isinstance(subject,str) triples+=[ (messageuri,po.subject,subject), ] replyid_=message["In-Reply-To"] saneid=self.makeId(replyid_) if bool(replyid_) and not bool(saneid): self.nreplies+=1 replyid=self.snapshotid+"-"+str(self.nlost_messages) self.nlost_messages+=1 replymessageuri=P.rdf.ic(po.LostEmailMessage,replyid,self.translation_graph,self.snapshoturi) triples+=[ (replymessageuri,a,po.EmailMessage), (replymessageuri,NS.rdfs.comment,"This message registered as having a reply, but the field might be ill-formed: "+replyid_), (messageuri,po.replyTo,replymessageuri), ] elif saneid: self.nreplies+=1 replymessageuri=P.rdf.ic(po.EmailMessage,saneid,self.translation_graph,self.snapshoturi) triples+=[ (replymessageuri,po.gmaneID,saneid), (messageuri,po.replyTo,replymessageuri), ] if isinstance(message["Date"],str): datetime=parseDate(message["Date"]) elif isinstance(message["Date"],mailbox.email.header.Header): datetimestring=decodeHeader(message["Date"]) if False in [i in string.printable for i in datetimestring]: datetime=None triples+=[ (messageuri,po.lostCreatedAt,True), ] else: datetime_=re.findall(r"(.*\d\d:\d\d:\d\d).*",datetimestring)[0] datetime=parseDate(datetime_) else: raise ValueError("datetime not understood") if datetime: self.dates+=[datetime] triples+=[ (messageuri,po.createdAt,datetime), ] if message["References"]: references=message["References"].replace("\n","").replace("\t","").replace(" ","") if not re.findall(r"\A<(.*?)>\Z",references): c("::: ::: ::: references field not understood", message["References"]) triples+=[ (messageuri,po.comment,"the references are not understood (<.*> ids are added anyway): "+message["References"]), (messageuri,po.referencesLost,True), ] for reference in re.findall(r"<(.*?)>",references): self.nreferences+=1 referenceuri=P.rdf.ic(po.EmailMessage,reference,self.translation_graph,self.snapshoturi) triples+=[ (referenceuri,po.gmaneID,reference), (messageuri,po.hasReference,referenceuri), ] for part in message["References"].replace("\n","").replace("\t","").split(): if validate_email(part): self.nreferences+=1 referenceuri=P.rdf.ic(po.EmailMessage,part,self.translation_graph,self.snapshoturi) triples+=[ (referenceuri,po.gmaneID,reference), (messageuri,po.hasReference,referenceuri), ] text=getText(message) if text: nchars=len(text) ntokens=len(k.wordpunct_tokenize(text)) nsentences=len(k.sent_tokenize(text)) triples+=[ (messageuri,po.messageText,text), (messageuri,po.nChars,nchars), (messageuri,po.nTokens,ntokens), (messageuri,po.nSentences,nsentences), ] self.nchars_all+=[nchars] self.ntokens_all+=[ntokens] self.nsentences_all+=[nsentences] clean_text=cleanEmailBody(text) self.nremoved_lines+=text.count("\n")-clean_text.count("\n") self.nlines+=text.count("\n") nchars_clean=len(clean_text) ntokens_clean=len(k.wordpunct_tokenize(clean_text)) nsentences_clean=len(k.sent_tokenize(clean_text)) triples+=[ (messageuri,po.messageTextClean,clean_text), (messageuri,po.nCharsClean,nchars_clean), (messageuri,po.nTokensClean,ntokens_clean), (messageuri,po.nSentencesClean,nsentences_clean), ] self.nchars_clean_all+=[nchars_clean] self.ntokens_clean_all+=[ntokens_clean] self.nsentences_clean_all+=[nsentences_clean] for url in re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',clean_text): self.nurls+=1 triples+=[ (messageuri,po.hasUrl,url), ] content_type=message.get_content_type() if content_type: triples+=[ (messageuri,po.contentType,content_type) ] else: raise ValueError("/\/\/\/\/\ message without content type") organization=message["Organization"] if organization: if not isinstance(organization,str): organization="".join(i for i in str(organization) if i in string.printable) triples+=[ (messageuri,po.organization,organization), ] if message["cc"]: cc,unparsed=parseAddresses(message["cc"]) if unparsed: triples+=[ (messageuri,po.unparsedCC,unparsed), ] for peeraddress,peername in cc: peeraddress=peeraddress.strip() assert bool(peeraddress) peeruri=P.rdf.ic(po.EmailPeer,peeraddress,self.translation_graph,self.snapshoturi) triples+=[ (messageuri,po.cc,peeruri), (peeruri,po.emailAddress,peeraddress), ] self.ncc+=1 if peername: triples+=[ (peeruri,po.name,peername.strip()), ] if message["to"]: to,unparsed=parseAddresses(message["to"]) if unparsed: triples+=[ (messageuri,po.unparsedTo,unparsed), ] for peeraddress,peername in to: peeraddress=peeraddress.strip() assert bool(peeraddress) peeruri=P.rdf.ic(po.EmailPeer,peeraddress,self.translation_graph,self.snapshoturi) triples+=[ (messageuri,po.to,peeruri), (peeruri,po.emailAddress,peeraddress), ] self.nto+=1 if peername: triples+=[ (peeruri,po.name,peername.strip()), ] listid=message["list-id"] if listid: assert isinstance(listid,str) listid=listid.replace("\n","").replace("\t","") if listid.count("<")==listid.count(">")==listid.count(" ")==0: listname="" listid_=listid elif listid.count("<")==listid.count(">")==0: parts=listid.split() lens=[len(i) for i in parts] listid_=[i for i in parts if len(i)==max(lens)][0] listname=" ".join(i for i in parts if len(i)!=max(lens)) elif listid.count("<")==listid.count(">")==1: listname,listid_=re.findall(r"(.*) {0,1}<(.*)>",listid)[0] else: raise ValueError("Unexpected listid string format") listuri=P.rdf.ic(po.EmailList,listid_,self.translation_graph,self.snapshoturi) triples+=[ (messageuri,po.emailList,listuri), (listuri,po.listID,listid_), ] if listname: triples+=[ (listuri,po.name,listname.strip()), ] P.add(triples,self.translation_graph) mbox.close()
def writeAllFB(self): c("started rendering of the snapshot publication. snapshotID:", self.snapshotid) self.final_path_ = "{}{}/".format(self.final_path, self.snapshotid) if not os.path.isdir(self.final_path_): os.mkdir(self.final_path_) g = P.context(self.friendship_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Friendship.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Friendship.rdf", "xml") c("serialized friendships") # get filesize and ntriples # filesizerdf = os.path.getsize(self.final_path_+self.snapshotid + # "Friendship.rdf")/(10**6) # filesizettl = os.path.getsize(self.final_path_+self.snapshotid + # "Friendship.ttl")/(10**6) # ntriples = len(g) # triples = [ # (self.snapshoturi, po.friendshipXMLFileSizeMB, filesizerdf), # (self.snapshoturi, po.friendshipTTLFileSizeMB, filesizettl), # (self.snapshoturi, po.nFriendshipTriples, ntriples), # ] g = P.context(self.meta_graph) # ntriples = len(g) # triples.append( # (self.snapshoturi, po.nMetaTriples, ntriples+1), # ) # P.add(triples, context=self.meta_graph) g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml") c("serialized meta") if not os.path.isdir(self.final_path_ + "scripts"): os.mkdir(self.final_path_ + "scripts") shutil.copy(S.PACKAGEDIR + "/../tests/triplify.py", self.final_path_ + "scripts/triplify.py") # copia do base data if not os.path.isdir(self.final_path_ + "base"): os.mkdir(self.final_path_ + "base") shutil.copy(self.data_path + self.filename_friendships, self.final_path_ + "base/") originals = "base/{}".format(self.filename_friendships) tfriendship = """\n\n{nf} individuals with metadata {fvars} and {nfs} friendships constitute the friendship network in the RDF/XML file: {frdf} \in the Turtle file: \n{fttl} (anonymized {fan}).""".format( nf=self.nfriends, fvars=str(self.friendsvars), nfs=self.nfriendships, frdf=self.frdf, fttl=self.fttl, fan=self.friendships_anonymized, ) datetime_string = P.get(self.snapshoturi, po.dateObtained, None, context=self.social_graph)[2] with open(self.final_path_ + "README", "w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the facebook snapshot {snapid} collected around {date}.{tfriendship} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nOriginal file(s): {origs} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ \ directory.\n:::""".format(snapid=self.snapshotid, date=datetime_string, tfriendship=tfriendship, mrdf=self.mrdf, mttl=self.mttl, origs=originals, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc))
def makeMetadata(self): if self.isfriendship and self.groupid and self.groupid2 and \ (self.groupid != self.groupid2): raise ValueError("Group IDS are different") # triples = P.get(self.snapshoturi, None, None, self.social_graph) # for rawfile in P.get(self.snapshoturi, po.rawFile, None, # self.social_graph, strict=True, minimized=True): # triples.extend(P.get(rawfile, None, None, self.social_graph)) # P.add(triples, context=self.meta_graph) foo = {"uris": [], "vals": []} if self.isfriendship: foo["uris"].extend([ # po.onlineOriginalFriendshipFile, # po.originalFriendshipFileName, # po.onlineFriendshipXMLFile, # po.onlineFriendshipTTLFile, # po.friendshipXMLFileName, # po.friendshipTTLFileName, # po.numberOfFriends, # po.numberOfFriendships, po.friendshipsAnonymized ] # + [po.frienshipParticipantAttribute]*len(self.friendsvars) ) self.ffile = "base/"+self.filename_friendships self.frdf = self.snapshotid+"Friendship.rdf" self.fttl = self.snapshotid+"Friendship.ttl" foo["vals"].extend([ # self.online_prefix+self.ffile, # self.ffile, # self.online_prefix+self.frdf, # self.online_prefix+self.fttl, # self.frdf, # self.fttl, # self.nfriends, # self.nfriendships, self.friendships_anonymized ] # +list(self.friendsvars) ) if self.isinteraction: foo["uris"].extend([ # po.onlineOriginalInteractionFile, # po.originalInteractionFileName, # po.onlineInteractionXMLFile, # po.onlineInteractionTTLFile, # po.interactionXMLFileName, # po.interactionTTLFileName, # po.numberOfInteractedParticipants, # po.numberOfInteractions, po.interactionsAnonymized ]) # + [po.interactionParticipantAttribute]*len( # self.interactionsvars) self.ifile = "base/"+self.filename_interactions self.irdf = irdf = self.snapshotid+"Interaction.rdf" self.ittl = ittl = self.snapshotid+"Interaction.ttl" foo["vals"].extend([ # self.ifile, # self.online_prefix+self.ifile, # self.online_prefix+irdf, # self.online_prefix+ittl, # irdf, # ittl, # self.ninteractions, # self.ninteracted, self.interactions_anonymized, ] # +list(self.interactionsvars) ) if self.hastext: foo["uris"].extend([ # po.onlineOriginalPostsFile, # po.originalPostsFileName, # po.onlinePostsXMLFile, # po.onlinePostsTTLFile, # po.postsXMLFileName, # po.postsTTLFileName, # po.numberOfPosts, # po.numberOfChars, # po.meanChars, # po.deviationChars, # po.numberOfTokens, # po.meanTokens, # po.deviationTokens, ] # + [po.postAttribute]*len(self.postsvars) ) self.pfile = "base/"+self.filename_posts self.prdf = self.snapshotid+"Post.rdf" self.pttl = self.snapshotid+"Post.ttl" foo["vals"].extend([ # self.online_prefix+self.pfile, # self.pfile, # self.online_prefix+self.prdf, # self.online_prefix+self.pttl, # self.prdf, # self.pttl, # self.nposts, # int(self.totalchars), # self.mcharsposts, # self.dcharsposts, # int(self.totaltokens), # self.mtokensposts, # self.dtokensposts, ] # +list(self.postsvars) ) foo["uris"].extend([ a, po.snapshotID, po.isGroup, po.isEgo, po.isFriendship, po.isInteraction, # po.hasText, po.isPost, po.dateObtained, po.name, ] ) # self.isego = bool(P.get(r.URIRef(self.snapshoturi), a, po.EgoSnapshot)) # self.isgroup = bool(P.get(r.URIRef(self.snapshoturi), a, po.GroupSnapshot)) self.isego = P.get(r.URIRef(self.snapshoturi), po.isEgo)[2].toPython() self.isgroup = P.get(r.URIRef(self.snapshoturi), po.isGroup)[2].toPython() date_obtained = P.get(r.URIRef(self.snapshoturi), po.dateObtained)[2].toPython() assert isinstance(date_obtained, datetime.date) name = P.get(r.URIRef(self.snapshoturi), po.name, None, context=self.social_graph)[2] foo["vals"].extend([po.Snapshot, self.snapshotid, self.isgroup, self.isego, self.isfriendship, self.isinteraction, self.hastext, date_obtained, name]) # , self.hastext]) numericID = P.get(r.URIRef(self.snapshoturi), po.numericID, None, context=self.social_graph) if numericID: foo['uris'].append(po.numericID) foo['vals'].append(numericID[2]) stringID = P.get(r.URIRef(self.snapshoturi), po.stringID, None, context=self.social_graph) if stringID: foo['uris'].append(po.stringID) foo['vals'].append(stringID[2]) url = P.get(r.URIRef(self.snapshoturi), po.url, None, context=self.social_graph) if url: foo['uris'].append(po.url) foo['vals'].append(url[2]) self.mrdf = self.snapshotid+"Meta.rdf" self.mttl = self.snapshotid+"Meta.ttl" self.desc = "facebook network with snapshotID: {}\nsnapshotURI: \ {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid, self.snapshoturi, self.isego, self.isgroup) self.desc += "\nisFriendship: {}".format(self.isfriendship) # if self.isfriendship: # self.desc += "; numberOfFriends: {}; numberOfFrienships: {}.".format( # self.nfriends, self.nfriendships) self.desc += "\nisInteraction: {}".format(self.isinteraction) # if self.isinteraction: # self.desc += "; numberOfInteracted: {}; numberOfInteractions: {}.".format( # self.ninteracted, self.ninteractions) self.desc += "\nisPost: {} (has text)".format( self.hastext) # if self.hastext: # self.desc += ";\nmeanChars: {}; deviationChars: {}; \ # totalChars: {}; \nmeanTokens: {}; \ # deviationTokens: {}; totalTokens: {}".format( # self.nposts, # self.mcharsposts, self.dcharsposts, self.totalchars, # self.mtokensposts, self.dtokensposts, self.totaltokens, # ) P.rdf.triplesScaffolding(self.snapshoturi, [ po.triplifiedIn, # po.triplifiedBy, # po.donatedBy, # po.availableAt, # po.onlineMetaXMLFile, # po.onlineMetaTTLFile, # po.metaXMLFileName, # po.metaTTLFileName, po.acquiredThrough, po.socialProtocol, # po.socialProtocolTag, # po.socialProtocol, po.comment, ]+foo["uris"], [ datetime.datetime.now(), # "scripts/", # self.snapshotid[:-4], # self.online_prefix, # self.online_prefix+self.mrdf, # self.online_prefix+self.mttl, # self.mrdf, # self.mttl, "Netvizz", "Facebook", # "Facebook", # P.rdf.ic(po.Platform, "Facebook", self.meta_graph, self.snapshoturi), self.desc, ]+foo["vals"], self.meta_graph)
def makeMetadata(self): if self.isfriendship and self.groupid and self.groupid2 and (self.groupid!=self.groupid2): raise ValueError("Group IDS are different") # put all triples from social_facebook to self.meta_graph #g1=P.context("social_facebook") #g2=P.context(self.meta_graph) #for subject, predicate, object_ in g1.triples((self.snapshoturi)) triples=P.get(self.snapshoturi,None,None,"social_facebook") for rawfile in P.get(self.snapshoturi,po.rawFile,None,"social_facebook",strict=True,minimized=True): triples+=P.get(rawfile,None,None,"social_facebook") P.add(triples,context=self.meta_graph) foo={"uris":[],"vals":[]} if self.isfriendship: foo["uris"]+=[ po.onlineOriginalFriendshipFile, po.originalFriendshipFileName, po.onlineFriendshipXMLFile, po.onlineFriendshipTTLFile, po.friendshipXMLFileName, po.friendshipTTLFileName, po.nFriends, po.nFriendships, po.friendshipsAnonymized ]+\ [po.frienshipParticipantAttribute]*len(self.friendsvars) self.ffile="base/"+self.filename_friendships self.frdf=self.snapshotid+"Friendship.rdf" self.fttl=self.snapshotid+"Friendship.ttl" foo["vals"]+=[ self.online_prefix+self.ffile, self.ffile, self.online_prefix+self.frdf, self.online_prefix+self.fttl, self.frdf, self.fttl, self.nfriends, self.nfriendships, self.friendships_anonymized ]+list(self.friendsvars) if self.isinteraction: foo["uris"]+=[ po.onlineOriginalInteractionFile, po.originalInteractionFileName, po.onlineInteractionXMLFile, po.onlineInteractionTTLFile, po.interactionXMLFileName, po.interactionTTLFileName, po.nInteracted, po.nInteractions, po.interactionsAnonymized ]+\ [po.interactionParticipantAttribute]*len(self.interactionsvars) self.ifile="base/"+self.filename_interactions self.irdf=irdf=self.snapshotid+"Interaction.rdf" self.ittl=ittl=self.snapshotid+"Interaction.ttl" foo["vals"]+=[ self.ifile, self.online_prefix+self.ifile, self.online_prefix+irdf, self.online_prefix+ittl, irdf, ittl, self.ninteractions, self.ninteracted, self.interactions_anonymized, ]+list(self.interactionsvars) if self.hastext: foo["uris"]+=[ po.onlineOriginalPostsFile, po.originalPostsFileName, po.onlinePostsXMLFile, po.onlinePostsTTLFile, po.postsXMLFileName, po.postsTTLFileName, po.nPosts, po.nCharsOverall, po.mCharsOverall, po.dCharsOverall, po.nTokensOverall, po.mTokensOverall, po.dTokensOverall, ]+\ [po.postAttribute]*len(self.postsvars) self.pfile="base/"+self.filename_posts self.prdf=self.snapshotid+"Post.rdf" self.pttl=self.snapshotid+"Post.ttl" foo["vals"]+=[ self.online_prefix+self.pfile, self.pfile, self.online_prefix+self.prdf, self.online_prefix+self.pttl, self.prdf, self.pttl, self.nposts, int(self.totalchars), self.mcharsposts, self.dcharsposts, int(self.totaltokens), self.mtokensposts, self.dtokensposts, ]+list(self.postsvars) foo["uris"]+=[ po.isGroup, po.isEgo, po.isFriendship, po.isInteraction, po.hasText, po.isPost, ] self.isego= bool(P.get(r.URIRef(self.snapshoturi),a,po.EgoSnapshot )) self.isgroup=bool(P.get(r.URIRef(self.snapshoturi),a,po.GroupSnapshot)) foo["vals"]+=[self.isgroup,self.isego,self.isfriendship,self.isinteraction,self.hastext,self.hastext] self.mrdf=self.snapshotid+"Meta.rdf" self.mttl=self.snapshotid+"Meta.ttl" self.desc="facebook network with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid,self.snapshoturi,self.isego,self.isgroup,) self.desc+="\nisFriendship: {}".format(self.isfriendship) if self.isfriendship: self.desc+="; nFriends: {}; nFrienships: {}.".format(self.nfriends,self.nfriendships,) self.desc+="\nisInteraction: {}".format(self.isinteraction) if self.isinteraction: self.desc+="; nInteracted: {}; nInteractions: {}.".format(self.ninteracted,self.ninteractions,) self.desc+="\nisPost: {} (alias hasText: {})".format(self.hastext,self.hastext) if self.hastext: self.desc+=";\nmCharsPostsOverall: {}; dCharsPostsOverall: {}; totalCharsOverall: {}; \ \nmTokensPostsOverall: {}; dTokensPostsOverall: {}; totalTokensOverall: {}".format( self.nposts, self.mcharsposts,self.dcharsposts,self.totalchars, self.mtokensposts,self.dtokensposts,self.totaltokens, ) P.rdf.triplesScaffolding(self.snapshoturi,[ po.triplifiedIn, po.triplifiedBy, po.donatedBy, po.availableAt, po.onlineMetaXMLFile, po.onlineMetaTTLFile, po.metaXMLFileName, po.metaTTLFileName, po.acquiredThrough, po.socialProtocolTag, po.socialProtocol, NS.rdfs.comment, ]+foo["uris"], [ datetime.datetime.now(), "scripts/", self.snapshotid[:-4], self.online_prefix, self.online_prefix+self.mrdf, self.online_prefix+self.mttl, self.mrdf, self.mttl, "Netvizz", "Facebook", P.rdf.ic(po.Platform,"Facebook",self.meta_graph,self.snapshoturi), self.desc, ]+foo["vals"], self.meta_graph)
def makeMetadata(self): triples = P.get(self.snapshoturi, None, None, self.social_graph) for rawfile in P.get(self.snapshoturi, po.rawFile, None, self.social_graph, strict=True, minimized=True): triples += P.get(rawfile, None, None, self.social_graph) P.add(triples, context=self.meta_graph) self.totalchars = sum(self.nchars_all) self.mcharsmessages = n.mean(self.nchars_all) self.dcharsmessages = n.std(self.nchars_all) self.totaltokens = sum(self.ntokens_all) self.mtokensmessages = n.mean(self.ntokens_all) self.dtokensmessages = n.std(self.ntokens_all) self.totalsentences = sum(self.nsentences_all) self.msentencesmessages = n.mean(self.nsentences_all) self.dsentencesmessages = n.std(self.nsentences_all) self.nparticipants = len(self.NICKS) self.nmessages = len(self.messageids) self.ntriples = len(P.context(self.irc_graph)) triples = [ (self.snapshoturi, po.nParticipants, self.nparticipants), (self.snapshoturi, po.nMessages, self.nmessages), (self.snapshoturi, po.nDirectMessages, self.ndirect), (self.snapshoturi, po.nUserMentions, self.nmention), (self.snapshoturi, po.nCharsOverall, self.totalchars), (self.snapshoturi, po.mCharsOverall, self.mcharsmessages), (self.snapshoturi, po.dCharsOverall, self.dcharsmessages), (self.snapshoturi, po.nTokensOverall, self.totaltokens), (self.snapshoturi, po.mTokensOverall, self.mtokensmessages), (self.snapshoturi, po.dTokensOverall, self.dtokensmessages), (self.snapshoturi, po.nSentencesOverall, self.totalsentences), (self.snapshoturi, po.mSentencesOverall, self.msentencesmessages), (self.snapshoturi, po.dSentencesOverall, self.dsentencesmessages), ] P.add(triples, context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.ircParticipantAttribute] * len(self.participantvars), self.participantvars, context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.logXMLFilename] * len(self.log_xml) + [po.logTTLFilename] * len(self.log_ttl), self.log_xml + self.log_ttl, context=self.meta_graph) P.rdf.triplesScaffolding( self.snapshoturi, [po.onlineLogXMLFile] * len(self.log_xml) + [po.onlineLogTTLFile] * len(self.log_ttl), [self.online_prefix + i for i in self.log_xml + self.log_ttl], context=self.meta_graph) self.mrdf = self.snapshotid + "Meta.rdf" self.mttl = self.snapshotid + "Meta.ttl" self.desc = "irc dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid, self.snapshoturi, self.isego, self.isgroup, ) self.desc += "\nisFriendship: {}; ".format(self.isfriendship) self.desc += "isInteraction: {}.".format(self.isinteraction) self.desc += "\nnParticipants: {}; nInteractions: {} (directed messages+user mentions).".format( self.nparticipants, self.ndirect + self.nmention) self.desc += "\nisPost: {} (alias hasText: {})".format( self.hastext, self.hastext) self.desc += "\nnMessages: {}; ".format(self.nmessages) self.desc += "nDirectedMessages: {}; nUserMentions: {};".format( self.ndirect, self.nmention) self.desc += "\nnCharsOverall: {}; mCharsOverall: {}; dCharsOverall: {}.".format( self.totalchars, self.mcharsmessages, self.dcharsmessages) self.desc += "\nnTokensOverall: {}; mTokensOverall: {}; dTokensOverall: {};".format( self.totaltokens, self.mtokensmessages, self.dtokensmessages) self.desc += "\nnSentencesOverall: {}; mSentencesOverall: {}; dSentencesOverall: {};".format( self.totalsentences, self.msentencesmessages, self.dsentencesmessages) self.desc += "\nnURLs: {}; nAAMessages {}.".format( self.nurls, self.naamessages) triples = [ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, po.triplifiedBy, "scripts/"), (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]), (self.snapshoturi, po.availableAt, self.online_prefix), (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix + self.mrdf), (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix + self.mttl), (self.snapshoturi, po.metaXMLFileName, self.mrdf), (self.snapshoturi, po.metaTTLFileName, self.mttl), (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_xml)), (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)), (self.snapshoturi, po.acquiredThrough, "channel text log"), (self.snapshoturi, po.socialProtocolTag, "IRC"), (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform, "IRC", self.meta_graph, self.snapshoturi)), (self.snapshoturi, po.nTriples, self.ntriples), (self.snapshoturi, NS.rdfs.comment, self.desc), ] P.add(triples, self.meta_graph)
def makeMetadata(self): triples = P.get(self.snapshoturi, None, None, self.social_graph) for rawfile in P.get(self.snapshoturi, po.rawFile, None, self.social_graph, strict=True, minimized=True): triples += P.get(rawfile, None, None, self.social_graph) self.totalchars = sum(self.nchars_all) self.mcharstweets = n.mean(self.nchars_all) self.dcharstweets = n.std(self.nchars_all) self.totaltokens = sum(self.ntokens_all) self.mtokenstweets = n.mean(self.ntokens_all) self.dtokenstweets = n.std(self.ntokens_all) P.add(triples, context=self.meta_graph) triples = [ (self.snapshoturi, po.nParticipants, self.nparticipants), (self.snapshoturi, po.nTweets, self.ntweets), (self.snapshoturi, po.nReplies, self.nreplies), (self.snapshoturi, po.nRetweets, self.nretweets), (self.snapshoturi, po.nCharsOverall, self.totalchars), (self.snapshoturi, po.mCharsOverall, self.mcharstweets), (self.snapshoturi, po.dCharsOverall, self.dcharstweets), (self.snapshoturi, po.nTokensOverall, self.totaltokens), (self.snapshoturi, po.mTokensOverall, self.mtokenstweets), (self.snapshoturi, po.dTokensOverall, self.dtokenstweets), ] P.add(triples, context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.tweetParticipantAttribute] * len(self.participantvars), self.participantvars, context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.tweetXMLFilename] * len(self.tweet_rdf) + [po.tweetTTLFilename] * len(self.tweet_ttl), self.tweet_rdf + self.tweet_ttl, context=self.meta_graph) P.rdf.triplesScaffolding( self.snapshoturi, [po.onlineTweetXMLFile] * len(self.tweet_rdf) + [po.onlineTweetTTLFile] * len(self.tweet_ttl), [self.online_prefix + i for i in self.tweet_rdf + self.tweet_ttl], context=self.meta_graph) self.mrdf = self.snapshotid + "Meta.rdf" self.mttl = self.snapshotid + "Meta.ttl" self.desc = "twitter dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid, self.snapshoturi, self.isego, self.isgroup, ) self.desc += "\nisFriendship: {}; ".format(self.isfriendship) self.desc += "isInteraction: {}.".format(self.isinteraction) self.desc += "\nnParticipants: {}; nInteractions: {} (replies+retweets+user mentions).".format( self.nparticipants, self.nreplies + self.nretweets + self.nuser_mentions, ) self.desc += "\nisPost: {} (alias hasText: {})".format( self.hastext, self.hastext) self.desc += "\nnTweets: {}; ".format(self.ntweets) self.desc += "nReplies: {}; nRetweets: {}; nUserMentions: {}.".format( self.nreplies, self.nretweets, self.nuser_mentions) self.desc += "\nnTokens: {}; mTokens: {}; dTokens: {};".format( self.totaltokens, self.mtokenstweets, self.dtokenstweets) self.desc += "\nnChars: {}; mChars: {}; dChars: {}.".format( self.totalchars, self.mcharstweets, self.dcharstweets) self.desc += "\nnHashtags: {}; nMedia: {}; nLinks: {}.".format( self.nhashtags, self.nmedia, self.nlinks) triples = [ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, po.triplifiedBy, "scripts/"), (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]), (self.snapshoturi, po.availableAt, self.online_prefix), (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix + self.mrdf), (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix + self.mttl), (self.snapshoturi, po.metaXMLFileName, self.mrdf), (self.snapshoturi, po.metaTTLFileName, self.mttl), (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_rdf)), (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)), (self.snapshoturi, po.acquiredThrough, "Twitter APIs"), (self.snapshoturi, po.socialProtocolTag, "Twitter"), (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform, "Twitter", self.meta_graph, self.snapshoturi)), (self.snapshoturi, po.nTriples, self.ntriples), (self.snapshoturi, NS.rdfs.comment, self.desc), ] P.add(triples, self.meta_graph)
def writeAllFB(self): c("started rendering of the snapshot publication. snapshotID:",self.snapshotid) self.final_path_="{}{}/".format(self.final_path,self.snapshotid) if not os.path.isdir(self.final_path_): os.mkdir(self.final_path_) #fnet,inet,mnet triples=[] if self.isfriendship: g=P.context(self.friendship_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Friendship.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Friendship.rdf","xml") c("serialized friendships") # get filesize and ntriples filesizerdf=os.path.getsize(self.final_path_+self.snapshotid+"Friendship.rdf")/(10**6) filesizettl=os.path.getsize(self.final_path_+self.snapshotid+"Friendship.ttl")/(10**6) ntriples=len(g) triples+=[ (self.snapshoturi,po.friendshipXMLFileSizeMB,filesizerdf), (self.snapshoturi,po.friendshipTTLFileSizeMB,filesizettl), (self.snapshoturi,po.nFriendshipTriples,ntriples), ] if self.isinteraction: g=P.context(self.interaction_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Interaction.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Interaction.rdf","xml") c("serialized interaction") filesizerdf=os.path.getsize(self.final_path_+self.snapshotid+"Interaction.rdf")/(10**6) filesizettl=os.path.getsize(self.final_path_+self.snapshotid+"Interaction.ttl")/(10**6) ntriples=len(g) triples+=[ (self.snapshoturi,po.interactionXMLFileSizeMB,filesizerdf), (self.snapshoturi,po.interactionTTLFileSizeMB,filesizettl), (self.snapshoturi,po.nInteractionTriples,ntriples), ] if self.hastext: g=P.context(self.posts_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Posts.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Posts.rdf","xml") c("serialized posts") filesizerdf=os.path.getsize(self.final_path_+self.snapshotid+"Posts.rdf")/(10**6) filesizettl=os.path.getsize(self.final_path_+self.snapshotid+"Posts.ttl")/(10**6) ntriples=len(g) triples+=[ (self.snapshoturi,po.postsXMLFileSizeMB,filesizerdf), (self.snapshoturi,po.postsTTLFileSizeMB,filesizettl), (self.snapshoturi,po.nPostsTriples,ntriples) , ] g=P.context(self.meta_graph) ntriples=len(g) triples+=[ (self.snapshoturi,po.nMetaTriples,ntriples) , ] P.add(triples,context=self.meta_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml") c("serialized meta") # copia o script que gera este codigo if not os.path.isdir(self.final_path_+"scripts"): os.mkdir(self.final_path_+"scripts") shutil.copy(S.PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py") # copia do base data if not os.path.isdir(self.final_path_+"base"): os.mkdir(self.final_path_+"base") originals="" if self.isfriendship: shutil.copy(self.data_path+self.filename_friendships,self.final_path_+"base/") originals+="base/{}".format(self.filename_friendships) tfriendship="""\n\n{nf} individuals with metadata {fvars} and {nfs} friendships constitute the friendship network in the RDF/XML file: {frdf} \nor in the Turtle file: \n{fttl} (anonymized: {fan}).""".format( nf=self.nfriends,fvars=str(self.friendsvars), nfs=self.nfriendships, frdf=self.frdf,fttl=self.fttl, fan=self.friendships_anonymized, ) else: tfriendship="" if self.isinteraction: shutil.copy(self.data_path+self.filename_interactions,self.final_path_+"base/") tinteraction="""\n\n{} individuals with metadata {} and {} interactions with metadata {} constitute the interaction network in the RDF/XML file: {} or in the Turtle file: {} (anonymized: {}).""".format( self.ninteracted,str(self.varsfriendsinteraction), self.ninteractions,str(self.interactionsvars), self.irdf, self.ittl, self.interactions_anonymized) originals+="\nbase/{}".format(self.filename_interactions) else: tinteraction="" if self.hastext: shutil.copy(self.data_path+self.filename_posts,self.final_path_+"base/") tposts="""\n\n{} posts with {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {} {:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {} posts data in the RDF/XML file: {} or in the Turtle file: {}""".format( self.nposts,self.mcharsposts,self.dcharsposts,self.totalchars, self.mtokensposts,self.dtokensposts,self.totaltokens, self.prdf, self.pttl) originals+="\nbase/{}".format(self.filename_posts) else: tposts="" # P.rdf.writeAll(mnet,aname+"Meta",fpath_,1) # faz um README datetime_string=P.get(r.URIRef(self.snapshoturi),po.dateObtained,None,context="social_facebook")[2] # if not os.path.isdir(self.final_path+"base"): # os.mkdir(self.final_path+"base") with open(self.final_path_+"README","w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the facebook snapshot {snapid} collected around {date}.{tfriendship}{tinteraction}{tposts} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nOriginal file(s): {origs} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""".format( snapid=self.snapshotid,date=datetime_string, tfriendship=tfriendship, tinteraction=tinteraction, tposts=tposts, mrdf=self.mrdf, mttl=self.mttl, origs=originals, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc ))
def makeMetadata(self): self.makePostsTriples() # get participant and message vars from snapshot through queries self.participantvars = P.get("""SELECT DISTINCT ?p WHERE { GRAPH <%s> { ?fooparticipant po:snapshot <%s> . ?fooparticipant a po:Participant . ?fooparticipant ?p ?fooobject . } } """ % ( self.translation_graph, self.snapshoturi)) P.rdf.triplesScaffolding( self.snapshoturi, [po.ParticipantAttribute]*len(self.participantvars), self.participantvars, context=self.meta_graph) self.messagevars = P.get("""SELECT DISTINCT ?p WHERE { GRAPH <%s> { ?foomessage po:snapshot <%s> . ?foomessage a po:Message . ?foomessage ?p ?fooobject . } } """ % ( self.translation_graph, self.snapshoturi)) P.rdf.triplesScaffolding( self.snapshoturi, [po.MessageAttribute]*len(self.messagevars), self.messagevars, context=self.meta_graph) self.mrdf = self.snapshotid+"Meta.rdf" self.mttl = self.snapshotid+"Meta.ttl" self.desc = "dataset with snapshotID:\ {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid, self.snapshoturi, self.isego, self.isgroup) self.desc += "\nisFriendship: {}; ".format(self.isfriendship) self.desc += "isInteraction: {}.".format(self.isinteraction) self.desc += "\nhasText: {}".format(self.hastext) self.nchecks = P.get(r"SELECT (COUNT(?checker) as ?cs) WHERE { \ ?foosession po:checkParticipant ?checker}", context=self.translation_graph) self.desc += "\nnParticipants: {}; nInteractions: {} \ (only session checks in first aa).".format( self.nparticipants, self.nchecks) self.desc += "\nnMessages: {}; ".format(self.nmessages) self.desc += "\nnCharsOverall: {}; mCharsOverall: {};\ dCharsOverall: {}.".format(self.totalchars, self.mchars_messages, self.dchars_messages) self.desc += "\nnTokensOverall: {}; mTokensOverall: {};\ dTokensOverall: {};".format(self.totaltokens, self.mtokens_messages, self.dtokens_messages) self.desc += "\nnSentencesOverall: {}; mSentencesOverall: {};\ dSentencesOverall: {};".format( self.totalsentences, self.msentences_messages, self.dsentences_messages) self.desc += "\nnURLs: {}; nAAMessages {}.".format( self.nurls, self.nmessages) self.dates = P.get(r"SELECT ?date WHERE { GRAPH <%s> {\ ?fooshout po:createdAt ?date } " % ( self.translation_graph,)) self.desc += "\nReference timespan: {} to {}".format( min(dates), max(dates)) self.desc += """\nRDF expression in the XML file(s): {} and the Turtle file(s): {} (anonymized: {}).""".format(self.translation_xml, self.translation_ttl, self.anonymized) self.desc += """\nMetadata of this snapshot in the XML file(s): {} and the Turtle file(s): {}.""".format(self.meta_xml, self.meta_ttl) self.desc += """\nFiles should be available in: \n{}""".format() self.desc += "\n\nNote: numeric variables starting with n area \ countings, with m are means and d are standard deviations." if isinstance(self.translation_xml, list): P.rdf.triplesScaffolding( self.snapshoturi, [po.translationXMLFilename]*len(self.translation_xml) + [po.translationTTLFilename]*len(self.translation_ttl), self.translation_xml+self.translation_ttl, context=self.meta_graph) P.rdf.triplesScaffolding( self.snapshoturi, [po.onlineTranslationXMLFileURI]*len(self.translation_xml) + [po.onlineTranslationTTLFileURI]*len(self.translation_ttl), [self.online_prefix+i for i in self.translation_xml+self.translation_ttl], context=self.meta_graph) triples = [ (self.snapshoturi, po.translationXMLFilesize, self.translation_size_xml), (self.snapshoturi, po.translationTTLFilesize, self.translation_size_ttl), ] else: triples = [ (self.snapshoturi, po.translationXMLFilename, self.translation_xml), (self.snapshoturi, po.translationXMLFilesize, self.translation_size_xml), (self.snapshoturi, po.translationTTLFilename, self.translation_ttl), (self.snapshoturi, po.translationTTLFilesize, self.translation_size_ttl), ] P.add(triples,self.meta_graph) triples=[ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, po.triplifiedBy, "scripts/"), # (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]), (self.snapshoturi, po.availableAt, self.online_prefix), (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf), (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl), (self.snapshoturi, po.metaXMLFileName, self.mrdf), (self.snapshoturi, po.metaTTLFileName, self.mttl), # (self.snapshoturi, po.acquiredThrough, "aa shouts in "+self.snapshotid), (self.snapshoturi, po.socialProtocolTag, self.social_protocol), # AA, fb, etc (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform,self.social_protocol,self.meta_graph,self.snapshoturi)), (self.snapshoturi, po.nTriples, self.ntranslation_triples), (self.snapshoturi, NS.rdfs.comment, self.desc), ] P.add(triples,self.meta_graph)
def writeAllFB(self): c("started rendering of the snapshot publication. snapshotID:", self.snapshotid) self.final_path_ = "{}{}/".format(self.final_path, self.snapshotid) if not os.path.isdir(self.final_path_): os.mkdir(self.final_path_) #fnet,inet,mnet triples = [] if self.isfriendship: g = P.context(self.friendship_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Friendship.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Friendship.rdf", "xml") c("serialized friendships") # get filesize and ntriples filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid + "Friendship.rdf") / (10**6) filesizettl = os.path.getsize(self.final_path_ + self.snapshotid + "Friendship.ttl") / (10**6) ntriples = len(g) triples += [ (self.snapshoturi, po.friendshipXMLFileSizeMB, filesizerdf), (self.snapshoturi, po.friendshipTTLFileSizeMB, filesizettl), (self.snapshoturi, po.nFriendshipTriples, ntriples), ] if self.isinteraction: g = P.context(self.interaction_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Interaction.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Interaction.rdf", "xml") c("serialized interaction") filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid + "Interaction.rdf") / (10**6) filesizettl = os.path.getsize(self.final_path_ + self.snapshotid + "Interaction.ttl") / (10**6) ntriples = len(g) triples += [ (self.snapshoturi, po.interactionXMLFileSizeMB, filesizerdf), (self.snapshoturi, po.interactionTTLFileSizeMB, filesizettl), (self.snapshoturi, po.nInteractionTriples, ntriples), ] if self.hastext: g = P.context(self.posts_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Posts.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Posts.rdf", "xml") c("serialized posts") filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid + "Posts.rdf") / (10**6) filesizettl = os.path.getsize(self.final_path_ + self.snapshotid + "Posts.ttl") / (10**6) ntriples = len(g) triples += [ (self.snapshoturi, po.postsXMLFileSizeMB, filesizerdf), (self.snapshoturi, po.postsTTLFileSizeMB, filesizettl), (self.snapshoturi, po.nPostsTriples, ntriples), ] g = P.context(self.meta_graph) ntriples = len(g) triples += [ (self.snapshoturi, po.nMetaTriples, ntriples), ] P.add(triples, context=self.meta_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml") c("serialized meta") # copia o script que gera este codigo if not os.path.isdir(self.final_path_ + "scripts"): os.mkdir(self.final_path_ + "scripts") shutil.copy(S.PACKAGEDIR + "/../tests/triplify.py", self.final_path_ + "scripts/triplify.py") # copia do base data if not os.path.isdir(self.final_path_ + "base"): os.mkdir(self.final_path_ + "base") originals = "" if self.isfriendship: shutil.copy(self.data_path + self.filename_friendships, self.final_path_ + "base/") originals += "base/{}".format(self.filename_friendships) tfriendship = """\n\n{nf} individuals with metadata {fvars} and {nfs} friendships constitute the friendship network in the RDF/XML file: {frdf} \nor in the Turtle file: \n{fttl} (anonymized: {fan}).""".format( nf=self.nfriends, fvars=str(self.friendsvars), nfs=self.nfriendships, frdf=self.frdf, fttl=self.fttl, fan=self.friendships_anonymized, ) else: tfriendship = "" if self.isinteraction: shutil.copy(self.data_path + self.filename_interactions, self.final_path_ + "base/") tinteraction = """\n\n{} individuals with metadata {} and {} interactions with metadata {} constitute the interaction network in the RDF/XML file: {} or in the Turtle file: {} (anonymized: {}).""".format(self.ninteracted, str(self.varsfriendsinteraction), self.ninteractions, str(self.interactionsvars), self.irdf, self.ittl, self.interactions_anonymized) originals += "\nbase/{}".format(self.filename_interactions) else: tinteraction = "" if self.hastext: shutil.copy(self.data_path + self.filename_posts, self.final_path_ + "base/") tposts = """\n\n{} posts with {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {} {:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {} posts data in the RDF/XML file: {} or in the Turtle file: {}""".format(self.nposts, self.mcharsposts, self.dcharsposts, self.totalchars, self.mtokensposts, self.dtokensposts, self.totaltokens, self.prdf, self.pttl) originals += "\nbase/{}".format(self.filename_posts) else: tposts = "" # P.rdf.writeAll(mnet,aname+"Meta",fpath_,1) # faz um README datetime_string = P.get(r.URIRef(self.snapshoturi), po.dateObtained, None, context="social_facebook")[2] # if not os.path.isdir(self.final_path+"base"): # os.mkdir(self.final_path+"base") with open(self.final_path_ + "README", "w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the facebook snapshot {snapid} collected around {date}.{tfriendship}{tinteraction}{tposts} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nOriginal file(s): {origs} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""" .format(snapid=self.snapshotid, date=datetime_string, tfriendship=tfriendship, tinteraction=tinteraction, tposts=tposts, mrdf=self.mrdf, mttl=self.mttl, origs=originals, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc))
def makeMetadata(self): return qtriples=[ ("?fooshout",po.shoutText,"?text"), ] self.totalchars=sum( self.size_chars_overall) self.mchars_messages=n.mean( self.size_chars_overall) self.dchars_messages=n.std( self.size_chars_overall) self.totaltokens=sum( self.size_tokens_overall) self.mtokens_messages=n.mean( self.size_tokens_overall) self.dtokens_messages=n.std( self.size_tokens_overall) self.totalsentences=sum( self.size_sentences_overall) self.msentences_messages=n.mean(self.size_sentences_overall) self.dsentences_messages=n.std( self.size_sentences_overall) self.nmessages=P.get("SELECT (COUNT(?s) as ?s) WHERE { ?s a po:Shout }",context=self.translation_graph) self.nparticipants=P.get("SELECT (COUNT(?s) as ?s) WHERE { ?s a po:Participant }",context=self.translation_graph) self.nurls=P.get("SELECT (COUNT(?s) as ?s) WHERE { ?s po:hasUrl ?o }",context=self.translation_graph) triples=[ (self.snapshoturi, po.nParticipants, self.nparticipants), (self.snapshoturi, po.nMessages, self.nmessages), (self.snapshoturi, po.nCharsOverall, self.totalchars), (self.snapshoturi, po.mCharsOverall, self.mchars_messages), (self.snapshoturi, po.dCharsOverall, self.dchars_messages), (self.snapshoturi, po.nTokensOverall, self.totaltokens), (self.snapshoturi, po.mTokensOverall, self.mtokens_messages), (self.snapshoturi, po.dTokensOverall, self.dtokens_messages), (self.snapshoturi, po.nSentencesOverall, self.totalsentences), (self.snapshoturi, po.mSentencesOverall, self.msentences_messages), (self.snapshoturi, po.dSentencesOverall, self.dsentences_messages), ] P.add(triples,context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.ParticipantAttribute]*len(self.participantvars), self.participantvars,context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.MessageAttribute]*len(self.messagevars), self.messagevars,context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.shoutXMLFilename]*len(self.translation_xml)+[po.shoutTTLFilename]*len(self.translation_ttl), self.translation_xml+self.translation_ttl,context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.onlineShoutXMLFile]*len(self.translation_xml)+[po.onlineShoutTTLFile]*len(self.translation_ttl), [self.online_prefix+i for i in self.translation_xml+self.translation_ttl],context=self.meta_graph) self.mrdf=self.snapshotid+"Meta.rdf" self.mttl=self.snapshotid+"Meta.ttl" self.desc="irc dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid,self.snapshoturi,self.isego,self.isgroup,) self.desc+="\nisFriendship: {}; ".format(self.isfriendship) self.desc+="isInteraction: {}.".format(self.isinteraction) self.nchecks=P.get(r"SELECT (COUNT(?checker) as ?cs) WHERE { ?foosession po:checkParticipant ?checker}",context=self.translation_graph) self.desc+="\nnParticipants: {}; nInteractions: {} (only session checks in first aa).".format(self.nparticipants,self.nchecks) self.desc+="\nisPost: {} (alias hasText: {})".format(self.hastext,self.hastext) self.desc+="\nnMessages: {}; ".format(self.nmessages) self.desc+="\nnCharsOverall: {}; mCharsOverall: {}; dCharsOverall: {}.".format(self.totalchars, self.mchars_messages, self.dchars_messages) self.desc+="\nnTokensOverall: {}; mTokensOverall: {}; dTokensOverall: {};".format(self.totaltokens, self.mtokens_messages, self.dtokens_messages) self.desc+="\nnSentencesOverall: {}; mSentencesOverall: {}; dSentencesOverall: {};".format(self.totalsentences,self.msentences_messages, self.dsentences_messages) self.desc+="\nnURLs: {}; nAAMessages {}.".format(self.nurls,self.nmessages) triples=[ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, po.triplifiedBy, "scripts/"), (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]), (self.snapshoturi, po.availableAt, self.online_prefix), (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf), (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl), (self.snapshoturi, po.metaXMLFileName, self.mrdf), (self.snapshoturi, po.metaTTLFileName, self.mttl), (self.snapshoturi, po.totalXMLFileSizeMB, self.size_xml), (self.snapshoturi, po.totalTTLFileSizeMB, self.size_ttl), (self.snapshoturi, po.acquiredThrough, "aa shouts in "+self.snapshotid), (self.snapshoturi, po.socialProtocolTag, "AA"), (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform,"IRC",self.meta_graph,self.snapshoturi)), (self.snapshoturi, po.nTriples, self.ntranslation_triples), (self.snapshoturi, NS.rdfs.comment, self.desc), ] P.add(triples,self.meta_graph)
def writeAll(self): g=P.context(self.meta_graph) ntriples=len(g) triples=[ (self.snapshoturi,po.nMetaTriples,ntriples) , ] P.add(triples,context=self.meta_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml") c("serialized meta") if not os.path.isdir(self.final_path_+"scripts"): os.mkdir(self.final_path_+"scripts") shutil.copy(PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py") # copia do base data text="""structure in the RDF/XML file(s): {} and the Turtle file(s): {} (anonymized: False "nicks inteface").""".format( self.nparticipants,str(self.participantvars), self.nchecks,self.ndirect,self.nmention, self.translation_xml, self.translation_ttl) tposts="""\n\nThe dataset consists of {} shout messages with metadata {} {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {} {:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {} {:.3f} sentences in average (std: {:.3f}) and total sentences in snapshot: {}""".format( self.nmessages,str(self.messagevars), self.mcharsmessages, self.dcharsmessages,self.totalchars, self.mtokensmessages,self.dtokensmessages,self.totaltokens, self.msentencesmessages,self.dsentencesmessages,self.totalsentences, ) self.dates=P.get(r"SELECT ?date WHERE { GRAPH <%s> { ?fooshout po:createdAt ?date } "%(self.translation_graph,)) self.dates=[i.isoformat() for i in self.dates] date1=min(self.dates) date2=max(self.dates) with open(self.final_path_+"README","w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the IRC snapshot {snapid} with tweets from {date1} to {date2} (total of {ntrip} triples).{tinteraction}{tposts} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""".format( snapid=self.snapshotid,date1=date1,date2=date2,ntrip=self.ntriples, tinteraction=tposts, tposts=tposts, mrdf=self.translation_xml, mttl=self.translation_ttl, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc ))
def makeMetadata(self): if self.isfriendship and self.groupid and self.groupid2 and ( self.groupid != self.groupid2): raise ValueError("Group IDS are different") # put all triples from social_facebook to self.meta_graph #g1=P.context("social_facebook") #g2=P.context(self.meta_graph) #for subject, predicate, object_ in g1.triples((self.snapshoturi)) triples = P.get(self.snapshoturi, None, None, "social_facebook") for rawfile in P.get(self.snapshoturi, po.rawFile, None, "social_facebook", strict=True, minimized=True): triples += P.get(rawfile, None, None, "social_facebook") P.add(triples, context=self.meta_graph) foo = {"uris": [], "vals": []} if self.isfriendship: foo["uris"]+=[ po.onlineOriginalFriendshipFile, po.originalFriendshipFileName, po.onlineFriendshipXMLFile, po.onlineFriendshipTTLFile, po.friendshipXMLFileName, po.friendshipTTLFileName, po.nFriends, po.nFriendships, po.friendshipsAnonymized ]+\ [po.frienshipParticipantAttribute]*len(self.friendsvars) self.ffile = "base/" + self.filename_friendships self.frdf = self.snapshotid + "Friendship.rdf" self.fttl = self.snapshotid + "Friendship.ttl" foo["vals"] += [ self.online_prefix + self.ffile, self.ffile, self.online_prefix + self.frdf, self.online_prefix + self.fttl, self.frdf, self.fttl, self.nfriends, self.nfriendships, self.friendships_anonymized ] + list(self.friendsvars) if self.isinteraction: foo["uris"]+=[ po.onlineOriginalInteractionFile, po.originalInteractionFileName, po.onlineInteractionXMLFile, po.onlineInteractionTTLFile, po.interactionXMLFileName, po.interactionTTLFileName, po.nInteracted, po.nInteractions, po.interactionsAnonymized ]+\ [po.interactionParticipantAttribute]*len(self.interactionsvars) self.ifile = "base/" + self.filename_interactions self.irdf = irdf = self.snapshotid + "Interaction.rdf" self.ittl = ittl = self.snapshotid + "Interaction.ttl" foo["vals"] += [ self.ifile, self.online_prefix + self.ifile, self.online_prefix + irdf, self.online_prefix + ittl, irdf, ittl, self.ninteractions, self.ninteracted, self.interactions_anonymized, ] + list(self.interactionsvars) if self.hastext: foo["uris"]+=[ po.onlineOriginalPostsFile, po.originalPostsFileName, po.onlinePostsXMLFile, po.onlinePostsTTLFile, po.postsXMLFileName, po.postsTTLFileName, po.nPosts, po.nCharsOverall, po.mCharsOverall, po.dCharsOverall, po.nTokensOverall, po.mTokensOverall, po.dTokensOverall, ]+\ [po.postAttribute]*len(self.postsvars) self.pfile = "base/" + self.filename_posts self.prdf = self.snapshotid + "Post.rdf" self.pttl = self.snapshotid + "Post.ttl" foo["vals"] += [ self.online_prefix + self.pfile, self.pfile, self.online_prefix + self.prdf, self.online_prefix + self.pttl, self.prdf, self.pttl, self.nposts, int(self.totalchars), self.mcharsposts, self.dcharsposts, int(self.totaltokens), self.mtokensposts, self.dtokensposts, ] + list(self.postsvars) foo["uris"] += [ po.isGroup, po.isEgo, po.isFriendship, po.isInteraction, po.hasText, po.isPost, ] self.isego = bool(P.get(r.URIRef(self.snapshoturi), a, po.EgoSnapshot)) self.isgroup = bool( P.get(r.URIRef(self.snapshoturi), a, po.GroupSnapshot)) foo["vals"] += [ self.isgroup, self.isego, self.isfriendship, self.isinteraction, self.hastext, self.hastext ] self.mrdf = self.snapshotid + "Meta.rdf" self.mttl = self.snapshotid + "Meta.ttl" self.desc = "facebook network with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid, self.snapshoturi, self.isego, self.isgroup, ) self.desc += "\nisFriendship: {}".format(self.isfriendship) if self.isfriendship: self.desc += "; nFriends: {}; nFrienships: {}.".format( self.nfriends, self.nfriendships, ) self.desc += "\nisInteraction: {}".format(self.isinteraction) if self.isinteraction: self.desc += "; nInteracted: {}; nInteractions: {}.".format( self.ninteracted, self.ninteractions, ) self.desc += "\nisPost: {} (alias hasText: {})".format( self.hastext, self.hastext) if self.hastext: self.desc += ";\nmCharsPostsOverall: {}; dCharsPostsOverall: {}; totalCharsOverall: {}; \ \nmTokensPostsOverall: {}; dTokensPostsOverall: {}; totalTokensOverall: {}".format( self.nposts, self.mcharsposts, self.dcharsposts, self.totalchars, self.mtokensposts, self.dtokensposts, self.totaltokens, ) P.rdf.triplesScaffolding(self.snapshoturi, [ po.triplifiedIn, po.triplifiedBy, po.donatedBy, po.availableAt, po.onlineMetaXMLFile, po.onlineMetaTTLFile, po.metaXMLFileName, po.metaTTLFileName, po.acquiredThrough, po.socialProtocolTag, po.socialProtocol, NS.rdfs.comment, ] + foo["uris"], [ datetime.datetime.now(), "scripts/", self.snapshotid[:-4], self.online_prefix, self.online_prefix + self.mrdf, self.online_prefix + self.mttl, self.mrdf, self.mttl, "Netvizz", "Facebook", P.rdf.ic(po.Platform, "Facebook", self.meta_graph, self.snapshoturi), self.desc, ] + foo["vals"], self.meta_graph)
def makeMetadata(self): # triples = P.get(self.snapshoturi, None, None, self.social_graph) # for rawfile in P.get(self.snapshoturi, po.rawFile, None, # self.social_graph, strict=True, minimized=True): # triples.extend(P.get(rawfile, None, None, self.social_graph)) # P.add(triples, context=self.meta_graph) self.ffile = "base/" + self.filename_friendships self.frdf = self.snapshotid + "Friendship.rdf" self.fttl = self.snapshotid + "Friendship.ttl" triples = [ # (self.snapshoturi, po.onlineOriginalFriendshipFile, # self.online_prefix+self.ffile), # (self.snapshoturi, po.originalFriendshipFileName, self.ffile), # (self.snapshoturi, po.onlineFriendshipXMLFile, # self.online_prefix+self.frdf), # (self.snapshoturi, po.onlineFriendshipTTLFile, # self.online_prefix+self.fttl), # (self.snapshoturi, po.friendshipXMLFileName, self.frdf), # (self.snapshoturi, po.friendshipTTLFileName, self.fttl), # (self.snapshoturi, po.numberOfFriends, self.nfriends), # (self.snapshoturi, po.numberOfFriendships, self.nfriendships), (self.snapshoturi, po.friendshipsAnonymized, self.friendships_anonymized), ] P.add(triples, context=self.meta_graph) # P.rdf.triplesScaffolding(self.snapshoturi, # [po.frienshipParticipantAttribute] * # len(self.friendsvars), # self.friendsvars, context=self.meta_graph) self.mrdf = self.snapshotid + "Meta.rdf" self.mttl = self.snapshotid + "Meta.ttl" self.desc = "facebook network with snapshotID: {}\nsnapshotURI: {} \n\ isEgo: {}. isGroup: {}.".format(self.snapshotid, self.snapshoturi, self.isego, self.isgroup) self.desc += "\nisFriendship: {}".format(self.isfriendship) # self.desc += "; numberOfFriends: {}; numberOfFrienships: {}."\ # .format(self.nfriends, self.nfriendships) self.desc += "\nisInteraction: {}".format(self.isinteraction) self.desc += "\nisPost: {} (hasText)".format(self.hastext) date_obtained = P.get(r.URIRef(self.snapshoturi), po.dateObtained)[2].toPython() assert isinstance(date_obtained, datetime.date) name = P.get(r.URIRef(self.snapshoturi), po.name, None, context=self.social_graph)[2] triples = [ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, a, po.Snapshot), (self.snapshoturi, po.snapshotID, self.snapshotid), (self.snapshoturi, po.isEgo, True), (self.snapshoturi, po.isGroup, False), (self.snapshoturi, po.isFriendship, True), (self.snapshoturi, po.isInteraction, False), (self.snapshoturi, po.isPost, False), (self.snapshoturi, po.dateObtained, date_obtained), (self.snapshoturi, po.name, name), # (self.snapshoturi, po.triplifiedBy, "scripts/"), # (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]), # (self.snapshoturi, po.availableAt, self.online_prefix), # (self.snapshoturi, po.onlineMetaXMLFile, # self.online_prefix+self.mrdf), # (self.snapshoturi, po.onlineMetaTTLFile, # self.online_prefix+self.mttl), # (self.snapshoturi, po.metaXMLFileName, self.mrdf), # (self.snapshoturi, po.metaTTLFileName, self.mttl), (self.snapshoturi, po.acquiredThrough, "Netvizz"), (self.snapshoturi, po.socialProtocol, "Facebook"), # (self.snapshoturi, po.socialProtocolTag, "Facebook"), # (self.snapshoturi, po.socialProtocol, # P.rdf.ic(po.Platform, "Facebook", self.meta_graph, # self.snapshoturi)), (self.snapshoturi, po.comment, self.desc), ] numericID = P.get(r.URIRef(self.snapshoturi), po.numericID, None, context=self.social_graph) if numericID: triples.append((self.snapshoturi, po.numericID, numericID[2])) stringID = P.get(r.URIRef(self.snapshoturi), po.stringID, None, context=self.social_graph) if stringID: triples.append((self.snapshoturi, po.stringID, stringID[2])) url = P.get(r.URIRef(self.snapshoturi), po.url, None, context=self.social_graph) if url: triples.append((self.snapshoturi, po.url, url[2])) P.add(triples, self.meta_graph)
def writeAllFB(self): c("started rendering of the snapshot publication. snapshotID:",self.snapshotid) self.final_path_="{}{}/".format(self.final_path,self.snapshotid) if not os.path.isdir(self.final_path_): os.mkdir(self.final_path_) #fnet,inet,mnet g=P.context(self.friendship_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Friendship.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Friendship.rdf","xml") c("serialized friendships") # get filesize and ntriples filesizerdf=os.path.getsize(self.final_path_+self.snapshotid+"Friendship.rdf")/(10**6) filesizettl=os.path.getsize(self.final_path_+self.snapshotid+"Friendship.ttl")/(10**6) ntriples=len(g) triples=[ (self.snapshoturi,po.friendshipXMLFileSizeMB,filesizerdf), (self.snapshoturi,po.friendshipTTLFileSizeMB,filesizettl), (self.snapshoturi,po.nFriendshipTriples,ntriples), ] P.add(triples,context=self.meta_graph) g=P.context(self.meta_graph) ntriples=len(g) triples+=[ (self.snapshoturi,po.nMetaTriples,ntriples+1) , ] P.add(triples,context=self.meta_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml") c("serialized meta") if not os.path.isdir(self.final_path_+"scripts"): os.mkdir(self.final_path_+"scripts") shutil.copy(S.PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py") # copia do base data if not os.path.isdir(self.final_path_+"base"): os.mkdir(self.final_path_+"base") shutil.copy(self.data_path+self.filename_friendships,self.final_path_+"base/") originals="base/{}".format(self.filename_friendships) tfriendship="""\n\n{nf} individuals with metadata {fvars} and {nfs} friendships constitute the friendship network in the RDF/XML file: {frdf} \in the Turtle file: \n{fttl} (anonymized {fan}).""".format( nf=self.nfriends,fvars=str(self.friendsvars), nfs=self.nfriendships, frdf=self.frdf,fttl=self.fttl, fan=self.friendships_anonymized, ) datetime_string=P.get(self.snapshoturi,po.dateObtained,None,context="social_facebook")[2] with open(self.final_path_+"README","w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the facebook snapshot {snapid} collected around {date}.{tfriendship} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nOriginal file(s): {origs} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""".format( snapid=self.snapshotid,date=datetime_string, tfriendship=tfriendship, mrdf=self.mrdf, mttl=self.mttl, origs=originals, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc ))
def makeMetadata(self): triples=P.get(self.snapshoturi,None,None,self.social_graph) for rawfile in P.get(self.snapshoturi,po.rawFile,None,self.social_graph,strict=True,minimized=True): triples+=P.get(rawfile,None,None,self.social_graph) self.totalchars=sum(self.nchars_all) self.mcharstweets=n.mean(self.nchars_all) self.dcharstweets=n.std(self.nchars_all) self.totaltokens=sum(self.ntokens_all) self.mtokenstweets=n.mean(self.ntokens_all) self.dtokenstweets=n.std(self.ntokens_all) P.add(triples,context=self.meta_graph) triples=[ (self.snapshoturi, po.nParticipants, self.nparticipants), (self.snapshoturi, po.nTweets, self.ntweets), (self.snapshoturi, po.nReplies, self.nreplies), (self.snapshoturi, po.nRetweets, self.nretweets), (self.snapshoturi, po.nCharsOverall, self.totalchars), (self.snapshoturi, po.mCharsOverall, self.mcharstweets), (self.snapshoturi, po.dCharsOverall, self.dcharstweets), (self.snapshoturi, po.nTokensOverall, self.totaltokens), (self.snapshoturi, po.mTokensOverall, self.mtokenstweets), (self.snapshoturi, po.dTokensOverall, self.dtokenstweets), ] P.add(triples,context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.tweetParticipantAttribute]*len(self.participantvars), self.participantvars,context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.tweetXMLFilename]*len(self.tweet_rdf)+[po.tweetTTLFilename]*len(self.tweet_ttl), self.tweet_rdf+self.tweet_ttl,context=self.meta_graph) P.rdf.triplesScaffolding(self.snapshoturi, [po.onlineTweetXMLFile]*len(self.tweet_rdf)+[po.onlineTweetTTLFile]*len(self.tweet_ttl), [self.online_prefix+i for i in self.tweet_rdf+self.tweet_ttl],context=self.meta_graph) self.mrdf=self.snapshotid+"Meta.rdf" self.mttl=self.snapshotid+"Meta.ttl" self.desc="twitter dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format( self.snapshotid,self.snapshoturi,self.isego,self.isgroup,) self.desc+="\nisFriendship: {}; ".format(self.isfriendship) self.desc+="isInteraction: {}.".format(self.isinteraction) self.desc+="\nnParticipants: {}; nInteractions: {} (replies+retweets+user mentions).".format(self.nparticipants,self.nreplies+self.nretweets+self.nuser_mentions,) self.desc+="\nisPost: {} (alias hasText: {})".format(self.hastext,self.hastext) self.desc+="\nnTweets: {}; ".format(self.ntweets) self.desc+="nReplies: {}; nRetweets: {}; nUserMentions: {}.".format(self.nreplies,self.nretweets,self.nuser_mentions) self.desc+="\nnTokens: {}; mTokens: {}; dTokens: {};".format(self.totaltokens,self.mtokenstweets,self.dtokenstweets) self.desc+="\nnChars: {}; mChars: {}; dChars: {}.".format(self.totalchars,self.mcharstweets,self.dcharstweets) self.desc+="\nnHashtags: {}; nMedia: {}; nLinks: {}.".format(self.nhashtags,self.nmedia,self.nlinks) triples=[ (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()), (self.snapshoturi, po.triplifiedBy, "scripts/"), (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]), (self.snapshoturi, po.availableAt, self.online_prefix), (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf), (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl), (self.snapshoturi, po.metaXMLFileName, self.mrdf), (self.snapshoturi, po.metaTTLFileName, self.mttl), (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_rdf)), (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)), (self.snapshoturi, po.acquiredThrough, "Twitter APIs"), (self.snapshoturi, po.socialProtocolTag, "Twitter"), (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform,"Twitter",self.meta_graph,self.snapshoturi)), (self.snapshoturi, po.nTriples, self.ntriples), (self.snapshoturi, NS.rdfs.comment, self.desc), ] P.add(triples,self.meta_graph)
def rdfMbox(self): for filecount, file_ in enumerate(self.files): if filecount % 100 == 0: c(self.snapshoturi, filecount) mbox = mailbox.mbox(self.data_path + self.directory + "/" + file_) if not mbox.keys(): self.nempty += 1 mbox.close() # c("||||||||||| EMPTY MESSAGE |||||||||||||||||||||",self.snapshotid,file_,"(",filecount,")") continue if not mbox[0]["Message-Id"]: raise ValueError( "What to do with nonempy messages without id?") message = mbox[0] gmaneid = self.makeId(message["Message-Id"]) #c("gmaneid",gmaneid) if not gmaneid: raise ValueError("Message without id") messageuri = P.rdf.ic(po.EmailMessage, gmaneid, self.translation_graph, self.snapshoturi) self.nmessages += 1 triples = [ (messageuri, po.gmaneID, gmaneid), ] email, name = self.parseParticipant(message["From"]) if not email: raise ValueError("message without author") participanturi = P.rdf.ic(po.GmaneParticipant, email, self.translation_graph, self.snapshoturi) if not P.get(participanturi, po.emailAddress, None, self.translation_graph): self.nparticipants += 1 if self.nparticipants == 100: pass triples += [ (messageuri, po.author, participanturi), (participanturi, po.emailAddress, email), ] if name: triples += [ (participanturi, po.name, name), ] subject = message["Subject"] if subject: subject = decodeHeader(subject) assert isinstance(subject, str) triples += [ (messageuri, po.subject, subject), ] replyid_ = message["In-Reply-To"] saneid = self.makeId(replyid_) if bool(replyid_) and not bool(saneid): self.nreplies += 1 replyid = self.snapshotid + "-" + str(self.nlost_messages) self.nlost_messages += 1 replymessageuri = P.rdf.ic(po.LostEmailMessage, replyid, self.translation_graph, self.snapshoturi) triples += [ (replymessageuri, a, po.EmailMessage), (replymessageuri, NS.rdfs.comment, "This message registered as having a reply, but the field might be ill-formed: " + replyid_), (messageuri, po.replyTo, replymessageuri), ] elif saneid: self.nreplies += 1 replymessageuri = P.rdf.ic(po.EmailMessage, saneid, self.translation_graph, self.snapshoturi) triples += [ (replymessageuri, po.gmaneID, saneid), (messageuri, po.replyTo, replymessageuri), ] if isinstance(message["Date"], str): datetime = parseDate(message["Date"]) elif isinstance(message["Date"], mailbox.email.header.Header): datetimestring = decodeHeader(message["Date"]) if False in [i in string.printable for i in datetimestring]: datetime = None triples += [ (messageuri, po.lostCreatedAt, True), ] else: datetime_ = re.findall(r"(.*\d\d:\d\d:\d\d).*", datetimestring)[0] datetime = parseDate(datetime_) else: raise ValueError("datetime not understood") if datetime: self.dates += [datetime] triples += [ (messageuri, po.createdAt, datetime), ] if message["References"]: references = message["References"].replace("\n", "").replace( "\t", "").replace(" ", "") if not re.findall(r"\A<(.*?)>\Z", references): c("::: ::: ::: references field not understood", message["References"]) triples += [ (messageuri, po.comment, "the references are not understood (<.*> ids are added anyway): " + message["References"]), (messageuri, po.referencesLost, True), ] for reference in re.findall(r"<(.*?)>", references): self.nreferences += 1 referenceuri = P.rdf.ic(po.EmailMessage, reference, self.translation_graph, self.snapshoturi) triples += [ (referenceuri, po.gmaneID, reference), (messageuri, po.hasReference, referenceuri), ] for part in message["References"].replace("\n", "").replace( "\t", "").split(): if validate_email(part): self.nreferences += 1 referenceuri = P.rdf.ic(po.EmailMessage, part, self.translation_graph, self.snapshoturi) triples += [ (referenceuri, po.gmaneID, reference), (messageuri, po.hasReference, referenceuri), ] text = getText(message) if text: nchars = len(text) ntokens = len(k.wordpunct_tokenize(text)) nsentences = len(k.sent_tokenize(text)) triples += [ (messageuri, po.messageText, text), (messageuri, po.nChars, nchars), (messageuri, po.nTokens, ntokens), (messageuri, po.nSentences, nsentences), ] self.nchars_all += [nchars] self.ntokens_all += [ntokens] self.nsentences_all += [nsentences] clean_text = cleanEmailBody(text) self.nremoved_lines += text.count("\n") - clean_text.count( "\n") self.nlines += text.count("\n") nchars_clean = len(clean_text) ntokens_clean = len(k.wordpunct_tokenize(clean_text)) nsentences_clean = len(k.sent_tokenize(clean_text)) triples += [ (messageuri, po.messageTextClean, clean_text), (messageuri, po.nCharsClean, nchars_clean), (messageuri, po.nTokensClean, ntokens_clean), (messageuri, po.nSentencesClean, nsentences_clean), ] self.nchars_clean_all += [nchars_clean] self.ntokens_clean_all += [ntokens_clean] self.nsentences_clean_all += [nsentences_clean] for url in re.findall( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', clean_text): self.nurls += 1 triples += [ (messageuri, po.hasUrl, url), ] content_type = message.get_content_type() if content_type: triples += [(messageuri, po.contentType, content_type)] else: raise ValueError("/\/\/\/\/\ message without content type") organization = message["Organization"] if organization: if not isinstance(organization, str): organization = "".join(i for i in str(organization) if i in string.printable) triples += [ (messageuri, po.organization, organization), ] if message["cc"]: cc, unparsed = parseAddresses(message["cc"]) if unparsed: triples += [ (messageuri, po.unparsedCC, unparsed), ] for peeraddress, peername in cc: peeraddress = peeraddress.strip() assert bool(peeraddress) peeruri = P.rdf.ic(po.EmailPeer, peeraddress, self.translation_graph, self.snapshoturi) triples += [ (messageuri, po.cc, peeruri), (peeruri, po.emailAddress, peeraddress), ] self.ncc += 1 if peername: triples += [ (peeruri, po.name, peername.strip()), ] if message["to"]: to, unparsed = parseAddresses(message["to"]) if unparsed: triples += [ (messageuri, po.unparsedTo, unparsed), ] for peeraddress, peername in to: peeraddress = peeraddress.strip() assert bool(peeraddress) peeruri = P.rdf.ic(po.EmailPeer, peeraddress, self.translation_graph, self.snapshoturi) triples += [ (messageuri, po.to, peeruri), (peeruri, po.emailAddress, peeraddress), ] self.nto += 1 if peername: triples += [ (peeruri, po.name, peername.strip()), ] listid = message["list-id"] if listid: assert isinstance(listid, str) listid = listid.replace("\n", "").replace("\t", "") if listid.count("<") == listid.count(">") == listid.count( " ") == 0: listname = "" listid_ = listid elif listid.count("<") == listid.count(">") == 0: parts = listid.split() lens = [len(i) for i in parts] listid_ = [i for i in parts if len(i) == max(lens)][0] listname = " ".join(i for i in parts if len(i) != max(lens)) elif listid.count("<") == listid.count(">") == 1: listname, listid_ = re.findall(r"(.*) {0,1}<(.*)>", listid)[0] else: raise ValueError("Unexpected listid string format") listuri = P.rdf.ic(po.EmailList, listid_, self.translation_graph, self.snapshoturi) triples += [ (messageuri, po.emailList, listuri), (listuri, po.listID, listid_), ] if listname: triples += [ (listuri, po.name, listname.strip()), ] P.add(triples, self.translation_graph) mbox.close()