Example #1
0
 def makePostsTriples(self):
     if not self.hastext:
         return
     self.totalchars = sum(self.size_chars_overall)
     self.mchars_messages = n.mean(self.size_chars_overall)
     self.dchars_messages = n.std(self.size_chars_overall)
     self.totaltokens = sum(self.size_tokens_overall)
     self.mtokens_messages = n.mean(self.size_tokens_overall)
     self.dtokens_messages = n.std(self.size_tokens_overall)
     self.totalsentences = sum(self.size_sentences_overall)
     self.msentences_messages = n.mean(self.size_sentences_overall)
     self.dsentences_messages = n.std(self.size_sentences_overall)
     self.nmessages = P.get(
         "SELECT (COUNT(?s) as ?s) WHERE { ?s a po:Message }",
         context=self.translation_graph)
     self.nparticipants = P.get(
         "SELECT (COUNT(?s) as ?s) WHERE { ?s a po:Participant }",
         context=self.translation_graph)
     self.nurls = P.get(
         "SELECT (COUNT(?s) as ?s) WHERE { ?s po:hasUrl ?o }",
         context=self.translation_graph)
     triples = [
          (self.snapshoturi, po.nParticipants,     self.nparticipants),
          (self.snapshoturi, po.nMessages,         self.nmessages),
          (self.snapshoturi, po.nCharsOverall,     self.totalchars),
          (self.snapshoturi, po.mCharsOverall,     self.mchars_messages),
          (self.snapshoturi, po.dCharsOverall,     self.dchars_messages),
          (self.snapshoturi, po.nTokensOverall,    self.totaltokens),
          (self.snapshoturi, po.mTokensOverall,    self.mtokens_messages),
          (self.snapshoturi, po.dTokensOverall,    self.dtokens_messages),
          (self.snapshoturi, po.nSentencesOverall, self.totalsentences),
          (self.snapshoturi, po.mSentencesOverall, self.msentences_messages),
          (self.snapshoturi, po.dSentencesOverall, self.dsentences_messages),
          ]
     P.add(triples, context=self.meta_graph)
Example #2
0
def publishAny(snapshoturi):
    # publish to umbrelladir
    # get friendship and interaction of the snapshoturi
    triples=[
            (snapshoturi,      po.rawFile, "?fileurifoo"),
            (snapshoturi,      po.snapshotID, "?snapshotid"),
            ("?fileurifoo",    po.expressedClass, po.Friendship),
            ("?fileurifoo",    po.fileFormat, "?fileformat"),
            ("?fileurifoo",    po.fileName, "?filename"),
            ]
    fileformat,friendship_filename,snapshotid=P.get(triples)

    triples=[
            (snapshoturi, NS.po.rawFile, "?fileurifoo"),
            ("?fileurifoo",    po.expressedClass, po.Interaction),
            ("?fileurifoo",    NS.po.fileName, "?filename"),
            ]
    interaction_filename=P.get(triples,context=social_facebook_inferred)

    triples=[
            (snapshoturi, NS.po.rawFile, "?fileurifoo"),
            ("?fileurifoo",    po.expressedClass, po.Post),
            ("?fileurifoo",    NS.po.fileName, "?filename"),
            ]
    posts_filename=P.get(triples,context=social_facebook_inferred)
    c(fileformat)
    if "gdf" in fileformat:
        c("publish gdf", snapshoturi)
#        friendship_filename,interaction_filename=None,None
        return GdfRdfPublishing(snapshoturi,snapshotid,friendship_filename,interaction_filename,posts_filename)
    elif fileformat=="gml":
        c("publish gml", snapshoturi)
        return GmlRdfPublishing(snapshoturi,snapshotid,friendship_filename)
Example #3
0
 def replyTriples(self,tweet,tweeturi):
     triples=[]
     if tweet["in_reply_to_user_id_str"] or tweet["in_reply_to_status_id_str"]:
         self.nreplies+=1                
         if tweet["in_reply_to_status_id_str"]:
             userid_reply=self.snapshotid+"-"+tweet["in_reply_to_user_id_str"]
             useruri_reply=P.rdf.ic(po.Participant,userid_reply,self.tweet_graph,self.snapshoturi)
             if not P.get(useruri_reply,po.numericID,None): # new user
                 self.nparticipants+=1
                 triples+=[(useruri_reply,po.numericID,userid_reply)]
         else:
             userid_reply=self.snapshotid+"-anonymous-"+str(self.anonymous_user_count)
             useruri_reply=P.rdf.ic(po.Participant,userid_reply,self.tweet_graph,self.snapshoturi)
             self.anonymous_user_count+=1
             triples+=[(useruri_reply,po.anonymous,True)]
         if tweet["in_reply_to_status_id_str"]:
             tweetid_reply=userid_reply+"-"+tweet["in_reply_to_status_id_str"]
             tweeturi_reply=P.rdf.ic(po.Tweet,tweetid_reply,self.tweet_graph,self.snapshoturi)
             if not P.get(tweeturi_reply,po.numericID,None): # new message
                 self.ntweets+=1
                 triples+=[(tweeturi_reply,po.numericID,tweetid_reply)]
         else:
             tweetid_reply=self.snapshotid+"-noidmsg-"+str(self.anonymous_tweet_count)
             tweeturi_reply=P.rdf.ic(po.Tweet,tweetid_reply,self.tweet_graph,self.snapshoturi)
             self.anonymous_tweet_count+=1
             triples+=[(tweeturi_reply,po.noid,True)]
         triples+=[
                  (tweeturi,po.inReplyToTweet,tweeturi_reply),
                  (tweeturi_reply,po.author,useruri_reply),
                  ]
     return triples
Example #4
0
 def countNew(self, tweetid, userid):
     query = [("?uri", a, po.Tweet), ("?uri", po.stringID, tweetid)]
     tweet_known = P.get(query)
     query = [("?uri", a, po.Participant), ("?uri", po.numericID, userid)]
     participant_known = P.get(query)
     if not tweet_known:
         self.ntweets += 1
     if not participant_known:
         self.nparticipants += 1
Example #5
0
def publishAll(snapshoturis=None):
    """express irc logs as RDF for publishing"""
    if not snapshoturis:
        c("getting irc snapshots, implementation needs verification TTM")
        uridict={}
        for snapshoturi in P.get(None,a,NS.po.IRCSnapshot,minimized=True):
            uridict[snapshoturi]=0
            for rawFile in P.get(snapshoturi,NS.po.rawFile,strict=True,minimized=True):
                uridict[snapshoturi]+=P.get(rawFile,NS.po.fileSize,minimized=True).toPython()
        snapshoturis.sort(key=lambda x: uridict[x])
    for snapshoturi in snapshoturis:
        triplification_class=publishAny(snapshoturi)
    #writePublishingReadme()
    return triplification_class
Example #6
0
    def makeMetadata(self):
        triples=P.get(self.snapshoturi,None,None,"social_facebook")
        for rawfile in P.get(self.snapshoturi,po.rawFile,None,"social_facebook",strict=True,minimized=True):
            triples+=P.get(rawfile,None,None,"social_facebook")
        P.add(triples,context=self.meta_graph)

        self.ffile="base/"+self.filename_friendships
        self.frdf=self.snapshotid+"Friendship.rdf"
        self.fttl=self.snapshotid+"Friendship.ttl"
        triples=[
                (self.snapshoturi, po.onlineOriginalFriendshipFile,self.online_prefix+self.ffile),
                (self.snapshoturi, po.originalFriendshipFileName,self.ffile),
                (self.snapshoturi, po.onlineFriendshipXMLFile,self.online_prefix+self.frdf),
                (self.snapshoturi, po.onlineFriendshipTTLFile,self.online_prefix+self.fttl),
                (self.snapshoturi, po.friendshipXMLFileName,       self.frdf),
                (self.snapshoturi, po.friendshipTTLFileName,       self.fttl),
                (self.snapshoturi, po.nFriends,              self.nfriends),
                (self.snapshoturi, po.nFriendships,          self.nfriendships),
                (self.snapshoturi, po.friendshipsAnonymized ,self.friendships_anonymized),
                ]
        P.add(triples,context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                [po.frienshipParticipantAttribute]*len(self.friendsvars),
                self.friendsvars,context=self.meta_graph)

        self.mrdf=self.snapshotid+"Meta.rdf"
        self.mttl=self.snapshotid+"Meta.ttl"

        self.desc="facebook network with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
                                                self.snapshotid,self.snapshoturi,self.isego,self.isgroup,)
        self.desc+="\nisFriendship: {}".format(self.isfriendship)
        self.desc+="; nFriends: {}; nFrienships: {}.".format(self.nfriends,self.nfriendships,)
        self.desc+="\nisInteraction: {}".format(self.isinteraction)
        self.desc+="\nisPost: {} (alias hasText: {})".format(self.hastext,self.hastext)
        triples=[
                (self.snapshoturi, po.triplifiedIn,      datetime.datetime.now()),
                (self.snapshoturi, po.triplifiedBy,      "scripts/"),
                (self.snapshoturi, po.donatedBy,         self.snapshotid[:-4]),
                (self.snapshoturi, po.availableAt,       self.online_prefix),
                (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf),
                (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl),
                (self.snapshoturi, po.metaXMLFileName,   self.mrdf),
                (self.snapshoturi, po.metaTTLFileName,   self.mttl),
                (self.snapshoturi, po.acquiredThrough,   "Netvizz"),
                (self.snapshoturi, po.socialProtocolTag, "Facebook"),
                (self.snapshoturi, po.socialProtocol,    P.rdf.ic(po.Platform,"Facebook",self.meta_graph,self.snapshoturi)),
                (self.snapshoturi, NS.rdfs.comment,         self.desc),
                ]
        P.add(triples,self.meta_graph)
Example #7
0
def publishAny(snapshoturi):
    # publish to umbrelladir
    triples=[
            (snapshoturi,      po.rawFile, "?fileurifoo"),
            ("?fileurifoo",    po.fileName, "?filename"),
            ]
    filenames=P.get(triples,join_queries="list",strict=True)
    filenames.sort()
#    filenames=[i for i in filenames if i.count("_")==2]
    triples=[
            (snapshoturi,      po.snapshotID, "?snapshotid"),
            ]
    snapshotid=P.get(triples)
    if filenames:
        return PicklePublishing(snapshoturi,snapshotid,filenames)
Example #8
0
def publishAll(snapshoturis=None):
    """express tweets as RDF for publishing"""
    if not snapshoturis:
        c("getting twitter snapshots, implementation needs verification TTM")
        uridict={}
        for snapshoturi in P.get(None,a,NS.po.TwitterSnapshot,minimized=True):
            uridict[snapshoturi]=0
            for rawFile in P.get(snapshoturi,NS.po.rawFile,strict=True,minimized=True):
                uridict[snapshoturi]+=P.get(rawFile,NS.po.fileSize,minimized=True).toPython()
        snapshoturis=[i for i in list(uridict.keys()) if i.endswith(".gml")]
        snapshoturis.sort(key=lambda x: uridict[x])
    for snapshoturi in snapshoturis:
        triplification_class=publishAny(snapshoturi)
    #writePublishingReadme()
    return triplification_class
Example #9
0
 def countNew(self,tweetid,userid):
     query=[
           ("?uri",a,po.Tweet),
           ("?uri",po.stringID,tweetid)
           ]
     tweet_known=P.get(query)
     query=[
           ("?uri",a,po.Participant),
           ("?uri",po.numericID,userid)
           ]
     participant_known=P.get(query)
     if not tweet_known:
         self.ntweets+=1
     if not participant_known:
         self.nparticipants+=1
Example #10
0
def publishAny(snapshoturi):
    # publish to umbrelladir
    triples = [
        (snapshoturi, po.rawFile, "?fileurifoo"),
        ("?fileurifoo", po.fileName, "?filename"),
    ]
    filenames = P.get(triples, join_queries="list", strict=True)
    filenames.sort()
    #    filenames=[i for i in filenames if i.count("_")==2]
    triples = [
        (snapshoturi, po.snapshotID, "?snapshotid"),
    ]
    snapshotid = P.get(triples)
    if filenames:
        return PicklePublishing(snapshoturi, snapshotid, filenames)
Example #11
0
def startSession(context="session"):
    current_user_uri=P.get(NS.per.currentUser) # from rdf.rdflib OK
    now=datetime.now()
    P.context("session","remove")
    if not current_user_uri:
        nick=randomNick() # OK
        current_user_uri=P.rdf.timestampedURI(NS.per.Participant,nick,now) # rdf.rdflib OK
        triples=[
                (current_user_uri, a, NS.per.DefaultParticipant),
                (current_user_uri, NS.per.nick, nick),
                (current_user_uri, NS.per.registered, now),
                ]
        c("Please create a user with P.utils.createUser() ASAP. Registered for now as {} with URI: {}".format(nick,current_user_uri))
    else:
        triples=[]
    session_uri=P.rdf.timestampedURI(NS.per.Session,nick,now) # from rdf.rdflib OK
    current_status_uri=NS.per.CurrentStatus # class in per: ontology OK
    triples+=[
             (current_status_uri,NS.per.currentSession,session_uri),
             (session_uri,NS.per.started,now),
             (session_uri,NS.per.user,current_user_uri),
             (current_status_uri,NS.per.currentUser,current_user_uri),
             ]
    P.set_(triples,context=context) # from rdf.rdflib OK
    #P.rdf.minimumOntology() # from rdf.ontology
    P.rdf.ontology.minimumTestOntology() # from rdf.ontology
    #P.legacy.triples.datasets.datasets() # from legacy.triples
    P.legacy.triples.datasets.minimalTestData() # from legacy.triples
    P.rdf.inference.performRdfsInference("void","minimum_ontology","session_legacy_metadata") # from rdf.inference
Example #12
0
File: render.py Project: ttm/gmane
def publishAll(snapshoturis=None):
    """express emails as RDF for publishing"""
    if not snapshoturis:
        c("getting email snapshots, implementation needs verification TTM")
        uridict = {}
        for snapshoturi in P.get(None, a, NS.po.Snapshot, minimized=True):
            uridict[snapshoturi] = 0
            for rawFile in P.get(snapshoturi, NS.po.rawFile, strict=True, minimized=True):
                uridict[snapshoturi] += P.get(rawFile, NS.po.directorySize, minimized=True).toPython()
        snapshoturis.sort(key=lambda x: uridict[x])
    c("on triplification")
    triplification_classes = []
    for snapshoturi in list(snapshoturis)[:10]:
        triplification_classes += [publishAny(snapshoturi)]
    # writePublishingReadme()
    return triplification_classes
Example #13
0
def publishAny(snapshoturi):
    triples = [
        (snapshoturi, po.rawFile, "?fileurifoo"),
        ("?fileurifoo", po.fileName, "?filename"),
        (snapshoturi, po.snapshotID, "?snapshotid"),
    ]
    filename, snapshotid = P.get(triples)
    #    filenames=[i for i in filenames if i.count("_")==2]
    return LogPublishing(snapshoturi, snapshotid, filename)
Example #14
0
def publishAll(snapshoturis=None):
    """express irc logs as RDF for publishing"""
    if not snapshoturis:
        c("getting irc snapshots, implementation needs verification TTM")
        uridict = {}
        for snapshoturi in P.get(None, a, NS.po.IRCSnapshot, minimized=True):
            uridict[snapshoturi] = 0
            for rawFile in P.get(snapshoturi,
                                 NS.po.rawFile,
                                 strict=True,
                                 minimized=True):
                uridict[snapshoturi] += P.get(rawFile,
                                              NS.po.fileSize,
                                              minimized=True).toPython()
        snapshoturis.sort(key=lambda x: uridict[x])
    for snapshoturi in snapshoturis:
        triplification_class = publishAny(snapshoturi)
    return triplification_class
Example #15
0
def publishAny(snapshoturi):
    # publish to umbrelladir
    triples = [
        (snapshoturi, po.dataDir, "?datadir"),
        (snapshoturi, po.snapshotID, "?snapshotid"),
        (snapshoturi, po.rawDirectory, "?directoryurifoo"),
        ("?directoryurifoo", po.directoryName, "?directoryname"),
    ]
    data_dir, directory, snapshotid = P.get(triples)
    return MboxPublishing(snapshoturi, snapshotid, directory, data_dir)
Example #16
0
def publishAny(snapshoturi):
    # publish to umbrelladir
    triples=[
            (snapshoturi,      po.rawFile, "?fileurifoo"),
            ("?fileurifoo",    po.fileName, "?filename"),
            (snapshoturi,      po.snapshotID, "?snapshotid"),
            ]
    filename,snapshotid=P.get(triples)
#    filenames=[i for i in filenames if i.count("_")==2]
    return LogPublishing(snapshoturi,snapshotid,filename)
Example #17
0
File: render.py Project: ttm/gmane
def publishAny(snapshoturi):
    # publish to umbrelladir
    triples = [
            (snapshoturi,      po.dataDir, "?datadir"),
            (snapshoturi,      po.snapshotID, "?snapshotid"),
            (snapshoturi,      po.rawDirectory, "?directoryurifoo"),
            ("?directoryurifoo",    po.directoryName, "?directoryname"),
            ]
    data_dir, directory, snapshotid = P.get(triples)
    return MboxPublishing(snapshoturi, snapshotid, directory, data_dir)
Example #18
0
 def replyTriples(self, tweet, tweeturi):
     triples = []
     if tweet["in_reply_to_user_id_str"] or tweet[
             "in_reply_to_status_id_str"]:
         self.nreplies += 1
         if tweet["in_reply_to_status_id_str"]:
             userid_reply = self.snapshotid + "-" + tweet[
                 "in_reply_to_user_id_str"]
             useruri_reply = P.rdf.ic(po.Participant, userid_reply,
                                      self.tweet_graph, self.snapshoturi)
             if not P.get(useruri_reply, po.numericID, None):  # new user
                 self.nparticipants += 1
                 triples += [(useruri_reply, po.numericID, userid_reply)]
         else:
             userid_reply = self.snapshotid + "-anonymous-" + str(
                 self.anonymous_user_count)
             useruri_reply = P.rdf.ic(po.Participant, userid_reply,
                                      self.tweet_graph, self.snapshoturi)
             self.anonymous_user_count += 1
             triples += [(useruri_reply, po.anonymous, True)]
         if tweet["in_reply_to_status_id_str"]:
             tweetid_reply = userid_reply + "-" + tweet[
                 "in_reply_to_status_id_str"]
             tweeturi_reply = P.rdf.ic(po.Tweet, tweetid_reply,
                                       self.tweet_graph, self.snapshoturi)
             if not P.get(tweeturi_reply, po.numericID,
                          None):  # new message
                 self.ntweets += 1
                 triples += [(tweeturi_reply, po.numericID, tweetid_reply)]
         else:
             tweetid_reply = self.snapshotid + "-noidmsg-" + str(
                 self.anonymous_tweet_count)
             tweeturi_reply = P.rdf.ic(po.Tweet, tweetid_reply,
                                       self.tweet_graph, self.snapshoturi)
             self.anonymous_tweet_count += 1
             triples += [(tweeturi_reply, po.noid, True)]
         triples += [
             (tweeturi, po.inReplyToTweet, tweeturi_reply),
             (tweeturi_reply, po.author, useruri_reply),
         ]
     return triples
Example #19
0
def publishAll(snapshoturis=None):
    """express emails as RDF for publishing"""
    if not snapshoturis:
        c("getting email snapshots, implementation needs verification TTM")
        uridict = {}
        for snapshoturi in P.get(None, a, NS.po.GmaneSnapshot, minimized=True):
            uridict[snapshoturi] = 0
            for rawFile in P.get(snapshoturi,
                                 NS.po.rawFile,
                                 strict=True,
                                 minimized=True):
                uridict[snapshoturi] += P.get(rawFile,
                                              NS.po.directorySize,
                                              minimized=True).toPython()
        snapshoturis.sort(key=lambda x: uridict[x])
    c("on triplification")
    triplification_classes = []
    for snapshoturi in list(snapshoturis)[:10]:
        triplification_classes += [publishAny(snapshoturi)]
    #writePublishingReadme()
    return triplification_classes
Example #20
0
def publishAll(snapshoturis=None):
    #triples=S.facebook.ontology.snapshots()
    #P.add(triples,context="facebook_snapshots_ontology")
    #P.rdf.inference.performRdfsInference("social_facebook","facebook_snapshots_ontology",social_facebook_inferred,False)
    if not snapshoturis:
        c("getting facebook snapshots, implementation needs verification TTM")
        uridict={}
        for snapshoturi in P.get(None,a,NS.po.FacebookSnapshot,minimized=True):
            uridict[snapshoturi]=0
            for rawFile in P.get(snapshoturi,NS.po.rawFile,strict=True,minimized=True):
                uridict[snapshoturi]+=P.get(rawFile,NS.po.fileSize,minimized=True).toPython()
        snapshoturis=[i for i in list(uridict.keys()) if i.endswith(".gml")]
        snapshoturis.sort(key=lambda x: uridict[x])
#    snapshoturis=[i for i in snapshoturis if i.endswith("gml")]
    c("snapuris:",snapshoturis)
    count=0
    for snapshoturi in snapshoturis:
        triplification_class=publishAny(snapshoturi)
        count+=1
    #writePublishingReadme()
    return triplification_class
Example #21
0
def publishAll(snapshoturis=None):
    """express tweets as RDF for publishing"""
    if not snapshoturis:
        c("getting twitter snapshots, implementation needs verification TTM")
        uridict = {}
        for snapshoturi in P.get(None,
                                 a,
                                 NS.po.TwitterSnapshot,
                                 minimized=True):
            uridict[snapshoturi] = 0
            for rawFile in P.get(snapshoturi,
                                 NS.po.rawFile,
                                 strict=True,
                                 minimized=True):
                uridict[snapshoturi] += P.get(rawFile,
                                              NS.po.fileSize,
                                              minimized=True).toPython()
        snapshoturis = [i for i in list(uridict.keys()) if i.endswith(".gml")]
        snapshoturis.sort(key=lambda x: uridict[x])
    for snapshoturi in snapshoturis:
        triplification_class = publishAny(snapshoturi)
    return triplification_class
Example #22
0
def publishAll(snapshoturis=None):
    if not snapshoturis:
        c("getting facebook snapshots, implementation needs verification TTM")
        uridict = {}
        for snapshoturi in P.get(None,
                                 a,
                                 NS.po.FacebookSnapshot,
                                 minimized=True):
            uridict[snapshoturi] = 0
            for rawFile in P.get(snapshoturi,
                                 NS.po.rawFile,
                                 strict=True,
                                 minimized=True):
                uridict[snapshoturi] += P.get(rawFile,
                                              NS.po.fileSize,
                                              minimized=True).toPython()
        snapshoturis = [i for i in list(uridict.keys()) if i.endswith(".gml")]
        snapshoturis.sort(key=lambda x: uridict[x])
    c("snapuris:", snapshoturis)
    count = 0
    for snapshoturi in snapshoturis:
        triplification_class = publishAny(snapshoturi)
        count += 1
    return triplification_class
Example #23
0
 def entityTriples(self, tweet, tweeturi):
     triples = []
     for hashtag_ in tweet["entities"]["hashtags"]:
         self.nhashtags += 1
         hashtag = hashtag_["text"]
         triples += [
             (tweeturi, po.hashtag, hashtag),
         ]
     for user_mention in tweet["entities"]["user_mentions"]:
         self.nuser_mentions += 1
         userid_mention_ = user_mention["id_str"]
         name_mention = user_mention["name"]
         screen_name_mention = user_mention["screen_name"]
         userid_mention = self.snapshotid + "-" + userid_mention_
         useruri_mention = P.rdf.ic(po.Participant, userid_mention,
                                    self.tweet_graph, self.snapshoturi)
         triples += [
             (tweeturi, po.userMention, useruri_mention),
             (useruri_mention, po.name, name_mention),
             (useruri_mention, po.screenName, screen_name_mention),
             (useruri_mention, po.stringID, userid_mention),
         ]
         if not P.get(useruri_mention, po.numericID, None):  # new user
             self.nparticipants += 1
             triples += [(useruri_mention, po.numericID, userid_mention)]
     links = []
     for link in tweet["entities"]["urls"]:
         self.nlinks += 1
         url = link["url"]
         triples += [(tweeturi, po.expandedURL, link["expanded_url"])]
     if "media" in tweet["entities"].keys():
         for media in tweet["entities"]["media"]:
             self.nmedia += 1
             mediaid = self.snapshoturi + "-" + str(self.nmedia)
             mediauri = P.rdf.ic(po.Media, mediaid, self.tweet_graph,
                                 self.snapshoturi)
             triples += [
                 (tweeturi, po.media, mediauri),
                 (mediauri, po.type, media["type"]),
                 (mediauri, po.expandedURL, media["expanded_url"]),
             ]
     #symbols?
     return triples
Example #24
0
 def entityTriples(self,tweet,tweeturi):
     triples=[]
     for hashtag_ in tweet["entities"]["hashtags"]:
         self.nhashtags+=1
         hashtag=hashtag_["text"]
         triples+=[
                 (tweeturi,po.hashtag,hashtag),
                 ]
     for user_mention in tweet["entities"]["user_mentions"]:
         self.nuser_mentions+=1
         userid_mention_=user_mention["id_str"]
         name_mention=user_mention["name"]
         screen_name_mention=user_mention["screen_name"]
         userid_mention=self.snapshotid+"-"+userid_mention_
         useruri_mention=P.rdf.ic(po.Participant,userid_mention,self.tweet_graph,self.snapshoturi)
         triples+=[
                 (tweeturi,po.userMention,useruri_mention),
                 (useruri_mention,po.name,name_mention),
                 (useruri_mention,po.screenName,screen_name_mention),
                 (useruri_mention,po.stringID,userid_mention),
                 ]
         if not P.get(useruri_mention,po.numericID,None): # new user
             self.nparticipants+=1
             triples+=[(useruri_mention,po.numericID,userid_mention)]
     links=[]
     for link in tweet["entities"]["urls"]:
         self.nlinks+=1
         url=link["url"]
         triples+=[
                  (tweeturi,po.expandedURL,link["expanded_url"])
                  ]
     if "media" in tweet["entities"].keys():
         for media in tweet["entities"]["media"]:
             self.nmedia+=1
             mediaid=self.snapshoturi+"-"+str(self.nmedia)
             mediauri=P.rdf.ic(po.Media,mediaid,self.tweet_graph,self.snapshoturi)
             triples+=[
                     (tweeturi,po.media,mediauri),
                     (mediauri,po.type,media["type"]),
                     (mediauri,po.expandedURL,media["expanded_url"]),
                     ]
     #symbols?
     return triples
Example #25
0
    def rdfMbox(self):
        for filecount,file_ in enumerate(self.files):
            if filecount%100==0:
                c(self.snapshoturi,filecount)
            mbox = mailbox.mbox(self.data_path+self.directory+"/"+file_)
            if not mbox.keys():
                self.nempty+=1
                mbox.close()
#                c("||||||||||| EMPTY MESSAGE |||||||||||||||||||||",self.snapshotid,file_,"(",filecount,")")
                continue
            if not mbox[0]["Message-Id"]:
                raise ValueError("What to do with nonempy messages without id?")
            message=mbox[0]
            gmaneid=self.makeId(message["Message-Id"])
            #c("gmaneid",gmaneid)
            if not gmaneid:
                raise ValueError("Message without id")
            messageuri=P.rdf.ic(po.EmailMessage,gmaneid,self.translation_graph,self.snapshoturi)
            self.nmessages+=1
            triples=[
                     (messageuri,po.gmaneID,gmaneid),
                     ]
            email,name=self.parseParticipant(message["From"])
            if not email:
                raise ValueError("message without author")
            participanturi=P.rdf.ic(po.GmaneParticipant,email,self.translation_graph,self.snapshoturi)
            if not P.get(participanturi,po.emailAddress,None,self.translation_graph):
                self.nparticipants+=1
                if self.nparticipants==100:
                    pass
            triples+=[
                     (messageuri,po.author,participanturi),
                     (participanturi,po.emailAddress,email),
                     ]
            if name:
                triples+=[
                         (participanturi,po.name,name),
                         ]
            subject=message["Subject"]
            if subject:
                subject=decodeHeader(subject)
                assert isinstance(subject,str)
                triples+=[
                         (messageuri,po.subject,subject),
                         ]
            replyid_=message["In-Reply-To"]
            saneid=self.makeId(replyid_)
            if bool(replyid_) and not bool(saneid):
                self.nreplies+=1
                replyid=self.snapshotid+"-"+str(self.nlost_messages)
                self.nlost_messages+=1
                replymessageuri=P.rdf.ic(po.LostEmailMessage,replyid,self.translation_graph,self.snapshoturi)
                triples+=[
                         (replymessageuri,a,po.EmailMessage),
                         (replymessageuri,NS.rdfs.comment,"This message registered as having a reply, but the field might be ill-formed: "+replyid_),
                         (messageuri,po.replyTo,replymessageuri),
                         ]
            elif saneid:
                self.nreplies+=1
                replymessageuri=P.rdf.ic(po.EmailMessage,saneid,self.translation_graph,self.snapshoturi)
                triples+=[
                         (replymessageuri,po.gmaneID,saneid),
                         (messageuri,po.replyTo,replymessageuri),
                         ]
            if isinstance(message["Date"],str):
                datetime=parseDate(message["Date"])
            elif isinstance(message["Date"],mailbox.email.header.Header):
                datetimestring=decodeHeader(message["Date"])
                if False in [i in string.printable for i in datetimestring]:
                    datetime=None
                    triples+=[
                             (messageuri,po.lostCreatedAt,True),
                             ]
                else:
                    datetime_=re.findall(r"(.*\d\d:\d\d:\d\d).*",datetimestring)[0]
                    datetime=parseDate(datetime_)
            else:
                raise ValueError("datetime not understood")
            if datetime:
                self.dates+=[datetime]
                triples+=[
                         (messageuri,po.createdAt,datetime),
                         ]
            if message["References"]:
                references=message["References"].replace("\n","").replace("\t","").replace(" ","")
                if not re.findall(r"\A<(.*?)>\Z",references):
                    c("::: ::: ::: references field not understood", message["References"])
                    triples+=[
                             (messageuri,po.comment,"the references are not understood (<.*> ids are added anyway): "+message["References"]),
                             (messageuri,po.referencesLost,True),
                             ]
                for reference in re.findall(r"<(.*?)>",references):
                    self.nreferences+=1
                    referenceuri=P.rdf.ic(po.EmailMessage,reference,self.translation_graph,self.snapshoturi)
                    triples+=[
                             (referenceuri,po.gmaneID,reference),
                             (messageuri,po.hasReference,referenceuri),
                             ]
                for part in message["References"].replace("\n","").replace("\t","").split():
                    if validate_email(part):
                        self.nreferences+=1
                        referenceuri=P.rdf.ic(po.EmailMessage,part,self.translation_graph,self.snapshoturi)
                        triples+=[
                                 (referenceuri,po.gmaneID,reference),
                                 (messageuri,po.hasReference,referenceuri),
                                 ]
            text=getText(message)
            if text:
                nchars=len(text)
                ntokens=len(k.wordpunct_tokenize(text))
                nsentences=len(k.sent_tokenize(text))
                triples+=[
                         (messageuri,po.messageText,text),
                         (messageuri,po.nChars,nchars),
                         (messageuri,po.nTokens,ntokens),
                         (messageuri,po.nSentences,nsentences),
                         ]
                self.nchars_all+=[nchars]
                self.ntokens_all+=[ntokens]
                self.nsentences_all+=[nsentences]

                clean_text=cleanEmailBody(text)
                self.nremoved_lines+=text.count("\n")-clean_text.count("\n")
                self.nlines+=text.count("\n")
                nchars_clean=len(clean_text)
                ntokens_clean=len(k.wordpunct_tokenize(clean_text))
                nsentences_clean=len(k.sent_tokenize(clean_text))
                triples+=[
                         (messageuri,po.messageTextClean,clean_text),
                         (messageuri,po.nCharsClean,nchars_clean),
                         (messageuri,po.nTokensClean,ntokens_clean),
                         (messageuri,po.nSentencesClean,nsentences_clean),
                         ]
                self.nchars_clean_all+=[nchars_clean]
                self.ntokens_clean_all+=[ntokens_clean]
                self.nsentences_clean_all+=[nsentences_clean]

                for url in re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',clean_text):
                    self.nurls+=1
                    triples+=[
                             (messageuri,po.hasUrl,url),
                             ]

            content_type=message.get_content_type()
            if content_type:
                triples+=[
                         (messageuri,po.contentType,content_type)
                         ]
            else:
                raise ValueError("/\/\/\/\/\ message without content type")
            organization=message["Organization"]
            if organization:
                if not isinstance(organization,str):
                    organization="".join(i for i in str(organization) if i in string.printable)
                triples+=[
                         (messageuri,po.organization,organization),
                         ]
            if message["cc"]:
                cc,unparsed=parseAddresses(message["cc"])
                if unparsed:
                    triples+=[
                             (messageuri,po.unparsedCC,unparsed),
                             ]
                for peeraddress,peername in cc:
                    peeraddress=peeraddress.strip()
                    assert bool(peeraddress)
                    peeruri=P.rdf.ic(po.EmailPeer,peeraddress,self.translation_graph,self.snapshoturi)
                    triples+=[
                             (messageuri,po.cc,peeruri),
                             (peeruri,po.emailAddress,peeraddress),
                             ]
                    self.ncc+=1
                    if peername:
                        triples+=[
                                 (peeruri,po.name,peername.strip()),
                                 ]
            if message["to"]:
                to,unparsed=parseAddresses(message["to"])
                if unparsed:
                    triples+=[
                             (messageuri,po.unparsedTo,unparsed),
                             ]
                for peeraddress,peername in to:
                    peeraddress=peeraddress.strip()
                    assert bool(peeraddress)
                    peeruri=P.rdf.ic(po.EmailPeer,peeraddress,self.translation_graph,self.snapshoturi)
                    triples+=[
                             (messageuri,po.to,peeruri),
                             (peeruri,po.emailAddress,peeraddress),
                             ]
                    self.nto+=1
                    if peername:
                        triples+=[
                                 (peeruri,po.name,peername.strip()),
                                 ]
            listid=message["list-id"]
            if listid:
                assert isinstance(listid,str)
                listid=listid.replace("\n","").replace("\t","")
                if listid.count("<")==listid.count(">")==listid.count(" ")==0:
                    listname=""
                    listid_=listid
                elif listid.count("<")==listid.count(">")==0:
                    parts=listid.split()
                    lens=[len(i) for i in parts]
                    listid_=[i for i in parts if len(i)==max(lens)][0]
                    listname=" ".join(i for i in parts if len(i)!=max(lens))
                elif listid.count("<")==listid.count(">")==1:
                    listname,listid_=re.findall(r"(.*) {0,1}<(.*)>",listid)[0]
                else:
                    raise ValueError("Unexpected listid string format")
                listuri=P.rdf.ic(po.EmailList,listid_,self.translation_graph,self.snapshoturi)
                triples+=[
                         (messageuri,po.emailList,listuri),
                         (listuri,po.listID,listid_),
                         ]
                if listname:
                    triples+=[
                             (listuri,po.name,listname.strip()),
                             ]
            P.add(triples,self.translation_graph)
            mbox.close()
Example #26
0
    def writeAllFB(self):
        c("started rendering of the snapshot publication. snapshotID:",
          self.snapshotid)
        self.final_path_ = "{}{}/".format(self.final_path, self.snapshotid)
        if not os.path.isdir(self.final_path_):
            os.mkdir(self.final_path_)
        g = P.context(self.friendship_graph)
        g.namespace_manager.bind("po", po)
        g.serialize(self.final_path_ + self.snapshotid + "Friendship.ttl",
                    "turtle")
        c("ttl")
        g.serialize(self.final_path_ + self.snapshotid + "Friendship.rdf",
                    "xml")
        c("serialized friendships")
        # get filesize and ntriples
        # filesizerdf = os.path.getsize(self.final_path_+self.snapshotid +
        #                               "Friendship.rdf")/(10**6)
        # filesizettl = os.path.getsize(self.final_path_+self.snapshotid +
        #                               "Friendship.ttl")/(10**6)
        # ntriples = len(g)
        # triples = [
        #          (self.snapshoturi, po.friendshipXMLFileSizeMB, filesizerdf),
        #          (self.snapshoturi, po.friendshipTTLFileSizeMB, filesizettl),
        #          (self.snapshoturi, po.nFriendshipTriples, ntriples),
        #          ]
        g = P.context(self.meta_graph)
        # ntriples = len(g)
        # triples.append(
        #          (self.snapshoturi, po.nMetaTriples, ntriples+1),
        # )
        # P.add(triples, context=self.meta_graph)
        g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle")
        c("ttl")
        g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml")
        c("serialized meta")

        if not os.path.isdir(self.final_path_ + "scripts"):
            os.mkdir(self.final_path_ + "scripts")
        shutil.copy(S.PACKAGEDIR + "/../tests/triplify.py",
                    self.final_path_ + "scripts/triplify.py")
        # copia do base data
        if not os.path.isdir(self.final_path_ + "base"):
            os.mkdir(self.final_path_ + "base")
        shutil.copy(self.data_path + self.filename_friendships,
                    self.final_path_ + "base/")

        originals = "base/{}".format(self.filename_friendships)
        tfriendship = """\n\n{nf} individuals with metadata {fvars}
and {nfs} friendships constitute the friendship network in the RDF/XML file:
{frdf} \in the Turtle file: \n{fttl}
(anonymized {fan}).""".format(
            nf=self.nfriends,
            fvars=str(self.friendsvars),
            nfs=self.nfriendships,
            frdf=self.frdf,
            fttl=self.fttl,
            fan=self.friendships_anonymized,
        )
        datetime_string = P.get(self.snapshoturi,
                                po.dateObtained,
                                None,
                                context=self.social_graph)[2]

        with open(self.final_path_ + "README", "w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the facebook
snapshot {snapid} collected around {date}.{tfriendship}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nOriginal file(s):
{origs}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ \
                    directory.\n:::""".format(snapid=self.snapshotid,
                                              date=datetime_string,
                                              tfriendship=tfriendship,
                                              mrdf=self.mrdf,
                                              mttl=self.mttl,
                                              origs=originals,
                                              ise=self.isego,
                                              isg=self.isgroup,
                                              isf=self.isfriendship,
                                              isi=self.isinteraction,
                                              ist=self.hastext,
                                              ava=self.online_prefix,
                                              desc=self.desc))
Example #27
0
    def makeMetadata(self):
        if self.isfriendship and self.groupid and self.groupid2 and \
                (self.groupid != self.groupid2):
            raise ValueError("Group IDS are different")
        # triples = P.get(self.snapshoturi, None, None, self.social_graph)
        # for rawfile in P.get(self.snapshoturi, po.rawFile, None,
        #                      self.social_graph, strict=True, minimized=True):
        #     triples.extend(P.get(rawfile, None, None, self.social_graph))
        # P.add(triples, context=self.meta_graph)
        foo = {"uris": [], "vals": []}
        if self.isfriendship:
            foo["uris"].extend([
                    # po.onlineOriginalFriendshipFile,
                    # po.originalFriendshipFileName,
                    # po.onlineFriendshipXMLFile,
                    # po.onlineFriendshipTTLFile,
                    # po.friendshipXMLFileName,
                    # po.friendshipTTLFileName,
                    # po.numberOfFriends,
                    # po.numberOfFriendships,
                    po.friendshipsAnonymized
                    ] #  + [po.frienshipParticipantAttribute]*len(self.friendsvars)
            )
            self.ffile = "base/"+self.filename_friendships
            self.frdf = self.snapshotid+"Friendship.rdf"
            self.fttl = self.snapshotid+"Friendship.ttl"
            foo["vals"].extend([
                    # self.online_prefix+self.ffile,
                    # self.ffile,
                    # self.online_prefix+self.frdf,
                    # self.online_prefix+self.fttl,
                    # self.frdf,
                    # self.fttl,
                    # self.nfriends,
                    # self.nfriendships,
                    self.friendships_anonymized
                    ] #  +list(self.friendsvars)
            )

        if self.isinteraction:
            foo["uris"].extend([
                        # po.onlineOriginalInteractionFile,
                        # po.originalInteractionFileName,
                        # po.onlineInteractionXMLFile,
                        # po.onlineInteractionTTLFile,
                        # po.interactionXMLFileName,
                        # po.interactionTTLFileName,
                        # po.numberOfInteractedParticipants,
                        # po.numberOfInteractions,
                        po.interactionsAnonymized
                        ])
            #  + [po.interactionParticipantAttribute]*len(
            #      self.interactionsvars)
            self.ifile = "base/"+self.filename_interactions
            self.irdf = irdf = self.snapshotid+"Interaction.rdf"
            self.ittl = ittl = self.snapshotid+"Interaction.ttl"
            foo["vals"].extend([
                    # self.ifile,
                    # self.online_prefix+self.ifile,
                    # self.online_prefix+irdf,
                    # self.online_prefix+ittl,
                    # irdf,
                    # ittl,
                    # self.ninteractions,
                    # self.ninteracted,
                    self.interactions_anonymized,
                    ] #  +list(self.interactionsvars)
            )
        if self.hastext:
            foo["uris"].extend([
                        # po.onlineOriginalPostsFile,
                        # po.originalPostsFileName,
                        # po.onlinePostsXMLFile,
                        # po.onlinePostsTTLFile,
                        # po.postsXMLFileName,
                        # po.postsTTLFileName,
                        # po.numberOfPosts,
                        # po.numberOfChars,
                        # po.meanChars,
                        # po.deviationChars,
                        # po.numberOfTokens,
                        # po.meanTokens,
                        # po.deviationTokens,
                        ] #  + [po.postAttribute]*len(self.postsvars)
            )
            self.pfile = "base/"+self.filename_posts
            self.prdf = self.snapshotid+"Post.rdf"
            self.pttl = self.snapshotid+"Post.ttl"
            foo["vals"].extend([
                    # self.online_prefix+self.pfile,
                    # self.pfile,
                    # self.online_prefix+self.prdf,
                    # self.online_prefix+self.pttl,
                    # self.prdf,
                    # self.pttl,
                    # self.nposts,
                    # int(self.totalchars),
                    # self.mcharsposts,
                    # self.dcharsposts,
                    # int(self.totaltokens),
                    # self.mtokensposts,
                    # self.dtokensposts,
                    ] #  +list(self.postsvars)
            )

        foo["uris"].extend([
                    a,
                    po.snapshotID,
                    po.isGroup,
                    po.isEgo,
                    po.isFriendship,
                    po.isInteraction,
                    # po.hasText,
                    po.isPost,
                    po.dateObtained,
                    po.name,
                    ]
        )
        # self.isego = bool(P.get(r.URIRef(self.snapshoturi), a, po.EgoSnapshot))
        # self.isgroup = bool(P.get(r.URIRef(self.snapshoturi), a, po.GroupSnapshot))
        self.isego = P.get(r.URIRef(self.snapshoturi), po.isEgo)[2].toPython()
        self.isgroup = P.get(r.URIRef(self.snapshoturi), po.isGroup)[2].toPython()
        date_obtained = P.get(r.URIRef(self.snapshoturi), po.dateObtained)[2].toPython()
        assert isinstance(date_obtained, datetime.date)
        name = P.get(r.URIRef(self.snapshoturi), po.name, None, context=self.social_graph)[2]
        foo["vals"].extend([po.Snapshot, self.snapshotid,
                        self.isgroup, self.isego, self.isfriendship,
                        self.isinteraction, self.hastext, date_obtained, name]) #  , self.hastext])

        numericID = P.get(r.URIRef(self.snapshoturi), po.numericID, None, context=self.social_graph)
        if numericID:
            foo['uris'].append(po.numericID)
            foo['vals'].append(numericID[2])
        stringID = P.get(r.URIRef(self.snapshoturi), po.stringID, None, context=self.social_graph)
        if stringID:
            foo['uris'].append(po.stringID)
            foo['vals'].append(stringID[2])
        url = P.get(r.URIRef(self.snapshoturi), po.url, None, context=self.social_graph)
        if url:
            foo['uris'].append(po.url)
            foo['vals'].append(url[2])
        self.mrdf = self.snapshotid+"Meta.rdf"
        self.mttl = self.snapshotid+"Meta.ttl"

        self.desc = "facebook network with snapshotID: {}\nsnapshotURI: \
            {} \nisEgo: {}. isGroup: {}.".format(
                self.snapshotid, self.snapshoturi, self.isego, self.isgroup)
        self.desc += "\nisFriendship: {}".format(self.isfriendship)
        # if self.isfriendship:
        #     self.desc += "; numberOfFriends: {}; numberOfFrienships: {}.".format(
        #         self.nfriends, self.nfriendships)
        self.desc += "\nisInteraction: {}".format(self.isinteraction)
        # if self.isinteraction:
        #     self.desc += "; numberOfInteracted: {}; numberOfInteractions: {}.".format(
        #         self.ninteracted, self.ninteractions)
        self.desc += "\nisPost: {} (has text)".format(
            self.hastext)
        # if self.hastext:
        #     self.desc += ";\nmeanChars: {}; deviationChars: {}; \
        #         totalChars: {}; \nmeanTokens: {}; \
        #         deviationTokens: {}; totalTokens: {}".format(
        #             self.nposts,
        #             self.mcharsposts, self.dcharsposts, self.totalchars,
        #             self.mtokensposts, self.dtokensposts, self.totaltokens,
        #             )
        P.rdf.triplesScaffolding(self.snapshoturi, [
                        po.triplifiedIn,
                        # po.triplifiedBy,
                        # po.donatedBy,
                        # po.availableAt,
                        # po.onlineMetaXMLFile,
                        # po.onlineMetaTTLFile,
                        # po.metaXMLFileName,
                        # po.metaTTLFileName,
                        po.acquiredThrough,
                        po.socialProtocol,
                        # po.socialProtocolTag,
                        # po.socialProtocol,
                        po.comment,
                        ]+foo["uris"],
                        [
                        datetime.datetime.now(),
                        # "scripts/",
                        # self.snapshotid[:-4],
                        # self.online_prefix,
                        # self.online_prefix+self.mrdf,
                        # self.online_prefix+self.mttl,
                        # self.mrdf,
                        # self.mttl,
                        "Netvizz",
                        "Facebook",
                        # "Facebook",
                        # P.rdf.ic(po.Platform, "Facebook", self.meta_graph, self.snapshoturi),
                        self.desc,
                        ]+foo["vals"],
                        self.meta_graph)
Example #28
0
    def makeMetadata(self):
        if self.isfriendship and self.groupid and self.groupid2 and (self.groupid!=self.groupid2):
            raise ValueError("Group IDS are different")
        # put all triples from social_facebook to self.meta_graph
        #g1=P.context("social_facebook")
        #g2=P.context(self.meta_graph)
        #for subject, predicate, object_ in g1.triples((self.snapshoturi))
        triples=P.get(self.snapshoturi,None,None,"social_facebook")
        for rawfile in P.get(self.snapshoturi,po.rawFile,None,"social_facebook",strict=True,minimized=True):
            triples+=P.get(rawfile,None,None,"social_facebook")
        P.add(triples,context=self.meta_graph)
        foo={"uris":[],"vals":[]}
        if self.isfriendship:
            foo["uris"]+=[
                         po.onlineOriginalFriendshipFile,
                         po.originalFriendshipFileName,
                         po.onlineFriendshipXMLFile,
                         po.onlineFriendshipTTLFile,
                         po.friendshipXMLFileName,
                         po.friendshipTTLFileName,
                         po.nFriends,
                         po.nFriendships,
                         po.friendshipsAnonymized 
                         ]+\
                         [po.frienshipParticipantAttribute]*len(self.friendsvars)
            self.ffile="base/"+self.filename_friendships
            self.frdf=self.snapshotid+"Friendship.rdf"
            self.fttl=self.snapshotid+"Friendship.ttl"
            foo["vals"]+=[
                         self.online_prefix+self.ffile,
                         self.ffile,
                         self.online_prefix+self.frdf,
                         self.online_prefix+self.fttl,
                         self.frdf,
                         self.fttl,
                         self.nfriends,
                         self.nfriendships,
                         self.friendships_anonymized
                         ]+list(self.friendsvars)

        if self.isinteraction:
            foo["uris"]+=[
                         po.onlineOriginalInteractionFile,
                         po.originalInteractionFileName,
                         po.onlineInteractionXMLFile,
                         po.onlineInteractionTTLFile,
                         po.interactionXMLFileName,
                         po.interactionTTLFileName,
                         po.nInteracted,
                         po.nInteractions,
                         po.interactionsAnonymized 
                         ]+\
                         [po.interactionParticipantAttribute]*len(self.interactionsvars)
            self.ifile="base/"+self.filename_interactions
            self.irdf=irdf=self.snapshotid+"Interaction.rdf"
            self.ittl=ittl=self.snapshotid+"Interaction.ttl"
            foo["vals"]+=[
                          self.ifile,
                          self.online_prefix+self.ifile,
                          self.online_prefix+irdf,
                          self.online_prefix+ittl,
                          irdf,
                          ittl,
                          self.ninteractions,
                          self.ninteracted,
                          self.interactions_anonymized,
                          ]+list(self.interactionsvars)
        if self.hastext:
            foo["uris"]+=[
                         po.onlineOriginalPostsFile,
                         po.originalPostsFileName,
                         po.onlinePostsXMLFile,
                         po.onlinePostsTTLFile,
                         po.postsXMLFileName,
                         po.postsTTLFileName,
                         po.nPosts,
                         po.nCharsOverall,
                         po.mCharsOverall,
                         po.dCharsOverall,
                         po.nTokensOverall,
                         po.mTokensOverall,
                         po.dTokensOverall,
                         ]+\
                         [po.postAttribute]*len(self.postsvars)
            self.pfile="base/"+self.filename_posts
            self.prdf=self.snapshotid+"Post.rdf"
            self.pttl=self.snapshotid+"Post.ttl"
            foo["vals"]+=[
                          self.online_prefix+self.pfile,
                          self.pfile,
                          self.online_prefix+self.prdf,
                          self.online_prefix+self.pttl,
                          self.prdf,
                          self.pttl,
                          self.nposts,
                          int(self.totalchars),
                          self.mcharsposts,
                          self.dcharsposts,
                          int(self.totaltokens),
                          self.mtokensposts,
                          self.dtokensposts,
                          ]+list(self.postsvars)
        foo["uris"]+=[
                     po.isGroup,
                     po.isEgo,
                     po.isFriendship,
                     po.isInteraction,
                     po.hasText,
                     po.isPost,
                     ]
        self.isego=  bool(P.get(r.URIRef(self.snapshoturi),a,po.EgoSnapshot  ))
        self.isgroup=bool(P.get(r.URIRef(self.snapshoturi),a,po.GroupSnapshot))
        foo["vals"]+=[self.isgroup,self.isego,self.isfriendship,self.isinteraction,self.hastext,self.hastext]

        self.mrdf=self.snapshotid+"Meta.rdf"
        self.mttl=self.snapshotid+"Meta.ttl"

        self.desc="facebook network with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
                                                self.snapshotid,self.snapshoturi,self.isego,self.isgroup,)
        self.desc+="\nisFriendship: {}".format(self.isfriendship)
        if self.isfriendship:
           self.desc+="; nFriends: {}; nFrienships: {}.".format(self.nfriends,self.nfriendships,)
        self.desc+="\nisInteraction: {}".format(self.isinteraction)
        if self.isinteraction:
              self.desc+="; nInteracted: {}; nInteractions: {}.".format(self.ninteracted,self.ninteractions,)
        self.desc+="\nisPost: {} (alias hasText: {})".format(self.hastext,self.hastext)
        if self.hastext:
              self.desc+=";\nmCharsPostsOverall: {}; dCharsPostsOverall: {}; totalCharsOverall: {}; \
                          \nmTokensPostsOverall: {}; dTokensPostsOverall: {}; totalTokensOverall: {}".format(
                    self.nposts,
                    self.mcharsposts,self.dcharsposts,self.totalchars,
                    self.mtokensposts,self.dtokensposts,self.totaltokens,
                    )
        
        P.rdf.triplesScaffolding(self.snapshoturi,[ 
                                  po.triplifiedIn,
                                  po.triplifiedBy,
                                  po.donatedBy,
                                  po.availableAt,
                                  po.onlineMetaXMLFile,
                                  po.onlineMetaTTLFile,
                                  po.metaXMLFileName,
                                  po.metaTTLFileName,
                                  po.acquiredThrough,
                                  po.socialProtocolTag,
                                  po.socialProtocol,
                                  NS.rdfs.comment,
                                  ]+foo["uris"],
                                  [
                                  datetime.datetime.now(),
                                  "scripts/",
                                  self.snapshotid[:-4],
                                  self.online_prefix,
                                  self.online_prefix+self.mrdf,
                                  self.online_prefix+self.mttl,
                                  self.mrdf,
                                  self.mttl,
                                  "Netvizz",
                                  "Facebook",
                                  P.rdf.ic(po.Platform,"Facebook",self.meta_graph,self.snapshoturi),
                                  self.desc,
                                  ]+foo["vals"],
                                  self.meta_graph)
Example #29
0
    def makeMetadata(self):
        triples = P.get(self.snapshoturi, None, None, self.social_graph)
        for rawfile in P.get(self.snapshoturi,
                             po.rawFile,
                             None,
                             self.social_graph,
                             strict=True,
                             minimized=True):
            triples += P.get(rawfile, None, None, self.social_graph)
        P.add(triples, context=self.meta_graph)
        self.totalchars = sum(self.nchars_all)
        self.mcharsmessages = n.mean(self.nchars_all)
        self.dcharsmessages = n.std(self.nchars_all)
        self.totaltokens = sum(self.ntokens_all)
        self.mtokensmessages = n.mean(self.ntokens_all)
        self.dtokensmessages = n.std(self.ntokens_all)
        self.totalsentences = sum(self.nsentences_all)
        self.msentencesmessages = n.mean(self.nsentences_all)
        self.dsentencesmessages = n.std(self.nsentences_all)
        self.nparticipants = len(self.NICKS)
        self.nmessages = len(self.messageids)
        self.ntriples = len(P.context(self.irc_graph))
        triples = [
            (self.snapshoturi, po.nParticipants, self.nparticipants),
            (self.snapshoturi, po.nMessages, self.nmessages),
            (self.snapshoturi, po.nDirectMessages, self.ndirect),
            (self.snapshoturi, po.nUserMentions, self.nmention),
            (self.snapshoturi, po.nCharsOverall, self.totalchars),
            (self.snapshoturi, po.mCharsOverall, self.mcharsmessages),
            (self.snapshoturi, po.dCharsOverall, self.dcharsmessages),
            (self.snapshoturi, po.nTokensOverall, self.totaltokens),
            (self.snapshoturi, po.mTokensOverall, self.mtokensmessages),
            (self.snapshoturi, po.dTokensOverall, self.dtokensmessages),
            (self.snapshoturi, po.nSentencesOverall, self.totalsentences),
            (self.snapshoturi, po.mSentencesOverall, self.msentencesmessages),
            (self.snapshoturi, po.dSentencesOverall, self.dsentencesmessages),
        ]
        P.add(triples, context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                                 [po.ircParticipantAttribute] *
                                 len(self.participantvars),
                                 self.participantvars,
                                 context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                                 [po.logXMLFilename] * len(self.log_xml) +
                                 [po.logTTLFilename] * len(self.log_ttl),
                                 self.log_xml + self.log_ttl,
                                 context=self.meta_graph)
        P.rdf.triplesScaffolding(
            self.snapshoturi, [po.onlineLogXMLFile] * len(self.log_xml) +
            [po.onlineLogTTLFile] * len(self.log_ttl),
            [self.online_prefix + i for i in self.log_xml + self.log_ttl],
            context=self.meta_graph)

        self.mrdf = self.snapshotid + "Meta.rdf"
        self.mttl = self.snapshotid + "Meta.ttl"
        self.desc = "irc dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
            self.snapshotid,
            self.snapshoturi,
            self.isego,
            self.isgroup,
        )
        self.desc += "\nisFriendship: {}; ".format(self.isfriendship)
        self.desc += "isInteraction: {}.".format(self.isinteraction)
        self.desc += "\nnParticipants: {}; nInteractions: {} (directed messages+user mentions).".format(
            self.nparticipants, self.ndirect + self.nmention)
        self.desc += "\nisPost: {} (alias hasText: {})".format(
            self.hastext, self.hastext)
        self.desc += "\nnMessages: {}; ".format(self.nmessages)
        self.desc += "nDirectedMessages: {}; nUserMentions: {};".format(
            self.ndirect, self.nmention)
        self.desc += "\nnCharsOverall: {}; mCharsOverall: {}; dCharsOverall: {}.".format(
            self.totalchars, self.mcharsmessages, self.dcharsmessages)
        self.desc += "\nnTokensOverall: {}; mTokensOverall: {}; dTokensOverall: {};".format(
            self.totaltokens, self.mtokensmessages, self.dtokensmessages)
        self.desc += "\nnSentencesOverall: {}; mSentencesOverall: {}; dSentencesOverall: {};".format(
            self.totalsentences, self.msentencesmessages,
            self.dsentencesmessages)
        self.desc += "\nnURLs: {}; nAAMessages {}.".format(
            self.nurls, self.naamessages)
        triples = [
            (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()),
            (self.snapshoturi, po.triplifiedBy, "scripts/"),
            (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]),
            (self.snapshoturi, po.availableAt, self.online_prefix),
            (self.snapshoturi, po.onlineMetaXMLFile,
             self.online_prefix + self.mrdf),
            (self.snapshoturi, po.onlineMetaTTLFile,
             self.online_prefix + self.mttl),
            (self.snapshoturi, po.metaXMLFileName, self.mrdf),
            (self.snapshoturi, po.metaTTLFileName, self.mttl),
            (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_xml)),
            (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)),
            (self.snapshoturi, po.acquiredThrough, "channel text log"),
            (self.snapshoturi, po.socialProtocolTag, "IRC"),
            (self.snapshoturi, po.socialProtocol,
             P.rdf.ic(po.Platform, "IRC", self.meta_graph, self.snapshoturi)),
            (self.snapshoturi, po.nTriples, self.ntriples),
            (self.snapshoturi, NS.rdfs.comment, self.desc),
        ]
        P.add(triples, self.meta_graph)
Example #30
0
    def makeMetadata(self):
        triples = P.get(self.snapshoturi, None, None, self.social_graph)
        for rawfile in P.get(self.snapshoturi,
                             po.rawFile,
                             None,
                             self.social_graph,
                             strict=True,
                             minimized=True):
            triples += P.get(rawfile, None, None, self.social_graph)
        self.totalchars = sum(self.nchars_all)
        self.mcharstweets = n.mean(self.nchars_all)
        self.dcharstweets = n.std(self.nchars_all)
        self.totaltokens = sum(self.ntokens_all)
        self.mtokenstweets = n.mean(self.ntokens_all)
        self.dtokenstweets = n.std(self.ntokens_all)
        P.add(triples, context=self.meta_graph)
        triples = [
            (self.snapshoturi, po.nParticipants, self.nparticipants),
            (self.snapshoturi, po.nTweets, self.ntweets),
            (self.snapshoturi, po.nReplies, self.nreplies),
            (self.snapshoturi, po.nRetweets, self.nretweets),
            (self.snapshoturi, po.nCharsOverall, self.totalchars),
            (self.snapshoturi, po.mCharsOverall, self.mcharstweets),
            (self.snapshoturi, po.dCharsOverall, self.dcharstweets),
            (self.snapshoturi, po.nTokensOverall, self.totaltokens),
            (self.snapshoturi, po.mTokensOverall, self.mtokenstweets),
            (self.snapshoturi, po.dTokensOverall, self.dtokenstweets),
        ]
        P.add(triples, context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                                 [po.tweetParticipantAttribute] *
                                 len(self.participantvars),
                                 self.participantvars,
                                 context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                                 [po.tweetXMLFilename] * len(self.tweet_rdf) +
                                 [po.tweetTTLFilename] * len(self.tweet_ttl),
                                 self.tweet_rdf + self.tweet_ttl,
                                 context=self.meta_graph)
        P.rdf.triplesScaffolding(
            self.snapshoturi, [po.onlineTweetXMLFile] * len(self.tweet_rdf) +
            [po.onlineTweetTTLFile] * len(self.tweet_ttl),
            [self.online_prefix + i for i in self.tweet_rdf + self.tweet_ttl],
            context=self.meta_graph)

        self.mrdf = self.snapshotid + "Meta.rdf"
        self.mttl = self.snapshotid + "Meta.ttl"
        self.desc = "twitter dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
            self.snapshotid,
            self.snapshoturi,
            self.isego,
            self.isgroup,
        )
        self.desc += "\nisFriendship: {}; ".format(self.isfriendship)
        self.desc += "isInteraction: {}.".format(self.isinteraction)
        self.desc += "\nnParticipants: {}; nInteractions: {} (replies+retweets+user mentions).".format(
            self.nparticipants,
            self.nreplies + self.nretweets + self.nuser_mentions,
        )
        self.desc += "\nisPost: {} (alias hasText: {})".format(
            self.hastext, self.hastext)
        self.desc += "\nnTweets: {}; ".format(self.ntweets)
        self.desc += "nReplies: {}; nRetweets: {}; nUserMentions: {}.".format(
            self.nreplies, self.nretweets, self.nuser_mentions)
        self.desc += "\nnTokens: {}; mTokens: {}; dTokens: {};".format(
            self.totaltokens, self.mtokenstweets, self.dtokenstweets)
        self.desc += "\nnChars: {}; mChars: {}; dChars: {}.".format(
            self.totalchars, self.mcharstweets, self.dcharstweets)
        self.desc += "\nnHashtags: {}; nMedia: {}; nLinks: {}.".format(
            self.nhashtags, self.nmedia, self.nlinks)
        triples = [
            (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()),
            (self.snapshoturi, po.triplifiedBy, "scripts/"),
            (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]),
            (self.snapshoturi, po.availableAt, self.online_prefix),
            (self.snapshoturi, po.onlineMetaXMLFile,
             self.online_prefix + self.mrdf),
            (self.snapshoturi, po.onlineMetaTTLFile,
             self.online_prefix + self.mttl),
            (self.snapshoturi, po.metaXMLFileName, self.mrdf),
            (self.snapshoturi, po.metaTTLFileName, self.mttl),
            (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_rdf)),
            (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)),
            (self.snapshoturi, po.acquiredThrough, "Twitter APIs"),
            (self.snapshoturi, po.socialProtocolTag, "Twitter"),
            (self.snapshoturi, po.socialProtocol,
             P.rdf.ic(po.Platform, "Twitter", self.meta_graph,
                      self.snapshoturi)),
            (self.snapshoturi, po.nTriples, self.ntriples),
            (self.snapshoturi, NS.rdfs.comment, self.desc),
        ]
        P.add(triples, self.meta_graph)
Example #31
0
    def writeAllFB(self):
        c("started rendering of the snapshot publication. snapshotID:",self.snapshotid)
        self.final_path_="{}{}/".format(self.final_path,self.snapshotid)
        if not os.path.isdir(self.final_path_):
            os.mkdir(self.final_path_)
        #fnet,inet,mnet
        triples=[]
        if self.isfriendship:
            g=P.context(self.friendship_graph)
            g.namespace_manager.bind("po",po)
            g.serialize(self.final_path_+self.snapshotid+"Friendship.ttl","turtle"); c("ttl")
            g.serialize(self.final_path_+self.snapshotid+"Friendship.rdf","xml")
            c("serialized friendships")
            # get filesize and ntriples
            filesizerdf=os.path.getsize(self.final_path_+self.snapshotid+"Friendship.rdf")/(10**6)
            filesizettl=os.path.getsize(self.final_path_+self.snapshotid+"Friendship.ttl")/(10**6)
            ntriples=len(g)
            triples+=[
                     (self.snapshoturi,po.friendshipXMLFileSizeMB,filesizerdf),
                     (self.snapshoturi,po.friendshipTTLFileSizeMB,filesizettl),
                     (self.snapshoturi,po.nFriendshipTriples,ntriples),
                     ]
        if self.isinteraction:
            g=P.context(self.interaction_graph)
            g.namespace_manager.bind("po",po)
            g.serialize(self.final_path_+self.snapshotid+"Interaction.ttl","turtle"); c("ttl")
            g.serialize(self.final_path_+self.snapshotid+"Interaction.rdf","xml")
            c("serialized interaction")
            filesizerdf=os.path.getsize(self.final_path_+self.snapshotid+"Interaction.rdf")/(10**6)
            filesizettl=os.path.getsize(self.final_path_+self.snapshotid+"Interaction.ttl")/(10**6)
            ntriples=len(g)
            triples+=[
                     (self.snapshoturi,po.interactionXMLFileSizeMB,filesizerdf),
                     (self.snapshoturi,po.interactionTTLFileSizeMB,filesizettl),
                     (self.snapshoturi,po.nInteractionTriples,ntriples),
                     ]
        if self.hastext:
            g=P.context(self.posts_graph)
            g.namespace_manager.bind("po",po)
            g.serialize(self.final_path_+self.snapshotid+"Posts.ttl","turtle"); c("ttl")
            g.serialize(self.final_path_+self.snapshotid+"Posts.rdf","xml")
            c("serialized posts")
            filesizerdf=os.path.getsize(self.final_path_+self.snapshotid+"Posts.rdf")/(10**6)
            filesizettl=os.path.getsize(self.final_path_+self.snapshotid+"Posts.ttl")/(10**6)
            ntriples=len(g)
            triples+=[
                     (self.snapshoturi,po.postsXMLFileSizeMB,filesizerdf),
                     (self.snapshoturi,po.postsTTLFileSizeMB,filesizettl),
                     (self.snapshoturi,po.nPostsTriples,ntriples)      ,
                     ]
        g=P.context(self.meta_graph)
        ntriples=len(g)
        triples+=[
                 (self.snapshoturi,po.nMetaTriples,ntriples)      ,
                 ]
        P.add(triples,context=self.meta_graph)
        g.namespace_manager.bind("po",po)
        g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl")
        g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml")
        c("serialized meta")
        # copia o script que gera este codigo
        if not os.path.isdir(self.final_path_+"scripts"):
            os.mkdir(self.final_path_+"scripts")
        shutil.copy(S.PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py")
        # copia do base data
        if not os.path.isdir(self.final_path_+"base"):
            os.mkdir(self.final_path_+"base")
        originals=""
        if self.isfriendship:
            shutil.copy(self.data_path+self.filename_friendships,self.final_path_+"base/")
            originals+="base/{}".format(self.filename_friendships)
            tfriendship="""\n\n{nf} individuals with metadata {fvars}
and {nfs} friendships constitute the friendship network in the RDF/XML file:
{frdf} \nor in the Turtle file: \n{fttl}
(anonymized: {fan}).""".format(
                            nf=self.nfriends,fvars=str(self.friendsvars),
                            nfs=self.nfriendships,
                            frdf=self.frdf,fttl=self.fttl,
                            fan=self.friendships_anonymized,
                        )
        else:
            tfriendship=""
        if self.isinteraction:
            shutil.copy(self.data_path+self.filename_interactions,self.final_path_+"base/")
            tinteraction="""\n\n{} individuals with metadata {}
and {} interactions with metadata {} constitute the interaction 
network in the RDF/XML file:
{}
or in the Turtle file:
{}
(anonymized: {}).""".format( self.ninteracted,str(self.varsfriendsinteraction),
                        self.ninteractions,str(self.interactionsvars),
                        self.irdf,
                        self.ittl,
                        self.interactions_anonymized)
            originals+="\nbase/{}".format(self.filename_interactions)
        else:
            tinteraction=""
        if self.hastext:
            shutil.copy(self.data_path+self.filename_posts,self.final_path_+"base/")
            tposts="""\n\n{} posts with {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {}
{:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}
posts data in the RDF/XML file:
{}
or in the Turtle file:
{}""".format( self.nposts,self.mcharsposts,self.dcharsposts,self.totalchars,
                        self.mtokensposts,self.dtokensposts,self.totaltokens,
                        self.prdf,
                        self.pttl)
            originals+="\nbase/{}".format(self.filename_posts)
        else:
            tposts=""


#        P.rdf.writeAll(mnet,aname+"Meta",fpath_,1)
        # faz um README
        datetime_string=P.get(r.URIRef(self.snapshoturi),po.dateObtained,None,context="social_facebook")[2]
#        if not os.path.isdir(self.final_path+"base"):
#            os.mkdir(self.final_path+"base")
        with open(self.final_path_+"README","w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the facebook
snapshot {snapid} collected around {date}.{tfriendship}{tinteraction}{tposts}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nOriginal file(s):
{origs}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::""".format(
                snapid=self.snapshotid,date=datetime_string,
                        tfriendship=tfriendship,
                        tinteraction=tinteraction,
                        tposts=tposts,
                        mrdf=self.mrdf,
                        mttl=self.mttl,
                        origs=originals,
                        ise=self.isego,
                        isg=self.isgroup,
                        isf=self.isfriendship,
                        isi=self.isinteraction,
                        ist=self.hastext,
                        ava=self.online_prefix,
                        desc=self.desc
                        ))
Example #32
0
    def makeMetadata(self):
        self.makePostsTriples()
        # get participant and message vars from snapshot through queries
        self.participantvars = P.get("""SELECT DISTINCT ?p WHERE { GRAPH <%s> {
                                  ?fooparticipant po:snapshot <%s> .
                                  ?fooparticipant a po:Participant .
                                  ?fooparticipant ?p ?fooobject . } } """ % (
                                self.translation_graph, self.snapshoturi))
        P.rdf.triplesScaffolding(
            self.snapshoturi,
            [po.ParticipantAttribute]*len(self.participantvars),
            self.participantvars, context=self.meta_graph)
        self.messagevars = P.get("""SELECT DISTINCT ?p WHERE { GRAPH <%s> {
                               ?foomessage po:snapshot <%s> .
                               ?foomessage a po:Message .
                               ?foomessage ?p ?fooobject . } } """ % (
                                   self.translation_graph, self.snapshoturi))
        P.rdf.triplesScaffolding(
                self.snapshoturi,
                [po.MessageAttribute]*len(self.messagevars),
                self.messagevars, context=self.meta_graph)

        self.mrdf = self.snapshotid+"Meta.rdf"
        self.mttl = self.snapshotid+"Meta.ttl"
        self.desc = "dataset with snapshotID:\
            {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
            self.snapshotid, self.snapshoturi, self.isego, self.isgroup)
        self.desc += "\nisFriendship: {}; ".format(self.isfriendship)
        self.desc += "isInteraction: {}.".format(self.isinteraction)
        self.desc += "\nhasText: {}".format(self.hastext)
        self.nchecks = P.get(r"SELECT (COUNT(?checker) as ?cs) WHERE { \
                             ?foosession po:checkParticipant ?checker}",
                             context=self.translation_graph)
        self.desc += "\nnParticipants: {}; nInteractions: {} \
            (only session checks in first aa).".format(
                self.nparticipants, self.nchecks)
        self.desc += "\nnMessages: {}; ".format(self.nmessages)
        self.desc += "\nnCharsOverall: {}; mCharsOverall: {};\
            dCharsOverall: {}.".format(self.totalchars, self.mchars_messages,
                                       self.dchars_messages)
        self.desc += "\nnTokensOverall: {}; mTokensOverall: {};\
            dTokensOverall: {};".format(self.totaltokens, self.mtokens_messages,
                                        self.dtokens_messages)
        self.desc += "\nnSentencesOverall: {}; mSentencesOverall: {};\
            dSentencesOverall: {};".format(
                self.totalsentences, self.msentences_messages,
                self.dsentences_messages)
        self.desc += "\nnURLs: {}; nAAMessages {}.".format(
            self.nurls, self.nmessages)
        self.dates = P.get(r"SELECT ?date WHERE { GRAPH <%s> {\
                           ?fooshout po:createdAt ?date } " % (
                               self.translation_graph,))
        self.desc += "\nReference timespan: {} to {}".format(
            min(dates), max(dates))
        self.desc += """\nRDF expression in the XML file(s):
{}
and the Turtle file(s):
{}
(anonymized: {}).""".format(self.translation_xml, self.translation_ttl,
                            self.anonymized)
        self.desc += """\nMetadata of this snapshot in the XML file(s):
{}
and the Turtle file(s):
{}.""".format(self.meta_xml, self.meta_ttl)
        self.desc += """\nFiles should be available in: \n{}""".format()

        self.desc += "\n\nNote: numeric variables starting with n area \
            countings, with m are means and d are standard deviations."
        if isinstance(self.translation_xml, list):
            P.rdf.triplesScaffolding(
                self.snapshoturi,
                [po.translationXMLFilename]*len(self.translation_xml) +
                [po.translationTTLFilename]*len(self.translation_ttl),
                self.translation_xml+self.translation_ttl,
                context=self.meta_graph)
            P.rdf.triplesScaffolding(
                self.snapshoturi,
                [po.onlineTranslationXMLFileURI]*len(self.translation_xml) +
                [po.onlineTranslationTTLFileURI]*len(self.translation_ttl),
                [self.online_prefix+i for i in
                 self.translation_xml+self.translation_ttl],
                context=self.meta_graph)
            triples = [
                (self.snapshoturi, po.translationXMLFilesize,
                 self.translation_size_xml),
                (self.snapshoturi, po.translationTTLFilesize,
                 self.translation_size_ttl),
                      ]
        else:
            triples = [
                      (self.snapshoturi, po.translationXMLFilename,
                       self.translation_xml),
                      (self.snapshoturi, po.translationXMLFilesize,
                       self.translation_size_xml),
                      (self.snapshoturi, po.translationTTLFilename,
                       self.translation_ttl),
                      (self.snapshoturi, po.translationTTLFilesize,
                       self.translation_size_ttl),
                      ]
        P.add(triples,self.meta_graph)
        triples=[
                (self.snapshoturi, po.triplifiedIn,      datetime.datetime.now()),
                (self.snapshoturi, po.triplifiedBy,      "scripts/"),
#                (self.snapshoturi, po.donatedBy,         self.snapshotid[:-4]),
                (self.snapshoturi, po.availableAt,       self.online_prefix),
                (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf),
                (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl),
                (self.snapshoturi, po.metaXMLFileName,   self.mrdf),
                (self.snapshoturi, po.metaTTLFileName,   self.mttl),
#                (self.snapshoturi, po.acquiredThrough,   "aa shouts in "+self.snapshotid),
                (self.snapshoturi, po.socialProtocolTag, self.social_protocol), # AA, fb, etc
                (self.snapshoturi, po.socialProtocol,    P.rdf.ic(po.Platform,self.social_protocol,self.meta_graph,self.snapshoturi)),
                (self.snapshoturi, po.nTriples,         self.ntranslation_triples),
                (self.snapshoturi, NS.rdfs.comment,         self.desc),
                ]
        P.add(triples,self.meta_graph)
Example #33
0
    def writeAllFB(self):
        c("started rendering of the snapshot publication. snapshotID:",
          self.snapshotid)
        self.final_path_ = "{}{}/".format(self.final_path, self.snapshotid)
        if not os.path.isdir(self.final_path_):
            os.mkdir(self.final_path_)
        #fnet,inet,mnet
        triples = []
        if self.isfriendship:
            g = P.context(self.friendship_graph)
            g.namespace_manager.bind("po", po)
            g.serialize(self.final_path_ + self.snapshotid + "Friendship.ttl",
                        "turtle")
            c("ttl")
            g.serialize(self.final_path_ + self.snapshotid + "Friendship.rdf",
                        "xml")
            c("serialized friendships")
            # get filesize and ntriples
            filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Friendship.rdf") / (10**6)
            filesizettl = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Friendship.ttl") / (10**6)
            ntriples = len(g)
            triples += [
                (self.snapshoturi, po.friendshipXMLFileSizeMB, filesizerdf),
                (self.snapshoturi, po.friendshipTTLFileSizeMB, filesizettl),
                (self.snapshoturi, po.nFriendshipTriples, ntriples),
            ]
        if self.isinteraction:
            g = P.context(self.interaction_graph)
            g.namespace_manager.bind("po", po)
            g.serialize(self.final_path_ + self.snapshotid + "Interaction.ttl",
                        "turtle")
            c("ttl")
            g.serialize(self.final_path_ + self.snapshotid + "Interaction.rdf",
                        "xml")
            c("serialized interaction")
            filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Interaction.rdf") / (10**6)
            filesizettl = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Interaction.ttl") / (10**6)
            ntriples = len(g)
            triples += [
                (self.snapshoturi, po.interactionXMLFileSizeMB, filesizerdf),
                (self.snapshoturi, po.interactionTTLFileSizeMB, filesizettl),
                (self.snapshoturi, po.nInteractionTriples, ntriples),
            ]
        if self.hastext:
            g = P.context(self.posts_graph)
            g.namespace_manager.bind("po", po)
            g.serialize(self.final_path_ + self.snapshotid + "Posts.ttl",
                        "turtle")
            c("ttl")
            g.serialize(self.final_path_ + self.snapshotid + "Posts.rdf",
                        "xml")
            c("serialized posts")
            filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Posts.rdf") / (10**6)
            filesizettl = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Posts.ttl") / (10**6)
            ntriples = len(g)
            triples += [
                (self.snapshoturi, po.postsXMLFileSizeMB, filesizerdf),
                (self.snapshoturi, po.postsTTLFileSizeMB, filesizettl),
                (self.snapshoturi, po.nPostsTriples, ntriples),
            ]
        g = P.context(self.meta_graph)
        ntriples = len(g)
        triples += [
            (self.snapshoturi, po.nMetaTriples, ntriples),
        ]
        P.add(triples, context=self.meta_graph)
        g.namespace_manager.bind("po", po)
        g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle")
        c("ttl")
        g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml")
        c("serialized meta")
        # copia o script que gera este codigo
        if not os.path.isdir(self.final_path_ + "scripts"):
            os.mkdir(self.final_path_ + "scripts")
        shutil.copy(S.PACKAGEDIR + "/../tests/triplify.py",
                    self.final_path_ + "scripts/triplify.py")
        # copia do base data
        if not os.path.isdir(self.final_path_ + "base"):
            os.mkdir(self.final_path_ + "base")
        originals = ""
        if self.isfriendship:
            shutil.copy(self.data_path + self.filename_friendships,
                        self.final_path_ + "base/")
            originals += "base/{}".format(self.filename_friendships)
            tfriendship = """\n\n{nf} individuals with metadata {fvars}
and {nfs} friendships constitute the friendship network in the RDF/XML file:
{frdf} \nor in the Turtle file: \n{fttl}
(anonymized: {fan}).""".format(
                nf=self.nfriends,
                fvars=str(self.friendsvars),
                nfs=self.nfriendships,
                frdf=self.frdf,
                fttl=self.fttl,
                fan=self.friendships_anonymized,
            )
        else:
            tfriendship = ""
        if self.isinteraction:
            shutil.copy(self.data_path + self.filename_interactions,
                        self.final_path_ + "base/")
            tinteraction = """\n\n{} individuals with metadata {}
and {} interactions with metadata {} constitute the interaction 
network in the RDF/XML file:
{}
or in the Turtle file:
{}
(anonymized: {}).""".format(self.ninteracted, str(self.varsfriendsinteraction),
                            self.ninteractions, str(self.interactionsvars),
                            self.irdf, self.ittl, self.interactions_anonymized)
            originals += "\nbase/{}".format(self.filename_interactions)
        else:
            tinteraction = ""
        if self.hastext:
            shutil.copy(self.data_path + self.filename_posts,
                        self.final_path_ + "base/")
            tposts = """\n\n{} posts with {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {}
{:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}
posts data in the RDF/XML file:
{}
or in the Turtle file:
{}""".format(self.nposts, self.mcharsposts, self.dcharsposts, self.totalchars,
             self.mtokensposts, self.dtokensposts, self.totaltokens, self.prdf,
             self.pttl)
            originals += "\nbase/{}".format(self.filename_posts)
        else:
            tposts = ""


#        P.rdf.writeAll(mnet,aname+"Meta",fpath_,1)
# faz um README
        datetime_string = P.get(r.URIRef(self.snapshoturi),
                                po.dateObtained,
                                None,
                                context="social_facebook")[2]
        #        if not os.path.isdir(self.final_path+"base"):
        #            os.mkdir(self.final_path+"base")
        with open(self.final_path_ + "README", "w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the facebook
snapshot {snapid} collected around {date}.{tfriendship}{tinteraction}{tposts}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nOriginal file(s):
{origs}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::"""
                    .format(snapid=self.snapshotid,
                            date=datetime_string,
                            tfriendship=tfriendship,
                            tinteraction=tinteraction,
                            tposts=tposts,
                            mrdf=self.mrdf,
                            mttl=self.mttl,
                            origs=originals,
                            ise=self.isego,
                            isg=self.isgroup,
                            isf=self.isfriendship,
                            isi=self.isinteraction,
                            ist=self.hastext,
                            ava=self.online_prefix,
                            desc=self.desc))
Example #34
0
    def makeMetadata(self):
        return
        qtriples=[
                 ("?fooshout",po.shoutText,"?text"),
                 ]
        self.totalchars=sum(                self.size_chars_overall)
        self.mchars_messages=n.mean(        self.size_chars_overall)
        self.dchars_messages=n.std(         self.size_chars_overall)
        self.totaltokens=sum(              self.size_tokens_overall)
        self.mtokens_messages=n.mean(      self.size_tokens_overall)
        self.dtokens_messages=n.std(       self.size_tokens_overall)
        self.totalsentences=sum(        self.size_sentences_overall)
        self.msentences_messages=n.mean(self.size_sentences_overall)
        self.dsentences_messages=n.std( self.size_sentences_overall)
        self.nmessages=P.get("SELECT (COUNT(?s) as ?s) WHERE { ?s a po:Shout }",context=self.translation_graph)
        self.nparticipants=P.get("SELECT (COUNT(?s) as ?s) WHERE { ?s a po:Participant }",context=self.translation_graph)
        self.nurls=P.get("SELECT (COUNT(?s) as ?s) WHERE { ?s po:hasUrl ?o }",context=self.translation_graph)
        triples=[
                (self.snapshoturi, po.nParticipants,     self.nparticipants),
                (self.snapshoturi, po.nMessages,         self.nmessages),
                (self.snapshoturi, po.nCharsOverall,     self.totalchars),
                (self.snapshoturi, po.mCharsOverall,     self.mchars_messages),
                (self.snapshoturi, po.dCharsOverall,     self.dchars_messages),
                (self.snapshoturi, po.nTokensOverall,    self.totaltokens),
                (self.snapshoturi, po.mTokensOverall,    self.mtokens_messages),
                (self.snapshoturi, po.dTokensOverall,    self.dtokens_messages),
                (self.snapshoturi, po.nSentencesOverall, self.totalsentences),
                (self.snapshoturi, po.mSentencesOverall, self.msentences_messages),
                (self.snapshoturi, po.dSentencesOverall, self.dsentences_messages),
                ]
        P.add(triples,context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                [po.ParticipantAttribute]*len(self.participantvars),
                self.participantvars,context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                [po.MessageAttribute]*len(self.messagevars),
                self.messagevars,context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                [po.shoutXMLFilename]*len(self.translation_xml)+[po.shoutTTLFilename]*len(self.translation_ttl),
                self.translation_xml+self.translation_ttl,context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                [po.onlineShoutXMLFile]*len(self.translation_xml)+[po.onlineShoutTTLFile]*len(self.translation_ttl),
                [self.online_prefix+i for i in self.translation_xml+self.translation_ttl],context=self.meta_graph)

        self.mrdf=self.snapshotid+"Meta.rdf"
        self.mttl=self.snapshotid+"Meta.ttl"
        self.desc="irc dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
                                                self.snapshotid,self.snapshoturi,self.isego,self.isgroup,)
        self.desc+="\nisFriendship: {}; ".format(self.isfriendship)
        self.desc+="isInteraction: {}.".format(self.isinteraction)
        self.nchecks=P.get(r"SELECT (COUNT(?checker) as ?cs) WHERE { ?foosession po:checkParticipant ?checker}",context=self.translation_graph)
        self.desc+="\nnParticipants: {}; nInteractions: {} (only session checks in first aa).".format(self.nparticipants,self.nchecks)
        self.desc+="\nisPost: {} (alias hasText: {})".format(self.hastext,self.hastext)
        self.desc+="\nnMessages: {}; ".format(self.nmessages)

        self.desc+="\nnCharsOverall: {}; mCharsOverall: {}; dCharsOverall: {}.".format(self.totalchars,                    self.mchars_messages,     self.dchars_messages)
        self.desc+="\nnTokensOverall: {}; mTokensOverall: {}; dTokensOverall: {};".format(self.totaltokens,               self.mtokens_messages,    self.dtokens_messages)
        self.desc+="\nnSentencesOverall: {}; mSentencesOverall: {}; dSentencesOverall: {};".format(self.totalsentences,self.msentences_messages, self.dsentences_messages)
        self.desc+="\nnURLs: {}; nAAMessages {}.".format(self.nurls,self.nmessages)
        triples=[
                (self.snapshoturi, po.triplifiedIn,      datetime.datetime.now()),
                (self.snapshoturi, po.triplifiedBy,      "scripts/"),
                (self.snapshoturi, po.donatedBy,         self.snapshotid[:-4]),
                (self.snapshoturi, po.availableAt,       self.online_prefix),
                (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf),
                (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl),
                (self.snapshoturi, po.metaXMLFileName,   self.mrdf),
                (self.snapshoturi, po.metaTTLFileName,   self.mttl),
                (self.snapshoturi, po.totalXMLFileSizeMB, self.size_xml),
                (self.snapshoturi, po.totalTTLFileSizeMB, self.size_ttl),
                (self.snapshoturi, po.acquiredThrough,   "aa shouts in "+self.snapshotid),
                (self.snapshoturi, po.socialProtocolTag, "AA"),
                (self.snapshoturi, po.socialProtocol,    P.rdf.ic(po.Platform,"IRC",self.meta_graph,self.snapshoturi)),
                (self.snapshoturi, po.nTriples,         self.ntranslation_triples),
                (self.snapshoturi, NS.rdfs.comment,         self.desc),
                ]
        P.add(triples,self.meta_graph)
Example #35
0
    def writeAll(self):
        g=P.context(self.meta_graph)
        ntriples=len(g)
        triples=[
                 (self.snapshoturi,po.nMetaTriples,ntriples)      ,
                 ]
        P.add(triples,context=self.meta_graph)
        g.namespace_manager.bind("po",po)
        g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl")
        g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml")
        c("serialized meta")
        if not os.path.isdir(self.final_path_+"scripts"):
            os.mkdir(self.final_path_+"scripts")
        shutil.copy(PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py")
        # copia do base data
        text="""structure in the RDF/XML file(s):
{}
and the Turtle file(s):
{}
(anonymized: False "nicks inteface").""".format( self.nparticipants,str(self.participantvars),
                    self.nchecks,self.ndirect,self.nmention,
                    self.translation_xml,
                    self.translation_ttl)
        tposts="""\n\nThe dataset consists of {} shout messages with metadata {}
{:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {}
{:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}
{:.3f} sentences in average (std: {:.3f}) and total sentences in snapshot: {}""".format(
                        self.nmessages,str(self.messagevars),
                        self.mcharsmessages, self.dcharsmessages,self.totalchars,
                        self.mtokensmessages,self.dtokensmessages,self.totaltokens,
                        self.msentencesmessages,self.dsentencesmessages,self.totalsentences,
                        )
        self.dates=P.get(r"SELECT ?date WHERE { GRAPH <%s> { ?fooshout po:createdAt ?date } "%(self.translation_graph,))
        self.dates=[i.isoformat() for i in self.dates]
        date1=min(self.dates)
        date2=max(self.dates)
        with open(self.final_path_+"README","w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the IRC
snapshot {snapid} with tweets from {date1} to {date2}
(total of {ntrip} triples).{tinteraction}{tposts}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::""".format(
                snapid=self.snapshotid,date1=date1,date2=date2,ntrip=self.ntriples,
                        tinteraction=tposts,
                        tposts=tposts,
                        mrdf=self.translation_xml,
                        mttl=self.translation_ttl,
                        ise=self.isego,
                        isg=self.isgroup,
                        isf=self.isfriendship,
                        isi=self.isinteraction,
                        ist=self.hastext,
                        ava=self.online_prefix,
                        desc=self.desc
                        ))
Example #36
0
    def makeMetadata(self):
        if self.isfriendship and self.groupid and self.groupid2 and (
                self.groupid != self.groupid2):
            raise ValueError("Group IDS are different")
        # put all triples from social_facebook to self.meta_graph
        #g1=P.context("social_facebook")
        #g2=P.context(self.meta_graph)
        #for subject, predicate, object_ in g1.triples((self.snapshoturi))
        triples = P.get(self.snapshoturi, None, None, "social_facebook")
        for rawfile in P.get(self.snapshoturi,
                             po.rawFile,
                             None,
                             "social_facebook",
                             strict=True,
                             minimized=True):
            triples += P.get(rawfile, None, None, "social_facebook")
        P.add(triples, context=self.meta_graph)
        foo = {"uris": [], "vals": []}
        if self.isfriendship:
            foo["uris"]+=[
                         po.onlineOriginalFriendshipFile,
                         po.originalFriendshipFileName,
                         po.onlineFriendshipXMLFile,
                         po.onlineFriendshipTTLFile,
                         po.friendshipXMLFileName,
                         po.friendshipTTLFileName,
                         po.nFriends,
                         po.nFriendships,
                         po.friendshipsAnonymized
                         ]+\
                         [po.frienshipParticipantAttribute]*len(self.friendsvars)
            self.ffile = "base/" + self.filename_friendships
            self.frdf = self.snapshotid + "Friendship.rdf"
            self.fttl = self.snapshotid + "Friendship.ttl"
            foo["vals"] += [
                self.online_prefix + self.ffile, self.ffile,
                self.online_prefix + self.frdf, self.online_prefix + self.fttl,
                self.frdf, self.fttl, self.nfriends, self.nfriendships,
                self.friendships_anonymized
            ] + list(self.friendsvars)

        if self.isinteraction:
            foo["uris"]+=[
                         po.onlineOriginalInteractionFile,
                         po.originalInteractionFileName,
                         po.onlineInteractionXMLFile,
                         po.onlineInteractionTTLFile,
                         po.interactionXMLFileName,
                         po.interactionTTLFileName,
                         po.nInteracted,
                         po.nInteractions,
                         po.interactionsAnonymized
                         ]+\
                         [po.interactionParticipantAttribute]*len(self.interactionsvars)
            self.ifile = "base/" + self.filename_interactions
            self.irdf = irdf = self.snapshotid + "Interaction.rdf"
            self.ittl = ittl = self.snapshotid + "Interaction.ttl"
            foo["vals"] += [
                self.ifile,
                self.online_prefix + self.ifile,
                self.online_prefix + irdf,
                self.online_prefix + ittl,
                irdf,
                ittl,
                self.ninteractions,
                self.ninteracted,
                self.interactions_anonymized,
            ] + list(self.interactionsvars)
        if self.hastext:
            foo["uris"]+=[
                         po.onlineOriginalPostsFile,
                         po.originalPostsFileName,
                         po.onlinePostsXMLFile,
                         po.onlinePostsTTLFile,
                         po.postsXMLFileName,
                         po.postsTTLFileName,
                         po.nPosts,
                         po.nCharsOverall,
                         po.mCharsOverall,
                         po.dCharsOverall,
                         po.nTokensOverall,
                         po.mTokensOverall,
                         po.dTokensOverall,
                         ]+\
                         [po.postAttribute]*len(self.postsvars)
            self.pfile = "base/" + self.filename_posts
            self.prdf = self.snapshotid + "Post.rdf"
            self.pttl = self.snapshotid + "Post.ttl"
            foo["vals"] += [
                self.online_prefix + self.pfile,
                self.pfile,
                self.online_prefix + self.prdf,
                self.online_prefix + self.pttl,
                self.prdf,
                self.pttl,
                self.nposts,
                int(self.totalchars),
                self.mcharsposts,
                self.dcharsposts,
                int(self.totaltokens),
                self.mtokensposts,
                self.dtokensposts,
            ] + list(self.postsvars)
        foo["uris"] += [
            po.isGroup,
            po.isEgo,
            po.isFriendship,
            po.isInteraction,
            po.hasText,
            po.isPost,
        ]
        self.isego = bool(P.get(r.URIRef(self.snapshoturi), a, po.EgoSnapshot))
        self.isgroup = bool(
            P.get(r.URIRef(self.snapshoturi), a, po.GroupSnapshot))
        foo["vals"] += [
            self.isgroup, self.isego, self.isfriendship, self.isinteraction,
            self.hastext, self.hastext
        ]

        self.mrdf = self.snapshotid + "Meta.rdf"
        self.mttl = self.snapshotid + "Meta.ttl"

        self.desc = "facebook network with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
            self.snapshotid,
            self.snapshoturi,
            self.isego,
            self.isgroup,
        )
        self.desc += "\nisFriendship: {}".format(self.isfriendship)
        if self.isfriendship:
            self.desc += "; nFriends: {}; nFrienships: {}.".format(
                self.nfriends,
                self.nfriendships,
            )
        self.desc += "\nisInteraction: {}".format(self.isinteraction)
        if self.isinteraction:
            self.desc += "; nInteracted: {}; nInteractions: {}.".format(
                self.ninteracted,
                self.ninteractions,
            )
        self.desc += "\nisPost: {} (alias hasText: {})".format(
            self.hastext, self.hastext)
        if self.hastext:
            self.desc += ";\nmCharsPostsOverall: {}; dCharsPostsOverall: {}; totalCharsOverall: {}; \
                          \nmTokensPostsOverall: {}; dTokensPostsOverall: {}; totalTokensOverall: {}".format(
                self.nposts,
                self.mcharsposts,
                self.dcharsposts,
                self.totalchars,
                self.mtokensposts,
                self.dtokensposts,
                self.totaltokens,
            )

        P.rdf.triplesScaffolding(self.snapshoturi, [
            po.triplifiedIn,
            po.triplifiedBy,
            po.donatedBy,
            po.availableAt,
            po.onlineMetaXMLFile,
            po.onlineMetaTTLFile,
            po.metaXMLFileName,
            po.metaTTLFileName,
            po.acquiredThrough,
            po.socialProtocolTag,
            po.socialProtocol,
            NS.rdfs.comment,
        ] + foo["uris"], [
            datetime.datetime.now(),
            "scripts/",
            self.snapshotid[:-4],
            self.online_prefix,
            self.online_prefix + self.mrdf,
            self.online_prefix + self.mttl,
            self.mrdf,
            self.mttl,
            "Netvizz",
            "Facebook",
            P.rdf.ic(po.Platform, "Facebook", self.meta_graph,
                     self.snapshoturi),
            self.desc,
        ] + foo["vals"], self.meta_graph)
Example #37
0
    def makeMetadata(self):
        # triples = P.get(self.snapshoturi, None, None, self.social_graph)
        # for rawfile in P.get(self.snapshoturi, po.rawFile, None,
        #                      self.social_graph, strict=True, minimized=True):
        #     triples.extend(P.get(rawfile, None, None, self.social_graph))
        # P.add(triples, context=self.meta_graph)

        self.ffile = "base/" + self.filename_friendships
        self.frdf = self.snapshotid + "Friendship.rdf"
        self.fttl = self.snapshotid + "Friendship.ttl"
        triples = [
            # (self.snapshoturi, po.onlineOriginalFriendshipFile,
            #  self.online_prefix+self.ffile),
            # (self.snapshoturi, po.originalFriendshipFileName, self.ffile),
            # (self.snapshoturi, po.onlineFriendshipXMLFile,
            #  self.online_prefix+self.frdf),
            # (self.snapshoturi, po.onlineFriendshipTTLFile,
            #  self.online_prefix+self.fttl),
            # (self.snapshoturi, po.friendshipXMLFileName, self.frdf),
            # (self.snapshoturi, po.friendshipTTLFileName, self.fttl),
            # (self.snapshoturi, po.numberOfFriends,              self.nfriends),
            # (self.snapshoturi, po.numberOfFriendships,          self.nfriendships),
            (self.snapshoturi, po.friendshipsAnonymized,
             self.friendships_anonymized),
        ]
        P.add(triples, context=self.meta_graph)
        # P.rdf.triplesScaffolding(self.snapshoturi,
        #                          [po.frienshipParticipantAttribute] *
        #                          len(self.friendsvars),
        #                          self.friendsvars, context=self.meta_graph)
        self.mrdf = self.snapshotid + "Meta.rdf"
        self.mttl = self.snapshotid + "Meta.ttl"
        self.desc = "facebook network with snapshotID: {}\nsnapshotURI: {} \n\
            isEgo: {}. isGroup: {}.".format(self.snapshotid, self.snapshoturi,
                                            self.isego, self.isgroup)
        self.desc += "\nisFriendship: {}".format(self.isfriendship)
        # self.desc += "; numberOfFriends: {}; numberOfFrienships: {}."\
        #     .format(self.nfriends, self.nfriendships)
        self.desc += "\nisInteraction: {}".format(self.isinteraction)
        self.desc += "\nisPost: {} (hasText)".format(self.hastext)
        date_obtained = P.get(r.URIRef(self.snapshoturi),
                              po.dateObtained)[2].toPython()
        assert isinstance(date_obtained, datetime.date)
        name = P.get(r.URIRef(self.snapshoturi),
                     po.name,
                     None,
                     context=self.social_graph)[2]
        triples = [
            (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()),
            (self.snapshoturi, a, po.Snapshot),
            (self.snapshoturi, po.snapshotID, self.snapshotid),
            (self.snapshoturi, po.isEgo, True),
            (self.snapshoturi, po.isGroup, False),
            (self.snapshoturi, po.isFriendship, True),
            (self.snapshoturi, po.isInteraction, False),
            (self.snapshoturi, po.isPost, False),
            (self.snapshoturi, po.dateObtained, date_obtained),
            (self.snapshoturi, po.name, name),
            # (self.snapshoturi, po.triplifiedBy, "scripts/"),
            # (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]),
            # (self.snapshoturi, po.availableAt, self.online_prefix),
            # (self.snapshoturi, po.onlineMetaXMLFile,
            #  self.online_prefix+self.mrdf),
            # (self.snapshoturi, po.onlineMetaTTLFile,
            #  self.online_prefix+self.mttl),
            # (self.snapshoturi, po.metaXMLFileName,   self.mrdf),
            # (self.snapshoturi, po.metaTTLFileName,   self.mttl),
            (self.snapshoturi, po.acquiredThrough, "Netvizz"),
            (self.snapshoturi, po.socialProtocol, "Facebook"),
            # (self.snapshoturi, po.socialProtocolTag, "Facebook"),
            # (self.snapshoturi, po.socialProtocol,
            #  P.rdf.ic(po.Platform, "Facebook", self.meta_graph,
            #           self.snapshoturi)),
            (self.snapshoturi, po.comment, self.desc),
        ]
        numericID = P.get(r.URIRef(self.snapshoturi),
                          po.numericID,
                          None,
                          context=self.social_graph)
        if numericID:
            triples.append((self.snapshoturi, po.numericID, numericID[2]))
        stringID = P.get(r.URIRef(self.snapshoturi),
                         po.stringID,
                         None,
                         context=self.social_graph)
        if stringID:
            triples.append((self.snapshoturi, po.stringID, stringID[2]))
        url = P.get(r.URIRef(self.snapshoturi),
                    po.url,
                    None,
                    context=self.social_graph)
        if url:
            triples.append((self.snapshoturi, po.url, url[2]))
        P.add(triples, self.meta_graph)
Example #38
0
    def writeAllFB(self):
        c("started rendering of the snapshot publication. snapshotID:",self.snapshotid)
        self.final_path_="{}{}/".format(self.final_path,self.snapshotid)
        if not os.path.isdir(self.final_path_):
            os.mkdir(self.final_path_)
        #fnet,inet,mnet
        g=P.context(self.friendship_graph)
        g.namespace_manager.bind("po",po)
        g.serialize(self.final_path_+self.snapshotid+"Friendship.ttl","turtle"); c("ttl")
        g.serialize(self.final_path_+self.snapshotid+"Friendship.rdf","xml")
        c("serialized friendships")
        # get filesize and ntriples
        filesizerdf=os.path.getsize(self.final_path_+self.snapshotid+"Friendship.rdf")/(10**6)
        filesizettl=os.path.getsize(self.final_path_+self.snapshotid+"Friendship.ttl")/(10**6)
        ntriples=len(g)
        triples=[
                 (self.snapshoturi,po.friendshipXMLFileSizeMB,filesizerdf),
                 (self.snapshoturi,po.friendshipTTLFileSizeMB,filesizettl),
                 (self.snapshoturi,po.nFriendshipTriples,ntriples),
                 ]
        P.add(triples,context=self.meta_graph)
        g=P.context(self.meta_graph)
        ntriples=len(g)
        triples+=[
                 (self.snapshoturi,po.nMetaTriples,ntriples+1)      ,
                 ]
        P.add(triples,context=self.meta_graph)
        g.namespace_manager.bind("po",po)
        g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl")
        g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml")
        c("serialized meta")

        if not os.path.isdir(self.final_path_+"scripts"):
            os.mkdir(self.final_path_+"scripts")
        shutil.copy(S.PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py")
        # copia do base data
        if not os.path.isdir(self.final_path_+"base"):
            os.mkdir(self.final_path_+"base")
        shutil.copy(self.data_path+self.filename_friendships,self.final_path_+"base/")

        originals="base/{}".format(self.filename_friendships)
        tfriendship="""\n\n{nf} individuals with metadata {fvars}
and {nfs} friendships constitute the friendship network in the RDF/XML file:
{frdf} \in the Turtle file: \n{fttl}
(anonymized {fan}).""".format(
                        nf=self.nfriends,fvars=str(self.friendsvars),
                        nfs=self.nfriendships,
                        frdf=self.frdf,fttl=self.fttl,
                        fan=self.friendships_anonymized,
                    )
        datetime_string=P.get(self.snapshoturi,po.dateObtained,None,context="social_facebook")[2]

        with open(self.final_path_+"README","w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the facebook
snapshot {snapid} collected around {date}.{tfriendship}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nOriginal file(s):
{origs}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::""".format(
                snapid=self.snapshotid,date=datetime_string,
                        tfriendship=tfriendship,
                        mrdf=self.mrdf,
                        mttl=self.mttl,
                        origs=originals,
                        ise=self.isego,
                        isg=self.isgroup,
                        isf=self.isfriendship,
                        isi=self.isinteraction,
                        ist=self.hastext,
                        ava=self.online_prefix,
                        desc=self.desc
                        ))
Example #39
0
    def makeMetadata(self):
        triples=P.get(self.snapshoturi,None,None,self.social_graph)
        for rawfile in P.get(self.snapshoturi,po.rawFile,None,self.social_graph,strict=True,minimized=True):
            triples+=P.get(rawfile,None,None,self.social_graph)
        self.totalchars=sum(self.nchars_all)
        self.mcharstweets=n.mean(self.nchars_all)
        self.dcharstweets=n.std(self.nchars_all)
        self.totaltokens=sum(self.ntokens_all)
        self.mtokenstweets=n.mean(self.ntokens_all)
        self.dtokenstweets=n.std(self.ntokens_all)
        P.add(triples,context=self.meta_graph)
        triples=[
                (self.snapshoturi, po.nParticipants,           self.nparticipants),
                (self.snapshoturi, po.nTweets,                 self.ntweets),
                (self.snapshoturi, po.nReplies,              self.nreplies),
                (self.snapshoturi, po.nRetweets,               self.nretweets),
                (self.snapshoturi, po.nCharsOverall, self.totalchars),
                (self.snapshoturi, po.mCharsOverall, self.mcharstweets),
                (self.snapshoturi, po.dCharsOverall, self.dcharstweets),
                (self.snapshoturi, po.nTokensOverall, self.totaltokens),
                (self.snapshoturi, po.mTokensOverall, self.mtokenstweets),
                (self.snapshoturi, po.dTokensOverall, self.dtokenstweets),
                ]
        P.add(triples,context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                [po.tweetParticipantAttribute]*len(self.participantvars),
                self.participantvars,context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                [po.tweetXMLFilename]*len(self.tweet_rdf)+[po.tweetTTLFilename]*len(self.tweet_ttl),
                self.tweet_rdf+self.tweet_ttl,context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                [po.onlineTweetXMLFile]*len(self.tweet_rdf)+[po.onlineTweetTTLFile]*len(self.tweet_ttl),
                [self.online_prefix+i for i in self.tweet_rdf+self.tweet_ttl],context=self.meta_graph)

        self.mrdf=self.snapshotid+"Meta.rdf"
        self.mttl=self.snapshotid+"Meta.ttl"
        self.desc="twitter dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
                                                self.snapshotid,self.snapshoturi,self.isego,self.isgroup,)
        self.desc+="\nisFriendship: {}; ".format(self.isfriendship)
        self.desc+="isInteraction: {}.".format(self.isinteraction)
        self.desc+="\nnParticipants: {}; nInteractions: {} (replies+retweets+user mentions).".format(self.nparticipants,self.nreplies+self.nretweets+self.nuser_mentions,)
        self.desc+="\nisPost: {} (alias hasText: {})".format(self.hastext,self.hastext)
        self.desc+="\nnTweets: {}; ".format(self.ntweets)
        self.desc+="nReplies: {}; nRetweets: {}; nUserMentions: {}.".format(self.nreplies,self.nretweets,self.nuser_mentions)
        self.desc+="\nnTokens: {}; mTokens: {}; dTokens: {};".format(self.totaltokens,self.mtokenstweets,self.dtokenstweets)
        self.desc+="\nnChars: {}; mChars: {}; dChars: {}.".format(self.totalchars,self.mcharstweets,self.dcharstweets)
        self.desc+="\nnHashtags: {}; nMedia: {}; nLinks: {}.".format(self.nhashtags,self.nmedia,self.nlinks)
        triples=[
                (self.snapshoturi, po.triplifiedIn,      datetime.datetime.now()),
                (self.snapshoturi, po.triplifiedBy,      "scripts/"),
                (self.snapshoturi, po.donatedBy,         self.snapshotid[:-4]),
                (self.snapshoturi, po.availableAt,       self.online_prefix),
                (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf),
                (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl),
                (self.snapshoturi, po.metaXMLFileName,   self.mrdf),
                (self.snapshoturi, po.metaTTLFileName,   self.mttl),
                (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_rdf)),
                (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)),
                (self.snapshoturi, po.acquiredThrough,   "Twitter APIs"),
                (self.snapshoturi, po.socialProtocolTag, "Twitter"),
                (self.snapshoturi, po.socialProtocol,    P.rdf.ic(po.Platform,"Twitter",self.meta_graph,self.snapshoturi)),
                (self.snapshoturi, po.nTriples,         self.ntriples),
                (self.snapshoturi, NS.rdfs.comment,         self.desc),
                ]
        P.add(triples,self.meta_graph)
Example #40
0
    def rdfMbox(self):
        for filecount, file_ in enumerate(self.files):
            if filecount % 100 == 0:
                c(self.snapshoturi, filecount)
            mbox = mailbox.mbox(self.data_path + self.directory + "/" + file_)
            if not mbox.keys():
                self.nempty += 1
                mbox.close()
                #                c("||||||||||| EMPTY MESSAGE |||||||||||||||||||||",self.snapshotid,file_,"(",filecount,")")
                continue
            if not mbox[0]["Message-Id"]:
                raise ValueError(
                    "What to do with nonempy messages without id?")
            message = mbox[0]
            gmaneid = self.makeId(message["Message-Id"])
            #c("gmaneid",gmaneid)
            if not gmaneid:
                raise ValueError("Message without id")
            messageuri = P.rdf.ic(po.EmailMessage, gmaneid,
                                  self.translation_graph, self.snapshoturi)
            self.nmessages += 1
            triples = [
                (messageuri, po.gmaneID, gmaneid),
            ]
            email, name = self.parseParticipant(message["From"])
            if not email:
                raise ValueError("message without author")
            participanturi = P.rdf.ic(po.GmaneParticipant, email,
                                      self.translation_graph, self.snapshoturi)
            if not P.get(participanturi, po.emailAddress, None,
                         self.translation_graph):
                self.nparticipants += 1
                if self.nparticipants == 100:
                    pass
            triples += [
                (messageuri, po.author, participanturi),
                (participanturi, po.emailAddress, email),
            ]
            if name:
                triples += [
                    (participanturi, po.name, name),
                ]
            subject = message["Subject"]
            if subject:
                subject = decodeHeader(subject)
                assert isinstance(subject, str)
                triples += [
                    (messageuri, po.subject, subject),
                ]
            replyid_ = message["In-Reply-To"]
            saneid = self.makeId(replyid_)
            if bool(replyid_) and not bool(saneid):
                self.nreplies += 1
                replyid = self.snapshotid + "-" + str(self.nlost_messages)
                self.nlost_messages += 1
                replymessageuri = P.rdf.ic(po.LostEmailMessage, replyid,
                                           self.translation_graph,
                                           self.snapshoturi)
                triples += [
                    (replymessageuri, a, po.EmailMessage),
                    (replymessageuri, NS.rdfs.comment,
                     "This message registered as having a reply, but the field might be ill-formed: "
                     + replyid_),
                    (messageuri, po.replyTo, replymessageuri),
                ]
            elif saneid:
                self.nreplies += 1
                replymessageuri = P.rdf.ic(po.EmailMessage, saneid,
                                           self.translation_graph,
                                           self.snapshoturi)
                triples += [
                    (replymessageuri, po.gmaneID, saneid),
                    (messageuri, po.replyTo, replymessageuri),
                ]
            if isinstance(message["Date"], str):
                datetime = parseDate(message["Date"])
            elif isinstance(message["Date"], mailbox.email.header.Header):
                datetimestring = decodeHeader(message["Date"])
                if False in [i in string.printable for i in datetimestring]:
                    datetime = None
                    triples += [
                        (messageuri, po.lostCreatedAt, True),
                    ]
                else:
                    datetime_ = re.findall(r"(.*\d\d:\d\d:\d\d).*",
                                           datetimestring)[0]
                    datetime = parseDate(datetime_)
            else:
                raise ValueError("datetime not understood")
            if datetime:
                self.dates += [datetime]
                triples += [
                    (messageuri, po.createdAt, datetime),
                ]
            if message["References"]:
                references = message["References"].replace("\n", "").replace(
                    "\t", "").replace(" ", "")
                if not re.findall(r"\A<(.*?)>\Z", references):
                    c("::: ::: ::: references field not understood",
                      message["References"])
                    triples += [
                        (messageuri, po.comment,
                         "the references are not understood (<.*> ids are added anyway): "
                         + message["References"]),
                        (messageuri, po.referencesLost, True),
                    ]
                for reference in re.findall(r"<(.*?)>", references):
                    self.nreferences += 1
                    referenceuri = P.rdf.ic(po.EmailMessage, reference,
                                            self.translation_graph,
                                            self.snapshoturi)
                    triples += [
                        (referenceuri, po.gmaneID, reference),
                        (messageuri, po.hasReference, referenceuri),
                    ]
                for part in message["References"].replace("\n", "").replace(
                        "\t", "").split():
                    if validate_email(part):
                        self.nreferences += 1
                        referenceuri = P.rdf.ic(po.EmailMessage, part,
                                                self.translation_graph,
                                                self.snapshoturi)
                        triples += [
                            (referenceuri, po.gmaneID, reference),
                            (messageuri, po.hasReference, referenceuri),
                        ]
            text = getText(message)
            if text:
                nchars = len(text)
                ntokens = len(k.wordpunct_tokenize(text))
                nsentences = len(k.sent_tokenize(text))
                triples += [
                    (messageuri, po.messageText, text),
                    (messageuri, po.nChars, nchars),
                    (messageuri, po.nTokens, ntokens),
                    (messageuri, po.nSentences, nsentences),
                ]
                self.nchars_all += [nchars]
                self.ntokens_all += [ntokens]
                self.nsentences_all += [nsentences]

                clean_text = cleanEmailBody(text)
                self.nremoved_lines += text.count("\n") - clean_text.count(
                    "\n")
                self.nlines += text.count("\n")
                nchars_clean = len(clean_text)
                ntokens_clean = len(k.wordpunct_tokenize(clean_text))
                nsentences_clean = len(k.sent_tokenize(clean_text))
                triples += [
                    (messageuri, po.messageTextClean, clean_text),
                    (messageuri, po.nCharsClean, nchars_clean),
                    (messageuri, po.nTokensClean, ntokens_clean),
                    (messageuri, po.nSentencesClean, nsentences_clean),
                ]
                self.nchars_clean_all += [nchars_clean]
                self.ntokens_clean_all += [ntokens_clean]
                self.nsentences_clean_all += [nsentences_clean]

                for url in re.findall(
                        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        clean_text):
                    self.nurls += 1
                    triples += [
                        (messageuri, po.hasUrl, url),
                    ]

            content_type = message.get_content_type()
            if content_type:
                triples += [(messageuri, po.contentType, content_type)]
            else:
                raise ValueError("/\/\/\/\/\ message without content type")
            organization = message["Organization"]
            if organization:
                if not isinstance(organization, str):
                    organization = "".join(i for i in str(organization)
                                           if i in string.printable)
                triples += [
                    (messageuri, po.organization, organization),
                ]
            if message["cc"]:
                cc, unparsed = parseAddresses(message["cc"])
                if unparsed:
                    triples += [
                        (messageuri, po.unparsedCC, unparsed),
                    ]
                for peeraddress, peername in cc:
                    peeraddress = peeraddress.strip()
                    assert bool(peeraddress)
                    peeruri = P.rdf.ic(po.EmailPeer, peeraddress,
                                       self.translation_graph,
                                       self.snapshoturi)
                    triples += [
                        (messageuri, po.cc, peeruri),
                        (peeruri, po.emailAddress, peeraddress),
                    ]
                    self.ncc += 1
                    if peername:
                        triples += [
                            (peeruri, po.name, peername.strip()),
                        ]
            if message["to"]:
                to, unparsed = parseAddresses(message["to"])
                if unparsed:
                    triples += [
                        (messageuri, po.unparsedTo, unparsed),
                    ]
                for peeraddress, peername in to:
                    peeraddress = peeraddress.strip()
                    assert bool(peeraddress)
                    peeruri = P.rdf.ic(po.EmailPeer, peeraddress,
                                       self.translation_graph,
                                       self.snapshoturi)
                    triples += [
                        (messageuri, po.to, peeruri),
                        (peeruri, po.emailAddress, peeraddress),
                    ]
                    self.nto += 1
                    if peername:
                        triples += [
                            (peeruri, po.name, peername.strip()),
                        ]
            listid = message["list-id"]
            if listid:
                assert isinstance(listid, str)
                listid = listid.replace("\n", "").replace("\t", "")
                if listid.count("<") == listid.count(">") == listid.count(
                        " ") == 0:
                    listname = ""
                    listid_ = listid
                elif listid.count("<") == listid.count(">") == 0:
                    parts = listid.split()
                    lens = [len(i) for i in parts]
                    listid_ = [i for i in parts if len(i) == max(lens)][0]
                    listname = " ".join(i for i in parts
                                        if len(i) != max(lens))
                elif listid.count("<") == listid.count(">") == 1:
                    listname, listid_ = re.findall(r"(.*) {0,1}<(.*)>",
                                                   listid)[0]
                else:
                    raise ValueError("Unexpected listid string format")
                listuri = P.rdf.ic(po.EmailList, listid_,
                                   self.translation_graph, self.snapshoturi)
                triples += [
                    (messageuri, po.emailList, listuri),
                    (listuri, po.listID, listid_),
                ]
                if listname:
                    triples += [
                        (listuri, po.name, listname.strip()),
                    ]
            P.add(triples, self.translation_graph)
            mbox.close()