Ejemplo n.º 1
0
 def writeRdf(self):
     pub_dir = './participabr_snapshot/'
     if not os.path.isdir(pub_dir):
         os.mkdir(pub_dir)
     g = P.context(self.translation_graph)
     g.serialize(pub_dir+'participabr.ttl', 'turtle')
     c('participation ttl serialized')
     g.serialize(pub_dir+'participabr.rdf', 'xml')
     c('participation xml serialized')
     # metadados: group, platform,
     triples = [
              (self.snapshoturi, a, po.Snapshot),
              # (self.snapshoturi, a, po.ParticipabrSnapshot),
              (self.snapshoturi, po.snapshotID, self.snapshotid),
              (self.snapshoturi, po.isEgo, False),
              (self.snapshoturi, po.isGroup, True),
              (self.snapshoturi, po.isFriendship, True),
              (self.snapshoturi, po.isInteraction, True),
              (self.snapshoturi, po.isPost, True),
              (self.snapshoturi, po.socialProtocol, 'ParticipaBR'),
              (self.snapshoturi, po.dateObtained, datetime.date(2012, 6, 28)),
              ]
     P.add(triples, self.meta_graph)
     g = P.context(self.meta_graph)
     g.serialize(pub_dir+'participabrMeta.ttl', 'turtle')
     c('participation meta ttl serialized')
     g.serialize(pub_dir+'participabrMeta.rdf', 'xml')
     c('participation meta xml serialized')
Ejemplo n.º 2
0
 def rdfTweets(self):
     tweets=[]
     if self.pickle_filename1:
         tweets+=readPickleTweetFile( self.data_path+self.pickle_filename1)[0]
     if self.pickle_filename2:
         tweets,fopen=readPickleTweetChunk(self.data_path+self.pickle_filename2,tweets,None,10000) # limit chuck to 5k tweets
     chunk_count=0
     self.tweets=tweets # for probing only, remove to release memory
     while tweets:
         c("rendering tweets, chunk:",chunk_count,"ntweets:",len(tweets),"snapshotid",self.snapshotid)
         for tweet in tweets:
             tweeturi,triples=self.tweetTriples(tweet)
             if "retweeted_status" in tweet.keys():
                 self.nretweets+=1
                 tweeturi0,triples0=self.tweetTriples(tweet)
                 triples+=triples0
                 triples+=[(tweeturi,po.retweetOf,tweeturi0)]
             self.ntriples+=len(triples)
             P.set_(triples,context=self.tweet_graph)
             c("rendered",self.ntweets,"tweets")
         c("end of chunk:",chunk_count,"ntriples:",self.ntriples)
         self.writeTweets(chunk_count)
         c("chunk has been written")
         chunk_count+=1
         if chunk_count==2:
             break
         if self.pickle_filename2:
             tweets,fopen=readPickleTweetChunk(None,[],fopen,10000)
         else:
             tweets=[]
     for i in range(chunk_count): # free memory
         P.context(self.tweet_graph[:-1]+str(i),"remove")
Ejemplo n.º 3
0
    def __init__(self,snapshoturi,snapshotid,filename_friendships="foo.gml",\
            data_path="../data/facebook/",final_path="./facebook_snapshots/",umbrella_dir="facebook_snapshots/"):
        self.friendship_graph="social_facebook_friendships"
        self.meta_graph="social_facebook_meta"
        self.social_graph="social_facebook"
        P.context(self.friendship_graph,"remove")
        P.context(self.meta_graph,"remove")
        self.snapshotid=snapshotid
        self.snapshoturi=snapshoturi
        self.online_prefix="https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format(umbrella_dir,self.snapshotid)
        self.isego=True
        self.isgroup=False
        self.isfriendship=True
        self.isinteraction=False
        self.hastext=False
        self.friendships_anonymized=True

        #friendship_network=x.read_gml(data_path+filename_friendships)
        with open(data_path+filename_friendships) as f:
            lines=f.readlines()
        friendship_network=x.readwrite.gml.parse_gml_lines(lines,"id",None)
        locals_=locals().copy()
        for i in locals_:
            if i !="self":
                exec("self.{}={}".format(i,i))
        self.rdfFriendshipNetwork(friendship_network)
        self.makeMetadata()
        self.writeAllFB()
Ejemplo n.º 4
0
def startSession(context="session"):
    current_user_uri=P.get(NS.per.currentUser) # from rdf.rdflib OK
    now=datetime.now()
    P.context("session","remove")
    if not current_user_uri:
        nick=randomNick() # OK
        current_user_uri=P.rdf.timestampedURI(NS.per.Participant,nick,now) # rdf.rdflib OK
        triples=[
                (current_user_uri, a, NS.per.DefaultParticipant),
                (current_user_uri, NS.per.nick, nick),
                (current_user_uri, NS.per.registered, now),
                ]
        c("Please create a user with P.utils.createUser() ASAP. Registered for now as {} with URI: {}".format(nick,current_user_uri))
    else:
        triples=[]
    session_uri=P.rdf.timestampedURI(NS.per.Session,nick,now) # from rdf.rdflib OK
    current_status_uri=NS.per.CurrentStatus # class in per: ontology OK
    triples+=[
             (current_status_uri,NS.per.currentSession,session_uri),
             (session_uri,NS.per.started,now),
             (session_uri,NS.per.user,current_user_uri),
             (current_status_uri,NS.per.currentUser,current_user_uri),
             ]
    P.set_(triples,context=context) # from rdf.rdflib OK
    #P.rdf.minimumOntology() # from rdf.ontology
    P.rdf.ontology.minimumTestOntology() # from rdf.ontology
    #P.legacy.triples.datasets.datasets() # from legacy.triples
    P.legacy.triples.datasets.minimalTestData() # from legacy.triples
    P.rdf.inference.performRdfsInference("void","minimum_ontology","session_legacy_metadata") # from rdf.inference
Ejemplo n.º 5
0
def parseLegacyFiles(data_dir=DATADIR+"twitter/"):
    """Parse legacy pickle files with Twitter tweets"""
    filenames=os.listdir(data_dir)
    filenames=[i for i in filenames if i!="ipython_log.py" and not i.endswith(".swp")]

    snapshots=set()
    triples=[]
    for filename in filenames:
        snapshotid="twitter-legacy-"+filename.replace("_","")
        snapshoturi=po.TwitterSnapshot+"#"+snapshotid
        expressed_classes=[po.Participant,po.Tweet]
        expressed_reference=filename.replace("_","").replace(".pickle","")
        name_humanized="Twitter"+expressed_reference
        filesize=os.path.getsize(data_dir+filename)/10**6
        fileformat="pickle"
        fileuri=po.File+"#twitter-file-"+filename
        triples+=[
                 (snapshoturi,a,po.Snapshot),
                 (snapshoturi,a,po.TwitterSnapshot),
                 (snapshoturi,po.snapshotID,snapshotid),
                 (snapshoturi, po.isEgo, False),
                 (snapshoturi, po.isGroup, True),
                 (snapshoturi, po.isFriendship, False),
                 (snapshoturi, po.isInteraction, True),
                 (snapshoturi, po.isPost, True),
                 (snapshoturi, po.humanizedName, name_humanized),
                 (snapshoturi, po.expressedReference, expressed_reference),
                 (snapshoturi, po.rawFile, fileuri),
                 (fileuri,     po.fileSize, filesize),
                 (fileuri,     po.fileName, filename),
                 (fileuri,     po.fileFormat, fileformat),
                 ]+[
                 (fileuri,    po.expressedClass, expressed_class) for expressed_class in expressed_classes
                 ]
        snapshots.add(snapshoturi)
    nfiles=len(filenames)
    nsnapshots=len(snapshots)
    P.context("social_twitter","remove")
    platformuri=P.rdf.ic(po.Platform,"Twitter",context="social_twitter")
    triples+=[
             (NS.social.Session,NS.social.nIRCParsedFiles,nfiles),
             (NS.social.Session,NS.social.nIRCSnapshots,nsnapshots),
             (platformuri, po.dataDir,data_dir),
             ]
    P.add(triples,context="social_twitter")
    c("parsed {} twitter files ({} snapshots) are in percolation graph and 'social_twitter' context".format(nfiles,nsnapshots))
    c("percolation graph have {} triples ({} in social_twitter context)".format(len(P.percolation_graph),len(P.context("social_twitter"))))
    negos=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE         { GRAPH <social_twitter> { ?s po:isEgo true         } } ")
    ngroups=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE       { GRAPH <social_twitter> { ?s po:isGroup true       } } ")
    nfriendships=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE  { GRAPH <social_twitter> { ?s po:isFriendship true  } } ")
    ninteractions=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isInteraction true } } ")
    nposts=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE        { GRAPH <social_twitter> { ?s po:isPost true        } } ")
    totalsize=sum(P.query(r" SELECT ?size WHERE              { GRAPH <social_twitter> { ?s po:fileSize ?size     } } "))
    c("""{} are ego snapshots, {} are group snapshots
{} have a friendship network. {} have an interaction network. {} have post texts and reaction counts
Total raw data size is {:.2f}MB""".format(negos,ngroups,nfriendships,ninteractions,nposts,totalsize))

    return snapshots
Ejemplo n.º 6
0
    def __init__(self,snapshoturi,snapshotid,directory="somedir/",\
            data_path="../data/",final_path="./gmane_snapshots/",umbrella_dir="gmane_snapshotsX/"):
        c(snapshoturi, snapshotid, directory)
        isego = False
        isgroup = True
        isfriendship = False
        isinteraction = True
        hastext = True
        interactions_anonymized = False

        translation_graph = "translation"
        meta_graph = "translation_meta"
        gmane_graph = "gmane"
        P.context(translation_graph, "remove")
        P.context(meta_graph, "remove")
        final_path_ = "{}{}/".format(final_path, snapshotid)
        online_prefix = "https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format(
            umbrella_dir, snapshotid)
        ncc = nto = nlines = nremoved_lines = nurls = nlost_messages = nparticipants = nreferences = totalchars = nurls = nreplies = nmessages = nempty = 0
        dates = []
        nchars_all = []
        ntokens_all = []
        nsentences_all = []
        participantvars = ["emailAddress", "name"]
        messagevars = [
            "author", "createdAt", "replyTo", "messageText",
            "cleanMessageText", "nCharsClean", "nTokensClean",
            "nSentencesClean", "hasUrl", "nChars", "nTokens", "nSentences",
            "emptyMessage", "gmaneID", "subject", "cc", "to", "hasReference",
            "contentType", "organization", "unparsedCC", "unparsedTo",
            "emailList"
        ]
        messagevars.sort()
        files = os.listdir(data_path + directory)
        if not files:
            self.comment = "no files on the snapshot id"
            return
        files.sort()
        nchars_all = []
        ntokens_all = []
        nsentences_all = []
        nchars_clean_all = []
        ntokens_clean_all = []
        nsentences_clean_all = []
        locals_ = locals().copy()
        del locals_["self"]
        for i in locals_:
            exec("self.{}={}".format(i, i))
        self.rdfMbox()
        if len(self.files) > self.nempty:
            if not os.path.isdir(final_path_):
                os.mkdir(final_path_)
            self.email_xml, self.size_xml, self.email_ttl, self.size_ttl = P.rdf.writeByChunks(
                self.final_path_ + self.snapshotid + "Email",
                context=self.translation_graph,
                ntriples=100000)
            self.makeMetadata()
            self.writeAllGmane()
Ejemplo n.º 7
0
def parseLegacyFiles(data_dir=DATADIR+"irc/"):
    """Parse legacy txt files with irc logs"""
    filenames=os.listdir(data_dir)
    filenames=[i for i in filenames if i!="ipython_log.py" and not i.endswith(".swp")]

    snapshots=set()
    triples=[]
    for filename in filenames:
        snapshotid="irc-legacy-"+filename.replace("#","")
        snapshoturi=po.TwitterSnapshot+"#"+snapshotid
        expressed_classes=[po.Participant,po.IRCMessage]
        expressed_reference=filename.replace("#","").replace(".txt","").replace(".log","")
        name_humanized="IRC log of channel "+expressed_reference
        filesize=os.path.getsize(data_dir+filename)/10**6
        fileformat="txt"
        fileuri=po.File+"#Irc-log-"+filename.replace("#","")
        triples+=[
                 (snapshoturi,a,po.Snapshot),
                 (snapshoturi,a,po.IRCSnapshot),
                 (snapshoturi,po.snapshotID,snapshotid),
                 (snapshoturi, po.isEgo, False),
                 (snapshoturi, po.isGroup, True),
                 (snapshoturi, po.isFriendship, False),
                 (snapshoturi, po.isInteraction, True),
                 (snapshoturi, po.isPost, True),
                 (snapshoturi, po.humanizedName, name_humanized),
                 (snapshoturi, po.expressedReference, expressed_reference),
                 (snapshoturi, po.rawFile, fileuri),
                 (fileuri,     po.fileSize, filesize),
                 (fileuri,     po.fileName, filename),
                 (fileuri,     po.fileFormat, fileformat),
                 ]+[
                 (fileuri,    po.expressedClass, expressed_class) for expressed_class in expressed_classes
                 ]
        snapshots.add(snapshoturi)
    nfiles=len(filenames)
    nsnapshots=len(snapshots)
    P.context("social_irc","remove")
    platformuri=P.rdf.ic(po.Platform,"IRC",context="social_irc")
    triples+=[
             (NS.social.Session,NS.social.nIRCParsedFiles,nfiles),
             (NS.social.Session,NS.social.nIRCSnapshots,nsnapshots),
             (platformuri, po.dataDir,data_dir),
             ]
    P.add(triples,context="social_irc")
    c("parsed {} irc logs files ({} snapshots) are in percolation graph and 'irc_twitter' context".format(nfiles,nsnapshots))
    c("percolation graph have {} triples ({} in social_irc context)".format(len(P.percolation_graph),len(P.context("social_irc"))))
    negos=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE         { GRAPH <social_irc> { ?s po:isEgo true         } } ")
    ngroups=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE       { GRAPH <social_irc> { ?s po:isGroup true       } } ")
    nfriendships=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE  { GRAPH <social_irc> { ?s po:isFriendship true  } } ")
    ninteractions=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isInteraction true } } ")
    nposts=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE        { GRAPH <social_irc> { ?s po:isPost true        } } ")
    totalsize=sum(P.query(r" SELECT ?size WHERE              { GRAPH <social_irc> { ?s po:fileSize ?size     } } "))
    c("""{} are ego snapshots, {} are group snapshots
{} have a friendship structures. {} have an interaction structures. {} have texts 
Total raw data size is {:.2f}MB""".format(negos,ngroups,nfriendships,ninteractions,nposts,totalsize))
    return snapshots
Ejemplo n.º 8
0
    def __init__(self, snapshoturi, snapshotid, directory="somedir/",
                 data_path="../data/", final_path="./gmane_snapshots/",
                 umbrella_dir="gmane_snapshotsX/"):
        c(snapshoturi, snapshotid, directory)
        isego = False
        isgroup = True
        isfriendship = False
        isinteraction = True
        hastext = True
        interactions_anonymized = False

        translation_graph = "translation"
        meta_graph = "translation_meta"
        gmane_graph = "gmane"
        P.context(translation_graph, "remove")
        P.context(meta_graph, "remove")
        final_path_ = "{}{}/".format(final_path, snapshotid)
        online_prefix = "https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format(umbrella_dir, snapshotid)
        ncc = nto = nlines = nremoved_lines = nurls = nlost_messages = nparticipants = nreferences = totalchars = nurls = nreplies = nmessages = nempty = 0
        dates = []
        nchars_all = []
        ntokens_all = []
        nsentences_all = []
        participantvars = ["emailAddress", "name"]
        messagevars = ["author", "createdAt", "replyTo", "messageText",
                       "cleanMessageText", "nCharsClean", "nTokensClean",
                       "nSentencesClean", "hasUrl", "nChars", "nTokens",
                       "nSentences", "emptyMessage", "gmaneID", "subject",
                       "cc", "to", "hasReference", "contentType", "organization",
                       "unparsedCC", "unparsedTo", "emailList"]
        messagevars.sort()
        files = os.listdir(data_path+directory)
        if not files:
            self.comment = "no files on the snapshot id"
            return
        files.sort()
        nchars_all = []
        ntokens_all = []
        nsentences_all = []
        nchars_clean_all = []
        ntokens_clean_all = []
        nsentences_clean_all = []
        locals_ = locals().copy()
        del locals_["self"]
        for i in locals_:
            exec("self.{}={}".format(i, i))
        self.rdfMbox()
        if len(self.files) > self.nempty:
            if not os.path.isdir(final_path_):
                os.mkdir(final_path_)
            self.email_xml, self.size_xml, self.email_ttl, self.size_ttl = P.rdf.writeByChunks(
                self.final_path_+self.snapshotid+"Email", context=self.translation_graph, ntriples=100000)
            self.makeMetadata()
            self.writeAllGmane()
Ejemplo n.º 9
0
    def __init__(self,snapshoturi,snapshotid,filenames=("foo.pickle",),\
            data_path="../data/twitter/",final_path="./twitter_snapshots/",umbrella_dir="twitter_snapshots/"):
        if len(filenames) == 2:
            pickle_filename1 = filenames[0]
            pickle_filename2 = filenames[1]
        elif filenames[0].count("_") == 1:
            pickle_filename1 = filenames[0]
            pickle_filename2 = ""
        elif filenames[0].count("_") == 2:
            pickle_filename1 = ""
            pickle_filename2 = filenames[0]
        else:
            raise ValueError("Filenames not understood")
        participantvars=["stringID","numericID","screenName","favouritesCount","followersCount","friendsCount",\
                "language","listedCount","name","statusesCount","createdAt","utfOffset","snapshot"]
        participantvars.sort()
        tweetvars = [
            "author", "nChars", "nTokens", "stringID", "createdAt", "message",
            "retweetCount", "language", "inReplyToTweet", "retweetOf",
            "expandedURL", "hashtag", "snapshot", "stringID", "retweetOf",
            "userMention", "media"
        ]
        isego = False
        isgroup = True
        isfriendship = False
        isinteraction = True
        hastext = True
        interactions_anonymized = False

        tweet_graph = "social_tweets0"
        meta_graph = "social_twitter_meta"
        social_graph = "social_twitter"
        P.context(tweet_graph, "remove")
        P.context(meta_graph, "remove")

        final_path_ = "{}{}/".format(final_path, snapshotid)
        online_prefix = "https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format(
            umbrella_dir, snapshotid)
        dates = []
        size_rdf = []
        size_ttl = []
        tweet_rdf = []
        tweet_ttl = []
        nchars_all = []
        ntokens_all = []
        ntriples = nhashtags = nmedia = nlinks = nuser_mentions = nparticipants = nretweets = ntweets = nreplies = anonymous_user_count = anonymous_tweet_count = 0
        locals_ = locals().copy()
        del locals_["self"]
        for i in locals_:
            exec("self.{}={}".format(i, i))
        self.rdfTweets()
        self.makeMetadata()
        self.writeAllTW()
Ejemplo n.º 10
0
 def __init__(self,
              snapshoturi,
              snapshotid,
              filename_friendships=None,
              filename_interactions=None,
              filename_posts=None,
              data_path="../data/facebook/",
              final_path="./facebook_snapshots/",
              umbrella_dir="facebook_snapshots/"):
     self.friendship_graph = "social_facebook_friendships"
     self.interaction_graph = "social_facebook_interactions"
     self.meta_graph = "social_facebook_meta"
     self.posts_graph = "social_facebook_posts"
     self.social_graph = "social_facebook"
     P.context(self.friendship_graph, "remove")
     P.context(self.interaction_graph, "remove")
     P.context(self.meta_graph, "remove")
     P.context(self.posts_graph, "remove")
     self.snapshotid = snapshotid
     self.snapshoturi = snapshoturi
     self.online_prefix = "https://raw.githubusercontent.com/Open\
         LinkedSocialData/{}master/{}/".format(umbrella_dir,
                                               self.snapshotid)
     self.isfriendship = bool(filename_friendships)
     self.isinteraction = bool(filename_interactions)
     self.hastext = bool(filename_posts)
     self.nfriends = self.nfriendships = self.ninteracted = \
         self.ninteractions = self.nposts = 0
     if self.isfriendship:
         # return networkx graph
         fnet = readGDF(data_path + filename_friendships)
         # writes to self.friendship_graph
         fnet_ = self.rdfFriendshipNetwork(fnet)
     if self.isinteraction:
         inet = readGDF(data_path + filename_interactions)  # to networkx
         self.rdfInteractionNetwork(inet)  # to self.interaction_graph
     else:
         self.groupid2 = 0
     if self.hastext:
         self.rdfGroupPosts(data_path +
                            filename_posts)  # to self.posts_graph
     self.observation_count = 0
     locals_ = locals().copy()
     for i in locals_:
         if i != "self":
             if isinstance(locals_[i], str):
                 exec("self.{}='{}'".format(i, locals_[i]))
             else:
                 exec("self.{}={}".format(i, locals_[i]))
     self.makeMetadata()  # rdflib graph with metadata
     self.writeAllFB()  # write linked data tree
Ejemplo n.º 11
0
 def __init__(self,
              snapshoturi,
              snapshotid,
              filename="foo.txt",
              data_path="../data/irc/",
              final_path="./irc_snapshots/",
              umbrella_dir="irc_snapshots/"):
     c(snapshoturi, snapshotid, filename)
     isego = False
     isgroup = True
     isfriendship = False
     isinteraction = True
     hastext = True
     interactions_anonymized = False
     irc_graph = "social_log"
     meta_graph = "social_irc_meta"
     social_graph = "social_irc"
     P.context(irc_graph, "remove")
     P.context(meta_graph, "remove")
     final_path_ = "{}{}/".format(final_path, snapshotid)
     online_prefix = "https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format(
         umbrella_dir, snapshotid)
     naamessages = nurls = ndirect = nmention = 0
     dates = []
     nchars_all = []
     ntokens_all = []
     nsentences_all = []
     participantvars = ["nick"]
     messagevars = [
         "author", "createdAt", "mentions", "directedTo", "systemMessage",
         "text", "cleanMessageText", "nChars", "nTokens", "nSentences",
         "url", "emptyMessage"
     ]
     messagevars.sort()
     locals_ = locals().copy()
     del locals_["self"]
     for i in locals_:
         exec("self.{}={}".format(i, i))
     self.rdfLog()
     self.makeMetadata()
     self.writeAllIRC()
Ejemplo n.º 12
0
 def writeTweets(self, chunk_count):
     if not os.path.isdir(self.final_path):
         os.mkdir(self.final_path)
     if not os.path.isdir(self.final_path_):
         os.mkdir(self.final_path_)
     filename = self.snapshotid + "Tweet{:05d}".format(chunk_count)
     g = P.context(self.tweet_graph)
     g.namespace_manager.bind("po", po)
     tttl = filename + ".ttl"
     trdf = filename + ".rdf"
     g.serialize(self.final_path_ + tttl, "turtle")
     c("ttl")
     g.serialize(self.final_path_ + trdf, "xml")
     filesizettl = os.path.getsize(self.final_path_ + tttl) / (10**6)
     filesizerdf = os.path.getsize(self.final_path_ + trdf) / (10**6)
     self.tweet_ttl += [tttl]
     self.size_ttl += [filesizettl]
     self.tweet_rdf += [trdf]
     self.size_rdf += [filesizerdf]
     # self.tweet_graph = self.tweet_graph[:-1]+str(chunk_count+1)
     P.context(self.tweet_graph, 'remove')
Ejemplo n.º 13
0
    def __init__(self,snapshoturi,snapshotid,filenames=("foo.pickle",),\
            data_path="../data/twitter/",final_path="./twitter_snapshots/",umbrella_dir="twitter_snapshots/"):
        if len(filenames)==2:
            pickle_filename1=filenames[0]
            pickle_filename2=filenames[1]
        elif filenames[0].count("_")==1:
            pickle_filename1=filenames[0]
            pickle_filename2=""
        elif filenames[0].count("_")==2:
            pickle_filename1=""
            pickle_filename2=filenames[0]
        else:
            raise ValueError("Filenames not understood")
        participantvars=["stringID","numericID","screenName","favouritesCount","followersCount","friendsCount",\
                "language","listedCount","name","statusesCount","createdAt","utfOffset","snapshot"]
        participantvars.sort()
        tweetvars=["author","nChars","nTokens","stringID","createdAt","message","retweetCount","language","inReplyToTweet","retweetOf","expandedURL","hashtag","snapshot","stringID","retweetOf","userMention","media"]
        isego=False
        isgroup=True
        isfriendship=False
        isinteraction=True
        hastext=True
        interactions_anonymized=False

        tweet_graph="social_tweets0"
        meta_graph="social_twitter_meta"
        social_graph="social_twitter"
        P.context(tweet_graph,"remove")
        P.context(meta_graph,"remove")

        final_path_="{}{}/".format(final_path,snapshotid)
        online_prefix="https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format(umbrella_dir,snapshotid)
        dates=[]; size_rdf=[]; size_ttl=[]; tweet_rdf=[]; tweet_ttl=[]; nchars_all=[]; ntokens_all=[]
        ntriples=nhashtags=nmedia=nlinks=nuser_mentions=nparticipants=nretweets=ntweets=nreplies=anonymous_user_count=anonymous_tweet_count=0
        locals_=locals().copy(); del locals_["self"]
        for i in locals_:
            exec("self.{}={}".format(i,i))
        self.rdfTweets()
        self.makeMetadata()
        self.writeAllTW()
Ejemplo n.º 14
0
 def writeAllTW(self):
     # write meta and readme with self.desc, then all is finished.
     g = P.context(self.meta_graph)
     # ntriples = len(g)
     # triples = [
     #          (self.snapshoturi, po.nMetaTriples, ntriples),
     #          ]
     # P.add(triples, context=self.meta_graph)
     g.namespace_manager.bind("po", po)
     g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle")
     c("ttl")
     g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml")
     c("serialized meta")
Ejemplo n.º 15
0
 def writeAllGmane(self):
     g = P.context(self.meta_graph)
     g.namespace_manager.bind("po", po)
     # ntriples = len(g)
     # triples = [
     #          (self.snapshoturi, po.nMetaTriples, ntriples),
     #          ]
     # P.add(triples, context=self.meta_graph)
     g.namespace_manager.bind("po", po)
     g.serialize(self.final_path_+self.snapshotid+"Meta.ttl", "turtle")
     c("ttl")
     g.serialize(self.final_path_+self.snapshotid+"Meta.rdf", "xml")
     c("serialized meta")
Ejemplo n.º 16
0
 def writeAllIRC(self):
     # g = P.context(self.meta_graph)
     # ntriples = len(g)
     # triples = [
     #          (self.snapshoturi, po.nMetaTriples, ntriples+1),
     #          ]
     # P.add(triples, context=self.meta_graph)
     g = P.context(self.meta_graph)
     g.namespace_manager.bind("po", po)
     g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle")
     c("ttl")
     g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml")
     c("serialized meta")
Ejemplo n.º 17
0
    def writeAll(self):
        g=P.context(self.meta_graph)
        ntriples=len(g)
        triples=[
                 (self.snapshoturi,po.nMetaTriples,ntriples)      ,
                 ]
        P.add(triples,context=self.meta_graph)
        g.namespace_manager.bind("po",po)
        g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl")
        g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml")
        c("serialized meta")
        if not os.path.isdir(self.final_path_+"scripts"):
            os.mkdir(self.final_path_+"scripts")
        shutil.copy(PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py")
        # copia do base data

        self.dates=[i.isoformat() for i in self.dates]
        date1=min(self.dates)
        date2=max(self.dates)
        with open(self.final_path_+"README","w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the IRC
snapshot {snapid} with tweets from {date1} to {date2}
(total of {ntrip} triples).{tinteraction}{tposts}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::""".format(
                snapid=self.snapshotid,date1=date1,date2=date2,ntrip=self.ntriples,
                        tinteraction=tposts,
                        tposts=tposts,
                        mrdf=self.translation_xml,
                        mttl=self.translation_ttl,
                        ise=self.isego,
                        isg=self.isgroup,
                        isf=self.isfriendship,
                        isi=self.isinteraction,
                        ist=self.hastext,
                        ava=self.online_prefix,
                        desc=self.desc
                        ))
Ejemplo n.º 18
0
 def __init__(self,
              snapshoturi,
              snapshotid,
              filename_friendships="foo.gml",
              data_path="../data/facebook/",
              final_path="./facebook_snapshots/",
              umbrella_dir="facebook_snapshots/"):
     self.friendship_graph = "social_facebook_friendships"
     self.meta_graph = "social_facebook_meta"
     self.social_graph = "social_facebook"
     P.context(self.friendship_graph, "remove")
     P.context(self.meta_graph, "remove")
     self.snapshotid = snapshotid
     self.snapshoturi = snapshoturi
     self.online_prefix = "https://raw.githubusercontent.com/\
         OpenLinkedSocialData/{}master/{}/".format(umbrella_dir,
                                                   self.snapshotid)
     # participant_uri = P.rdf.ic(po.Snapshot, self.snapshotid,
     #                            self.friendship_graph)
     self.isego = True
     self.isgroup = False
     self.isfriendship = True
     self.isinteraction = False
     self.hastext = False
     self.friendships_anonymized = True
     # friendship_network=x.read_gml(data_path+filename_friendships)
     with open(data_path + filename_friendships) as f:
         lines = f.readlines()
     friendship_network = x.readwrite.gml.parse_gml_lines(lines, "id", None)
     locals_ = locals().copy()
     for i in locals_:
         if i != "self":
             exec("self.{}={}".format(i, i))
     self.rdfFriendshipNetwork(friendship_network)
     self.makeMetadata()
     self.writeAllFB()
Ejemplo n.º 19
0
 def rdfTweets(self):
     tweets = []
     if self.pickle_filename1:
         tweets += readPickleTweetFile(self.data_path +
                                       self.pickle_filename1)[0]
     if self.pickle_filename2:
         tweets, fopen = readPickleTweetChunk(
             self.data_path + self.pickle_filename2, tweets, None,
             10000)  # limit chuck to 5k tweets
     chunk_count = 0
     self.tweets = tweets  # for probing only, remove to release memory
     while tweets:
         c("rendering tweets, chunk:", chunk_count, "ntweets:", len(tweets),
           "snapshotid", self.snapshotid)
         for tweet in tweets:
             tweeturi, triples = self.tweetTriples(tweet)
             if "retweeted_status" in tweet.keys():
                 self.nretweets += 1
                 tweeturi0, triples0 = self.tweetTriples(tweet)
                 triples += triples0
                 triples += [(tweeturi, po.retweetOf, tweeturi0)]
             self.ntriples += len(triples)
             P.set_(triples, context=self.tweet_graph)
             c("rendered", self.ntweets, "tweets")
         c("end of chunk:", chunk_count, "ntriples:", self.ntriples)
         self.writeTweets(chunk_count)
         c("chunk has been written")
         chunk_count += 1
         if chunk_count == 2:
             break
         if self.pickle_filename2:
             tweets, fopen = readPickleTweetChunk(None, [], fopen, 10000)
         else:
             tweets = []
     for i in range(chunk_count):  # free memory
         P.context(self.tweet_graph[:-1] + str(i), "remove")
Ejemplo n.º 20
0
 def writeTranslates(self,mode="full"):
     c("mode full or chunk or multigraph write:",mode)
     if mode=="full":
         g=P.context(self.translation_graph)
         self.translation_ttl=self.snapshotid+"Translation.ttl"
         self.translation_xml=self.snapshotid+"Translation.rdf"
         g.serialize(self.final_path_+self.translation_ttl,"turtle"); c("ttl")
         g.serialize(self.final_path_+self.translation_xml,"xml")
         self.translation_size_ttl=os.path.getsize(self.final_path_+self.translation_ttl)/10**6
         self.translation_size_xml=os.path.getsize(self.final_path_+self.translation_xml)/10**6
         self.ntranslation_triples=len(g)
     elif mode=="chunk":
         # writeByChunks
         raise NotImplementedError("Perform P.utils.writeByChunks on self.translation_graph")
     elif mode=="multigraph":
         raise NotImplementedError("Perform serialize(write) on each of the self.translation_graphs")
Ejemplo n.º 21
0
 def writeTranslates(self,mode="full"):
     c("mode full or chunk or multigraph write:",mode)
     if mode=="full":
         g=P.context(self.translation_graph)
         self.translation_ttl=self.snapshotid+"Translation.ttl"
         self.translation_xml=self.snapshotid+"Translation.rdf"
         g.serialize(self.final_path_+self.translation_ttl,"turtle"); c("ttl")
         g.serialize(self.final_path_+self.translation_xml,"xml")
         self.size_ttl=os.path.getsize(self.final_path_+self.translation_ttl)/10**6
         self.size_xml=os.path.getsize(self.final_path_+self.translation_xml)/10**6
         self.ntranslation_triples=len(g)
     elif mode=="chunk":
         # writeByChunks
         raise NotImplementedError("Perform P.utils.writeByChunks on self.translation_graph")
     elif mode=="multigraph":
         raise NotImplementedError("Perform serialize(write) on each of the self.translation_graphs")
Ejemplo n.º 22
0
def performRdfsInference(data_context=None,ontology_context=None,inferred_context=None,clean_inferred_context=True):
    # clean inference graph if True
    if clean_inferred_context:
        P.context(inferred_context,"remove")
    previous_count=len(P.context(inferred_context))
    rdfsInferenceIterate(data_context,ontology_context,inferred_context)
    new_count=len(P.context(inferred_context))
    while previous_count != new_count:
        previous_count=len(P.context(inferred_context))
        rdfsInferenceIterate(inferred_context,ontology_context,inferred_context)
        new_count=len(P.context(inferred_context))
    c("should have all triples resulting from a rdfs subclass subproperty range and domain assertions")
Ejemplo n.º 23
0
 def writeRdf(self):
     pub_dir = './cidadedemocratica_snapshot/'
     if not os.path.isdir(pub_dir):
         os.mkdir(pub_dir)
     # g = P.context(self.translation_graph)
     # g.serialize(pub_dir+'cidadedemocratica.ttl', 'turtle')
     # c('participation ttl serialized')
     # g.serialize(pub_dir+'cidadedemocratica.rdf', 'xml')
     # c('participation xml serialized')
     P.rdf.writeByChunks(pub_dir+'cidadedemocratica',
                         context=self.translation_graph,
                         ntriples=100000)
     # metadados: group, platform,
     g = P.context(self.meta_graph)
     g.serialize(pub_dir+'cidadedemocraticaMeta.ttl', 'turtle')
     c('participation meta ttl serialized')
     g.serialize(pub_dir+'cidadedemocraticaMeta.rdf', 'xml')
     c('participation meta xml serialized')
Ejemplo n.º 24
0
 def writeTweets(self,chunk_count):
     if not os.path.isdir(self.final_path):
         os.mkdir(self.final_path)
     if not os.path.isdir(self.final_path_):
         os.mkdir(self.final_path_)
     filename=self.snapshotid+"Tweet{:05d}".format(chunk_count)
     g=P.context(self.tweet_graph)
     g.namespace_manager.bind("po",po)
     tttl=filename+".ttl"
     trdf=filename+".rdf"
     g.serialize(self.final_path_+tttl,"turtle"); c("ttl")
     g.serialize(self.final_path_+trdf,"xml")
     filesizettl=os.path.getsize(self.final_path_+tttl)/(10**6)
     filesizerdf=os.path.getsize(self.final_path_+trdf)/(10**6)
     self.tweet_ttl+=[tttl]
     self.size_ttl+=[filesizettl]
     self.tweet_rdf+=[trdf]
     self.size_rdf+=[filesizerdf]
     self.tweet_graph=self.tweet_graph[:-1]+str(chunk_count+1)
Ejemplo n.º 25
0
    def __init__(self,snapshoturi,snapshotid,filename_friendships=None,\
            filename_interactions=None,filename_posts=None,\
            data_path="../data/facebook/",final_path="./facebook_snapshots/",umbrella_dir="facebook_snapshots/"):

        self.friendship_graph="social_facebook_friendships"
        self.interaction_graph="social_facebook_interactions"
        self.meta_graph="social_facebook_meta"
        self.posts_graph="social_facebook_posts"
        self.social_graph="social_facebook"
        P.context(self.friendship_graph,"remove")
        P.context(self.interaction_graph,"remove")
        P.context(self.meta_graph,"remove")
        P.context(self.posts_graph,"remove")
        self.snapshotid=snapshotid
        self.snapshoturi=snapshoturi
        self.online_prefix="https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format(umbrella_dir,self.snapshotid)
        self.isfriendship= bool(filename_friendships)
        self.isinteraction=bool(filename_interactions)
        self.hastext=bool(filename_posts)
        self.nfriends=self.nfriendships=self.ninteracted=self.ninteractions=self.nposts=0
        if self.isfriendship:
            fnet=readGDF(data_path+filename_friendships)     # return networkx graph
            fnet_=self.rdfFriendshipNetwork(fnet)   # writes to self.friendship_graph
        if self.isinteraction:
            inet=readGDF(data_path+filename_interactions)    # return networkx graph
            self.rdfInteractionNetwork(inet)      # writes to self.interaction_graph
        else:
            self.groupid2=0
        if self.hastext:
            self.rdfGroupPosts(data_path+filename_posts)      # writes to self.posts_graph

        locals_=locals().copy()
        for i in locals_:
            if i !="self":
                if isinstance(locals_[i],str):
                    exec("self.{}='{}'".format(i,locals_[i]))
                else:
                    exec("self.{}={}".format(i,locals_[i]))
        self.makeMetadata()     # return rdflib graph with metadata about the structure
        self.writeAllFB()  # write linked data tree
Ejemplo n.º 26
0
def publishAll(mysqldb=None, mongoshouts=None, irclogs=None, oreshouts=None):
    """express aa shouts as RDF for publishing"""
    pub_dir='./aa_snapshots/'
    if not os.path.isdir(pub_dir):
        os.mkdir(pub_dir)
    if mysqldb:
        c("before mysql publishing")
        mysqldb = MysqlPublishing(mysqldb)
        g = P.context(mysqldb.translation_graph)
        g.serialize(pub_dir+"aamysql.ttl", "turtle")
        c("mysql ttl ok")
        g.serialize(pub_dir+"aamysql.rdf", "xml")
        c("mysql ok")
        g = P.context(mysqldb.meta_graph)
        g.serialize(pub_dir+"aamysqlMeta.ttl", "turtle")
        c("mysql ttl ok")
        g.serialize(pub_dir+"aamysqlMeta.rdf", "xml")
        c("mysql ok")
    if mongoshouts:
        mongoshouts = MongoPublishing(mongoshouts)
        g = P.context(mongoshouts.translation_graph)
        g.serialize(pub_dir+"aamongo.ttl", "turtle")
        c("mongo ttl ok")
        g.serialize(pub_dir+"aamongo.rdf", "xml")
        c("mongo ok")
        g = P.context(mongoshouts.meta_graph)
        g.serialize(pub_dir+"aamongoMeta.ttl", "turtle")
        c("mongo ttl ok")
        g.serialize(pub_dir+"aamongoMeta.rdf", "xml")
        c("mongo ok")
    if irclogs:
        g = r.Graph()
        gm = r.Graph()
        for irclog in irclogs:  # filenames
            irclog = LogPublishing(irclog)
            g += P.context(irclog.translation_graph)
            gm += P.context(irclog.meta_graph)
        g.serialize(pub_dir+"aairc.ttl", "turtle")
        c("irc ttl ok")
        g.serialize(pub_dir+"aairc.rdf", "xml")
        c("irc ok")
        gm.serialize(pub_dir+"aaircMeta.ttl", "turtle")
        c("irc ttl ok")
        gm.serialize(pub_dir+"aaircMeta.rdf", "xml")
        c("irc ok")
    if oreshouts:
        oreshouts = OrePublishing(oreshouts)
        c("ore ok")
    return mysqldb, mongoshouts, irclog, oreshouts
Ejemplo n.º 27
0
def rdfsInferenceIterate(data_context=None,ontology_context=None,inferred_context=None):
    contexts=[i.identifier.lower() for i in P.context()]
    if data_context not in contexts:
        c("no data context")
    if ontology_context not in contexts:
        c("no ontology context")
    if inferred_context not in contexts:
        c("inferred context to be created context:",inferred_context)
    for subject, foo, object_ in P.percolation_graph.triples(\
            (None,NS.rdfs.subClassOf,None),context=ontology_context):
        for individual, footype, foosubject in P.percolation_graph.triples(\
                (None,a,subject),context=data_context):
            P.add((individual,a,object_),context=inferred_context)
        for foosubject, fooproperty, subject in P.percolation_graph.triples(\
                (None,None,subject),context=data_context):
            P.add((foosubject,fooproperty,object_),context=inferred_context)

    c("finished subclass reasoning")
    for subject, foo, object_ in P.percolation_graph.triples(\
            (None,NS.rdfs.subPropertyOf,None),context=ontology_context):
        c(subject,foo,object_)
        for subject2,propertyfoo,object2 in P.percolation_graph.triples(\
                (None,subject,None),context=data_context):
            c(subject2,propertyfoo,object2)
            P.add((subject2,object_,object2),context=inferred_context)
    c("finished subproperty reasoning")
    for subject, foo, object_ in P.percolation_graph.triples(\
            (None,NS.rdfs.domain,None),context=ontology_context):
        for subject2,predicatefoo,objectfoo in P.percolation_graph.triples(\
                (None,subject,None),context=data_context):
            P.add((subject2,a,object_),context=inferred_context)
    c("finished domain reasoning")
    for subject, foo, object_ in P.percolation_graph.triples(\
            (None,NS.rdfs.range,None),context=ontology_context):
        for subjectfoo,predicatefoo,object2 in P.percolation_graph.triples(\
                (None,subject,None),context=data_context):
                P.add((object2,a,object_),context=inferred_context)
    c("finished range reasoning")
Ejemplo n.º 28
0
    def writeAllTW(self):
        # write meta and readme with self.desc, finished.
        g = P.context(self.meta_graph)
        ntriples = len(g)
        triples = [
            (self.snapshoturi, po.nMetaTriples, ntriples),
        ]
        P.add(triples, context=self.meta_graph)
        g.namespace_manager.bind("po", po)
        g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle")
        c("ttl")
        g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml")
        c("serialized meta")
        # copia o script que gera este codigo
        if not os.path.isdir(self.final_path_ + "scripts"):
            os.mkdir(self.final_path_ + "scripts")
        shutil.copy(S.PACKAGEDIR + "/../tests/triplify.py",
                    self.final_path_ + "scripts/triplify.py")
        # copia do base data
        tinteraction = """\n\n{} individuals with metadata {}
and {} interactions (retweets: {}, replies: {}, user_mentions: {}) 
constitute the interaction 
network in the RDF/XML file(s):
{}
and the Turtle file(s):
{}
(anonymized: {}).""".format(
            self.nparticipants, str(self.participantvars),
            self.nretweets + self.nreplies + self.nuser_mentions,
            self.nretweets, self.nreplies, self.nuser_mentions, self.tweet_rdf,
            self.tweet_ttl, self.interactions_anonymized)
        tposts = """\n\nThe dataset consists of {} tweets with metadata {}
{:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {}
{:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}""".format(
            self.ntweets,
            str(self.tweetvars),
            self.mcharstweets,
            self.dcharstweets,
            self.totalchars,
            self.mtokenstweets,
            self.dtokenstweets,
            self.totaltokens,
        )
        self.dates = [i.isoformat() for i in self.dates]
        date1 = min(self.dates)
        date2 = max(self.dates)
        with open(self.final_path_ + "README", "w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the twitter
snapshot {snapid} with tweets from {date1} to {date2}
(total of {ntrip} triples).{tinteraction}{tposts}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::"""
                    .format(snapid=self.snapshotid,
                            date1=date1,
                            date2=date2,
                            ntrip=self.ntriples,
                            tinteraction=tinteraction,
                            tposts=tposts,
                            mrdf=self.mrdf,
                            mttl=self.mttl,
                            ise=self.isego,
                            isg=self.isgroup,
                            isf=self.isfriendship,
                            isi=self.isinteraction,
                            ist=self.hastext,
                            ava=self.online_prefix,
                            desc=self.desc))
Ejemplo n.º 29
0
    def writeAllFB(self):
        c("started rendering of the snapshot publication. snapshotID:",self.snapshotid)
        self.final_path_="{}{}/".format(self.final_path,self.snapshotid)
        if not os.path.isdir(self.final_path_):
            os.mkdir(self.final_path_)
        #fnet,inet,mnet
        triples=[]
        if self.isfriendship:
            g=P.context(self.friendship_graph)
            g.namespace_manager.bind("po",po)
            g.serialize(self.final_path_+self.snapshotid+"Friendship.ttl","turtle"); c("ttl")
            g.serialize(self.final_path_+self.snapshotid+"Friendship.rdf","xml")
            c("serialized friendships")
            # get filesize and ntriples
            filesizerdf=os.path.getsize(self.final_path_+self.snapshotid+"Friendship.rdf")/(10**6)
            filesizettl=os.path.getsize(self.final_path_+self.snapshotid+"Friendship.ttl")/(10**6)
            ntriples=len(g)
            triples+=[
                     (self.snapshoturi,po.friendshipXMLFileSizeMB,filesizerdf),
                     (self.snapshoturi,po.friendshipTTLFileSizeMB,filesizettl),
                     (self.snapshoturi,po.nFriendshipTriples,ntriples),
                     ]
        if self.isinteraction:
            g=P.context(self.interaction_graph)
            g.namespace_manager.bind("po",po)
            g.serialize(self.final_path_+self.snapshotid+"Interaction.ttl","turtle"); c("ttl")
            g.serialize(self.final_path_+self.snapshotid+"Interaction.rdf","xml")
            c("serialized interaction")
            filesizerdf=os.path.getsize(self.final_path_+self.snapshotid+"Interaction.rdf")/(10**6)
            filesizettl=os.path.getsize(self.final_path_+self.snapshotid+"Interaction.ttl")/(10**6)
            ntriples=len(g)
            triples+=[
                     (self.snapshoturi,po.interactionXMLFileSizeMB,filesizerdf),
                     (self.snapshoturi,po.interactionTTLFileSizeMB,filesizettl),
                     (self.snapshoturi,po.nInteractionTriples,ntriples),
                     ]
        if self.hastext:
            g=P.context(self.posts_graph)
            g.namespace_manager.bind("po",po)
            g.serialize(self.final_path_+self.snapshotid+"Posts.ttl","turtle"); c("ttl")
            g.serialize(self.final_path_+self.snapshotid+"Posts.rdf","xml")
            c("serialized posts")
            filesizerdf=os.path.getsize(self.final_path_+self.snapshotid+"Posts.rdf")/(10**6)
            filesizettl=os.path.getsize(self.final_path_+self.snapshotid+"Posts.ttl")/(10**6)
            ntriples=len(g)
            triples+=[
                     (self.snapshoturi,po.postsXMLFileSizeMB,filesizerdf),
                     (self.snapshoturi,po.postsTTLFileSizeMB,filesizettl),
                     (self.snapshoturi,po.nPostsTriples,ntriples)      ,
                     ]
        g=P.context(self.meta_graph)
        ntriples=len(g)
        triples+=[
                 (self.snapshoturi,po.nMetaTriples,ntriples)      ,
                 ]
        P.add(triples,context=self.meta_graph)
        g.namespace_manager.bind("po",po)
        g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl")
        g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml")
        c("serialized meta")
        # copia o script que gera este codigo
        if not os.path.isdir(self.final_path_+"scripts"):
            os.mkdir(self.final_path_+"scripts")
        shutil.copy(S.PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py")
        # copia do base data
        if not os.path.isdir(self.final_path_+"base"):
            os.mkdir(self.final_path_+"base")
        originals=""
        if self.isfriendship:
            shutil.copy(self.data_path+self.filename_friendships,self.final_path_+"base/")
            originals+="base/{}".format(self.filename_friendships)
            tfriendship="""\n\n{nf} individuals with metadata {fvars}
and {nfs} friendships constitute the friendship network in the RDF/XML file:
{frdf} \nor in the Turtle file: \n{fttl}
(anonymized: {fan}).""".format(
                            nf=self.nfriends,fvars=str(self.friendsvars),
                            nfs=self.nfriendships,
                            frdf=self.frdf,fttl=self.fttl,
                            fan=self.friendships_anonymized,
                        )
        else:
            tfriendship=""
        if self.isinteraction:
            shutil.copy(self.data_path+self.filename_interactions,self.final_path_+"base/")
            tinteraction="""\n\n{} individuals with metadata {}
and {} interactions with metadata {} constitute the interaction 
network in the RDF/XML file:
{}
or in the Turtle file:
{}
(anonymized: {}).""".format( self.ninteracted,str(self.varsfriendsinteraction),
                        self.ninteractions,str(self.interactionsvars),
                        self.irdf,
                        self.ittl,
                        self.interactions_anonymized)
            originals+="\nbase/{}".format(self.filename_interactions)
        else:
            tinteraction=""
        if self.hastext:
            shutil.copy(self.data_path+self.filename_posts,self.final_path_+"base/")
            tposts="""\n\n{} posts with {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {}
{:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}
posts data in the RDF/XML file:
{}
or in the Turtle file:
{}""".format( self.nposts,self.mcharsposts,self.dcharsposts,self.totalchars,
                        self.mtokensposts,self.dtokensposts,self.totaltokens,
                        self.prdf,
                        self.pttl)
            originals+="\nbase/{}".format(self.filename_posts)
        else:
            tposts=""


#        P.rdf.writeAll(mnet,aname+"Meta",fpath_,1)
        # faz um README
        datetime_string=P.get(r.URIRef(self.snapshoturi),po.dateObtained,None,context="social_facebook")[2]
#        if not os.path.isdir(self.final_path+"base"):
#            os.mkdir(self.final_path+"base")
        with open(self.final_path_+"README","w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the facebook
snapshot {snapid} collected around {date}.{tfriendship}{tinteraction}{tposts}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nOriginal file(s):
{origs}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::""".format(
                snapid=self.snapshotid,date=datetime_string,
                        tfriendship=tfriendship,
                        tinteraction=tinteraction,
                        tposts=tposts,
                        mrdf=self.mrdf,
                        mttl=self.mttl,
                        origs=originals,
                        ise=self.isego,
                        isg=self.isgroup,
                        isf=self.isfriendship,
                        isi=self.isinteraction,
                        ist=self.hastext,
                        ava=self.online_prefix,
                        desc=self.desc
                        ))
Ejemplo n.º 30
0
def parseLegacyFiles(data_dir=DATADIR + "twitter/"):
    """Parse legacy pickle files with Twitter tweets"""
    filenames = os.listdir(data_dir)
    filenames = [
        i for i in filenames
        if i != "ipython_log.py" and not i.endswith(".swp")
    ]
    snapshots = set()
    triples = []
    for filename in filenames:
        snapshotid = "twitter-legacy-" + filename.replace("_", "").replace(
            'tw.pickle', '')
        snapshoturi = po.Snapshot + "#" + snapshotid
        expressed_classes = [po.Participant, po.Tweet]
        expressed_reference = filename.replace("_", "").replace(".pickle", "")
        name_humanized = "Twitter " + expressed_reference
        filesize = os.path.getsize(data_dir + filename) / 10**6
        fileformat = "pickle"
        fileuri = po.File + "#twitter-file-" + filename
        triples += [
            (snapshoturi, a, po.Snapshot),
            # (snapshoturi, a, po.TwitterSnapshot),
            (snapshoturi, po.snapshotID, snapshotid),
            (snapshoturi, po.isEgo, False),
            (snapshoturi, po.isGroup, True),
            (snapshoturi, po.isFriendship, False),
            (snapshoturi, po.isInteraction, True),
            (snapshoturi, po.isPost, True),
            # (snapshoturi, po.humanizedName, name_humanized),
            # (snapshoturi, po.expressedReference, expressed_reference),
            (snapshoturi, po.rawFile, fileuri),
            # (fileuri,     po.fileSize, filesize),
            (fileuri, po.fileName, filename),
            # (fileuri,     po.fileFormat, fileformat),
        ] + [
            # (fileuri,    po.expressedClass, expressed_class) for
            # expressed_class in expressed_classes
        ]
        snapshots.add(snapshoturi)
    nfiles = len(filenames)
    nsnapshots = len(snapshots)
    P.context("social_twitter", "remove")
    platformuri = P.rdf.ic(po.Platform, "Twitter", context="social_twitter")
    triples += [
        (NS.social.Session, NS.social.nTwitterParsedFiles, nfiles),
        (NS.social.Session, NS.social.nTwitterSnapshots, nsnapshots),
        (platformuri, po.dataDir, data_dir),
    ]
    P.add(triples, context="social_twitter")
    c("parsed {} twitter files ({} snapshots) are in percolation graph \
      and 'social_twitter' context".format(nfiles, nsnapshots))
    c("percolation graph have {} triples ({} in social_twitter context)".
      format(len(P.percolation_graph), len(P.context("social_twitter"))))
    negos = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE         { GRAPH <social_twitter> { ?s po:isEgo true         } } "
    )
    ngroups = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE       { GRAPH <social_twitter> { ?s po:isGroup true       } } "
    )
    nfriendships = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE  { GRAPH <social_twitter> { ?s po:isFriendship true  } } "
    )
    ninteractions = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isInteraction true } } "
    )
    nposts = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE        { GRAPH <social_twitter> { ?s po:isPost true        } } "
    )
    totalsize = sum(
        P.query(
            r" SELECT ?size WHERE              { GRAPH <social_twitter> { ?s po:fileSize ?size     } } "
        ))
    c("""{} are ego snapshots, {} are group snapshots
{} have a friendship network. {} have an interaction network. \
      {} have post texts and reaction counts. Total raw data size is {:.2f}MB"""
      .format(negos, ngroups, nfriendships, ninteractions, nposts, totalsize))
    return snapshots
Ejemplo n.º 31
0
 def rdfLog(self):
     try:
         with codecs.open(self.data_path + self.filename, "rb",
                          "iso-8859-1") as f:
             logtext = textFix(f.read())
         c('opened log {} as iso-8859-1'.format(self.snapshotid))
     except OSError:
         with open(self.data_path + self.filename, "r") as f:
             logtext = textFix(f.read())
         c('opened log {} as utf8'.format(self.snapshotid))
     # msgregex=r"\[(\d{2}):(\d{2}):(\d{2})\] \* ([^ ?]*)[ ]*(.*)" # DELETE ???
     # rmessage= r"\[(\d{2}):(\d{2}):(\d{2})\] \<(.*?)\>[ ]*(.*)" # message
     # lista arquivos no dir
     # rdate = r"(\d{4})(\d{2})(\d{2})"  # date
     # system message:
     rsysmsg = r"(\d{4})\-(\d{2})\-(\d{2})T(\d{2}):(\d{2}):(\d{2})  \*\*\* (\S+) (.*)"
     # user message:
     rmsg = r"(\d{4})\-(\d{2})\-(\d{2})T(\d{2}):(\d{2}):(\d{2})  \<(.*?)\> (.*)"
     rurl = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
     messages = re.findall(rmsg, logtext)
     system_messages = re.findall(rsysmsg, logtext)
     self.NICKS = set([Q(i[-2]) for i in messages] +
                      [Q(i[-2]) for i in system_messages])
     triples = []
     for nick in self.NICKS:
         useruri = P.rdf.ic(po.Participant,
                            "{}-{}".format(self.provenance_prefix, nick),
                            self.irc_graph, self.snapshoturi)
         obs = P.rdf.ic(po.Observation,
                        "{}-{}".format(self.snapshotid, nick),
                        self.irc_graph, self.snapshoturi)
         triples.extend([
             (useruri, po.observation, obs),
             (obs, po.nick, nick),
         ])
     messageids = set()
     msgcount = 0
     c("starting translation of log with",
       len(messages) + len(system_messages), "messages")
     for message in messages:
         year, month, day, hour, minute, second, nick, text = message
         nick = Q(nick)
         datetime_ = datetime.datetime(
             *[int(i) for i in (year, month, day, hour, minute, second)])
         self.dates += [datetime_]
         timestamp = datetime_.isoformat()
         messageid = "{}-{}-{}".format(self.snapshotid, nick, timestamp)
         while messageid in messageids:
             messageid += '_r_%05x' % random.randrange(16**5)
         messageids.add(messageid)
         messageuri = P.rdf.ic(po.IRCMessage, messageid, self.irc_graph,
                               self.snapshoturi)
         tokens = k.word_tokenize(text)
         tokens = [i for i in tokens if i not in set(string.punctuation)]
         direct_nicks = []  # for directed messages at
         mention_nicks = []  # for mentioned fellows
         direct = 1
         for token in tokens:
             if token not in self.NICKS:
                 direct = 0
             else:
                 if direct:
                     direct_nicks.append(token)
                 else:
                     mention_nicks.append(token)
         for nick in direct_nicks:
             useruri2 = po.Participant + "#{}-{}".format(
                 self.snapshotid, nick)
             triples.append((messageuri, po.directedTo, useruri2))
         if direct_nicks:
             self.ndirect += 1
             text_ = text[text.index(direct_nicks[-1]) +
                          len(direct_nicks[-1]) + 1:].lstrip()
         else:
             text_ = text
         for nick in mention_nicks:
             useruri2 = po.Participant + "#{}-{}".format(
                 self.snapshotid, nick)
             triples.append((messageuri, po.mentions, useruri2))
         self.nmention += len(mention_nicks)
         useruri = po.Participant + "#{}-{}".format(self.snapshotid, nick)
         triples.extend((
             (messageuri, po.author, useruri),
             (messageuri, po.systemMessage, False),
             (messageuri, po.createdAt, datetime_),
         ))
         if text:
             triples.append((messageuri, po.text, text))
         if text_:
             nchars = len(text_)
             ntokens = len(k.word_tokenize(text_))
             nsentences = len(k.sent_tokenize(text_))
             triples += [
                 (messageuri, po.cleanText, text_),
                 # (messageuri, po.nChars, nchars),
                 # (messageuri, po.nTokens, ntokens),
                 # (messageuri, po.nSentences, nsentences),
             ]
             urls = re.findall(rurl, text_)
             for url in urls:
                 triples += [
                     (messageuri, po.hasUrl, url),
                 ]
             self.nchars_all += [nchars]
             self.ntokens_all += [ntokens]
             self.nsentences_all += [nsentences]
             self.nurls += len(urls)
         else:
             triples += [
                 (messageuri, po.emptyMessage, True),
             ]
         if text.startswith(";aa ") or text.startswith(
                 "lalenia, aa ") or text.startswith("lalenia: aa "):
             self.naamessages += 1
             # triples.append((messageuri, a, po.AAIRCMessage))
             triples.append((messageuri, po.aaMessage, True))
         msgcount += 1
         if msgcount % 1000 == 0:
             c("finished user message", msgcount)
     msgcount = 0
     for message in system_messages:
         year, month, day, hour, minute, second, nick, text = message
         nick = Q(nick)
         useruri = po.Participant + "#{}-{}".format(self.provenance_prefix,
                                                    nick)
         datetime_ = datetime.datetime(
             *[int(i) for i in (year, month, day, hour, minute, second)])
         self.dates += [datetime_]
         timestamp = datetime_.isoformat()
         messageid = "{}-{}".format(self.snapshotid, timestamp)
         while messageid in messageids:
             messageid += '_r_%05x' % random.randrange(16**5)
         messageids.add(messageid)
         messageuri = P.rdf.ic(po.IRCMessage, messageid, self.irc_graph,
                               self.snapshoturi)
         triples += [(messageuri, po.impliedUser, useruri),
                     (messageuri, po.createdAt, datetime_),
                     (messageuri, po.systemMessage, True)]
         if text:
             triples += [(messageuri, po.text, text)]
         msgcount += 1
         if msgcount % 1000 == 0:
             c("Total system messages:", msgcount)
     self.messageids = messageids
     if not os.path.isdir(self.final_path):
         os.mkdir(self.final_path)
     if not os.path.isdir(self.final_path_):
         os.mkdir(self.final_path_)
     g = P.context(self.irc_graph)
     triples_ = [tr for tr in g]
     triples.extend(triples_)
     self.log_xml, self.size_xml, self.log_ttl, self.size_ttl = P.rdf.writeByChunks(
         self.final_path_ + self.snapshotid + "Log",
         ntriples=100000,
         triples=triples,
         bind=[('po', po)])
Ejemplo n.º 32
0
Archivo: access.py Proyecto: ttm/gmane
def parseLegacyFiles(data_dir=DATADIR):
    """Parse legacy mbox files with emails from the Gmane database"""
    data_dir = os.path.expanduser(data_dir)
    directories = os.listdir(data_dir)
    directories = [i for i in directories if os.path.isdir(data_dir+i)]
    snapshots = set()
    triples = []
    for directory in directories:
        all_files = [i for i in os.listdir(data_dir+directory) if i.isdigit()]
        if all_files:
            all_files.sort()
            foo = all_files[0].lstrip("0")
            if not foo:
                foo = "0"
            snapshotid = re.sub(r'^gmane\.', 'email-legacy-', directory.replace('+', 'p'))+foo+"-"+all_files[-1].lstrip("0")
            snapshoturi = po.Snapshot+"#"+snapshotid
            expressed_classes = [po.GmaneParticipant, po.EmailPeer, po.EmailMessage]
            expressed_reference = directory
            name_humanized = "Gmane email list with id "+expressed_reference
            directorysize = sum(os.path.getsize(data_dir+directory+"/"+filename) for filename in os.listdir(data_dir+directory))/10**6
            fileformat = "mbox"
            directoryuri = po.Directory+"#gmane-"+directory
            triples.extend([
                     (snapshoturi, a, po.Snapshot),
                     # (snapshoturi, a, po.GmaneSnapshot),
                     (snapshoturi, po.dataDir, data_dir),
                     (snapshoturi, po.snapshotID, snapshotid),
                     (snapshoturi, po.isEgo, False),
                     (snapshoturi, po.isGroup, True),
                     (snapshoturi, po.isFriendship, False),
                     (snapshoturi, po.isInteraction, True),
                     (snapshoturi, po.isPost, True),
                     # (snapshoturi, po.humanizedName, name_humanized),
                     # (snapshoturi, po.expressedReference, expressed_reference),
                     (snapshoturi, po.rawDirectory, directoryuri),
                     # (directoryuri,     po.directorySize, directorysize),
                     (directoryuri,     po.directoryName, directory),
                     # (directoryuri,     po.fileFormat, fileformat),
                     ]+[
                     # (directoryuri,    po.expressedClass, expressed_class) for expressed_class in expressed_classes
                     ])
            snapshots.add(snapshoturi)
    nsnapshots = ndirectories = len(directories)
    P.context("gmane", "remove")
    platformuri = P.rdf.ic(po.Platform, "Gmane", context="gmane")
    triples.extend([
             (NS.social.Session, NS.social.nGmaneParsedDirectories, ndirectories),
             (NS.social.Session, NS.social.nGmaneSnapshots, nsnapshots),
             (NS.social.Session, po.platform, platformuri),
    ])
    P.add(triples, context="gmane")
    c("parsed {} gmane data directories (=={} snapshots) are in percolation graph and 'gmane' context".format(ndirectories, nsnapshots))
    c("percolation graph have {} triples ({} in gmane context)".format(len(P.percolation_graph), len(P.context("gmane"))))
    negos = P.query(r" SELECT (COUNT(?s) as ?cs) WHERE         { GRAPH <gmane> { ?s po:isEgo true         } } ")
    ngroups = P.query(r" SELECT (COUNT(?s) as ?cs) WHERE       { GRAPH <gmane> { ?s po:isGroup true       } } ")
    nfriendships = P.query(r" SELECT (COUNT(?s) as ?cs) WHERE  { GRAPH <gmane> { ?s po:isFriendship true  } } ")
    ninteractions = P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isInteraction true } } ")
    nposts = P.query(r" SELECT (COUNT(?s) as ?cs) WHERE        { GRAPH <gmane> { ?s po:isPost true        } } ")
    totalsize = sum(P.query(r" SELECT ?size WHERE              { GRAPH <gmane> { ?s po:directorySize ?size     } } "))
    c("""{} are ego snapshots, {} are group snapshots
{} have a friendship structures. {} have an interaction structures. {} have texts
Total raw data size is {:.2f}MB""".format(negos, ngroups, nfriendships, ninteractions, nposts, totalsize))
    return snapshots
Ejemplo n.º 33
0
    def writeAll(self):
        g=P.context(self.meta_graph)
        ntriples=len(g)
        triples=[
                 (self.snapshoturi,po.nMetaTriples,ntriples)      ,
                 ]
        P.add(triples,context=self.meta_graph)
        g.namespace_manager.bind("po",po)
        g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl")
        g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml")
        c("serialized meta")
        if not os.path.isdir(self.final_path_+"scripts"):
            os.mkdir(self.final_path_+"scripts")
        shutil.copy(PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py")
        # copia do base data
        text="""structure in the RDF/XML file(s):
{}
and the Turtle file(s):
{}
(anonymized: False "nicks inteface").""".format( self.nparticipants,str(self.participantvars),
                    self.nchecks,self.ndirect,self.nmention,
                    self.translation_xml,
                    self.translation_ttl)
        tposts="""\n\nThe dataset consists of {} shout messages with metadata {}
{:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {}
{:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}
{:.3f} sentences in average (std: {:.3f}) and total sentences in snapshot: {}""".format(
                        self.nmessages,str(self.messagevars),
                        self.mcharsmessages, self.dcharsmessages,self.totalchars,
                        self.mtokensmessages,self.dtokensmessages,self.totaltokens,
                        self.msentencesmessages,self.dsentencesmessages,self.totalsentences,
                        )
        self.dates=P.get(r"SELECT ?date WHERE { GRAPH <%s> { ?fooshout po:createdAt ?date } "%(self.translation_graph,))
        self.dates=[i.isoformat() for i in self.dates]
        date1=min(self.dates)
        date2=max(self.dates)
        with open(self.final_path_+"README","w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the IRC
snapshot {snapid} with tweets from {date1} to {date2}
(total of {ntrip} triples).{tinteraction}{tposts}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::""".format(
                snapid=self.snapshotid,date1=date1,date2=date2,ntrip=self.ntriples,
                        tinteraction=tposts,
                        tposts=tposts,
                        mrdf=self.translation_xml,
                        mttl=self.translation_ttl,
                        ise=self.isego,
                        isg=self.isgroup,
                        isf=self.isfriendship,
                        isi=self.isinteraction,
                        ist=self.hastext,
                        ava=self.online_prefix,
                        desc=self.desc
                        ))
Ejemplo n.º 34
0
    def writeAllFB(self):
        c("started rendering of the snapshot publication. snapshotID:",self.snapshotid)
        self.final_path_="{}{}/".format(self.final_path,self.snapshotid)
        if not os.path.isdir(self.final_path_):
            os.mkdir(self.final_path_)
        #fnet,inet,mnet
        g=P.context(self.friendship_graph)
        g.namespace_manager.bind("po",po)
        g.serialize(self.final_path_+self.snapshotid+"Friendship.ttl","turtle"); c("ttl")
        g.serialize(self.final_path_+self.snapshotid+"Friendship.rdf","xml")
        c("serialized friendships")
        # get filesize and ntriples
        filesizerdf=os.path.getsize(self.final_path_+self.snapshotid+"Friendship.rdf")/(10**6)
        filesizettl=os.path.getsize(self.final_path_+self.snapshotid+"Friendship.ttl")/(10**6)
        ntriples=len(g)
        triples=[
                 (self.snapshoturi,po.friendshipXMLFileSizeMB,filesizerdf),
                 (self.snapshoturi,po.friendshipTTLFileSizeMB,filesizettl),
                 (self.snapshoturi,po.nFriendshipTriples,ntriples),
                 ]
        P.add(triples,context=self.meta_graph)
        g=P.context(self.meta_graph)
        ntriples=len(g)
        triples+=[
                 (self.snapshoturi,po.nMetaTriples,ntriples+1)      ,
                 ]
        P.add(triples,context=self.meta_graph)
        g.namespace_manager.bind("po",po)
        g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl")
        g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml")
        c("serialized meta")

        if not os.path.isdir(self.final_path_+"scripts"):
            os.mkdir(self.final_path_+"scripts")
        shutil.copy(S.PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py")
        # copia do base data
        if not os.path.isdir(self.final_path_+"base"):
            os.mkdir(self.final_path_+"base")
        shutil.copy(self.data_path+self.filename_friendships,self.final_path_+"base/")

        originals="base/{}".format(self.filename_friendships)
        tfriendship="""\n\n{nf} individuals with metadata {fvars}
and {nfs} friendships constitute the friendship network in the RDF/XML file:
{frdf} \in the Turtle file: \n{fttl}
(anonymized {fan}).""".format(
                        nf=self.nfriends,fvars=str(self.friendsvars),
                        nfs=self.nfriendships,
                        frdf=self.frdf,fttl=self.fttl,
                        fan=self.friendships_anonymized,
                    )
        datetime_string=P.get(self.snapshoturi,po.dateObtained,None,context="social_facebook")[2]

        with open(self.final_path_+"README","w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the facebook
snapshot {snapid} collected around {date}.{tfriendship}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nOriginal file(s):
{origs}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::""".format(
                snapid=self.snapshotid,date=datetime_string,
                        tfriendship=tfriendship,
                        mrdf=self.mrdf,
                        mttl=self.mttl,
                        origs=originals,
                        ise=self.isego,
                        isg=self.isgroup,
                        isf=self.isfriendship,
                        isi=self.isinteraction,
                        ist=self.hastext,
                        ava=self.online_prefix,
                        desc=self.desc
                        ))
Ejemplo n.º 35
0
def parseLegacyFiles(datadir=DATADIR + "facebook/"):
    """Parse legacy gdf, gml and tab files of facebook structures

    Synthax of facebook filenames is:
    <prefix><name><date><suffix><extension> where:

    <prefix> used are:
        *) avlab_ for files obtained with participants at AVLAB
        *) posavlab_ for files obtained from participants
        *) page_ for files about facebook pages
        *) ego_ for ego networks
    ommited for gml files and gdf group files.

    <name> is any string name associated with the user or
    group delimiting the structure in the file, e.g. FernandValfro.
    it gets split with spaces before uppercase letter chunks for
    po:humanizedName: REM splits to REM. RFabbri to RFabbri.

    <date> daymonthyear in 2/2/4 digits, e.g. 20032014 for 20/March/2014.

    <suffix> is ommited for friendship .gml .gdf networks,
    .tab are text and activity files.
    _interaction is used if interaction network.

    <extension> is either
    .gml for gml files, all are ego friendship network data
    .gdf for gdf files with group and ego,
                            interaction and friendship network data
    .tab for tab files with post data, such as text

    These render snapshots of two classes:
    po:FacebookEgoFriendshipSnapshot from .gml files and
        gdf files with prefix avlab_ posavlab_ or ego_
    po:FacebookGroupFriendshipInteractionSnapshot from
        .gdf files without prefix with and without _interaction suffix
        and the .tab files. They form sets of files, all with friendship
        and interaction networks and some with a .tab file.

    ToDo:
       *) Implement parsing of page files.
       *) Implement parsing of new group files."""
    platformuri = P.rdf.ic(po.Platform, "Facebook", context="social_facebook")
    triples = [
        (platformuri, po.dataDir, datadir),
    ]
    filenames = os.listdir(datadir)
    filenames = [
        i for i in filenames if not i.endswith("swp") and "ipython_log.py" != i
    ]
    snapshots = set()
    regex = re.compile(
        r"^(avlab_|ego_|posavlab_|page_)*(.*?)(\d{8})(_interactions|_comments){0,1}\.(gdf|tab|gml)$"
    )
    regex2 = re.compile(r'([A-Z]{2,}(?=[A-Z]|$)|[A-Z][a-z]*)')
    for filename in filenames:
        prefix, name, date, sufix, format_ = regex.findall(filename)[0]
        if prefix == "page_":
            c("page data currently not supported. Jumping", filename)
            continue
        # size in megabytes
        filesize = os.path.getsize(datadir + filename) / (10**6)
        snapshotid = 'facebook-legacy-' + filename.replace(
            "_interactions.gdf", "").replace(".tab", "").replace(
                '.gml', '').replace('.gdf', '')
        snapshoturi = po.Snapshot + "#" + snapshotid
        date_obtained = datetime.date(int(date[4:]), int(date[2:4]),
                                      int(date[:2]))
        name_humanized = " ".join(regex2.findall(name))
        metadata = S.legacy.facebook.files.files_dict[filename.replace(
            "_interactions.gdf", ".gdf").replace(".tab", ".gdf")]
        if metadata[0]:
            triples += [(snapshoturi, po.numericID, metadata[0])]
        if metadata[1]:
            triples += [(snapshoturi, po.stringID, metadata[1])]
        if len(metadata) == 3:
            if not metadata[2]:
                c("group data without a publishing link: ", filename)
            else:
                triples += [(snapshoturi, po.url, metadata[2])]
        if filename.endswith(".gml") or any(
                filename.startswith(i)
                for i in ("ego_", "avlab_", "posavlab_")):
            isego = True
            isgroup = False
            isfriendship = True
            isinteraction = False
            isposts = False
            expressed_classes = (po.Friendship, po.Participant)

            if metadata[0]:
                expressed_reference = po.Participant+"#" + \
                    snapshotid+"-"+metadata[0]
            else:
                if "Mirtes" in filename:
                    expressed_reference = po.Participant+"#" + \
                        snapshotid+"-anon_mirtes"
                else:
                    raise ValueError(
                        "Numeric ID is needed for friendship networks")
            triples += [(expressed_reference, a, po.FacebookParticipant)]
        else:  # group snapshot
            isego = False
            isgroup = True
            ffilename = prefix + name + date + ".gdf"
            ifilename = prefix + name + date + "_interactions.gdf"
            tfilename = prefix + name + date + ".tab"
            isfriendship = ffilename in filenames
            isinteraction = ifilename in filenames
            isposts = tfilename in filenames
            if metadata[0]:
                expressed_reference = po.FacebookGroup+"#" +\
                    snapshotid+"-"+metadata[0]
            else:
                if metadata[1]:
                    expressed_reference = po.FacebookGroup+"#" +\
                        snapshotid+"-"+metadata[1]
                else:
                    raise ValueError("Numeric or string ID is needed\
                                     for group networks")
            triples += [(expressed_reference, a, po.FacebookGroup)]
            if filename == ffilename:
                expressed_classes = (po.Friendship, po.Participant)
            elif filename == ifilename:
                expressed_classes = (po.Interaction, po.Participant)
            elif format_ == "tab":
                expressed_classes = (po.Post, )
            else:
                raise NameError("filename structure not understood")

        fileuri = NS.po.File + "#" + snapshotid + "-_file_-" + filename
        triples += [
            (snapshoturi, a, po.Snapshot),
            # (snapshoturi, a, po.FacebookSnapshot),
            (snapshoturi, po.snapshotID, snapshotid),
            (snapshoturi, po.isEgo, isego),
            (snapshoturi, po.isGroup, isgroup),
            (snapshoturi, po.isFriendship, isfriendship),
            (snapshoturi, po.isInteraction, isinteraction),
            (snapshoturi, po.isPost, isposts),
            (snapshoturi, po.name, name_humanized),
            (snapshoturi, po.dateObtained, date_obtained),
            # (snapshoturi, po.expressedReference, expressed_reference),
            (snapshoturi, po.rawFile, fileuri),
            # (fileuri,     po.fileSize, filesize),
            (fileuri, po.fileName, filename),
            (fileuri, po.fileFormat, format_),
        ]
        triples += [(fileuri, po.expressedClass, expressed_class)
                    for expressed_class in expressed_classes]
        note = theNote(filename)  # for avlab and posavlab
        if note:
            triples += [
                (snapshoturi, NS.rdfs.comment, note),
            ]
        snapshots.add(snapshoturi)
    # data about the overall data in percolation graph
    nfiles = len(filenames)
    nsnapshots = len(snapshots)
    triples += [
        (NS.social.Session, NS.social.nFacebookParsedFiles, nfiles),
        (NS.social.Session, NS.social.nFacebookSnapshots, nsnapshots),
    ]
    P.context("social_facebook", "remove")
    P.add(triples, context="social_facebook")
    c("parsed {} facebook files ({} snapshots) are in percolation \
      graph and 'social_facebook' context".format(nfiles, nsnapshots))
    c("percolation graph have {} triples ({} in social_facebook context\
      )".format(len(P.percolation_graph), len(P.context("social_facebook"))))
    negos = P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isEgo true } ")
    ngroups = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isGroup true } ")
    nfriendships = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isFriendship true } ")
    ninteractions = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isInteraction true } ")
    nposts = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isPost true } ")
    totalsize = sum(P.query(r" SELECT ?size WHERE { ?s po:fileSize ?size } "))
    c("""{} are ego snapshots, {} are group snapshots
{} have a friendship network. {} have an interaction network. \
      {} have post texts and reaction counts
Total raw data size is {:.2f}MB""".format(negos, ngroups, nfriendships,
                                          ninteractions, nposts, totalsize))

    return snapshots
Ejemplo n.º 36
0
def probeOntology(endpoint_url, graph_urns, final_dir, one_datatype=True):
    if not os.path.isdir(final_dir):
        os.makedirs(final_dir)

    client = P.rdf.sparql.classes.LegacyClient(endpoint_url)
    from_ = ''
    for graph_urn in graph_urns:
        from_ += '\nFROM <%s>' % (graph_urn, )

    def mkQuery(query, plain=True):
        query_ = query.split('WHERE')
        query__ = (query_[0], from_, '\nWHERE ' + query_[1])
        query___ = ''.join(query__)
        result = client.retrieveQuery(query___)
        if plain:
            return pl(result)
        else:
            return result['results']['bindings']

    c('find all classes')
    q = "SELECT DISTINCT ?class WHERE { ?s a ?class . }"
    # classes = pl(client.retrieveQuery(prefix+q))
    classes = mkQuery(q)

    c('antecedents, consequents and restrictions of each class')
    neighbors = {}
    triples = []
    existential_restrictions = {}
    universal_restrictions = {}
    for aclass in classes:
        q = "SELECT DISTINCT ?cs ?p WHERE { ?i a <%s> . ?s ?p ?i . OPTIONAL { ?s a ?cs . } }" % (
            aclass, )
        antecedent_property = mkQuery(q)
        # q = "SELECT DISTINCT ?ap (datatype(?o) as ?do) WHERE { ?i a <%s> . ?i ?ap ?o . filter (datatype(?o) != '') }" % (aclass,)
        # consequent_property = mkQuery(q)
        # q = "SELECT DISTINCT ?ap ?co WHERE { ?i a <%s> . ?i ?ap ?o . ?o a ?co . }" % (aclass,)
        # consequent_property_ = mkQuery(q)
        q = "SELECT DISTINCT ?ap ?co (datatype(?o) as ?do) WHERE { ?i a <%s> . ?i ?ap ?o . OPTIONAL { ?o a ?co . } }" % (
            aclass, )
        consequent_property__ = mkQuery(q, 0)
        consequent_property = [[i['ap']['value'], i['do']['value']]
                               for i in consequent_property__ if 'do' in i]
        consequent_property_ = [[i['ap']['value'], i['co']['value']]
                                for i in consequent_property__ if 'co' in i]
        neighbors[aclass] = (antecedent_property,
                             consequent_property + consequent_property_)
        # neighbors[aclass] = (antecedent_property, dict(consequent_property, **consequent_property_))

        # class restrictions
        q = "SELECT DISTINCT ?p WHERE {?s a <%s>. ?s ?p ?o .}" % (aclass, )
        props_c = mkQuery(q)
        # q = "SELECT DISTINCT ?s WHERE {?s a <%s>}" % (aclass,)
        # inds = mkQuery(q)
        q = "SELECT (COUNT(DISTINCT ?s) as ?cs) WHERE {?s a <%s>}" % (aclass, )
        ninds = pl(client.retrieveQuery(q))[0]
        for pc in props_c:
            if '22-rdf-syntax' in pc:
                continue
            # q = "SELECT DISTINCT ?s ?co  (datatype(?o) as ?do) WHERE {?s a <%s>. ?s <%s> ?o . OPTIONAL {?o a ?co . }}" % (aclass, pc)
            q = "SELECT DISTINCT ?co (datatype(?o) as ?do) WHERE {?s a <%s>. ?s <%s> ?o . OPTIONAL {?o a ?co . }}" % (
                aclass, pc)
            inds2 = mkQuery(q, 0)
            # inds2_ = set([i["s"]["value"] for i in inds2])
            objs = set([i["co"]["value"] for i in inds2 if "co" in i.keys()])
            vals = set([i["do"]["value"] for i in inds2 if "do" in i.keys()])
            q = "SELECT (COUNT(DISTINCT ?s) as ?cs) WHERE {?s a <%s>. ?s <%s> ?o . }" % (
                aclass, pc)
            ninds2 = pl(client.retrieveQuery(q))[0]
            # if len(inds) == len(inds2_):  # existential
            if ninds == ninds2:  # existential
                if len(vals):
                    ob = list(vals)[0]
                else:
                    if len(objs):
                        ob = list(objs)[0]
                    else:
                        ob = 0
                if ob:
                    B = r.BNode()
                    triples += [(aclass, rdfs.subClassOf, B),
                                (B, a, owl.Restriction),
                                (B, owl.onProperty, pc),
                                (B, owl.someValuesFrom, ob)]
                    if aclass in existential_restrictions.keys():
                        existential_restrictions[aclass].append((pc, ob))
                    else:
                        existential_restrictions[aclass] = [(pc, ob)]
            q = "SELECT (COUNT(DISTINCT ?s) as ?cs) WHERE { ?s <%s> ?o . ?s a ?ca . FILTER(str(?ca) != '%s') }" % (
                pc, aclass)
            ninds3 = pl(client.retrieveQuery(q))[0]
            # q = "SELECT DISTINCT ?s WHERE { ?s <%s> ?o .}" % (pc,)
            # inds3 = mkQuery(q)
            # if set(inds) == set(inds3):  # universal
            # if all([i in set(inds) for i in inds3]):  # universal
            # if ninds == ninds3:  # universal
            if ninds3 == 0:  # universal
                if len(vals):
                    ob = list(vals)[0]
                else:
                    if len(objs):
                        ob = list(objs)[0]
                    else:
                        ob = 0
                if ob:
                    B = r.BNode()
                    triples += [(aclass, rdfs.subClassOf, B),
                                (B, a, owl.Restriction),
                                (B, owl.onProperty, pc),
                                (B, owl.allValuesFrom, ob)]
                    if aclass in universal_restrictions.keys():
                        universal_restrictions[aclass].append((pc, ob))
                    else:
                        universal_restrictions[aclass] = [(pc, ob)]
    del q, aclass, antecedent_property, consequent_property
    c('find properties')
    q = "SELECT DISTINCT ?p WHERE {?s ?p ?o}"
    # properties = pl(client.retrieveQuery(prefix+q))
    properties = mkQuery(q)
    # properties_ = [i.split("/")[-1] for i in properties]

    c('check if property is functional and get range and domain')
    functional_properties = set()
    for prop in properties:
        # check if property is functional
        q = 'SELECT DISTINCT (COUNT(?o) as ?co) WHERE { ?s <%s> ?o } GROUP BY ?s' % (
            prop, )
        is_functional = mkQuery(q)
        if len(is_functional) == 1 and is_functional[0] == 1:
            triples.append((prop, a, owl.FunctionalProperty))
            functional_properties.add(prop)

        # datatype or object properties
        suj = mkQuery("SELECT DISTINCT ?cs WHERE { ?s <%s> ?o . ?s a ?cs . }" %
                      (prop, ))
        # obj = mkQuery("SELECT DISTINCT ?co (datatype(?o) as ?do) WHERE { ?s <%s> ?o . OPTIONAL { ?o a ?co . } }" % (prop,))
        obj1 = mkQuery(
            "SELECT DISTINCT ?co WHERE { ?s <%s> ?o . ?o a ?co . }" % (prop, ))
        obj2 = mkQuery(
            "SELECT DISTINCT (datatype(?o) as ?do) WHERE { ?s <%s> ?o . }" %
            (prop, ))
        obj = obj1 + obj2
        if len(obj) and ("XMLS" in obj[0]):
            triples.append((prop, a, owl.DataProperty))
        else:
            triples.append((prop, a, owl.ObjectProperty))
        if len(suj) > 1:
            B = r.BNode()
            triples.append((prop, rdfs.domain, B))
            for ss in suj:
                triples.append((B, owl.unionOf, ss))
        elif suj:
            triples.append((prop, rdfs.domain, suj[0]))
        if len(obj) > 1:
            B = r.BNode()
            triples.append((prop, rdfs.range, B))
            for ss in suj:
                triples.append((B, owl.unionOf, ss))
        elif obj:
            triples.append((prop, rdfs.range, obj[0]))

        # for drawing
        # prop_ = prop.split("/")[-1]
        # suj_ = [i.split('/')[-1] for i in suj]
        # obj_ = [i.split('/')[-1] for i in obj]
    # Drawing
    c('started drawing')
    A = gv.AGraph(directed=True, strict=False)
    q = """PREFIX po: <http://purl.org/socialparticipation/po/>
    SELECT DISTINCT ?snap WHERE { { ?i po:snapshot ?snap } UNION { ?snap po:snapshotID ?idfoo } }"""
    # SELECT DISTINCT ?snap WHERE { ?i po:snapshot ?snap }"""
    snap = mkQuery(q)[0]
    q = """PREFIX po: <http://purl.org/socialparticipation/po/>
    SELECT ?provenance
    WHERE { <%s> po:socialProtocol ?provenance }""" % (snap)
    # WHERE { { <%s> po:socialProtocolTag ?provenance } UNION
    #         { <%s> po:humanizedName ?provenance } }""" % (snap, snap)
    provenance = pl(client.retrieveQuery(q))[0]
    # A.graph_attr["label"] = r"General diagram of ontological structure from %s in the http://purl.org/socialparticipation/participationontology/ namespace.\nGreen edge denotes existential restriction;\ninverted edge nip denotes universal restriction;\nfull edge (non-dashed) denotes functional property." % (provenance,)
    edge_counter = 1
    node_counter = 1
    data_nodes = {}
    for aclass in classes:
        aclass_ = aclass.split('/')[-1]
        if aclass_ not in A.nodes():
            A.add_node(aclass_, style="filled")
            n = A.get_node(aclass_)
            n.attr['color'] = "#A2F3D1"
        neigh = neighbors[aclass]
        # for i in range(len(neigh[0])):  # antecendents
        #     label = neigh[0][i][0].split("/")[-1]
        #     elabel = neigh[0][i][1]
        #     elabel_ = elabel.split("/")[-1]
        #     if label not in A.nodes():
        #         A.add_node(label, style="filled")
        #         n = A.get_node(label)
        #         n.attr['color'] = "#A2F3D1"
        #     ekey = '{}-{}-{}'.format(label, aclass_, edge_counter)
        #     edge_counter += 1
        #     A.add_edge(label, aclass_, ekey)
        #     e = A.get_edge(label, aclass_, key=ekey)
        #     e.attr["label"] = elabel_
        #     e.attr["penwidth"] = 2.
        #     e.attr["arrowsize"] = 2.
        #     if elabel not in functional_properties:
        #         e.attr["style"] = "dashed"
        #     if neigh[0][i][0] in existential_restrictions.keys():
        #         restriction = existential_restrictions[neigh[0][i][0]]
        #         prop = [iii[0] for iii in restriction]
        #         obj = [iii[1] for iii in restriction]
        #         if (elabel in prop) and (obj[prop.index(elabel)] == aclass):
        #             e.attr["color"] = "#A0E0A0"
        #     if neigh[0][i][0] in universal_restrictions.keys():
        #         restriction = universal_restrictions[neigh[0][i][0]]
        #         prop = [iii[0] for iii in restriction]
        #         obj = [iii[1] for iii in restriction]
        #         if (elabel in prop) and (obj[prop.index(elabel)] == aclass):
        #             e.attr["color"] = "inv"
        for i in range(len(neigh[1])):  # consequents
            label = neigh[1][i][1].split("/")[-1]
            elabel = neigh[1][i][0]
            elabel_ = elabel.split('/')[-1]
            if "XMLS" in label:
                color = "#FFE4AA"
                if one_datatype:
                    if label in data_nodes:
                        label_ = data_nodes[label]
                    else:
                        label_ = node_counter
                        node_counter += 1
                        data_nodes[label] = label_
                else:
                    label_ = node_counter
                    node_counter += 1
            else:
                label_ = label
                color = "#A2F3D1"
            if label_ not in A.nodes():
                A.add_node(label_, style="filled")
                n = A.get_node(label_)
                n.attr['label'] = label.split("#")[-1]
                n.attr['color'] = color
            ekey = '{}-{}-{}'.format(aclass_, label_, edge_counter)
            edge_counter += 1
            A.add_edge(aclass_, label_, ekey)
            e = A.get_edge(aclass_, label_, key=ekey)
            e.attr["label"] = elabel_
            e.attr["color"] = color
            e.attr["penwidth"] = 2
            if r.URIRef(elabel) not in functional_properties:
                e.attr["style"] = "dashed"
            if aclass in existential_restrictions.keys():
                restrictions = existential_restrictions[aclass]
                prop = [iii[0] for iii in restrictions]
                if r.URIRef(elabel) in prop:
                    e.attr["color"] = "#A0E0A0"
            if aclass in universal_restrictions.keys():
                restrictions = universal_restrictions[aclass]
                prop = [iii[0] for iii in restrictions]
                if r.URIRef(elabel) in prop:
                    e.attr["arrowhead"] = "inv"
                    e.attr["arrowsize"] = 2.

    # A.draw(os.path.join(final_dir, "{}.png".format(final_dir)), prog="dot")
    # try:
    #     A.draw(os.path.join(final_dir, "{}_circo.png".format(final_dir)), prog="circo")
    # except:
    #     pass
    # A.draw(os.path.join(final_dir, "{}_twopi.png".format(final_dir)), prog="twopi", args="-Granksep=4")
    # A.write(os.path.join(final_dir, "{}.dot".format(final_dir)))
    A.draw(os.path.join(final_dir, "draw.png"), prog="dot")
    try:
        A.draw(os.path.join(final_dir, "draw_circo.png"), prog="circo")
    except:
        pass
    A.draw(os.path.join(final_dir, "draw_twopi.png"),
           prog="twopi",
           args="-Granksep=4")
    A.write(os.path.join(final_dir, "draw.dot"))
    # for triple in triples:
    #     g.add(triple)
    P.start(False)
    P.context('ontology', 'remove')
    P.add(triples, 'ontology')
    g = P.context('ontology')
    g.serialize(os.path.join(final_dir, 'ontology.owl'))
    g.serialize(os.path.join(final_dir, 'ontology.ttl'), 'turtle')
    return locals()
Ejemplo n.º 37
0
    def writeAllFB(self):
        c("started rendering of the snapshot publication. snapshotID:",
          self.snapshotid)
        self.final_path_ = "{}{}/".format(self.final_path, self.snapshotid)
        if not os.path.isdir(self.final_path_):
            os.mkdir(self.final_path_)
        #fnet,inet,mnet
        triples = []
        if self.isfriendship:
            g = P.context(self.friendship_graph)
            g.namespace_manager.bind("po", po)
            g.serialize(self.final_path_ + self.snapshotid + "Friendship.ttl",
                        "turtle")
            c("ttl")
            g.serialize(self.final_path_ + self.snapshotid + "Friendship.rdf",
                        "xml")
            c("serialized friendships")
            # get filesize and ntriples
            filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Friendship.rdf") / (10**6)
            filesizettl = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Friendship.ttl") / (10**6)
            ntriples = len(g)
            triples += [
                (self.snapshoturi, po.friendshipXMLFileSizeMB, filesizerdf),
                (self.snapshoturi, po.friendshipTTLFileSizeMB, filesizettl),
                (self.snapshoturi, po.nFriendshipTriples, ntriples),
            ]
        if self.isinteraction:
            g = P.context(self.interaction_graph)
            g.namespace_manager.bind("po", po)
            g.serialize(self.final_path_ + self.snapshotid + "Interaction.ttl",
                        "turtle")
            c("ttl")
            g.serialize(self.final_path_ + self.snapshotid + "Interaction.rdf",
                        "xml")
            c("serialized interaction")
            filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Interaction.rdf") / (10**6)
            filesizettl = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Interaction.ttl") / (10**6)
            ntriples = len(g)
            triples += [
                (self.snapshoturi, po.interactionXMLFileSizeMB, filesizerdf),
                (self.snapshoturi, po.interactionTTLFileSizeMB, filesizettl),
                (self.snapshoturi, po.nInteractionTriples, ntriples),
            ]
        if self.hastext:
            g = P.context(self.posts_graph)
            g.namespace_manager.bind("po", po)
            g.serialize(self.final_path_ + self.snapshotid + "Posts.ttl",
                        "turtle")
            c("ttl")
            g.serialize(self.final_path_ + self.snapshotid + "Posts.rdf",
                        "xml")
            c("serialized posts")
            filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Posts.rdf") / (10**6)
            filesizettl = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Posts.ttl") / (10**6)
            ntriples = len(g)
            triples += [
                (self.snapshoturi, po.postsXMLFileSizeMB, filesizerdf),
                (self.snapshoturi, po.postsTTLFileSizeMB, filesizettl),
                (self.snapshoturi, po.nPostsTriples, ntriples),
            ]
        g = P.context(self.meta_graph)
        ntriples = len(g)
        triples += [
            (self.snapshoturi, po.nMetaTriples, ntriples),
        ]
        P.add(triples, context=self.meta_graph)
        g.namespace_manager.bind("po", po)
        g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle")
        c("ttl")
        g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml")
        c("serialized meta")
        # copia o script que gera este codigo
        if not os.path.isdir(self.final_path_ + "scripts"):
            os.mkdir(self.final_path_ + "scripts")
        shutil.copy(S.PACKAGEDIR + "/../tests/triplify.py",
                    self.final_path_ + "scripts/triplify.py")
        # copia do base data
        if not os.path.isdir(self.final_path_ + "base"):
            os.mkdir(self.final_path_ + "base")
        originals = ""
        if self.isfriendship:
            shutil.copy(self.data_path + self.filename_friendships,
                        self.final_path_ + "base/")
            originals += "base/{}".format(self.filename_friendships)
            tfriendship = """\n\n{nf} individuals with metadata {fvars}
and {nfs} friendships constitute the friendship network in the RDF/XML file:
{frdf} \nor in the Turtle file: \n{fttl}
(anonymized: {fan}).""".format(
                nf=self.nfriends,
                fvars=str(self.friendsvars),
                nfs=self.nfriendships,
                frdf=self.frdf,
                fttl=self.fttl,
                fan=self.friendships_anonymized,
            )
        else:
            tfriendship = ""
        if self.isinteraction:
            shutil.copy(self.data_path + self.filename_interactions,
                        self.final_path_ + "base/")
            tinteraction = """\n\n{} individuals with metadata {}
and {} interactions with metadata {} constitute the interaction 
network in the RDF/XML file:
{}
or in the Turtle file:
{}
(anonymized: {}).""".format(self.ninteracted, str(self.varsfriendsinteraction),
                            self.ninteractions, str(self.interactionsvars),
                            self.irdf, self.ittl, self.interactions_anonymized)
            originals += "\nbase/{}".format(self.filename_interactions)
        else:
            tinteraction = ""
        if self.hastext:
            shutil.copy(self.data_path + self.filename_posts,
                        self.final_path_ + "base/")
            tposts = """\n\n{} posts with {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {}
{:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}
posts data in the RDF/XML file:
{}
or in the Turtle file:
{}""".format(self.nposts, self.mcharsposts, self.dcharsposts, self.totalchars,
             self.mtokensposts, self.dtokensposts, self.totaltokens, self.prdf,
             self.pttl)
            originals += "\nbase/{}".format(self.filename_posts)
        else:
            tposts = ""


#        P.rdf.writeAll(mnet,aname+"Meta",fpath_,1)
# faz um README
        datetime_string = P.get(r.URIRef(self.snapshoturi),
                                po.dateObtained,
                                None,
                                context="social_facebook")[2]
        #        if not os.path.isdir(self.final_path+"base"):
        #            os.mkdir(self.final_path+"base")
        with open(self.final_path_ + "README", "w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the facebook
snapshot {snapid} collected around {date}.{tfriendship}{tinteraction}{tposts}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nOriginal file(s):
{origs}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::"""
                    .format(snapid=self.snapshotid,
                            date=datetime_string,
                            tfriendship=tfriendship,
                            tinteraction=tinteraction,
                            tposts=tposts,
                            mrdf=self.mrdf,
                            mttl=self.mttl,
                            origs=originals,
                            ise=self.isego,
                            isg=self.isgroup,
                            isf=self.isfriendship,
                            isi=self.isinteraction,
                            ist=self.hastext,
                            ava=self.online_prefix,
                            desc=self.desc))
Ejemplo n.º 38
0
    def makeMetadata(self):
        triples = P.get(self.snapshoturi, None, None, self.social_graph)
        for rawfile in P.get(self.snapshoturi,
                             po.rawFile,
                             None,
                             self.social_graph,
                             strict=True,
                             minimized=True):
            triples += P.get(rawfile, None, None, self.social_graph)
        P.add(triples, context=self.meta_graph)
        self.totalchars = sum(self.nchars_all)
        self.mcharsmessages = n.mean(self.nchars_all)
        self.dcharsmessages = n.std(self.nchars_all)
        self.totaltokens = sum(self.ntokens_all)
        self.mtokensmessages = n.mean(self.ntokens_all)
        self.dtokensmessages = n.std(self.ntokens_all)
        self.totalsentences = sum(self.nsentences_all)
        self.msentencesmessages = n.mean(self.nsentences_all)
        self.dsentencesmessages = n.std(self.nsentences_all)
        self.nparticipants = len(self.NICKS)
        self.nmessages = len(self.messageids)
        self.ntriples = len(P.context(self.irc_graph))
        triples = [
            (self.snapshoturi, po.nParticipants, self.nparticipants),
            (self.snapshoturi, po.nMessages, self.nmessages),
            (self.snapshoturi, po.nDirectMessages, self.ndirect),
            (self.snapshoturi, po.nUserMentions, self.nmention),
            (self.snapshoturi, po.nCharsOverall, self.totalchars),
            (self.snapshoturi, po.mCharsOverall, self.mcharsmessages),
            (self.snapshoturi, po.dCharsOverall, self.dcharsmessages),
            (self.snapshoturi, po.nTokensOverall, self.totaltokens),
            (self.snapshoturi, po.mTokensOverall, self.mtokensmessages),
            (self.snapshoturi, po.dTokensOverall, self.dtokensmessages),
            (self.snapshoturi, po.nSentencesOverall, self.totalsentences),
            (self.snapshoturi, po.mSentencesOverall, self.msentencesmessages),
            (self.snapshoturi, po.dSentencesOverall, self.dsentencesmessages),
        ]
        P.add(triples, context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                                 [po.ircParticipantAttribute] *
                                 len(self.participantvars),
                                 self.participantvars,
                                 context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                                 [po.logXMLFilename] * len(self.log_xml) +
                                 [po.logTTLFilename] * len(self.log_ttl),
                                 self.log_xml + self.log_ttl,
                                 context=self.meta_graph)
        P.rdf.triplesScaffolding(
            self.snapshoturi, [po.onlineLogXMLFile] * len(self.log_xml) +
            [po.onlineLogTTLFile] * len(self.log_ttl),
            [self.online_prefix + i for i in self.log_xml + self.log_ttl],
            context=self.meta_graph)

        self.mrdf = self.snapshotid + "Meta.rdf"
        self.mttl = self.snapshotid + "Meta.ttl"
        self.desc = "irc dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
            self.snapshotid,
            self.snapshoturi,
            self.isego,
            self.isgroup,
        )
        self.desc += "\nisFriendship: {}; ".format(self.isfriendship)
        self.desc += "isInteraction: {}.".format(self.isinteraction)
        self.desc += "\nnParticipants: {}; nInteractions: {} (directed messages+user mentions).".format(
            self.nparticipants, self.ndirect + self.nmention)
        self.desc += "\nisPost: {} (alias hasText: {})".format(
            self.hastext, self.hastext)
        self.desc += "\nnMessages: {}; ".format(self.nmessages)
        self.desc += "nDirectedMessages: {}; nUserMentions: {};".format(
            self.ndirect, self.nmention)
        self.desc += "\nnCharsOverall: {}; mCharsOverall: {}; dCharsOverall: {}.".format(
            self.totalchars, self.mcharsmessages, self.dcharsmessages)
        self.desc += "\nnTokensOverall: {}; mTokensOverall: {}; dTokensOverall: {};".format(
            self.totaltokens, self.mtokensmessages, self.dtokensmessages)
        self.desc += "\nnSentencesOverall: {}; mSentencesOverall: {}; dSentencesOverall: {};".format(
            self.totalsentences, self.msentencesmessages,
            self.dsentencesmessages)
        self.desc += "\nnURLs: {}; nAAMessages {}.".format(
            self.nurls, self.naamessages)
        triples = [
            (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()),
            (self.snapshoturi, po.triplifiedBy, "scripts/"),
            (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]),
            (self.snapshoturi, po.availableAt, self.online_prefix),
            (self.snapshoturi, po.onlineMetaXMLFile,
             self.online_prefix + self.mrdf),
            (self.snapshoturi, po.onlineMetaTTLFile,
             self.online_prefix + self.mttl),
            (self.snapshoturi, po.metaXMLFileName, self.mrdf),
            (self.snapshoturi, po.metaTTLFileName, self.mttl),
            (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_xml)),
            (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)),
            (self.snapshoturi, po.acquiredThrough, "channel text log"),
            (self.snapshoturi, po.socialProtocolTag, "IRC"),
            (self.snapshoturi, po.socialProtocol,
             P.rdf.ic(po.Platform, "IRC", self.meta_graph, self.snapshoturi)),
            (self.snapshoturi, po.nTriples, self.ntriples),
            (self.snapshoturi, NS.rdfs.comment, self.desc),
        ]
        P.add(triples, self.meta_graph)
Ejemplo n.º 39
0
    def writeAllTW(self):
        # write meta and readme with self.desc, finished.
        g=P.context(self.meta_graph)
        ntriples=len(g)
        triples=[
                 (self.snapshoturi,po.nMetaTriples,ntriples)      ,
                 ]
        P.add(triples,context=self.meta_graph)
        g.namespace_manager.bind("po",po)
        g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl")
        g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml")
        c("serialized meta")
        # copia o script que gera este codigo
        if not os.path.isdir(self.final_path_+"scripts"):
            os.mkdir(self.final_path_+"scripts")
        shutil.copy(S.PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py")
        # copia do base data
        tinteraction="""\n\n{} individuals with metadata {}
and {} interactions (retweets: {}, replies: {}, user_mentions: {}) 
constitute the interaction 
network in the RDF/XML file(s):
{}
and the Turtle file(s):
{}
(anonymized: {}).""".format( self.nparticipants,str(self.participantvars),
                    self.nretweets+self.nreplies+self.nuser_mentions,self.nretweets,self.nreplies,self.nuser_mentions,
                    self.tweet_rdf,
                    self.tweet_ttl,
                    self.interactions_anonymized)
        tposts="""\n\nThe dataset consists of {} tweets with metadata {}
{:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {}
{:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}""".format(
                        self.ntweets,str(self.tweetvars),
                        self.mcharstweets,self.dcharstweets,self.totalchars,
                        self.mtokenstweets,self.dtokenstweets,self.totaltokens,
                        )
        self.dates=[i.isoformat() for i in self.dates]
        date1=min(self.dates)
        date2=max(self.dates)
        with open(self.final_path_+"README","w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the twitter
snapshot {snapid} with tweets from {date1} to {date2}
(total of {ntrip} triples).{tinteraction}{tposts}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::""".format(
                snapid=self.snapshotid,date1=date1,date2=date2,ntrip=self.ntriples,
                        tinteraction=tinteraction,
                        tposts=tposts,
                        mrdf=self.mrdf,
                        mttl=self.mttl,
                        ise=self.isego,
                        isg=self.isgroup,
                        isf=self.isfriendship,
                        isi=self.isinteraction,
                        ist=self.hastext,
                        ava=self.online_prefix,
                        desc=self.desc
                        ))
Ejemplo n.º 40
0
def parseLegacyFiles(data_dir=DATADIR):
    """Parse legacy txt files with irc logs"""
    data_dir = os.path.expanduser(data_dir)
    directories = os.listdir(data_dir)
    directories = [i for i in directories if os.path.isdir(data_dir + i)]

    snapshots = set()
    triples = []
    for directory in directories:
        all_files = [
            i for i in os.listdir(data_dir + directory) if i.isdigit()
        ]
        if all_files:
            all_files.sort()
            foo = all_files[0].lstrip("0")
            if not foo:
                foo = "0"
            snapshotid = "legacy-" + directory + "-" + foo + "-" + all_files[
                -1].lstrip("0")
            snapshoturi = po.GmaneSnapshot + "#" + snapshotid
            expressed_classes = [
                po.GmaneParticipant, po.EmailPeer, po.EmailMessage
            ]
            expressed_reference = directory
            name_humanized = "Gmane email list with id " + expressed_reference
            # get size for all files in dir
            directorysize = sum(
                os.path.getsize(data_dir + directory + "/" + filename)
                for filename in os.listdir(data_dir + directory)) / 10**6
            nfiles = len(all_files)
            fileformat = "mbox"
            directoryuri = po.Directory + "#gmane-" + directory
            triples += [
                (snapshoturi, a, po.Snapshot),
                (snapshoturi, po.dataDir, data_dir),
                (snapshoturi, a, po.Snapshot),
                (snapshoturi, a, po.GmaneSnapshot),
                (snapshoturi, po.snapshotID, snapshotid),
                (snapshoturi, po.isEgo, False),
                (snapshoturi, po.isGroup, True),
                (snapshoturi, po.isFriendship, False),
                (snapshoturi, po.isInteraction, True),
                (snapshoturi, po.isPost, True),
                (snapshoturi, po.humanizedName, name_humanized),
                (snapshoturi, po.expressedReference, expressed_reference),
                (snapshoturi, po.rawDirectory, directoryuri),
                (directoryuri, po.directorySize, directorysize),
                (directoryuri, po.directoryName, directory),
                (directoryuri, po.fileFormat, fileformat),
            ] + [(directoryuri, po.expressedClass, expressed_class)
                 for expressed_class in expressed_classes]
            snapshots.add(snapshoturi)
    nsnapshots = ndirectories = len(directories)
    #P.context("gmane","remove")
    platformuri = P.rdf.ic(po.Platform, "Gmane", context="gmane")
    triples += [
        (NS.social.Session, NS.social.nGmaneParsedDirectories, ndirectories),
        (NS.social.Session, NS.social.nGmaneSnapshots, nsnapshots),
        (NS.social.Session, po.platform, platformuri),
    ]
    P.add(triples, context="gmane")
    c("parsed {} gmane data directories (=={} snapshots) are in percolation graph and 'gmane' context"
      .format(ndirectories, nsnapshots))
    c("percolation graph have {} triples ({} in gmane context)".format(
        len(P.percolation_graph), len(P.context("gmane"))))
    negos = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE         { GRAPH <gmane> { ?s po:isEgo true         } } "
    )
    ngroups = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE       { GRAPH <gmane> { ?s po:isGroup true       } } "
    )
    nfriendships = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE  { GRAPH <gmane> { ?s po:isFriendship true  } } "
    )
    ninteractions = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isInteraction true } } "
    )
    nposts = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE        { GRAPH <gmane> { ?s po:isPost true        } } "
    )
    totalsize = sum(
        P.query(
            r" SELECT ?size WHERE              { GRAPH <gmane> { ?s po:directorySize ?size     } } "
        ))
    c("""{} are ego snapshots, {} are group snapshots
{} have a friendship structures. {} have an interaction structures. {} have texts 
Total raw data size is {:.2f}MB""".format(negos, ngroups, nfriendships,
                                          ninteractions, nposts, totalsize))
    return snapshots
Ejemplo n.º 41
0
def parseLegacyFiles(datadir=DATADIR+"facebook/"):
    """Parse legacy gdf, gml and tab files of facebook structures
    
    Synthax of facebook filenames is:
    <prefix><name><date><suffix><extension> where:

    <prefix> used are:
        *) avlab_ for files obtained with participants at AVLAB
        *) posavlab_ for files obtained from participants
        *) page_ for files about facebook pages
        *) ego_ for ego networks
    ommited for gml files and gdf group files.

    <name> is any string name associated with the user or group delimiting the structure in the file, e.g. FernandValfro.
    it gets split with spaces before uppercase letter chuncks for po:humanizedName: REM splits to REM. RFabbri to RFabbri.

    <date> daymonthyear in 2/2/4 digits, e.g. 20032014 for 20/March/2014.

    <suffix> is ommited for friendship .gml .gdf networks, .tab text and activity files.
    _interaction is used if interaction network.

    <extension> is either .gml for gml files, all are ego friendship network data
                          .gdf for gdf files with group and ego, interaction and friendship network data
                          .tab for tab files with post data, such as text

    These render snapshots of two classes:
    po:FacebookEgoFriendshipSnapshot from .gml files and gdf files with prefix avlab_ posavlab_ or ego_
    po:FacebookGroupFriendshipInteractionSnapshot from .gdf files without prefix with and without _interaction suffix and the .tab files. They form sets of files, all with friendship and interaction networks and some with a .tab file.

    ToDo:
       *) Implement parsing of page files.
       *) Implement parsing of new group files."""
    platformuri=P.rdf.ic(po.Platform,"Facebook",context="social_facebook")
    triples=[
            (platformuri, po.dataDir,datadir),
            ]
    filenames=os.listdir(datadir)
    filenames=[i for i in filenames if not i.endswith("swp")]
    # clean filenames: if they are equal except for extension, keep gml file
    snapshots=set()
    regex=re.compile(r"^(avlab_|ego_|posavlab_|page_)*(.*?)(\d{8})(_interactions|_comments){0,1}\.(gdf|tab|gml)$")
    regex2=re.compile(r'([A-Z]{2,}(?=[A-Z]|$)|[A-Z][a-z]*)')
    for filename in filenames:
        prefix,name,date,sufix,format_=regex.findall(filename)[0]
        if prefix=="page_":
            c("page data currently not supported. Jumping", filename)
            continue
        filesize=os.path.getsize(datadir+filename)/(10**6) # size in megabytes
        snapshotid=filename.replace("_interactions.gdf",".gdf").replace(".tab",".gdf")+"_fb"
        snapshoturi=po.FacebookSnapshot+"#"+snapshotid # put on ontology as subclass of po:Snapshot
        date_obtained=datetime.date(int(date[4:]),int(date[2:4]),int(date[:2]))
        name_humanized=" ".join(regex2.findall(name))
        metadata=S.legacy.facebook.files.files_dict[filename.replace("_interactions.gdf",".gdf").replace(".tab",".gdf")]
        if metadata[0]:
            triples+=[(snapshoturi,po.numericID,metadata[0])]
        if metadata[1]:
            triples+=[(snapshoturi,po.stringID,metadata[1])]
        if len(metadata)==3:
            if not metadata[2]:
                c("group data without a publishing link: ",filename)
            else:
                triples+=[(snapshoturi,po.publishedURL,metadata[2])]
        if filename.endswith(".gml") or any(filename.startswith(i) for i in ("ego_","avlab_","posavlab_")):
            isego=True
            isgroup=False
            isfriendship=True
            isinteraction=False
            isposts=False
            expressed_classes=(po.Friendship,po.Participant)

            if metadata[0]:
                expressed_reference=po.FacebookParticipant+"#"+snapshotid+"-"+metadata[0]
            else:
                if "Mirtes" in filename:
                    expressed_reference=po.FacebookParticipant+"#"+"anon_mirtes"
                else:
                    raise ValueError("Numeric ID is needed for friendship networks")
            triples+=[(expressed_reference,a,po.FacebookParticipant),]
        else: # group snapshot
            isego=False
            isgroup=True
            ffilename=prefix+name+date+".gdf"
            ifilename=prefix+name+date+"_interactions.gdf"
            tfilename=prefix+name+date+".tab"
            isfriendship=ffilename in filenames
            isinteraction=ifilename in filenames
            isposts=tfilename in filenames
            if metadata[0]:
                expressed_reference=po.FacebookGroup+"#"+metadata[0]
            else:
                if metadata[1]:
                    expressed_reference=po.FacebookGroup+"#"+metadata[1]
                else:
                    raise ValueError("Numeric or string ID is needed for group networks")
            triples+=[(expressed_reference,a,po.FacebookGroup)]
            if filename==ffilename:
                expressed_classes=(po.Friendship,po.Participant)
            elif filename==ifilename:
                expressed_classes=(po.Interaction,po.Participant)
            elif format_=="tab":
                expressed_classes=(po.Post,)
            else:
                raise NameError("filename structure not understood")

        fileuri=NS.po.File+"#"+snapshotid+"-_file_-"+filename
        triples+=[
                 (snapshoturi, a, po.Snapshot),
                 (snapshoturi, a, po.FacebookSnapshot),
                 (snapshoturi, po.snapshotID, snapshotid),
                 (snapshoturi, po.isEgo, isego),
                 (snapshoturi, po.isGroup, isgroup),
                 (snapshoturi, po.isFriendship, isfriendship),
                 (snapshoturi, po.isInteraction, isinteraction),
                 (snapshoturi, po.isPost, isposts),
                 (snapshoturi, po.humanizedName, name_humanized),
                 (snapshoturi, po.dateObtained, date_obtained),
                 (snapshoturi, po.expressedReference, expressed_reference),
                 (snapshoturi, po.rawFile, fileuri),
                 (fileuri,     po.fileSize, filesize),
                 (fileuri,     po.fileName, filename),
                 (fileuri,     po.fileFormat, format_),
                 ]+[
                 (fileuri,    po.expressedClass, expressed_class) for expressed_class in expressed_classes
                 ]
        note=theNote(filename) # for avlab and posavlab
        if note:
            triples+=[
                     (snapshoturi,NS.rdfs.comment,note),
                     ]
        snapshots.add(snapshoturi)
    # data about the overall data in percolation graph
    nfiles=len(filenames)
    nsnapshots=len(snapshots)
    triples+=[
             (NS.social.Session,NS.social.nFacebookParsedFiles,nfiles),
             (NS.social.Session,NS.social.nFacebookSnapshots,nsnapshots),
             ]
    P.context("social_facebook","remove")
    P.add(triples,context="social_facebook")
    c("parsed {} facebook files ({} snapshots) are in percolation graph and 'social_facebook' context".format(nfiles,nsnapshots))
    c("percolation graph have {} triples ({} in social_facebook context)".format(len(P.percolation_graph),len(P.context("social_facebook"))))
    negos=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isEgo true } ")
    ngroups=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isGroup true } ")
    nfriendships=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isFriendship true } ")
    ninteractions=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isInteraction true } ")
    nposts=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isPost true } ")
    totalsize=sum(P.query(r" SELECT ?size WHERE { ?s po:fileSize ?size } "))
    c("""{} are ego snapshots, {} are group snapshots
{} have a friendship network. {} have an interaction network. {} have post texts and reaction counts
Total raw data size is {:.2f}MB""".format(negos,ngroups,nfriendships,ninteractions,nposts,totalsize))

    return snapshots
Ejemplo n.º 42
0
    def makeMetadata(self):
        self.totalchars=sum(self.nchars_all)
        self.mchars_messages=n.mean(self.nchars_all)
        self.dchars_messages=n.std(self.nchars_all)
        self.totaltokens=sum(self.ntokens_all)
        self.mtokens_messages=n.mean(self.ntokens_all)
        self.dtokens_messages=n.std(self.ntokens_all)
        self.totalsentences=sum(self.nsentences_all)
        self.msentences_messages=n.mean(self.nsentences_all)
        self.dsentences_messages=n.std( self.nsentences_all)

        self.totalchars_clean=sum(self.nchars_clean_all)
        self.mchars_messages_clean=n.mean(self.nchars_clean_all)
        self.dchars_messages_clean=n.std(self.nchars_clean_all)
        self.totaltokens_clean=sum(self.ntokens_clean_all)
        self.mtokens_messages_clean=n.mean(self.ntokens_clean_all)
        self.dtokens_messages_clean=n.std(self.ntokens_clean_all)
        self.totalsentences_clean=sum(self.nsentences_clean_all)
        self.msentences_messages_clean=n.mean(self.nsentences_clean_all)
        self.dsentences_messages_clean=n.std( self.nsentences_clean_all)
        fremoved_lines=self.nremoved_lines/self.nlines

        triples=[
                (self.snapshoturi, po.nParticipants,           self.nparticipants),
                (self.snapshoturi, po.nMessages,                 self.nmessages),
                (self.snapshoturi, po.nEmptyMessages,                 self.nempty),
                (self.snapshoturi, po.nReplies,              self.nreplies),
                (self.snapshoturi, po.nCC,                 self.ncc),
                (self.snapshoturi, po.nTo,              self.nto),
                (self.snapshoturi, po.nReferences,               self.nreferences),
                (self.snapshoturi, po.nUrls,               self.nurls),
                (self.snapshoturi, po.nCharsOverall, self.totalchars),
                (self.snapshoturi, po.mCharsOverall, self.mchars_messages),
                (self.snapshoturi, po.dCharsOverall, self.dchars_messages),
                (self.snapshoturi, po.nTokensOverall, self.totaltokens),
                (self.snapshoturi, po.mTokensOverall, self.mtokens_messages),
                (self.snapshoturi, po.dTokensOverall, self.dtokens_messages),
                (self.snapshoturi, po.nSentencesOverall, self.totalsentences),
                (self.snapshoturi, po.mSentencesOverall, self.msentences_messages),
                (self.snapshoturi, po.dSentencesOverall, self.dsentences_messages),

                (self.snapshoturi,  po.nCharsOverallClean,      self.totalchars_clean),
                (self.snapshoturi,  po.mCharsOverallClean,  self.mchars_messages_clean),
                (self.snapshoturi,  po.dCharsOverallClean,  self.dchars_messages_clean),
                (self.snapshoturi, po.nTokensOverallClean,     self.totaltokens_clean),
                (self.snapshoturi, po.mTokensOverallClean, self.mtokens_messages_clean),
                (self.snapshoturi, po.dTokensOverallClean, self.dtokens_messages_clean),
                (self.snapshoturi, po.nSentencesOverallClean,     self.totalsentences_clean),
                (self.snapshoturi, po.mSentencesOverallClean, self.msentences_messages_clean),
                (self.snapshoturi, po.dSentencesOverallClean, self.dsentences_messages_clean),
                (self.snapshoturi, po.fRemovedLines, fremoved_lines),
                ]
        P.add(triples,context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                [po.gmaneParticipantAttribute]*len(self.participantvars),
                self.participantvars,context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                [po.gmaneMessageAttribute]*len(self.messagevars),
                self.messagevars,context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                [po.emailXMLFilename]*len(self.email_xml)+[po.emailTTLFilename]*len(self.email_ttl),
                self.email_xml+self.email_ttl,context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                [po.onlineEmailXMLFile]*len(self.email_xml)+[po.onlineEmailTTLFile]*len(self.email_ttl),
                [self.online_prefix+i for i in self.email_xml+self.email_ttl],context=self.meta_graph)
        self.mrdf=self.snapshotid+"Meta.rdf"
        self.mttl=self.snapshotid+"Meta.ttl"
        self.desc="gmane public email list dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
                                                self.snapshotid,self.snapshoturi,self.isego,self.isgroup,)
        self.desc+="\nisFriendship: {}; ".format(self.isfriendship)
        self.desc+="isInteraction: {}.".format(self.isinteraction)
        self.desc+="\nnParticipants: {}; nInteractions: {} (replies+references+cc+to).".format(self.nparticipants,self.nreplies+self.nreferences+self.ncc+self.nto)
        self.desc+="\nisPost: {} (alias hasText: {})".format(self.hastext,self.hastext)
        self.desc+="\nnMessages: {} (+ empty: {}); ".format(self.nmessages,self.nempty)
        self.desc+="nReplies: {}; nReferences: {}; nTo {}; nCC: {}.".format(self.nreplies,self.nreferences,self.ncc,self.nto)
        self.desc+="\nnChars: {}; mChars: {}; dChars: {}.".format(self.totalchars,self.mchars_messages,self.dchars_messages)
        self.desc+="\nnTokens: {}; mTokens: {}; dTokens: {};".format(self.totaltokens,self.mtokens_messages,self.dtokens_messages)
        self.desc+="\nnSentences: {}; mSentences: {}; dSentences: {}.".format(self.totalsentences,self.msentences_messages,self.dsentences_messages)
        self.desc+="\nnCharsClean: {}; mCharsClean: {}; dCharsClean: {}.".format(self.totalchars_clean,self.mchars_messages_clean,self.dchars_messages_clean)
        self.desc+="\nnTokensClean: {}; mTokensClean: {}; dTokensClean: {};".format(self.totaltokens_clean,self.mtokens_messages_clean,self.dtokens_messages_clean)
        self.desc+="\nnSentencesClean: {}; mSentencesClean: {}; dSentencesClean: {}.".format(self.totalsentences_clean,self.msentences_messages_clean,self.dsentences_messages_clean)
        self.desc+="\nnUrls: {};  fRemovedLines {};.".format(self.nurls,fremoved_lines)
        self.ntriples=len(P.context(self.translation_graph))
        triples=[
                (self.snapshoturi, po.triplifiedIn,      datetime.datetime.now()),
                (self.snapshoturi, po.triplifiedBy,      "scripts/"),
                (self.snapshoturi, po.donatedBy,         self.snapshotid),
                (self.snapshoturi, po.availableAt,       self.online_prefix),
                (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf),
                (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl),
                (self.snapshoturi, po.metaXMLFileName,   self.mrdf),
                (self.snapshoturi, po.metaTTLFileName,   self.mttl),
                (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_xml)),
                (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)),
                (self.snapshoturi, po.acquiredThrough,   "Gmane public mailing list archive RSS feed"),
                (self.snapshoturi, po.socialProtocolTag, "Gmane"),
                (self.snapshoturi, po.socialProtocol,    P.rdf.ic(po.Platform,"Gmane",self.meta_graph,self.snapshoturi)),
                (self.snapshoturi, po.nTriples,         self.ntriples),
                (self.snapshoturi, NS.rdfs.comment,         self.desc),
                (self.snapshoturi, po.gmaneID, self.directory),
                ]
        P.add(triples,context=self.meta_graph)
Ejemplo n.º 43
0
    def makeMetadata(self):
        self.totalchars = sum(self.nchars_all)
        self.mchars_messages = n.mean(self.nchars_all)
        self.dchars_messages = n.std(self.nchars_all)
        self.totaltokens = sum(self.ntokens_all)
        self.mtokens_messages = n.mean(self.ntokens_all)
        self.dtokens_messages = n.std(self.ntokens_all)
        self.totalsentences = sum(self.nsentences_all)
        self.msentences_messages = n.mean(self.nsentences_all)
        self.dsentences_messages = n.std(self.nsentences_all)

        self.totalchars_clean = sum(self.nchars_clean_all)
        self.mchars_messages_clean = n.mean(self.nchars_clean_all)
        self.dchars_messages_clean = n.std(self.nchars_clean_all)
        self.totaltokens_clean = sum(self.ntokens_clean_all)
        self.mtokens_messages_clean = n.mean(self.ntokens_clean_all)
        self.dtokens_messages_clean = n.std(self.ntokens_clean_all)
        self.totalsentences_clean = sum(self.nsentences_clean_all)
        self.msentences_messages_clean = n.mean(self.nsentences_clean_all)
        self.dsentences_messages_clean = n.std(self.nsentences_clean_all)
        fremoved_lines = self.nremoved_lines / self.nlines

        triples = [
            (self.snapshoturi, po.nParticipants, self.nparticipants),
            (self.snapshoturi, po.nMessages, self.nmessages),
            (self.snapshoturi, po.nEmptyMessages, self.nempty),
            (self.snapshoturi, po.nReplies, self.nreplies),
            (self.snapshoturi, po.nCC, self.ncc),
            (self.snapshoturi, po.nTo, self.nto),
            (self.snapshoturi, po.nReferences, self.nreferences),
            (self.snapshoturi, po.nUrls, self.nurls),
            (self.snapshoturi, po.nCharsOverall, self.totalchars),
            (self.snapshoturi, po.mCharsOverall, self.mchars_messages),
            (self.snapshoturi, po.dCharsOverall, self.dchars_messages),
            (self.snapshoturi, po.nTokensOverall, self.totaltokens),
            (self.snapshoturi, po.mTokensOverall, self.mtokens_messages),
            (self.snapshoturi, po.dTokensOverall, self.dtokens_messages),
            (self.snapshoturi, po.nSentencesOverall, self.totalsentences),
            (self.snapshoturi, po.mSentencesOverall, self.msentences_messages),
            (self.snapshoturi, po.dSentencesOverall, self.dsentences_messages),
            (self.snapshoturi, po.nCharsOverallClean, self.totalchars_clean),
            (self.snapshoturi, po.mCharsOverallClean,
             self.mchars_messages_clean),
            (self.snapshoturi, po.dCharsOverallClean,
             self.dchars_messages_clean),
            (self.snapshoturi, po.nTokensOverallClean, self.totaltokens_clean),
            (self.snapshoturi, po.mTokensOverallClean,
             self.mtokens_messages_clean),
            (self.snapshoturi, po.dTokensOverallClean,
             self.dtokens_messages_clean),
            (self.snapshoturi, po.nSentencesOverallClean,
             self.totalsentences_clean),
            (self.snapshoturi, po.mSentencesOverallClean,
             self.msentences_messages_clean),
            (self.snapshoturi, po.dSentencesOverallClean,
             self.dsentences_messages_clean),
            (self.snapshoturi, po.fRemovedLines, fremoved_lines),
        ]
        P.add(triples, context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                                 [po.gmaneParticipantAttribute] *
                                 len(self.participantvars),
                                 self.participantvars,
                                 context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi, [po.gmaneMessageAttribute] *
                                 len(self.messagevars),
                                 self.messagevars,
                                 context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                                 [po.emailXMLFilename] * len(self.email_xml) +
                                 [po.emailTTLFilename] * len(self.email_ttl),
                                 self.email_xml + self.email_ttl,
                                 context=self.meta_graph)
        P.rdf.triplesScaffolding(
            self.snapshoturi, [po.onlineEmailXMLFile] * len(self.email_xml) +
            [po.onlineEmailTTLFile] * len(self.email_ttl),
            [self.online_prefix + i for i in self.email_xml + self.email_ttl],
            context=self.meta_graph)
        self.mrdf = self.snapshotid + "Meta.rdf"
        self.mttl = self.snapshotid + "Meta.ttl"
        self.desc = "gmane public email list dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
            self.snapshotid,
            self.snapshoturi,
            self.isego,
            self.isgroup,
        )
        self.desc += "\nisFriendship: {}; ".format(self.isfriendship)
        self.desc += "isInteraction: {}.".format(self.isinteraction)
        self.desc += "\nnParticipants: {}; nInteractions: {} (replies+references+cc+to).".format(
            self.nparticipants,
            self.nreplies + self.nreferences + self.ncc + self.nto)
        self.desc += "\nisPost: {} (alias hasText: {})".format(
            self.hastext, self.hastext)
        self.desc += "\nnMessages: {} (+ empty: {}); ".format(
            self.nmessages, self.nempty)
        self.desc += "nReplies: {}; nReferences: {}; nTo {}; nCC: {}.".format(
            self.nreplies, self.nreferences, self.ncc, self.nto)
        self.desc += "\nnChars: {}; mChars: {}; dChars: {}.".format(
            self.totalchars, self.mchars_messages, self.dchars_messages)
        self.desc += "\nnTokens: {}; mTokens: {}; dTokens: {};".format(
            self.totaltokens, self.mtokens_messages, self.dtokens_messages)
        self.desc += "\nnSentences: {}; mSentences: {}; dSentences: {}.".format(
            self.totalsentences, self.msentences_messages,
            self.dsentences_messages)
        self.desc += "\nnCharsClean: {}; mCharsClean: {}; dCharsClean: {}.".format(
            self.totalchars_clean, self.mchars_messages_clean,
            self.dchars_messages_clean)
        self.desc += "\nnTokensClean: {}; mTokensClean: {}; dTokensClean: {};".format(
            self.totaltokens_clean, self.mtokens_messages_clean,
            self.dtokens_messages_clean)
        self.desc += "\nnSentencesClean: {}; mSentencesClean: {}; dSentencesClean: {}.".format(
            self.totalsentences_clean, self.msentences_messages_clean,
            self.dsentences_messages_clean)
        self.desc += "\nnUrls: {};  fRemovedLines {};.".format(
            self.nurls, fremoved_lines)
        self.ntriples = len(P.context(self.translation_graph))
        triples = [
            (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()),
            (self.snapshoturi, po.triplifiedBy, "scripts/"),
            (self.snapshoturi, po.donatedBy, self.snapshotid),
            (self.snapshoturi, po.availableAt, self.online_prefix),
            (self.snapshoturi, po.onlineMetaXMLFile,
             self.online_prefix + self.mrdf),
            (self.snapshoturi, po.onlineMetaTTLFile,
             self.online_prefix + self.mttl),
            (self.snapshoturi, po.metaXMLFileName, self.mrdf),
            (self.snapshoturi, po.metaTTLFileName, self.mttl),
            (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_xml)),
            (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)),
            (self.snapshoturi, po.acquiredThrough,
             "Gmane public mailing list archive RSS feed"),
            (self.snapshoturi, po.socialProtocolTag, "Gmane"),
            (self.snapshoturi, po.socialProtocol,
             P.rdf.ic(po.Platform, "Gmane", self.meta_graph,
                      self.snapshoturi)),
            (self.snapshoturi, po.nTriples, self.ntriples),
            (self.snapshoturi, NS.rdfs.comment, self.desc),
            (self.snapshoturi, po.gmaneID, self.directory),
        ]
        P.add(triples, context=self.meta_graph)
Ejemplo n.º 44
0
    def writeAllFB(self):
        c("started rendering of the snapshot publication. snapshotID:",
          self.snapshotid)
        self.final_path_ = "{}{}/".format(self.final_path, self.snapshotid)
        if not os.path.isdir(self.final_path_):
            os.mkdir(self.final_path_)
        g = P.context(self.friendship_graph)
        g.namespace_manager.bind("po", po)
        g.serialize(self.final_path_ + self.snapshotid + "Friendship.ttl",
                    "turtle")
        c("ttl")
        g.serialize(self.final_path_ + self.snapshotid + "Friendship.rdf",
                    "xml")
        c("serialized friendships")
        # get filesize and ntriples
        # filesizerdf = os.path.getsize(self.final_path_+self.snapshotid +
        #                               "Friendship.rdf")/(10**6)
        # filesizettl = os.path.getsize(self.final_path_+self.snapshotid +
        #                               "Friendship.ttl")/(10**6)
        # ntriples = len(g)
        # triples = [
        #          (self.snapshoturi, po.friendshipXMLFileSizeMB, filesizerdf),
        #          (self.snapshoturi, po.friendshipTTLFileSizeMB, filesizettl),
        #          (self.snapshoturi, po.nFriendshipTriples, ntriples),
        #          ]
        g = P.context(self.meta_graph)
        # ntriples = len(g)
        # triples.append(
        #          (self.snapshoturi, po.nMetaTriples, ntriples+1),
        # )
        # P.add(triples, context=self.meta_graph)
        g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle")
        c("ttl")
        g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml")
        c("serialized meta")

        if not os.path.isdir(self.final_path_ + "scripts"):
            os.mkdir(self.final_path_ + "scripts")
        shutil.copy(S.PACKAGEDIR + "/../tests/triplify.py",
                    self.final_path_ + "scripts/triplify.py")
        # copia do base data
        if not os.path.isdir(self.final_path_ + "base"):
            os.mkdir(self.final_path_ + "base")
        shutil.copy(self.data_path + self.filename_friendships,
                    self.final_path_ + "base/")

        originals = "base/{}".format(self.filename_friendships)
        tfriendship = """\n\n{nf} individuals with metadata {fvars}
and {nfs} friendships constitute the friendship network in the RDF/XML file:
{frdf} \in the Turtle file: \n{fttl}
(anonymized {fan}).""".format(
            nf=self.nfriends,
            fvars=str(self.friendsvars),
            nfs=self.nfriendships,
            frdf=self.frdf,
            fttl=self.fttl,
            fan=self.friendships_anonymized,
        )
        datetime_string = P.get(self.snapshoturi,
                                po.dateObtained,
                                None,
                                context=self.social_graph)[2]

        with open(self.final_path_ + "README", "w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the facebook
snapshot {snapid} collected around {date}.{tfriendship}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nOriginal file(s):
{origs}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ \
                    directory.\n:::""".format(snapid=self.snapshotid,
                                              date=datetime_string,
                                              tfriendship=tfriendship,
                                              mrdf=self.mrdf,
                                              mttl=self.mttl,
                                              origs=originals,
                                              ise=self.isego,
                                              isg=self.isgroup,
                                              isf=self.isfriendship,
                                              isi=self.isinteraction,
                                              ist=self.hastext,
                                              ava=self.online_prefix,
                                              desc=self.desc))