Ejemplo n.º 1
0
 def makePostsTriples(self):
     if not self.hastext:
         return
     self.totalchars = sum(self.size_chars_overall)
     self.mchars_messages = n.mean(self.size_chars_overall)
     self.dchars_messages = n.std(self.size_chars_overall)
     self.totaltokens = sum(self.size_tokens_overall)
     self.mtokens_messages = n.mean(self.size_tokens_overall)
     self.dtokens_messages = n.std(self.size_tokens_overall)
     self.totalsentences = sum(self.size_sentences_overall)
     self.msentences_messages = n.mean(self.size_sentences_overall)
     self.dsentences_messages = n.std(self.size_sentences_overall)
     self.nmessages = P.get(
         "SELECT (COUNT(?s) as ?s) WHERE { ?s a po:Message }",
         context=self.translation_graph)
     self.nparticipants = P.get(
         "SELECT (COUNT(?s) as ?s) WHERE { ?s a po:Participant }",
         context=self.translation_graph)
     self.nurls = P.get(
         "SELECT (COUNT(?s) as ?s) WHERE { ?s po:hasUrl ?o }",
         context=self.translation_graph)
     triples = [
          (self.snapshoturi, po.nParticipants,     self.nparticipants),
          (self.snapshoturi, po.nMessages,         self.nmessages),
          (self.snapshoturi, po.nCharsOverall,     self.totalchars),
          (self.snapshoturi, po.mCharsOverall,     self.mchars_messages),
          (self.snapshoturi, po.dCharsOverall,     self.dchars_messages),
          (self.snapshoturi, po.nTokensOverall,    self.totaltokens),
          (self.snapshoturi, po.mTokensOverall,    self.mtokens_messages),
          (self.snapshoturi, po.dTokensOverall,    self.dtokens_messages),
          (self.snapshoturi, po.nSentencesOverall, self.totalsentences),
          (self.snapshoturi, po.mSentencesOverall, self.msentences_messages),
          (self.snapshoturi, po.dSentencesOverall, self.dsentences_messages),
          ]
     P.add(triples, context=self.meta_graph)
Ejemplo n.º 2
0
 def translateStates(self):
     count = 0
     triples = []
     for estado in self.data["estados"]:
         gid = estado[0]
         nome = estado[1]
         abr = estado[2]
         created = estado[3]
         updated = estado[4]
         relevance = estado[5]
         uri = P.rdf.ic(po.State,
                        self.snapshotid+"-"+str(gid),
                        self.translation_graph, self.snapshoturi)
         triples += [
                 (uri, po.name, nome),
                 (uri, po.abbreviation, abr),
                 (uri, po.createdAt, created),
                 (uri, po.relevance, relevance),
         ]
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         count += 1
         if count % 60 == 0:
             c("finished states entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of states entries")
             triples = []
     if triples:
             P.add(triples, self.translation_graph)
     c("finished add of states entries")
Ejemplo n.º 3
0
 def translateSupporters(self):
     count = 0
     triples = []
     for adesao in self.data["adesoes"]:
         tid = adesao[0]
         uid = adesao[1]
         created = adesao[2]
         updated = adesao[3]
         aid = adesao[4]
         uri = P.rdf.ic(po.Support,
                        self.snapshotid+"-"+str(aid),
                        self.translation_graph, self.snapshoturi)
         triples += [
                 (uri, po.participant,
                     po.Participant+'#'+self.snapshotid+'-'+str(uid)),
                 (uri, po.topic,
                     po.Topic+'#'+self.snapshotid+'-'+str(tid)),
                 (uri, po.createdAt, created),
         ]
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         count += 1
         if count % 60 == 0:
             c("finished supporters entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of supporters entries")
             triples = []
     if triples:
             P.add(triples, self.translation_graph)
     c("finished add of supporters entries")
Ejemplo n.º 4
0
 def translateObservatories(self):
     count = 0
     triples = []
     for observatorio in self.data["observatorios"]:
         oid = observatorio[0]
         uid = observatorio[1]
         created = observatorio[4]
         updated = observatorio[5]
         uri = P.rdf.ic(po.Observatory,
                        self.snapshotid+"-"+str(oid),
                        self.translation_graph, self.snapshoturi)
         triples += [
                 (uri, po.participant,
                     po.Participant+'#'+self.snapshoturi+'-'+str(uid)),
                 (uri, po.createdAt, created),
         ]
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         count += 1
         if count % 60 == 0:
             c("finished observatory  entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of observatory entries")
             triples = []
     if triples:
             P.add(triples, self.translation_graph)
     c("finished add of observatory entries")
Ejemplo n.º 5
0
def minimalTestData():
    triples=[
            (NS.po.SnapshotFoo+"#1", NS.facebook.ego, True),
            (NS.po.SnapshotFoo+"#1", NS.facebook.userID, "1039203918"),
            (NS.po.SnapshotFoo+"#1", NS.facebook.user, NS.facebook.Participant+"Foop"),
            ]
    P.add(triples,context="void")
Ejemplo n.º 6
0
 def translateNeighborhoods(self):
     count = 0
     triples = []
     for bairro in self.data["bairros"]:
         bid = bairro[0]
         nome = bairro[1]
         cid = bairro[2]
         created = bairro[3]
         updated = bairro[4]
         relevance = bairro[5]
         uri = P.rdf.ic(po.Neighborhood,
                        self.snapshotid+"-"+str(bid),
                        self.translation_graph, self.snapshoturi)
         triples += [
                 (uri, po.name, nome),
                 (uri, po.city,
                     po.City+'#'+self.snapshotid+'-'+str(cid)),
                 (uri, po.createdAt, created),
                 (uri, po.relevance, relevance)
         ]
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         count += 1
         if count % 60 == 0:
             c("finished neighborhood entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of neighborhood entries")
             triples = []
     if triples:
             P.add(triples, self.translation_graph)
     c("finished add of neighborhood entries")
Ejemplo n.º 7
0
 def translateCities(self):
     count = 0
     triples = []
     for cidade in self.data["cidades"]:
         cid = cidade[0]
         nome = cidade[1]
         eid = cidade[2]
         slug = cidade[3]
         created = cidade[4]
         updated = cidade[5]
         relevance = cidade[6]
         uri = P.rdf.ic(po.City,
                        self.snapshotid+"-"+str(cid),
                        self.translation_graph, self.snapshoturi)
         triples += [
                 (uri, po.name, nome),
                 (uri, po.state,
                     po.State+'#'+self.snapshotid+str(eid)),
                 (uri, po.slug, slug),
                 (uri, po.createdAt, created),
                 (uri, po.relevance, relevance)
         ]
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         count += 1
         if count % 60 == 0:
             c("finished cities k entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of cities entries")
             triples = []
     if triples:
             P.add(triples, self.translation_graph)
     c("finished add of cities entries")
Ejemplo n.º 8
0
 def addArticleBody(self, body, articleuri):
     triples = []
     if re.findall(r"<(.*)>(.*)<(.*)>", body, re.S):
         try:
             P.add((articleuri, po.htmlBodyText, body),
                   context=self.translation_graph)
         except QueryBadFormed:
             c("QUOTING HTML BODY")
             P.add((articleuri, po.quotedHtmlBodyText,
                    urllib.parse.quote(body)),
                   context=self.translation_graph)
         cleanbody = BeautifulSoup(body, 'html.parser').get_text()
         if cleanbody:
             try:
                 P.add((articleuri, po.cleanBodyText, cleanbody),
                       context=self.translation_graph)
             except QueryBadFormed:
                 c("QUOTING HTML CLEAN BODY")
                 P.add((articleuri, po.quotedCleanBodyText,
                        urllib.parse.quote(cleanbody)),
                       context=self.translation_graph)
     else:
         triples += [
                    (articleuri, po.cleanBodyText, body),
                    ]
         P.add(triples, context=self.translation_graph)
     self.bodies += [body]
Ejemplo n.º 9
0
 def writeRdf(self):
     pub_dir = './participabr_snapshot/'
     if not os.path.isdir(pub_dir):
         os.mkdir(pub_dir)
     g = P.context(self.translation_graph)
     g.serialize(pub_dir+'participabr.ttl', 'turtle')
     c('participation ttl serialized')
     g.serialize(pub_dir+'participabr.rdf', 'xml')
     c('participation xml serialized')
     # metadados: group, platform,
     triples = [
              (self.snapshoturi, a, po.Snapshot),
              # (self.snapshoturi, a, po.ParticipabrSnapshot),
              (self.snapshoturi, po.snapshotID, self.snapshotid),
              (self.snapshoturi, po.isEgo, False),
              (self.snapshoturi, po.isGroup, True),
              (self.snapshoturi, po.isFriendship, True),
              (self.snapshoturi, po.isInteraction, True),
              (self.snapshoturi, po.isPost, True),
              (self.snapshoturi, po.socialProtocol, 'ParticipaBR'),
              (self.snapshoturi, po.dateObtained, datetime.date(2012, 6, 28)),
              ]
     P.add(triples, self.meta_graph)
     g = P.context(self.meta_graph)
     g.serialize(pub_dir+'participabrMeta.ttl', 'turtle')
     c('participation meta ttl serialized')
     g.serialize(pub_dir+'participabrMeta.rdf', 'xml')
     c('participation meta xml serialized')
Ejemplo n.º 10
0
 def translateLinks(self):
     count = 0
     triples = []
     for link in self.data['links']:
         lid = link[0]
         nome = link[1]
         url = link[2]
         tid = link[4]
         created = link[5]
         updated = link[6]
         uri = P.rdf.ic(po.Link,
                        self.snapshotid+"-"+str(lid),
                        self.translation_graph, self.snapshoturi)
         triples += [
                 (uri, po.name, nome),
                 (uri, po.url, url),
                 (uri, po.topic,
                     po.Topic+'#'+self.snapshotid+'-'+str(tid)),
                 (uri, po.createdAt, created)
         ]
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         count += 1
         if count % 60 == 0:
             c("finished links entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of links entries")
             triples = []
     if triples:
             P.add(triples, self.translation_graph)
     c("finished add of links entries")
Ejemplo n.º 11
0
    def rdfGroupPosts(self, filename_posts_):
        data = [
            i.split("\t")
            for i in open(filename_posts_, "r").read().split("\n")[:-1]
        ]
        tvars = data[0]
        standard_vars = [
            'id', 'type', 'message', 'created_time', 'comments', 'likes',
            'commentsandlikes'
        ]
        if len(tvars) != sum([i == j for i, j in zip(tvars, standard_vars)]):
            raise ValueError("the tab file format was not understood")
        data = data[1:]
        triples = []
        self.nposts = 0
        nchars_all = []
        ntokens_all = []
        for post in data:
            ind = P.rdf.ic(po.Post, post[0], self.posts_graph,
                           self.snapshoturi)
            ptext = post[2].replace("_", "\n")
            nchars = len(ptext)
            nchars_all += [nchars]
            ntokens = len(k.tokenize.wordpunct_tokenize(ptext))
            ntokens_all += [ntokens]
            triples += [
                (ind, po.snapshot, self.snapshoturi),
                (ind, po.postID, post[0]),
                (ind, po.postType, post[1]),
                (ind, po.postText, ptext),
                (ind, po.createdAt, dateutil.parser.parse(post[3])),
                (ind, po.nComments, int(post[4])),
                (ind, po.nLikes, int(post[5])),
                (ind, po.nChars, nchars),
                (ind, po.nTokens, ntokens),
            ]
            if self.nposts % 200 == 0:
                c("posts: ", self.nposts)
            self.nposts += 1
        self.postsvars = [
            "postID", "postType", "postText", "createdAt", "nComments",
            "nLikes", "nChars", "nTokens"
        ]
        self.mcharsposts = n.mean(nchars_all)
        self.dcharsposts = n.std(nchars_all)
        self.totalchars = n.sum(nchars_all)
        self.mtokensposts = n.mean(ntokens_all)
        self.dtokensposts = n.std(ntokens_all)
        self.totaltokens = n.sum(ntokens_all)
        #triples+=[ # went to meta file
        #         (self.snapshoturi,po.mCharsPosts,self.mcharsposts),
        #         (self.snapshoturi,po.dCharsPosts,self.dcharsposts),
        #         (self.snapshoturi,po.totalCharsPosts,self.totalchars),

        #         (self.snapshoturi,po.mTokensPosts,self.mtokensposts),
        #         (self.snapshoturi,po.dTokensPosts,self.dtokensposts),
        #         (self.snapshoturi,po.totalTokensPosts,self.totaltokens),
        #         ]
        P.add(triples, context=self.posts_graph)
Ejemplo n.º 12
0
def triplesScaffolding(subjects, predicates, objects, context=None):
    """Link subject(s) through predicate(s) to subject(s).

    Accepts any combination of one and N triples in inputs, eg:
      triplesScafolding(participants,NS.po.name,names) # N 1 N
      triplesScafolding(participants,name_props,name) # N N 1
      triplesScafolding(participant,name_pros,names) # 1 N N

      triplesScafolding(participant, names_props,name) # 1 N 1
      triplesScafolding(participant, NS.po.name,names) # 1 1 N
      triplesScafolding(participants,NS.po.name,name) # N 1 1

    Might be useful for rearanging lists into triples:
      triplesScafolding(participants,name_props,names) # N N N
      triplesScafolding(participant,NS.po.name,names) # 1 1 1"""
    if isinstance(subjects, str):
        subjects = r.URIRef(subjects)

    N = max([len(subjects), 0][isinstance(subjects, (r.URIRef, r.Namespace))],
            [len(predicates), 0][isinstance(predicates,
                                            (r.URIRef, r.Namespace))],
            [len(objects), 0][isinstance(objects, (r.URIRef, r.Namespace))])
    check = sum([((len(i) == N) or isinstance(i, (r.URIRef, r.Namespace)))
                 for i in (subjects, predicates, objects)]) == 3
    if not check:
        raise ValueError(
            "input should be a combination of loose URIs and lists of same size "
        )
    triples = []
    if check == 3:
        for i, subject in enumerate(subjects):
            predicate = predicates[i]
            object_ = objects[i]
            triples += [(subject, predicate, object_)]
    else:
        if isinstance(subjects, (r.URIRef, r.Namespace)):
            subjects = [subjects]
        if isinstance(predicates, (r.URIRef, r.Namespace)):
            predicates = [predicates]
        if isinstance(objects, (r.URIRef, r.Namespace)):
            objects = [objects]
        if len(subjects) == 1:
            subjects *= N
        if len(predicates) == 1:
            predicates *= N
        if len(objects) == 1:
            objects *= N
        for subject, predicate, object_ in zip(subjects, predicates, objects):
            triples += [(subject, predicate, object_)]
    if context == "return_triples":
        return triples
    # c(outer_frame,dir(outer_frame),outer_frame.f_locals)
    # frames = inspect.getouterframes(inspect.currentframe())
    # outer_frame = frames[1][0]
    # if "triples" in outer_frame.f_locals:
    #     outer_frame.f_locals["triples"]+=triples
    # else:
    #     P.add(triples,context=context)
    P.add(triples, context=context)
Ejemplo n.º 13
0
def void():
    triples=[
            (NS.po.SnapshotFoo+"#1", a, NS.po.FacebookSnapshot),
            (NS.po.SnapshotFoo+"#1", NS.po.rawFile, "~/.percolation/data/somedirs/something.raw"),
            (NS.po.SnapshotFoo+"#1", NS.po.rdfFile, "~/.percolation/data/somedirs/something.rdf"),
            (NS.po.SnapshotFoo+"#1", NS.po.voidFile, "~/.percolation/data/somedirs/void.raw"),
            ]
    P.add(triples,context="void")
Ejemplo n.º 14
0
def minimumTestOntology(context="minimum_ontology"):
    triples=[
            (NS.po.FacebookSnapshot,NS.rdfs.subClassOf,NS.po.Snapshot),
            (NS.facebook.user,NS.rdfs.range,NS.po.Participant),
            (NS.facebook.ego,NS.rdfs.domain,NS.po.FacebookSnapshot),
            (NS.facebook.userID,NS.rdfs.subPropertyOf,NS.po.userID),
            ]
    P.add(triples,context=context)
Ejemplo n.º 15
0
def minimalTestData():
    triples = [
        (NS.po.SnapshotFoo + "#1", NS.facebook.ego, True),
        (NS.po.SnapshotFoo + "#1", NS.facebook.userID, "1039203918"),
        (NS.po.SnapshotFoo + "#1", NS.facebook.user,
         NS.facebook.Participant + "Foop"),
    ]
    P.add(triples, context="void")
Ejemplo n.º 16
0
def parseLegacyFiles(data_dir=DATADIR+"twitter/"):
    """Parse legacy pickle files with Twitter tweets"""
    filenames=os.listdir(data_dir)
    filenames=[i for i in filenames if i!="ipython_log.py" and not i.endswith(".swp")]

    snapshots=set()
    triples=[]
    for filename in filenames:
        snapshotid="twitter-legacy-"+filename.replace("_","")
        snapshoturi=po.TwitterSnapshot+"#"+snapshotid
        expressed_classes=[po.Participant,po.Tweet]
        expressed_reference=filename.replace("_","").replace(".pickle","")
        name_humanized="Twitter"+expressed_reference
        filesize=os.path.getsize(data_dir+filename)/10**6
        fileformat="pickle"
        fileuri=po.File+"#twitter-file-"+filename
        triples+=[
                 (snapshoturi,a,po.Snapshot),
                 (snapshoturi,a,po.TwitterSnapshot),
                 (snapshoturi,po.snapshotID,snapshotid),
                 (snapshoturi, po.isEgo, False),
                 (snapshoturi, po.isGroup, True),
                 (snapshoturi, po.isFriendship, False),
                 (snapshoturi, po.isInteraction, True),
                 (snapshoturi, po.isPost, True),
                 (snapshoturi, po.humanizedName, name_humanized),
                 (snapshoturi, po.expressedReference, expressed_reference),
                 (snapshoturi, po.rawFile, fileuri),
                 (fileuri,     po.fileSize, filesize),
                 (fileuri,     po.fileName, filename),
                 (fileuri,     po.fileFormat, fileformat),
                 ]+[
                 (fileuri,    po.expressedClass, expressed_class) for expressed_class in expressed_classes
                 ]
        snapshots.add(snapshoturi)
    nfiles=len(filenames)
    nsnapshots=len(snapshots)
    P.context("social_twitter","remove")
    platformuri=P.rdf.ic(po.Platform,"Twitter",context="social_twitter")
    triples+=[
             (NS.social.Session,NS.social.nIRCParsedFiles,nfiles),
             (NS.social.Session,NS.social.nIRCSnapshots,nsnapshots),
             (platformuri, po.dataDir,data_dir),
             ]
    P.add(triples,context="social_twitter")
    c("parsed {} twitter files ({} snapshots) are in percolation graph and 'social_twitter' context".format(nfiles,nsnapshots))
    c("percolation graph have {} triples ({} in social_twitter context)".format(len(P.percolation_graph),len(P.context("social_twitter"))))
    negos=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE         { GRAPH <social_twitter> { ?s po:isEgo true         } } ")
    ngroups=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE       { GRAPH <social_twitter> { ?s po:isGroup true       } } ")
    nfriendships=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE  { GRAPH <social_twitter> { ?s po:isFriendship true  } } ")
    ninteractions=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isInteraction true } } ")
    nposts=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE        { GRAPH <social_twitter> { ?s po:isPost true        } } ")
    totalsize=sum(P.query(r" SELECT ?size WHERE              { GRAPH <social_twitter> { ?s po:fileSize ?size     } } "))
    c("""{} are ego snapshots, {} are group snapshots
{} have a friendship network. {} have an interaction network. {} have post texts and reaction counts
Total raw data size is {:.2f}MB""".format(negos,ngroups,nfriendships,ninteractions,nposts,totalsize))

    return snapshots
Ejemplo n.º 17
0
 def translateObservatoryTags(self):
     triples = []
     for ot in self.data["observatorios_tem_tags"]:
         oid = ot[0]
         tid = ot[1]
         triples.append((po.Observatory+'#'+self.snapshotid+'-'+str(oid),
                         po.hasTag, po.Tag+'#'+self.snapshotid+'-'+str(tid)))
     P.add(triples, self.translation_graph)
     c("finished add of observatory tag entries")
Ejemplo n.º 18
0
def parseLegacyFiles(data_dir=DATADIR+"irc/"):
    """Parse legacy txt files with irc logs"""
    filenames=os.listdir(data_dir)
    filenames=[i for i in filenames if i!="ipython_log.py" and not i.endswith(".swp")]

    snapshots=set()
    triples=[]
    for filename in filenames:
        snapshotid="irc-legacy-"+filename.replace("#","")
        snapshoturi=po.TwitterSnapshot+"#"+snapshotid
        expressed_classes=[po.Participant,po.IRCMessage]
        expressed_reference=filename.replace("#","").replace(".txt","").replace(".log","")
        name_humanized="IRC log of channel "+expressed_reference
        filesize=os.path.getsize(data_dir+filename)/10**6
        fileformat="txt"
        fileuri=po.File+"#Irc-log-"+filename.replace("#","")
        triples+=[
                 (snapshoturi,a,po.Snapshot),
                 (snapshoturi,a,po.IRCSnapshot),
                 (snapshoturi,po.snapshotID,snapshotid),
                 (snapshoturi, po.isEgo, False),
                 (snapshoturi, po.isGroup, True),
                 (snapshoturi, po.isFriendship, False),
                 (snapshoturi, po.isInteraction, True),
                 (snapshoturi, po.isPost, True),
                 (snapshoturi, po.humanizedName, name_humanized),
                 (snapshoturi, po.expressedReference, expressed_reference),
                 (snapshoturi, po.rawFile, fileuri),
                 (fileuri,     po.fileSize, filesize),
                 (fileuri,     po.fileName, filename),
                 (fileuri,     po.fileFormat, fileformat),
                 ]+[
                 (fileuri,    po.expressedClass, expressed_class) for expressed_class in expressed_classes
                 ]
        snapshots.add(snapshoturi)
    nfiles=len(filenames)
    nsnapshots=len(snapshots)
    P.context("social_irc","remove")
    platformuri=P.rdf.ic(po.Platform,"IRC",context="social_irc")
    triples+=[
             (NS.social.Session,NS.social.nIRCParsedFiles,nfiles),
             (NS.social.Session,NS.social.nIRCSnapshots,nsnapshots),
             (platformuri, po.dataDir,data_dir),
             ]
    P.add(triples,context="social_irc")
    c("parsed {} irc logs files ({} snapshots) are in percolation graph and 'irc_twitter' context".format(nfiles,nsnapshots))
    c("percolation graph have {} triples ({} in social_irc context)".format(len(P.percolation_graph),len(P.context("social_irc"))))
    negos=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE         { GRAPH <social_irc> { ?s po:isEgo true         } } ")
    ngroups=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE       { GRAPH <social_irc> { ?s po:isGroup true       } } ")
    nfriendships=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE  { GRAPH <social_irc> { ?s po:isFriendship true  } } ")
    ninteractions=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isInteraction true } } ")
    nposts=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE        { GRAPH <social_irc> { ?s po:isPost true        } } ")
    totalsize=sum(P.query(r" SELECT ?size WHERE              { GRAPH <social_irc> { ?s po:fileSize ?size     } } "))
    c("""{} are ego snapshots, {} are group snapshots
{} have a friendship structures. {} have an interaction structures. {} have texts 
Total raw data size is {:.2f}MB""".format(negos,ngroups,nfriendships,ninteractions,nposts,totalsize))
    return snapshots
Ejemplo n.º 19
0
def void():
    triples = [
        (NS.po.SnapshotFoo + "#1", a, NS.po.FacebookSnapshot),
        (NS.po.SnapshotFoo + "#1", NS.po.rawFile,
         "~/.percolation/data/somedirs/something.raw"),
        (NS.po.SnapshotFoo + "#1", NS.po.rdfFile,
         "~/.percolation/data/somedirs/something.rdf"),
        (NS.po.SnapshotFoo + "#1", NS.po.voidFile,
         "~/.percolation/data/somedirs/void.raw"),
    ]
    P.add(triples, context="void")
Ejemplo n.º 20
0
    def translateComments(self):
        trans = {'resposta': 'answer',
                 'pergunta': 'question',
                 'comentario': 'comment',
                 'ideia': 'idea'}
        triples = []
        count = 0
        for comment in self.data['comments']:
            cid = comment[0]
            tid = comment[1]  # topic id
            body = comment[3]
            if not body:
                continue
            body = body.replace('', '')
            uid = comment[4]
            ctype = comment[8]
            created = comment[9]
            updated = comment[10]

            assert isinstance(cid, int)
            assert isinstance(tid, int)
            assert isinstance(body, str)
            assert isinstance(uid, int)
            assert isinstance(ctype, str)
            assert isinstance(created, datetime.datetime)
            assert isinstance(updated, datetime.datetime)
            commenturi = P.rdf.ic(po.Comment,
                                  self.snapshotid+"-"+str(cid),
                                  self.translation_graph, self.snapshoturi)
            participanturi = po.Participant+'#'+self.snapshotid+"-"+str(uid)
            # topicuri = self.topicuris[tid]
            topicuri = po.Topic+'#'+self.snapshotid+'-'+str(tid)
            triples += [
                (commenturi, po.author, participanturi),
                (commenturi, po.topic, topicuri),
                (commenturi, po.text, body),
                # (commenturi, po.nChars, len(body)),
                (commenturi, po.type, trans[ctype]),
                (topicuri, po.createdAt, created),
            ]
            if updated != created:
                 triples.append(
                    (topicuri, po.updatedAt, updated),
                 )
            count += 1
            if count % 60 == 0:
                c("finished comment entries:", count, "ntriples:", len(triples))
                P.add(triples, self.translation_graph)
                c("finished add of comment entries")
                triples = []
        if triples:
                P.add(triples, self.translation_graph)
        c("finished add of comment entries")
Ejemplo n.º 21
0
 def translatePlaces(self):
     count = 0
     triples = []
     for local in self.data["locais"]:
         lid = local[0]
         rid = local[1]
         rtype = local[2]
         bid = local[3]
         cid = local[4]
         created = local[7]
         updated = local[8]
         cep = local[9]
         eid = local[10]
         uri = P.rdf.ic(po.Place,
                        self.snapshotid+"-"+str(lid),
                        self.translation_graph, self.snapshoturi)
         triples += [(uri, po.createdAt, created)]
         if bid:
             triples.append((uri, po.neighborhood,
                             po.Neighborhood+'#'+self.snapshotid+'-'+str(bid)))
         if cid:
             triples.append((uri, po.city,
                             po.City+'#'+self.snapshotid+'-'+str(cid)))
         if eid:
             triples.append((uri, po.state,
                             po.State+'#'+self.snapshotid+'-'+str(eid)))
         if cep:
             triples.append((uri, po.cep, cep))
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         if rtype == "Topico":
             uri_ = po.Topic+'#'+self.snapshotid+'-'+str(rid)
         elif rtype == "User":
             uri_ = po.User+'#'+self.snapshotid+'-'+str(rid)
         elif rtype == "Competition":
             uri_ = po.Competition+'#'+self.snapshotid+'-'+str(rid)
         elif rtype == "Observatorio":
             uri_ = po.Observatory+'#'+self.snapshotid+'-'+str(rid)
         if rtype:
             triples.append((uri, po.accountable, uri_))
         count += 1
         if count % 60 == 0:
             c("finished places entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of places entries")
             triples = []
     if triples:
             P.add(triples, self.translation_graph)
     c("finished add of places entries")
Ejemplo n.º 22
0
    def writeAll(self):
        g=P.context(self.meta_graph)
        ntriples=len(g)
        triples=[
                 (self.snapshoturi,po.nMetaTriples,ntriples)      ,
                 ]
        P.add(triples,context=self.meta_graph)
        g.namespace_manager.bind("po",po)
        g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl")
        g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml")
        c("serialized meta")
        if not os.path.isdir(self.final_path_+"scripts"):
            os.mkdir(self.final_path_+"scripts")
        shutil.copy(PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py")
        # copia do base data

        self.dates=[i.isoformat() for i in self.dates]
        date1=min(self.dates)
        date2=max(self.dates)
        with open(self.final_path_+"README","w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the IRC
snapshot {snapid} with tweets from {date1} to {date2}
(total of {ntrip} triples).{tinteraction}{tposts}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::""".format(
                snapid=self.snapshotid,date1=date1,date2=date2,ntrip=self.ntriples,
                        tinteraction=tposts,
                        tposts=tposts,
                        mrdf=self.translation_xml,
                        mttl=self.translation_ttl,
                        ise=self.isego,
                        isg=self.isgroup,
                        isf=self.isfriendship,
                        isi=self.isinteraction,
                        ist=self.hastext,
                        ava=self.online_prefix,
                        desc=self.desc
                        ))
Ejemplo n.º 23
0
    def rdfGroupPosts(self,filename_posts_):
        data=[i.split("\t") for i in open(filename_posts_,"r").read().split("\n")[:-1]]
        tvars=data[0]
        standard_vars=['id','type','message','created_time','comments','likes','commentsandlikes']
        if len(tvars)!=sum([i==j for i,j in zip(tvars,standard_vars)]):
            raise ValueError("the tab file format was not understood")
        data=data[1:]
        triples=[]
        self.nposts=0
        nchars_all=[]
        ntokens_all=[]
        for post in data:
            ind=P.rdf.ic(po.Post,post[0],self.posts_graph,self.snapshoturi)
            ptext=post[2].replace("_","\n")
            nchars=len(ptext)
            nchars_all+=[nchars]
            ntokens=len(k.tokenize.wordpunct_tokenize(ptext))
            ntokens_all+=[ntokens]
            triples+=[
                     (ind,po.snapshot,self.snapshoturi),
                     (ind,po.postID,post[0]),
                     (ind,po.postType,post[1]),
                     (ind,po.postText,ptext),
                     (ind,po.createdAt,dateutil.parser.parse(post[3])),
                     (ind,po.nComments,int(post[4])),
                     (ind,po.nLikes,int(post[5])),
                     (ind,po.nChars,nchars),
                     (ind,po.nTokens,ntokens),
                     ]
            if self.nposts%200==0:
                c("posts: ",self.nposts)
            self.nposts+=1
        self.postsvars=["postID","postType","postText","createdAt","nComments","nLikes","nChars","nTokens"]
        self.mcharsposts=n.mean(nchars_all)
        self.dcharsposts=n.std(  nchars_all)
        self.totalchars=n.sum(   nchars_all)
        self.mtokensposts=n.mean(ntokens_all)
        self.dtokensposts=n.std( ntokens_all)
        self.totaltokens=n.sum(  ntokens_all)
        #triples+=[ # went to meta file
        #         (self.snapshoturi,po.mCharsPosts,self.mcharsposts),
        #         (self.snapshoturi,po.dCharsPosts,self.dcharsposts),
        #         (self.snapshoturi,po.totalCharsPosts,self.totalchars),

        #         (self.snapshoturi,po.mTokensPosts,self.mtokensposts),
        #         (self.snapshoturi,po.dTokensPosts,self.dtokensposts),
        #         (self.snapshoturi,po.totalTokensPosts,self.totaltokens),
        #         ]
        P.add(triples,context=self.posts_graph)
Ejemplo n.º 24
0
 def translateVotes(self):
     triples = []
     commentids = set(self.comments_table.get("id"))
     count = 0
     for id_, vote, voteable_id, voteable_type,\
         voter_id, voter_type, created_at in \
         self.votes_table.getMany(
                 ("id", "vote", "voteable_id",
                  "voteable_type", "voter_id", "voter_type", "created_at")):
         assert isinstance(id_, int)
         assert isinstance(voteable_id, int)
         assert isinstance(created_at, datetime.datetime)
         voteuri = P.rdf.ic(po.Vote, self.snapshotid+"-"+str(id_),
                            self.translation_graph, self.snapshoturi)
         if voteable_type == "Article":
             type__ = self.articletypes[voteable_id].split("::")[-1]
             # referenceuri = \
             #     eval("po."+type__)+"#"+self.snapshotid+"-"+str(voteable_id)
             referenceuri = \
                 po.Article+"#"+self.snapshotid+"-"+str(voteable_id)
         elif voteable_type == "Comment":
             assert voteable_id in commentids
             referenceuri = \
                 po.Comment+"#"+self.snapshotid+"-"+str(voteable_id)
         else:
             raise ValueError("unexpected voteable type")
         triples += [
                    (voteuri, po.createdAt, created_at),
                    (voteuri, po.vote, vote),
                    (voteuri, po.reference, referenceuri),
                    ]
         if voter_id:
             assert voter_type == "Profile"
             assert isinstance(voter_id, int)
             participanturi = po.Participant + '#' + \
                 self.snapshotid+"-"+self.profileids[voter_id]
             triples += [
                        (voteuri, po.author, participanturi),
                        ]
         count += 1
         if count % 100 == 0:
             c("votes done:", count)
             c("ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of votes")
             triples = []
     if triples:
         c("ntriples:", len(triples))
         P.add(triples, self.translation_graph)
Ejemplo n.º 25
0
 def makeMeta(self):
     triples = [
              (self.snapshoturi, a, po.Snapshot),
              # (self.snapshoturi, a, po.AASnapshot),
              # (self.snapshoturi, a, po.AAIRCSnapshot),
              (self.snapshoturi, po.snapshotID, self.snapshotid),
              (self.snapshoturi, po.isEgo, False),
              (self.snapshoturi, po.isGroup, True),
              (self.snapshoturi, po.isFriendship, False),
              (self.snapshoturi, po.isInteraction, False),
              (self.snapshoturi, po.isPost, True),
              (self.snapshoturi, po.socialProtocol, 'Algorithmic Autoregulation'),
              (self.snapshoturi, po.dateObtained, datetime.date(2015, 7, 15)),
              ]
     P.add(triples, self.meta_graph)
Ejemplo n.º 26
0
 def makeMeta(self):
     triples = [
              (self.snapshoturi, a, po.Snapshot),
              # (self.snapshoturi, a, po.AASnapshot),
              # (self.snapshoturi, a, po.AAIRCSnapshot),
              (self.snapshoturi, po.snapshotID, self.snapshotid),
              (self.snapshoturi, po.isEgo, False),
              (self.snapshoturi, po.isGroup, True),
              (self.snapshoturi, po.isFriendship, False),
              (self.snapshoturi, po.isInteraction, False),
              (self.snapshoturi, po.isPost, True),
              (self.snapshoturi, po.socialProtocol, 'Cidade Democrática'),
              (self.snapshoturi, po.dateObtained, datetime.date(2014, 3, 19)),
              ]
     P.add(triples, self.meta_graph)
Ejemplo n.º 27
0
 def translateImages(self):
     triples = []
     count = 0
     for imagem in self.data["imagens"]:
         iid = imagem[0]
         rid = imagem[1]
         rtype = imagem[2]
         size = imagem[3]
         ctype = imagem[4]
         fname = imagem[5]
         height = imagem[6]
         width = imagem[7]
         legenda = imagem[11]
         created = imagem[12]
         updated = imagem[13]
         uri = P.rdf.ic(po.Image,
                        self.snapshotid+"-"+str(iid),
                        self.translation_graph, self.snapshoturi)
         triples.append((uri, po.createdAt, created))
         if rtype == "User":
             triples.append((uri, po.accountable,
                             po.Participant+"#"+self.snapshotid+'-'+str(rid)))
         if rtype == "Topico":
             triples.append((uri, po.accountable,
                             po.Topic+"#"+self.snapshotid+'-'+str(rid)))
         if size:
             triples.append((uri, po.size, int(size)))
         if ctype:
             triples.append((uri, po.contentType, ctype))
         if fname:
             triples.append((uri, po.filename, fname))
         if height:
             triples.append((uri, po.height, int(height)))
         if width:
             triples.append((uri, po.width, int(width)))
         if legenda:
             triples.append((uri, po.caption, legenda))
         if updated != created:
             triples.append((uri, po.updatedAt, updated))
         count += 1
         if count % 60 == 0:
             c("finished image  entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of image entries")
             triples = []
     if triples:
             P.add(triples, self.translation_graph)
     c("finished add of prizes entries")
Ejemplo n.º 28
0
 def translateLoginHistory(self):
     triples = []
     for login in self.data["historico_de_logins"]:
         lid = login[0]
         uid = login[1]
         created = login[2]
         ip = login[3]
         uri = P.rdf.ic(po.Login,
                        self.snapshotid+"-"+str(lid),
                        self.translation_graph, self.snapshoturi)
         triples += [
                 (uri, po.participant,
                     po.Participant+'#'+self.snapshotid+'-'+str(uid)),
                 (uri, po.createdAt, created),
                 (uri, po.ip, ip)
         ]
     P.add(triples, self.translation_graph)
     c("finished add of login entries")
Ejemplo n.º 29
0
 def translateMacrotags(self):
     triples = []
     for mt in self.data["macro_tags"]:
         mtid = mt[0]
         title = mt[1]
         created = mt[2]
         updated = mt[3]
         uri = P.rdf.ic(po.Macrotag,
                        self.snapshotid+"-"+str(mtid),
                        self.translation_graph, self.snapshoturi)
         triples.append((uri, po.createdAt, created))
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         if title:
             triples.append((uri, po.title, title))
     P.add(triples, self.translation_graph)
     c("finished add of microtag entries")
Ejemplo n.º 30
0
 def translateFriendships(self):
     triples = []
     fids = self.friendships_table.getMany(("person_id", "friend_id"))
     added_friendships = []
     count = 0
     for person_id, friend_id, created_at, group in \
             self.friendships_table.getMany(
                 ('person_id', 'friend_id', 'created_at', 'group')):
         if [friend_id, person_id] in added_friendships:
             pass
         else:
             added_friendships += [[person_id, friend_id]]
         id0 = self.profileids[person_id]
         id1 = self.profileids[friend_id]
         friendshipuri = P.rdf.ic(po.Friendship,
                                  self.snapshotid+'-'+id0+'-'+id1,
                                  self.translation_graph, self.snapshoturi)
         participanturi0 = po.Participant+"#"+self.snapshotid+"-"+id0
         participanturi1 = po.Participant+"#"+self.snapshotid+"-"+id1
         assert isinstance(created_at, datetime.datetime)
         triples += [
                    (friendshipuri, po.member, participanturi0),
                    (friendshipuri, po.member, participanturi1),
                    (friendshipuri, po.createdAt, created_at),
                    ]
         if [friend_id, person_id] not in fids:
             triples += [
                        (participanturi0, po.knows, participanturi1),
                        ]
         if group:
             triples += [
                        (friendshipuri, po.socialCircle, group),
                        ]
         count += 1
         if count % 100 == 0:
             c("done friendships:", count)
             c("ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of friendships")
             triples = []
     if triples:
         c("ntriples:", len(triples))
         P.add(triples, self.translation_graph)
Ejemplo n.º 31
0
def ic(uriref, string, context=None, snapshoturi=None):
    uri = uriref + "#" + urllib.parse.quote(string, safe="")
    assert rfc3986.is_valid_uri(uri)  # also rfc3986.normalize_uri
    triples = [
        (uri, a, uriref),
    ]
    if snapshoturi:
        triples += [
            (uri, NS.po.snapshot, snapshoturi),
        ]
    # frames = inspect.getouterframes(inspect.currentframe())
    # c(outer_frame,dir(outer_frame),outer_frame.f_locals)
    # outer_frame = frames[1][0]
    # if "triples" in outer_frame.f_locals:
    #     outer_frame.f_locals["triples"]+=triples
    # else:
    #     P.add(triples,context=context)
    P.add(triples, context=context)
    return uri
Ejemplo n.º 32
0
 def translateCompetitions(self):
     count = 0
     triples = []
     for competition in self.data['competitions']:
         coid = competition[0]
         sdesc = competition[1]
         created = competition[3]
         updated = competition[4]
         start = competition[5]
         title = competition[11]
         ldesc = competition[14]
         adesc = competition[15]
         reg = competition[16]
         aw = competition[17]
         part = competition[18]
         competitionuri = P.rdf.ic(po.Competition,
                                   self.snapshotid+"-"+str(coid),
                                   self.translation_graph, self.snapshoturi)
         triples += [
                 (competitionuri, po.shortDescription, sdesc),
                 (competitionuri, po.description, ldesc),
                 (competitionuri, po.authorDescription, adesc),
                 (competitionuri, po.createdAt, created),
                 (competitionuri, po.startAt, start),
                 (competitionuri, po.title, title),
                 (competitionuri, po.regulations, reg),
                 (competitionuri, po.awards, aw),
                 (competitionuri, po.partners, part),
         ]
         if updated != created:
              triples.append(
                 (competitionuri, po.updatedAt, updated),
              )
         count += 1
         if count % 60 == 0:
             c("finished competition entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of competition entries")
             triples = []
     if triples:
         P.add(triples, self.translation_graph)
     c("finisheg add of competitiok entries")
Ejemplo n.º 33
0
    def translatePrizes(self):
        count = 0
        triples = []
        for prize in self.data["competition_prizes"]:
            pid = prize[0]
            name = prize[1]
            description = prize[2]
            competition_id = prize[3]
            offerer_id = prize[4]
            tid = prize[5]
            created = prize[6]
            updated = prize[7]
            prizeuri = P.rdf.ic(po.Prize,
                                self.snapshotid+"-"+str(pid),
                                self.translation_graph, self.snapshoturi)

            triples += [
                    (prizeuri, po.name, name),
                    (prizeuri, po.description, description),
                    (prizeuri, po.description, description),
                    (prizeuri, po.competition,
                        po.Competition+"#"+self.snapshotid+'-'+str(competition_id)),
                    (prizeuri, po.offerer,
                        po.Participant+"#"+self.snapshotid+'-'+str(offerer_id)),
                    (prizeuri, po.topic,
                     po.Topic+"#"+self.snapshotid+'-'+str(tid)),
                    (prizeuri, po.createdAt, created)
            ]
            if updated != created:
                triples += [
                           (prizeuri, po.updatedAt, updated),
                           ]
            count += 1
            if count % 60 == 0:
                c("finished prizes entries:", count, "ntriples:", len(triples))
                P.add(triples, self.translation_graph)
                c("finished add of prizes entries")
                triples = []
        if triples:
                P.add(triples, self.translation_graph)
        c("finished add of prizes entries")
Ejemplo n.º 34
0
    def rdfTweets(self):
        tweets = []
        if self.pickle_filename1:
            tweets += readPickleTweetFile(self.data_path +
                                          self.pickle_filename1)[0]
        if self.pickle_filename2:
            # limit chuck to 10k tweets
            tweets, fopen = readPickleTweetChunk(
                self.data_path + self.pickle_filename2, tweets, None, 10000)
        chunk_count = 0
        # self.tweets = tweets  # for debugging only, remove to release memory
        while tweets:
            c("rendering tweets, chunk:", chunk_count, "ntweets:", len(tweets),
              "snapshotid", self.snapshotid)
            count = 0

            for tweet in tweets:
                tweeturi, triples = self.tweetTriples(tweet)
                if "retweeted_status" in tweet.keys():
                    # self.nretweets += 1
                    tweeturi0, triples0 = self.tweetTriples(
                        tweet['retweeted_status'])
                    triples.extend(triples0)
                    triples.append((tweeturi, po.retweetOf, tweeturi0))
                self.ntriples += len(triples)
                P.add(triples, context=self.tweet_graph)
                count += 1
                if count % 1000 == 0:
                    c("triplified", count, "tweets")
            c("end of chunk:", chunk_count, "ntriples:", self.ntriples)
            self.writeTweets(chunk_count)
            c("chunk has been written")
            chunk_count += 1
            # if chunk_count == 2:
            #     break
            if self.pickle_filename2:
                tweets, fopen = readPickleTweetChunk(None, [], fopen, 10000)
            else:
                tweets = []
Ejemplo n.º 35
0
 def translateInspirations(self):
     count = 0
     triples = []
     for inspiration in self.data["inspirations"]:
         iid = inspiration[0]
         cid = inspiration[1]
         desc = inspiration[2]
         created = inspiration[3]
         updated = inspiration[4]
         image = inspiration[5]
         uid = inspiration[6]
         title = inspiration[7]
         uri = P.rdf.ic(po.Inspiration,
                        self.snapshotid+"-"+str(iid),
                        self.translation_graph, self.snapshoturi)
         triples += [
                 (uri, po.competition,
                     po.Competition+'#'+self.snapshotid+'-'+str(cid)),
                 (uri, po.description, desc),
                 (uri, po.createdAt, created),
                 (uri, po.participant,
                     po.Participant+'#'+self.snapshotid+'-'+str(uid)),
                 (uri, po.title, title),
                 (uri, po.filename, image),
         ]
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         count += 1
         if count % 60 == 0:
             c("finished inspiration entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of inspiration entries")
             triples = []
     if triples:
             P.add(triples, self.translation_graph)
     c("finished add of inspiration entries")
Ejemplo n.º 36
0
def rdfsInferenceIterate(data_context=None,ontology_context=None,inferred_context=None):
    contexts=[i.identifier.lower() for i in P.context()]
    if data_context not in contexts:
        c("no data context")
    if ontology_context not in contexts:
        c("no ontology context")
    if inferred_context not in contexts:
        c("inferred context to be created context:",inferred_context)
    for subject, foo, object_ in P.percolation_graph.triples(\
            (None,NS.rdfs.subClassOf,None),context=ontology_context):
        for individual, footype, foosubject in P.percolation_graph.triples(\
                (None,a,subject),context=data_context):
            P.add((individual,a,object_),context=inferred_context)
        for foosubject, fooproperty, subject in P.percolation_graph.triples(\
                (None,None,subject),context=data_context):
            P.add((foosubject,fooproperty,object_),context=inferred_context)

    c("finished subclass reasoning")
    for subject, foo, object_ in P.percolation_graph.triples(\
            (None,NS.rdfs.subPropertyOf,None),context=ontology_context):
        c(subject,foo,object_)
        for subject2,propertyfoo,object2 in P.percolation_graph.triples(\
                (None,subject,None),context=data_context):
            c(subject2,propertyfoo,object2)
            P.add((subject2,object_,object2),context=inferred_context)
    c("finished subproperty reasoning")
    for subject, foo, object_ in P.percolation_graph.triples(\
            (None,NS.rdfs.domain,None),context=ontology_context):
        for subject2,predicatefoo,objectfoo in P.percolation_graph.triples(\
                (None,subject,None),context=data_context):
            P.add((subject2,a,object_),context=inferred_context)
    c("finished domain reasoning")
    for subject, foo, object_ in P.percolation_graph.triples(\
            (None,NS.rdfs.range,None),context=ontology_context):
        for subjectfoo,predicatefoo,object2 in P.percolation_graph.triples(\
                (None,subject,None),context=data_context):
                P.add((object2,a,object_),context=inferred_context)
    c("finished range reasoning")
Ejemplo n.º 37
0
    def translateTags(self):
        count = 0
        triples = []
        for tag in self.data["tags"]:
            tid = tag[0]
            tag_ = tag[1]
            relevancia = tag[2]

            uri = P.rdf.ic(po.Tag,
                           self.snapshotid+"-"+str(tid),
                           self.translation_graph, self.snapshoturi)
            triples += [
                        (uri, po.text, tag_),
                        (uri, po.relevance, relevancia),
            ]
            count += 1
            if count % 160 == 0:
                c("finished tag  entries:", count, "ntriples:", len(triples))
                P.add(triples, self.translation_graph)
                c("finished add of tag  entries")
                triples = []
        if triples:
                P.add(triples, self.translation_graph)
        c("finished add of tag entries")
Ejemplo n.º 38
0
    def translateTaggings(self):
        count = 0
        triples = []
        for tagging in self.data["taggings"]:
            tid_ = tagging[0]
            tid = tagging[1]
            toid = tagging[2]
            uid = tagging[3]
            ttype = tagging[5]
            created = tagging[7]

            uri = po.Tagging+"#"+self.snapshotid+'-'+str(tid_)
            uri = P.rdf.ic(po.Tagging,
                           self.snapshotid+"-"+str(tid_),
                           self.translation_graph, self.snapshoturi)
            triples += [
                (uri, po.tag, po.Tag+"#"+self.snapshotid+'-'+str(tid)),
                (uri, po.tagger, po.Participant+"#"+self.snapshotid+'-'+str(uid)),
                (uri, po.createdAt, created)
            ]
            if ttype == "Topico":
                # tagging -> topico
                triples.append((uri, po.tagged,
                                po.Topic+'#'+self.snapshotid+'-'+str(toid)))
            else:
                triples.append((uri, po.tagged,
                                po.Macrotag+"#"+self.snapshotid+'-'+str(toid)))
            count += 1
            if count % 160 == 0:
                c("finished tagging  entries:", count, "ntriples:", len(triples))
                P.add(triples, self.translation_graph)
                c("finished add of tagging  entries")
                triples = []
        if triples:
                P.add(triples, self.translation_graph)
        c("finished add of tagging entries")
Ejemplo n.º 39
0
    def makeMetadata(self):
        triples=P.get(self.snapshoturi,None,None,"social_facebook")
        for rawfile in P.get(self.snapshoturi,po.rawFile,None,"social_facebook",strict=True,minimized=True):
            triples+=P.get(rawfile,None,None,"social_facebook")
        P.add(triples,context=self.meta_graph)

        self.ffile="base/"+self.filename_friendships
        self.frdf=self.snapshotid+"Friendship.rdf"
        self.fttl=self.snapshotid+"Friendship.ttl"
        triples=[
                (self.snapshoturi, po.onlineOriginalFriendshipFile,self.online_prefix+self.ffile),
                (self.snapshoturi, po.originalFriendshipFileName,self.ffile),
                (self.snapshoturi, po.onlineFriendshipXMLFile,self.online_prefix+self.frdf),
                (self.snapshoturi, po.onlineFriendshipTTLFile,self.online_prefix+self.fttl),
                (self.snapshoturi, po.friendshipXMLFileName,       self.frdf),
                (self.snapshoturi, po.friendshipTTLFileName,       self.fttl),
                (self.snapshoturi, po.nFriends,              self.nfriends),
                (self.snapshoturi, po.nFriendships,          self.nfriendships),
                (self.snapshoturi, po.friendshipsAnonymized ,self.friendships_anonymized),
                ]
        P.add(triples,context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                [po.frienshipParticipantAttribute]*len(self.friendsvars),
                self.friendsvars,context=self.meta_graph)

        self.mrdf=self.snapshotid+"Meta.rdf"
        self.mttl=self.snapshotid+"Meta.ttl"

        self.desc="facebook network with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
                                                self.snapshotid,self.snapshoturi,self.isego,self.isgroup,)
        self.desc+="\nisFriendship: {}".format(self.isfriendship)
        self.desc+="; nFriends: {}; nFrienships: {}.".format(self.nfriends,self.nfriendships,)
        self.desc+="\nisInteraction: {}".format(self.isinteraction)
        self.desc+="\nisPost: {} (alias hasText: {})".format(self.hastext,self.hastext)
        triples=[
                (self.snapshoturi, po.triplifiedIn,      datetime.datetime.now()),
                (self.snapshoturi, po.triplifiedBy,      "scripts/"),
                (self.snapshoturi, po.donatedBy,         self.snapshotid[:-4]),
                (self.snapshoturi, po.availableAt,       self.online_prefix),
                (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf),
                (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl),
                (self.snapshoturi, po.metaXMLFileName,   self.mrdf),
                (self.snapshoturi, po.metaTTLFileName,   self.mttl),
                (self.snapshoturi, po.acquiredThrough,   "Netvizz"),
                (self.snapshoturi, po.socialProtocolTag, "Facebook"),
                (self.snapshoturi, po.socialProtocol,    P.rdf.ic(po.Platform,"Facebook",self.meta_graph,self.snapshoturi)),
                (self.snapshoturi, NS.rdfs.comment,         self.desc),
                ]
        P.add(triples,self.meta_graph)
Ejemplo n.º 40
0
    def makeMetadata(self):
        # triples = P.get(self.snapshoturi, None, None, self.social_graph)
        # for rawfile in P.get(self.snapshoturi, po.rawFile, None, self.social_graph, strict=True, minimized=True):
        #     triples += P.get(rawfile, None, None, self.social_graph)
        # self.totalchars = sum(self.nchars_all)
        # self.mcharsmessages = n.mean(self.nchars_all)
        # self.dcharsmessages = n.std(self.nchars_all)
        # self.totaltokens = sum(self.ntokens_all)
        # self.mtokensmessages = n.mean(self.ntokens_all)
        # self.dtokensmessages = n.std(self.ntokens_all)
        # self.totalsentences = sum(self.nsentences_all)
        # self.msentencesmessages = n.mean(self.nsentences_all)
        # self.dsentencesmessages = n.std(self.nsentences_all)
        # self.nparticipants = len(self.NICKS)
        # self.nmessages = len(self.messageids)
        # self.ntriples = len(P.context(self.irc_graph))
        # triples = [
        # (self.snapshoturi, po.numberOfParticipants,           self.nparticipants),
        # (self.snapshoturi, po.numberOfMessages,                 self.nmessages),
        # (self.snapshoturi, po.numberOfDirectMessages,              self.ndirect),
        # (self.snapshoturi, po.numberOfUserMentions,              self.nmention),
        # (self.snapshoturi, po.numberOfChars, self.totalchars),
        # (self.snapshoturi, po.meanChars, self.mcharsmessages),
        # (self.snapshoturi, po.deviationChars, self.dcharsmessages),
        # (self.snapshoturi, po.numberOfTokens, self.totaltokens),
        # (self.snapshoturi, po.meanTokens, self.mtokensmessages),
        # (self.snapshoturi, po.deviationTokens, self.dtokensmessages),
        # (self.snapshoturi, po.numberOfSentences, self.totalsentences),
        # (self.snapshoturi, po.meanSentences, self.msentencesmessages),
        # (self.snapshoturi, po.deviationSentences, self.dsentencesmessages),
        #        ]
        # P.add(triples, context=self.meta_graph)
        # P.rdf.triplesScaffolding(
        #     self.snapshoturi,
        #     [po.ircParticipantAttribute]*len(self.participantvars),
        #     self.participantvars, context=self.meta_graph
        # )
        # P.rdf.triplesScaffolding(
        #     self.snapshoturi,
        #     [po.logXMLFilename]*len(self.log_xml)+[po.logTTLFilename]*len(self.log_ttl),
        #     self.log_xml+self.log_ttl, context=self.meta_graph
        # )
        # P.rdf.triplesScaffolding(
        #     self.snapshoturi,
        #     [po.onlineLogXMLFile]*len(self.log_xml)+[po.onlineLogTTLFile]*len(self.log_ttl),
        #     [self.online_prefix+i for i in self.log_xml+self.log_ttl], context=self.meta_graph
        # )

        self.mrdf = self.snapshotid + "Meta.rdf"
        self.mttl = self.snapshotid + "Meta.ttl"
        self.desc = "irc dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
            self.snapshotid,
            self.snapshoturi,
            self.isego,
            self.isgroup,
        )
        self.desc += "\nisFriendship: {}; ".format(self.isfriendship)
        self.desc += "isInteraction: {}.".format(self.isinteraction)
        # self.desc += "\nnParticipants: {}; nInteractions: {} (directed messages+user mentions).".format(
        #     self.nparticipants, self.ndirect+self.nmention)
        self.desc += "\nisPost: {} (alias hasText: {})".format(
            self.hastext, self.hastext)
        # self.desc += "\nnumberOfMessages: {}; ".format(self.nmessages)
        # self.desc += "nDirectedMessages: {}; numberOfUserMentions: {};".format(self.ndirect, self.nmention)
        # self.desc += "\nnumberOfChars: {}; meanChars: {}; deviationChars: {}.".format(
        #     self.totalchars, self.mcharsmessages, self.dcharsmessages)
        # self.desc += "\nnumberOfTokens: {}; meanTokens: {}; deviationTokens: {};"
        #     self.totaltokens, self.mtokensmessages, self.dtokensmessages)
        # self.desc += "\nnSentencesOverall: {}; meanSentences: {}; deviationSentences: {};".format(
        #     self.totalsentences, self.msentencesmessages, self.dsentencesmessages)
        # self.desc += "\nnumberOfURLs: {}; numberOfAAMessages {}.".format(self.nurls, self.naamessages)
        triples = [
            (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()),
            (self.snapshoturi, a, po.Snapshot),
            (self.snapshoturi, po.snapshotID, self.snapshotid),
            (self.snapshoturi, po.isEgo, False),
            (self.snapshoturi, po.isGroup, True),
            (self.snapshoturi, po.isFriendship, False),
            (self.snapshoturi, po.isInteraction, True),
            (self.snapshoturi, po.isPost, True),
            (self.snapshoturi, po.channel,
             '#' + self.snapshotid.replace('irc-legacy-', '')),
            # (self.snapshoturi, po.triplifiedBy,      "scripts/"),
            # (self.snapshoturi, po.donatedBy,         self.snapshotid[:-4]),
            # (self.snapshoturi, po.availableAt,       self.online_prefix),
            # (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf),
            # (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl),
            # (self.snapshoturi, po.metaXMLFileName,   self.mrdf),
            # (self.snapshoturi, po.metaTTLFileName,   self.mttl),
            # (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_xml)),
            # (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)),
            (self.snapshoturi, po.acquiredThrough, "channel text log"),
            (self.snapshoturi, po.socialProtocol, "IRC"),
            # (self.snapshoturi, po.socialProtocolTag, "IRC"),
            # (self.snapshoturi, po.socialProtocol,    P.rdf.ic( po.Platform, "IRC", self.meta_graph, self.snapshoturi)),
            # (self.snapshoturi, po.numberOfTriples,         self.ntriples),
            (self.snapshoturi, po.comment, self.desc),
        ]
        P.add(triples, self.meta_graph)
Ejemplo n.º 41
0
 def rdfFriendshipNetwork(self, fnet):
     if sum([("user" in i) for i in fnet["individuals"]["label"]]) == \
             len(fnet["individuals"]["label"]):
         # fake names and local ids
         self.friendships_anonymized = True
     else:
         self.friendships_anonymized = False
     tkeys = list(fnet["individuals"].keys())
     if "groupid" in tkeys:
         self.groupid = fnet["individuals"]["groupid"][0]
         tkeys.remove("groupid")
     else:
         self.groupid = None
     if self.friendships_anonymized:
         self.friendsvars = [trans[i] for i in tkeys if
                             i not in ('label', 'name')]
     else:
         self.friendsvars = [trans[i] for i in tkeys]
     insert = {"uris": [], "vals": []}
     # values for each participant are in the same order as insert['uris']
     for tkey in tkeys:
         insert["uris"].append(eval("po."+trans[tkey]))
         insert["vals"].append(fnet["individuals"][tkey])
     self.nfriends = len(insert["vals"][0])
     iname = tkeys.index("name")
     ilabel = tkeys.index("label")
     for vals_ in zip(*insert["vals"]):
         if self.friendships_anonymized:
             if vals_[ilabel] and ("user" not in vals_[ilabel]):
                 raise ValueError("Anonymized networks should have no \
                                  informative name. Found: "+vals_[ilabel])
             name_ = "{}-{}".format(self.self.snapshotid, vals_[iname])
             insert_uris_ = [el for i, el in enumerate(insert['uris']) if
                             i not in (ilabel, iname)]
             vals_ = [el for i, el in enumerate(vals_) if
                      (i not in (ilabel, iname))]
             obsname = '{}-{}'.format(self.snapshotid, self.observation_count)
             self.observation_count += 1
         else:
             name_ = "{}-{}".format(self.provenance_prefix, vals_[iname])
             insert_uris_ = [el for i, el in enumerate(insert['uris'])]
             vals_ = [el for i, el in enumerate(vals_)]
             # uri = insert['uris'][iname]
             # numericID = vals_[iname]
             # P.add([(ind, uri, numericID)], self.friendship_graph)
             obsname = '{}-{}'.format(self.snapshotid, vals_[iname])
         ind = P.rdf.ic(po.Participant, name_, self.friendship_graph,
                        self.snapshoturi)
         obs = P.rdf.ic(po.Observation, obsname, self.friendship_graph,
                        self.snapshoturi)
         P.add([(ind, po.observation, obs)], self.friendship_graph)
         P.rdf.triplesScaffolding(obs, insert_uris_, vals_,
                                  self.friendship_graph)
     c("participants written")
     friendships_ = [fnet["relations"][i] for i in ("node1", "node2")]
     i = 0
     for uid1, uid2 in zip(*friendships_):
         uids_ = [uid1, uid2]
         uids_.sort()
         if self.friendships_anonymized:
             flabel = "{}-{}-{}".format(self.snapshotid, *uids_)
             uids = [r.URIRef(po.Participant+"#{}-{}".format(
                 self.snapshotid, i)) for i in (uid1, uid2)]
         else:
             flabel = "{}-{}-{}".format(self.provenance_prefix, *uids_)
             uids = [r.URIRef(po.Participant+"#{}-{}".format(
                 self.provenance_prefix, i)) for i in (uid1, uid2)]
         friendship_uri = P.rdf.ic(po.Friendship, flabel,
                                   self.friendship_graph, self.snapshoturi)
         P.rdf.triplesScaffolding(friendship_uri, [po.member]*2,
                                  uids, self.friendship_graph)
         i += 1
         if (i % 1000) == 0:
             c("friendships", i)
     self.nfriendships = len(friendships_[0])
     c("friendships written")
Ejemplo n.º 42
0
def probeOntology(endpoint_url, graph_urns, final_dir, one_datatype=True):
    if not os.path.isdir(final_dir):
        os.makedirs(final_dir)

    client = P.rdf.sparql.classes.LegacyClient(endpoint_url)
    from_ = ''
    for graph_urn in graph_urns:
        from_ += '\nFROM <%s>' % (graph_urn, )

    def mkQuery(query, plain=True):
        query_ = query.split('WHERE')
        query__ = (query_[0], from_, '\nWHERE ' + query_[1])
        query___ = ''.join(query__)
        result = client.retrieveQuery(query___)
        if plain:
            return pl(result)
        else:
            return result['results']['bindings']

    c('find all classes')
    q = "SELECT DISTINCT ?class WHERE { ?s a ?class . }"
    # classes = pl(client.retrieveQuery(prefix+q))
    classes = mkQuery(q)

    c('antecedents, consequents and restrictions of each class')
    neighbors = {}
    triples = []
    existential_restrictions = {}
    universal_restrictions = {}
    for aclass in classes:
        q = "SELECT DISTINCT ?cs ?p WHERE { ?i a <%s> . ?s ?p ?i . OPTIONAL { ?s a ?cs . } }" % (
            aclass, )
        antecedent_property = mkQuery(q)
        # q = "SELECT DISTINCT ?ap (datatype(?o) as ?do) WHERE { ?i a <%s> . ?i ?ap ?o . filter (datatype(?o) != '') }" % (aclass,)
        # consequent_property = mkQuery(q)
        # q = "SELECT DISTINCT ?ap ?co WHERE { ?i a <%s> . ?i ?ap ?o . ?o a ?co . }" % (aclass,)
        # consequent_property_ = mkQuery(q)
        q = "SELECT DISTINCT ?ap ?co (datatype(?o) as ?do) WHERE { ?i a <%s> . ?i ?ap ?o . OPTIONAL { ?o a ?co . } }" % (
            aclass, )
        consequent_property__ = mkQuery(q, 0)
        consequent_property = [[i['ap']['value'], i['do']['value']]
                               for i in consequent_property__ if 'do' in i]
        consequent_property_ = [[i['ap']['value'], i['co']['value']]
                                for i in consequent_property__ if 'co' in i]
        neighbors[aclass] = (antecedent_property,
                             consequent_property + consequent_property_)
        # neighbors[aclass] = (antecedent_property, dict(consequent_property, **consequent_property_))

        # class restrictions
        q = "SELECT DISTINCT ?p WHERE {?s a <%s>. ?s ?p ?o .}" % (aclass, )
        props_c = mkQuery(q)
        # q = "SELECT DISTINCT ?s WHERE {?s a <%s>}" % (aclass,)
        # inds = mkQuery(q)
        q = "SELECT (COUNT(DISTINCT ?s) as ?cs) WHERE {?s a <%s>}" % (aclass, )
        ninds = pl(client.retrieveQuery(q))[0]
        for pc in props_c:
            if '22-rdf-syntax' in pc:
                continue
            # q = "SELECT DISTINCT ?s ?co  (datatype(?o) as ?do) WHERE {?s a <%s>. ?s <%s> ?o . OPTIONAL {?o a ?co . }}" % (aclass, pc)
            q = "SELECT DISTINCT ?co (datatype(?o) as ?do) WHERE {?s a <%s>. ?s <%s> ?o . OPTIONAL {?o a ?co . }}" % (
                aclass, pc)
            inds2 = mkQuery(q, 0)
            # inds2_ = set([i["s"]["value"] for i in inds2])
            objs = set([i["co"]["value"] for i in inds2 if "co" in i.keys()])
            vals = set([i["do"]["value"] for i in inds2 if "do" in i.keys()])
            q = "SELECT (COUNT(DISTINCT ?s) as ?cs) WHERE {?s a <%s>. ?s <%s> ?o . }" % (
                aclass, pc)
            ninds2 = pl(client.retrieveQuery(q))[0]
            # if len(inds) == len(inds2_):  # existential
            if ninds == ninds2:  # existential
                if len(vals):
                    ob = list(vals)[0]
                else:
                    if len(objs):
                        ob = list(objs)[0]
                    else:
                        ob = 0
                if ob:
                    B = r.BNode()
                    triples += [(aclass, rdfs.subClassOf, B),
                                (B, a, owl.Restriction),
                                (B, owl.onProperty, pc),
                                (B, owl.someValuesFrom, ob)]
                    if aclass in existential_restrictions.keys():
                        existential_restrictions[aclass].append((pc, ob))
                    else:
                        existential_restrictions[aclass] = [(pc, ob)]
            q = "SELECT (COUNT(DISTINCT ?s) as ?cs) WHERE { ?s <%s> ?o . ?s a ?ca . FILTER(str(?ca) != '%s') }" % (
                pc, aclass)
            ninds3 = pl(client.retrieveQuery(q))[0]
            # q = "SELECT DISTINCT ?s WHERE { ?s <%s> ?o .}" % (pc,)
            # inds3 = mkQuery(q)
            # if set(inds) == set(inds3):  # universal
            # if all([i in set(inds) for i in inds3]):  # universal
            # if ninds == ninds3:  # universal
            if ninds3 == 0:  # universal
                if len(vals):
                    ob = list(vals)[0]
                else:
                    if len(objs):
                        ob = list(objs)[0]
                    else:
                        ob = 0
                if ob:
                    B = r.BNode()
                    triples += [(aclass, rdfs.subClassOf, B),
                                (B, a, owl.Restriction),
                                (B, owl.onProperty, pc),
                                (B, owl.allValuesFrom, ob)]
                    if aclass in universal_restrictions.keys():
                        universal_restrictions[aclass].append((pc, ob))
                    else:
                        universal_restrictions[aclass] = [(pc, ob)]
    del q, aclass, antecedent_property, consequent_property
    c('find properties')
    q = "SELECT DISTINCT ?p WHERE {?s ?p ?o}"
    # properties = pl(client.retrieveQuery(prefix+q))
    properties = mkQuery(q)
    # properties_ = [i.split("/")[-1] for i in properties]

    c('check if property is functional and get range and domain')
    functional_properties = set()
    for prop in properties:
        # check if property is functional
        q = 'SELECT DISTINCT (COUNT(?o) as ?co) WHERE { ?s <%s> ?o } GROUP BY ?s' % (
            prop, )
        is_functional = mkQuery(q)
        if len(is_functional) == 1 and is_functional[0] == 1:
            triples.append((prop, a, owl.FunctionalProperty))
            functional_properties.add(prop)

        # datatype or object properties
        suj = mkQuery("SELECT DISTINCT ?cs WHERE { ?s <%s> ?o . ?s a ?cs . }" %
                      (prop, ))
        # obj = mkQuery("SELECT DISTINCT ?co (datatype(?o) as ?do) WHERE { ?s <%s> ?o . OPTIONAL { ?o a ?co . } }" % (prop,))
        obj1 = mkQuery(
            "SELECT DISTINCT ?co WHERE { ?s <%s> ?o . ?o a ?co . }" % (prop, ))
        obj2 = mkQuery(
            "SELECT DISTINCT (datatype(?o) as ?do) WHERE { ?s <%s> ?o . }" %
            (prop, ))
        obj = obj1 + obj2
        if len(obj) and ("XMLS" in obj[0]):
            triples.append((prop, a, owl.DataProperty))
        else:
            triples.append((prop, a, owl.ObjectProperty))
        if len(suj) > 1:
            B = r.BNode()
            triples.append((prop, rdfs.domain, B))
            for ss in suj:
                triples.append((B, owl.unionOf, ss))
        elif suj:
            triples.append((prop, rdfs.domain, suj[0]))
        if len(obj) > 1:
            B = r.BNode()
            triples.append((prop, rdfs.range, B))
            for ss in suj:
                triples.append((B, owl.unionOf, ss))
        elif obj:
            triples.append((prop, rdfs.range, obj[0]))

        # for drawing
        # prop_ = prop.split("/")[-1]
        # suj_ = [i.split('/')[-1] for i in suj]
        # obj_ = [i.split('/')[-1] for i in obj]
    # Drawing
    c('started drawing')
    A = gv.AGraph(directed=True, strict=False)
    q = """PREFIX po: <http://purl.org/socialparticipation/po/>
    SELECT DISTINCT ?snap WHERE { { ?i po:snapshot ?snap } UNION { ?snap po:snapshotID ?idfoo } }"""
    # SELECT DISTINCT ?snap WHERE { ?i po:snapshot ?snap }"""
    snap = mkQuery(q)[0]
    q = """PREFIX po: <http://purl.org/socialparticipation/po/>
    SELECT ?provenance
    WHERE { <%s> po:socialProtocol ?provenance }""" % (snap)
    # WHERE { { <%s> po:socialProtocolTag ?provenance } UNION
    #         { <%s> po:humanizedName ?provenance } }""" % (snap, snap)
    provenance = pl(client.retrieveQuery(q))[0]
    # A.graph_attr["label"] = r"General diagram of ontological structure from %s in the http://purl.org/socialparticipation/participationontology/ namespace.\nGreen edge denotes existential restriction;\ninverted edge nip denotes universal restriction;\nfull edge (non-dashed) denotes functional property." % (provenance,)
    edge_counter = 1
    node_counter = 1
    data_nodes = {}
    for aclass in classes:
        aclass_ = aclass.split('/')[-1]
        if aclass_ not in A.nodes():
            A.add_node(aclass_, style="filled")
            n = A.get_node(aclass_)
            n.attr['color'] = "#A2F3D1"
        neigh = neighbors[aclass]
        # for i in range(len(neigh[0])):  # antecendents
        #     label = neigh[0][i][0].split("/")[-1]
        #     elabel = neigh[0][i][1]
        #     elabel_ = elabel.split("/")[-1]
        #     if label not in A.nodes():
        #         A.add_node(label, style="filled")
        #         n = A.get_node(label)
        #         n.attr['color'] = "#A2F3D1"
        #     ekey = '{}-{}-{}'.format(label, aclass_, edge_counter)
        #     edge_counter += 1
        #     A.add_edge(label, aclass_, ekey)
        #     e = A.get_edge(label, aclass_, key=ekey)
        #     e.attr["label"] = elabel_
        #     e.attr["penwidth"] = 2.
        #     e.attr["arrowsize"] = 2.
        #     if elabel not in functional_properties:
        #         e.attr["style"] = "dashed"
        #     if neigh[0][i][0] in existential_restrictions.keys():
        #         restriction = existential_restrictions[neigh[0][i][0]]
        #         prop = [iii[0] for iii in restriction]
        #         obj = [iii[1] for iii in restriction]
        #         if (elabel in prop) and (obj[prop.index(elabel)] == aclass):
        #             e.attr["color"] = "#A0E0A0"
        #     if neigh[0][i][0] in universal_restrictions.keys():
        #         restriction = universal_restrictions[neigh[0][i][0]]
        #         prop = [iii[0] for iii in restriction]
        #         obj = [iii[1] for iii in restriction]
        #         if (elabel in prop) and (obj[prop.index(elabel)] == aclass):
        #             e.attr["color"] = "inv"
        for i in range(len(neigh[1])):  # consequents
            label = neigh[1][i][1].split("/")[-1]
            elabel = neigh[1][i][0]
            elabel_ = elabel.split('/')[-1]
            if "XMLS" in label:
                color = "#FFE4AA"
                if one_datatype:
                    if label in data_nodes:
                        label_ = data_nodes[label]
                    else:
                        label_ = node_counter
                        node_counter += 1
                        data_nodes[label] = label_
                else:
                    label_ = node_counter
                    node_counter += 1
            else:
                label_ = label
                color = "#A2F3D1"
            if label_ not in A.nodes():
                A.add_node(label_, style="filled")
                n = A.get_node(label_)
                n.attr['label'] = label.split("#")[-1]
                n.attr['color'] = color
            ekey = '{}-{}-{}'.format(aclass_, label_, edge_counter)
            edge_counter += 1
            A.add_edge(aclass_, label_, ekey)
            e = A.get_edge(aclass_, label_, key=ekey)
            e.attr["label"] = elabel_
            e.attr["color"] = color
            e.attr["penwidth"] = 2
            if r.URIRef(elabel) not in functional_properties:
                e.attr["style"] = "dashed"
            if aclass in existential_restrictions.keys():
                restrictions = existential_restrictions[aclass]
                prop = [iii[0] for iii in restrictions]
                if r.URIRef(elabel) in prop:
                    e.attr["color"] = "#A0E0A0"
            if aclass in universal_restrictions.keys():
                restrictions = universal_restrictions[aclass]
                prop = [iii[0] for iii in restrictions]
                if r.URIRef(elabel) in prop:
                    e.attr["arrowhead"] = "inv"
                    e.attr["arrowsize"] = 2.

    # A.draw(os.path.join(final_dir, "{}.png".format(final_dir)), prog="dot")
    # try:
    #     A.draw(os.path.join(final_dir, "{}_circo.png".format(final_dir)), prog="circo")
    # except:
    #     pass
    # A.draw(os.path.join(final_dir, "{}_twopi.png".format(final_dir)), prog="twopi", args="-Granksep=4")
    # A.write(os.path.join(final_dir, "{}.dot".format(final_dir)))
    A.draw(os.path.join(final_dir, "draw.png"), prog="dot")
    try:
        A.draw(os.path.join(final_dir, "draw_circo.png"), prog="circo")
    except:
        pass
    A.draw(os.path.join(final_dir, "draw_twopi.png"),
           prog="twopi",
           args="-Granksep=4")
    A.write(os.path.join(final_dir, "draw.dot"))
    # for triple in triples:
    #     g.add(triple)
    P.start(False)
    P.context('ontology', 'remove')
    P.add(triples, 'ontology')
    g = P.context('ontology')
    g.serialize(os.path.join(final_dir, 'ontology.owl'))
    g.serialize(os.path.join(final_dir, 'ontology.ttl'), 'turtle')
    return locals()
Ejemplo n.º 43
0
def parseLegacyFiles(data_dir=DATADIR + "twitter/"):
    """Parse legacy pickle files with Twitter tweets"""
    filenames = os.listdir(data_dir)
    filenames = [
        i for i in filenames
        if i != "ipython_log.py" and not i.endswith(".swp")
    ]
    snapshots = set()
    triples = []
    for filename in filenames:
        snapshotid = "twitter-legacy-" + filename.replace("_", "").replace(
            'tw.pickle', '')
        snapshoturi = po.Snapshot + "#" + snapshotid
        expressed_classes = [po.Participant, po.Tweet]
        expressed_reference = filename.replace("_", "").replace(".pickle", "")
        name_humanized = "Twitter " + expressed_reference
        filesize = os.path.getsize(data_dir + filename) / 10**6
        fileformat = "pickle"
        fileuri = po.File + "#twitter-file-" + filename
        triples += [
            (snapshoturi, a, po.Snapshot),
            # (snapshoturi, a, po.TwitterSnapshot),
            (snapshoturi, po.snapshotID, snapshotid),
            (snapshoturi, po.isEgo, False),
            (snapshoturi, po.isGroup, True),
            (snapshoturi, po.isFriendship, False),
            (snapshoturi, po.isInteraction, True),
            (snapshoturi, po.isPost, True),
            # (snapshoturi, po.humanizedName, name_humanized),
            # (snapshoturi, po.expressedReference, expressed_reference),
            (snapshoturi, po.rawFile, fileuri),
            # (fileuri,     po.fileSize, filesize),
            (fileuri, po.fileName, filename),
            # (fileuri,     po.fileFormat, fileformat),
        ] + [
            # (fileuri,    po.expressedClass, expressed_class) for
            # expressed_class in expressed_classes
        ]
        snapshots.add(snapshoturi)
    nfiles = len(filenames)
    nsnapshots = len(snapshots)
    P.context("social_twitter", "remove")
    platformuri = P.rdf.ic(po.Platform, "Twitter", context="social_twitter")
    triples += [
        (NS.social.Session, NS.social.nTwitterParsedFiles, nfiles),
        (NS.social.Session, NS.social.nTwitterSnapshots, nsnapshots),
        (platformuri, po.dataDir, data_dir),
    ]
    P.add(triples, context="social_twitter")
    c("parsed {} twitter files ({} snapshots) are in percolation graph \
      and 'social_twitter' context".format(nfiles, nsnapshots))
    c("percolation graph have {} triples ({} in social_twitter context)".
      format(len(P.percolation_graph), len(P.context("social_twitter"))))
    negos = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE         { GRAPH <social_twitter> { ?s po:isEgo true         } } "
    )
    ngroups = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE       { GRAPH <social_twitter> { ?s po:isGroup true       } } "
    )
    nfriendships = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE  { GRAPH <social_twitter> { ?s po:isFriendship true  } } "
    )
    ninteractions = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_twitter> { ?s po:isInteraction true } } "
    )
    nposts = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE        { GRAPH <social_twitter> { ?s po:isPost true        } } "
    )
    totalsize = sum(
        P.query(
            r" SELECT ?size WHERE              { GRAPH <social_twitter> { ?s po:fileSize ?size     } } "
        ))
    c("""{} are ego snapshots, {} are group snapshots
{} have a friendship network. {} have an interaction network. \
      {} have post texts and reaction counts. Total raw data size is {:.2f}MB"""
      .format(negos, ngroups, nfriendships, ninteractions, nposts, totalsize))
    return snapshots
Ejemplo n.º 44
0
    def makeMetadata(self):
        if self.isfriendship and self.groupid and self.groupid2 and (
                self.groupid != self.groupid2):
            raise ValueError("Group IDS are different")
        # put all triples from social_facebook to self.meta_graph
        #g1=P.context("social_facebook")
        #g2=P.context(self.meta_graph)
        #for subject, predicate, object_ in g1.triples((self.snapshoturi))
        triples = P.get(self.snapshoturi, None, None, "social_facebook")
        for rawfile in P.get(self.snapshoturi,
                             po.rawFile,
                             None,
                             "social_facebook",
                             strict=True,
                             minimized=True):
            triples += P.get(rawfile, None, None, "social_facebook")
        P.add(triples, context=self.meta_graph)
        foo = {"uris": [], "vals": []}
        if self.isfriendship:
            foo["uris"]+=[
                         po.onlineOriginalFriendshipFile,
                         po.originalFriendshipFileName,
                         po.onlineFriendshipXMLFile,
                         po.onlineFriendshipTTLFile,
                         po.friendshipXMLFileName,
                         po.friendshipTTLFileName,
                         po.nFriends,
                         po.nFriendships,
                         po.friendshipsAnonymized
                         ]+\
                         [po.frienshipParticipantAttribute]*len(self.friendsvars)
            self.ffile = "base/" + self.filename_friendships
            self.frdf = self.snapshotid + "Friendship.rdf"
            self.fttl = self.snapshotid + "Friendship.ttl"
            foo["vals"] += [
                self.online_prefix + self.ffile, self.ffile,
                self.online_prefix + self.frdf, self.online_prefix + self.fttl,
                self.frdf, self.fttl, self.nfriends, self.nfriendships,
                self.friendships_anonymized
            ] + list(self.friendsvars)

        if self.isinteraction:
            foo["uris"]+=[
                         po.onlineOriginalInteractionFile,
                         po.originalInteractionFileName,
                         po.onlineInteractionXMLFile,
                         po.onlineInteractionTTLFile,
                         po.interactionXMLFileName,
                         po.interactionTTLFileName,
                         po.nInteracted,
                         po.nInteractions,
                         po.interactionsAnonymized
                         ]+\
                         [po.interactionParticipantAttribute]*len(self.interactionsvars)
            self.ifile = "base/" + self.filename_interactions
            self.irdf = irdf = self.snapshotid + "Interaction.rdf"
            self.ittl = ittl = self.snapshotid + "Interaction.ttl"
            foo["vals"] += [
                self.ifile,
                self.online_prefix + self.ifile,
                self.online_prefix + irdf,
                self.online_prefix + ittl,
                irdf,
                ittl,
                self.ninteractions,
                self.ninteracted,
                self.interactions_anonymized,
            ] + list(self.interactionsvars)
        if self.hastext:
            foo["uris"]+=[
                         po.onlineOriginalPostsFile,
                         po.originalPostsFileName,
                         po.onlinePostsXMLFile,
                         po.onlinePostsTTLFile,
                         po.postsXMLFileName,
                         po.postsTTLFileName,
                         po.nPosts,
                         po.nCharsOverall,
                         po.mCharsOverall,
                         po.dCharsOverall,
                         po.nTokensOverall,
                         po.mTokensOverall,
                         po.dTokensOverall,
                         ]+\
                         [po.postAttribute]*len(self.postsvars)
            self.pfile = "base/" + self.filename_posts
            self.prdf = self.snapshotid + "Post.rdf"
            self.pttl = self.snapshotid + "Post.ttl"
            foo["vals"] += [
                self.online_prefix + self.pfile,
                self.pfile,
                self.online_prefix + self.prdf,
                self.online_prefix + self.pttl,
                self.prdf,
                self.pttl,
                self.nposts,
                int(self.totalchars),
                self.mcharsposts,
                self.dcharsposts,
                int(self.totaltokens),
                self.mtokensposts,
                self.dtokensposts,
            ] + list(self.postsvars)
        foo["uris"] += [
            po.isGroup,
            po.isEgo,
            po.isFriendship,
            po.isInteraction,
            po.hasText,
            po.isPost,
        ]
        self.isego = bool(P.get(r.URIRef(self.snapshoturi), a, po.EgoSnapshot))
        self.isgroup = bool(
            P.get(r.URIRef(self.snapshoturi), a, po.GroupSnapshot))
        foo["vals"] += [
            self.isgroup, self.isego, self.isfriendship, self.isinteraction,
            self.hastext, self.hastext
        ]

        self.mrdf = self.snapshotid + "Meta.rdf"
        self.mttl = self.snapshotid + "Meta.ttl"

        self.desc = "facebook network with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
            self.snapshotid,
            self.snapshoturi,
            self.isego,
            self.isgroup,
        )
        self.desc += "\nisFriendship: {}".format(self.isfriendship)
        if self.isfriendship:
            self.desc += "; nFriends: {}; nFrienships: {}.".format(
                self.nfriends,
                self.nfriendships,
            )
        self.desc += "\nisInteraction: {}".format(self.isinteraction)
        if self.isinteraction:
            self.desc += "; nInteracted: {}; nInteractions: {}.".format(
                self.ninteracted,
                self.ninteractions,
            )
        self.desc += "\nisPost: {} (alias hasText: {})".format(
            self.hastext, self.hastext)
        if self.hastext:
            self.desc += ";\nmCharsPostsOverall: {}; dCharsPostsOverall: {}; totalCharsOverall: {}; \
                          \nmTokensPostsOverall: {}; dTokensPostsOverall: {}; totalTokensOverall: {}".format(
                self.nposts,
                self.mcharsposts,
                self.dcharsposts,
                self.totalchars,
                self.mtokensposts,
                self.dtokensposts,
                self.totaltokens,
            )

        P.rdf.triplesScaffolding(self.snapshoturi, [
            po.triplifiedIn,
            po.triplifiedBy,
            po.donatedBy,
            po.availableAt,
            po.onlineMetaXMLFile,
            po.onlineMetaTTLFile,
            po.metaXMLFileName,
            po.metaTTLFileName,
            po.acquiredThrough,
            po.socialProtocolTag,
            po.socialProtocol,
            NS.rdfs.comment,
        ] + foo["uris"], [
            datetime.datetime.now(),
            "scripts/",
            self.snapshotid[:-4],
            self.online_prefix,
            self.online_prefix + self.mrdf,
            self.online_prefix + self.mttl,
            self.mrdf,
            self.mttl,
            "Netvizz",
            "Facebook",
            P.rdf.ic(po.Platform, "Facebook", self.meta_graph,
                     self.snapshoturi),
            self.desc,
        ] + foo["vals"], self.meta_graph)
Ejemplo n.º 45
0
    def writeAllTW(self):
        # write meta and readme with self.desc, finished.
        g=P.context(self.meta_graph)
        ntriples=len(g)
        triples=[
                 (self.snapshoturi,po.nMetaTriples,ntriples)      ,
                 ]
        P.add(triples,context=self.meta_graph)
        g.namespace_manager.bind("po",po)
        g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl")
        g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml")
        c("serialized meta")
        # copia o script que gera este codigo
        if not os.path.isdir(self.final_path_+"scripts"):
            os.mkdir(self.final_path_+"scripts")
        shutil.copy(S.PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py")
        # copia do base data
        tinteraction="""\n\n{} individuals with metadata {}
and {} interactions (retweets: {}, replies: {}, user_mentions: {}) 
constitute the interaction 
network in the RDF/XML file(s):
{}
and the Turtle file(s):
{}
(anonymized: {}).""".format( self.nparticipants,str(self.participantvars),
                    self.nretweets+self.nreplies+self.nuser_mentions,self.nretweets,self.nreplies,self.nuser_mentions,
                    self.tweet_rdf,
                    self.tweet_ttl,
                    self.interactions_anonymized)
        tposts="""\n\nThe dataset consists of {} tweets with metadata {}
{:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {}
{:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}""".format(
                        self.ntweets,str(self.tweetvars),
                        self.mcharstweets,self.dcharstweets,self.totalchars,
                        self.mtokenstweets,self.dtokenstweets,self.totaltokens,
                        )
        self.dates=[i.isoformat() for i in self.dates]
        date1=min(self.dates)
        date2=max(self.dates)
        with open(self.final_path_+"README","w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the twitter
snapshot {snapid} with tweets from {date1} to {date2}
(total of {ntrip} triples).{tinteraction}{tposts}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::""".format(
                snapid=self.snapshotid,date1=date1,date2=date2,ntrip=self.ntriples,
                        tinteraction=tinteraction,
                        tposts=tposts,
                        mrdf=self.mrdf,
                        mttl=self.mttl,
                        ise=self.isego,
                        isg=self.isgroup,
                        isf=self.isfriendship,
                        isi=self.isinteraction,
                        ist=self.hastext,
                        ava=self.online_prefix,
                        desc=self.desc
                        ))
Ejemplo n.º 46
0
def parseLegacyFiles(data_dir=DATADIR):
    """Parse legacy txt files with irc logs"""
    data_dir = os.path.expanduser(data_dir)
    directories = os.listdir(data_dir)
    directories = [i for i in directories if os.path.isdir(data_dir + i)]

    snapshots = set()
    triples = []
    for directory in directories:
        all_files = [
            i for i in os.listdir(data_dir + directory) if i.isdigit()
        ]
        if all_files:
            all_files.sort()
            foo = all_files[0].lstrip("0")
            if not foo:
                foo = "0"
            snapshotid = "legacy-" + directory + "-" + foo + "-" + all_files[
                -1].lstrip("0")
            snapshoturi = po.GmaneSnapshot + "#" + snapshotid
            expressed_classes = [
                po.GmaneParticipant, po.EmailPeer, po.EmailMessage
            ]
            expressed_reference = directory
            name_humanized = "Gmane email list with id " + expressed_reference
            # get size for all files in dir
            directorysize = sum(
                os.path.getsize(data_dir + directory + "/" + filename)
                for filename in os.listdir(data_dir + directory)) / 10**6
            nfiles = len(all_files)
            fileformat = "mbox"
            directoryuri = po.Directory + "#gmane-" + directory
            triples += [
                (snapshoturi, a, po.Snapshot),
                (snapshoturi, po.dataDir, data_dir),
                (snapshoturi, a, po.Snapshot),
                (snapshoturi, a, po.GmaneSnapshot),
                (snapshoturi, po.snapshotID, snapshotid),
                (snapshoturi, po.isEgo, False),
                (snapshoturi, po.isGroup, True),
                (snapshoturi, po.isFriendship, False),
                (snapshoturi, po.isInteraction, True),
                (snapshoturi, po.isPost, True),
                (snapshoturi, po.humanizedName, name_humanized),
                (snapshoturi, po.expressedReference, expressed_reference),
                (snapshoturi, po.rawDirectory, directoryuri),
                (directoryuri, po.directorySize, directorysize),
                (directoryuri, po.directoryName, directory),
                (directoryuri, po.fileFormat, fileformat),
            ] + [(directoryuri, po.expressedClass, expressed_class)
                 for expressed_class in expressed_classes]
            snapshots.add(snapshoturi)
    nsnapshots = ndirectories = len(directories)
    #P.context("gmane","remove")
    platformuri = P.rdf.ic(po.Platform, "Gmane", context="gmane")
    triples += [
        (NS.social.Session, NS.social.nGmaneParsedDirectories, ndirectories),
        (NS.social.Session, NS.social.nGmaneSnapshots, nsnapshots),
        (NS.social.Session, po.platform, platformuri),
    ]
    P.add(triples, context="gmane")
    c("parsed {} gmane data directories (=={} snapshots) are in percolation graph and 'gmane' context"
      .format(ndirectories, nsnapshots))
    c("percolation graph have {} triples ({} in gmane context)".format(
        len(P.percolation_graph), len(P.context("gmane"))))
    negos = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE         { GRAPH <gmane> { ?s po:isEgo true         } } "
    )
    ngroups = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE       { GRAPH <gmane> { ?s po:isGroup true       } } "
    )
    nfriendships = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE  { GRAPH <gmane> { ?s po:isFriendship true  } } "
    )
    ninteractions = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isInteraction true } } "
    )
    nposts = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE        { GRAPH <gmane> { ?s po:isPost true        } } "
    )
    totalsize = sum(
        P.query(
            r" SELECT ?size WHERE              { GRAPH <gmane> { ?s po:directorySize ?size     } } "
        ))
    c("""{} are ego snapshots, {} are group snapshots
{} have a friendship structures. {} have an interaction structures. {} have texts 
Total raw data size is {:.2f}MB""".format(negos, ngroups, nfriendships,
                                          ninteractions, nposts, totalsize))
    return snapshots
Ejemplo n.º 47
0
    def makeMetadata(self):
        # triples = P.get(self.snapshoturi, None, None, self.social_graph)
        # for rawfile in P.get(self.snapshoturi, po.rawFile, None,
        #                      self.social_graph, strict=True, minimized=True):
        #     triples.extend(P.get(rawfile, None, None, self.social_graph))
        # P.add(triples, context=self.meta_graph)

        self.ffile = "base/" + self.filename_friendships
        self.frdf = self.snapshotid + "Friendship.rdf"
        self.fttl = self.snapshotid + "Friendship.ttl"
        triples = [
            # (self.snapshoturi, po.onlineOriginalFriendshipFile,
            #  self.online_prefix+self.ffile),
            # (self.snapshoturi, po.originalFriendshipFileName, self.ffile),
            # (self.snapshoturi, po.onlineFriendshipXMLFile,
            #  self.online_prefix+self.frdf),
            # (self.snapshoturi, po.onlineFriendshipTTLFile,
            #  self.online_prefix+self.fttl),
            # (self.snapshoturi, po.friendshipXMLFileName, self.frdf),
            # (self.snapshoturi, po.friendshipTTLFileName, self.fttl),
            # (self.snapshoturi, po.numberOfFriends,              self.nfriends),
            # (self.snapshoturi, po.numberOfFriendships,          self.nfriendships),
            (self.snapshoturi, po.friendshipsAnonymized,
             self.friendships_anonymized),
        ]
        P.add(triples, context=self.meta_graph)
        # P.rdf.triplesScaffolding(self.snapshoturi,
        #                          [po.frienshipParticipantAttribute] *
        #                          len(self.friendsvars),
        #                          self.friendsvars, context=self.meta_graph)
        self.mrdf = self.snapshotid + "Meta.rdf"
        self.mttl = self.snapshotid + "Meta.ttl"
        self.desc = "facebook network with snapshotID: {}\nsnapshotURI: {} \n\
            isEgo: {}. isGroup: {}.".format(self.snapshotid, self.snapshoturi,
                                            self.isego, self.isgroup)
        self.desc += "\nisFriendship: {}".format(self.isfriendship)
        # self.desc += "; numberOfFriends: {}; numberOfFrienships: {}."\
        #     .format(self.nfriends, self.nfriendships)
        self.desc += "\nisInteraction: {}".format(self.isinteraction)
        self.desc += "\nisPost: {} (hasText)".format(self.hastext)
        date_obtained = P.get(r.URIRef(self.snapshoturi),
                              po.dateObtained)[2].toPython()
        assert isinstance(date_obtained, datetime.date)
        name = P.get(r.URIRef(self.snapshoturi),
                     po.name,
                     None,
                     context=self.social_graph)[2]
        triples = [
            (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()),
            (self.snapshoturi, a, po.Snapshot),
            (self.snapshoturi, po.snapshotID, self.snapshotid),
            (self.snapshoturi, po.isEgo, True),
            (self.snapshoturi, po.isGroup, False),
            (self.snapshoturi, po.isFriendship, True),
            (self.snapshoturi, po.isInteraction, False),
            (self.snapshoturi, po.isPost, False),
            (self.snapshoturi, po.dateObtained, date_obtained),
            (self.snapshoturi, po.name, name),
            # (self.snapshoturi, po.triplifiedBy, "scripts/"),
            # (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]),
            # (self.snapshoturi, po.availableAt, self.online_prefix),
            # (self.snapshoturi, po.onlineMetaXMLFile,
            #  self.online_prefix+self.mrdf),
            # (self.snapshoturi, po.onlineMetaTTLFile,
            #  self.online_prefix+self.mttl),
            # (self.snapshoturi, po.metaXMLFileName,   self.mrdf),
            # (self.snapshoturi, po.metaTTLFileName,   self.mttl),
            (self.snapshoturi, po.acquiredThrough, "Netvizz"),
            (self.snapshoturi, po.socialProtocol, "Facebook"),
            # (self.snapshoturi, po.socialProtocolTag, "Facebook"),
            # (self.snapshoturi, po.socialProtocol,
            #  P.rdf.ic(po.Platform, "Facebook", self.meta_graph,
            #           self.snapshoturi)),
            (self.snapshoturi, po.comment, self.desc),
        ]
        numericID = P.get(r.URIRef(self.snapshoturi),
                          po.numericID,
                          None,
                          context=self.social_graph)
        if numericID:
            triples.append((self.snapshoturi, po.numericID, numericID[2]))
        stringID = P.get(r.URIRef(self.snapshoturi),
                         po.stringID,
                         None,
                         context=self.social_graph)
        if stringID:
            triples.append((self.snapshoturi, po.stringID, stringID[2]))
        url = P.get(r.URIRef(self.snapshoturi),
                    po.url,
                    None,
                    context=self.social_graph)
        if url:
            triples.append((self.snapshoturi, po.url, url[2]))
        P.add(triples, self.meta_graph)
Ejemplo n.º 48
0
    def makeMetadata(self):
        self.makePostsTriples()
        # get participant and message vars from snapshot through queries
        self.participantvars = P.get("""SELECT DISTINCT ?p WHERE { GRAPH <%s> {
                                  ?fooparticipant po:snapshot <%s> .
                                  ?fooparticipant a po:Participant .
                                  ?fooparticipant ?p ?fooobject . } } """ % (
                                self.translation_graph, self.snapshoturi))
        P.rdf.triplesScaffolding(
            self.snapshoturi,
            [po.ParticipantAttribute]*len(self.participantvars),
            self.participantvars, context=self.meta_graph)
        self.messagevars = P.get("""SELECT DISTINCT ?p WHERE { GRAPH <%s> {
                               ?foomessage po:snapshot <%s> .
                               ?foomessage a po:Message .
                               ?foomessage ?p ?fooobject . } } """ % (
                                   self.translation_graph, self.snapshoturi))
        P.rdf.triplesScaffolding(
                self.snapshoturi,
                [po.MessageAttribute]*len(self.messagevars),
                self.messagevars, context=self.meta_graph)

        self.mrdf = self.snapshotid+"Meta.rdf"
        self.mttl = self.snapshotid+"Meta.ttl"
        self.desc = "dataset with snapshotID:\
            {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
            self.snapshotid, self.snapshoturi, self.isego, self.isgroup)
        self.desc += "\nisFriendship: {}; ".format(self.isfriendship)
        self.desc += "isInteraction: {}.".format(self.isinteraction)
        self.desc += "\nhasText: {}".format(self.hastext)
        self.nchecks = P.get(r"SELECT (COUNT(?checker) as ?cs) WHERE { \
                             ?foosession po:checkParticipant ?checker}",
                             context=self.translation_graph)
        self.desc += "\nnParticipants: {}; nInteractions: {} \
            (only session checks in first aa).".format(
                self.nparticipants, self.nchecks)
        self.desc += "\nnMessages: {}; ".format(self.nmessages)
        self.desc += "\nnCharsOverall: {}; mCharsOverall: {};\
            dCharsOverall: {}.".format(self.totalchars, self.mchars_messages,
                                       self.dchars_messages)
        self.desc += "\nnTokensOverall: {}; mTokensOverall: {};\
            dTokensOverall: {};".format(self.totaltokens, self.mtokens_messages,
                                        self.dtokens_messages)
        self.desc += "\nnSentencesOverall: {}; mSentencesOverall: {};\
            dSentencesOverall: {};".format(
                self.totalsentences, self.msentences_messages,
                self.dsentences_messages)
        self.desc += "\nnURLs: {}; nAAMessages {}.".format(
            self.nurls, self.nmessages)
        self.dates = P.get(r"SELECT ?date WHERE { GRAPH <%s> {\
                           ?fooshout po:createdAt ?date } " % (
                               self.translation_graph,))
        self.desc += "\nReference timespan: {} to {}".format(
            min(dates), max(dates))
        self.desc += """\nRDF expression in the XML file(s):
{}
and the Turtle file(s):
{}
(anonymized: {}).""".format(self.translation_xml, self.translation_ttl,
                            self.anonymized)
        self.desc += """\nMetadata of this snapshot in the XML file(s):
{}
and the Turtle file(s):
{}.""".format(self.meta_xml, self.meta_ttl)
        self.desc += """\nFiles should be available in: \n{}""".format()

        self.desc += "\n\nNote: numeric variables starting with n area \
            countings, with m are means and d are standard deviations."
        if isinstance(self.translation_xml, list):
            P.rdf.triplesScaffolding(
                self.snapshoturi,
                [po.translationXMLFilename]*len(self.translation_xml) +
                [po.translationTTLFilename]*len(self.translation_ttl),
                self.translation_xml+self.translation_ttl,
                context=self.meta_graph)
            P.rdf.triplesScaffolding(
                self.snapshoturi,
                [po.onlineTranslationXMLFileURI]*len(self.translation_xml) +
                [po.onlineTranslationTTLFileURI]*len(self.translation_ttl),
                [self.online_prefix+i for i in
                 self.translation_xml+self.translation_ttl],
                context=self.meta_graph)
            triples = [
                (self.snapshoturi, po.translationXMLFilesize,
                 self.translation_size_xml),
                (self.snapshoturi, po.translationTTLFilesize,
                 self.translation_size_ttl),
                      ]
        else:
            triples = [
                      (self.snapshoturi, po.translationXMLFilename,
                       self.translation_xml),
                      (self.snapshoturi, po.translationXMLFilesize,
                       self.translation_size_xml),
                      (self.snapshoturi, po.translationTTLFilename,
                       self.translation_ttl),
                      (self.snapshoturi, po.translationTTLFilesize,
                       self.translation_size_ttl),
                      ]
        P.add(triples,self.meta_graph)
        triples=[
                (self.snapshoturi, po.triplifiedIn,      datetime.datetime.now()),
                (self.snapshoturi, po.triplifiedBy,      "scripts/"),
#                (self.snapshoturi, po.donatedBy,         self.snapshotid[:-4]),
                (self.snapshoturi, po.availableAt,       self.online_prefix),
                (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf),
                (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl),
                (self.snapshoturi, po.metaXMLFileName,   self.mrdf),
                (self.snapshoturi, po.metaTTLFileName,   self.mttl),
#                (self.snapshoturi, po.acquiredThrough,   "aa shouts in "+self.snapshotid),
                (self.snapshoturi, po.socialProtocolTag, self.social_protocol), # AA, fb, etc
                (self.snapshoturi, po.socialProtocol,    P.rdf.ic(po.Platform,self.social_protocol,self.meta_graph,self.snapshoturi)),
                (self.snapshoturi, po.nTriples,         self.ntranslation_triples),
                (self.snapshoturi, NS.rdfs.comment,         self.desc),
                ]
        P.add(triples,self.meta_graph)
Ejemplo n.º 49
0
def parseLegacyFiles(datadir=DATADIR + "facebook/"):
    """Parse legacy gdf, gml and tab files of facebook structures

    Synthax of facebook filenames is:
    <prefix><name><date><suffix><extension> where:

    <prefix> used are:
        *) avlab_ for files obtained with participants at AVLAB
        *) posavlab_ for files obtained from participants
        *) page_ for files about facebook pages
        *) ego_ for ego networks
    ommited for gml files and gdf group files.

    <name> is any string name associated with the user or
    group delimiting the structure in the file, e.g. FernandValfro.
    it gets split with spaces before uppercase letter chunks for
    po:humanizedName: REM splits to REM. RFabbri to RFabbri.

    <date> daymonthyear in 2/2/4 digits, e.g. 20032014 for 20/March/2014.

    <suffix> is ommited for friendship .gml .gdf networks,
    .tab are text and activity files.
    _interaction is used if interaction network.

    <extension> is either
    .gml for gml files, all are ego friendship network data
    .gdf for gdf files with group and ego,
                            interaction and friendship network data
    .tab for tab files with post data, such as text

    These render snapshots of two classes:
    po:FacebookEgoFriendshipSnapshot from .gml files and
        gdf files with prefix avlab_ posavlab_ or ego_
    po:FacebookGroupFriendshipInteractionSnapshot from
        .gdf files without prefix with and without _interaction suffix
        and the .tab files. They form sets of files, all with friendship
        and interaction networks and some with a .tab file.

    ToDo:
       *) Implement parsing of page files.
       *) Implement parsing of new group files."""
    platformuri = P.rdf.ic(po.Platform, "Facebook", context="social_facebook")
    triples = [
        (platformuri, po.dataDir, datadir),
    ]
    filenames = os.listdir(datadir)
    filenames = [
        i for i in filenames if not i.endswith("swp") and "ipython_log.py" != i
    ]
    snapshots = set()
    regex = re.compile(
        r"^(avlab_|ego_|posavlab_|page_)*(.*?)(\d{8})(_interactions|_comments){0,1}\.(gdf|tab|gml)$"
    )
    regex2 = re.compile(r'([A-Z]{2,}(?=[A-Z]|$)|[A-Z][a-z]*)')
    for filename in filenames:
        prefix, name, date, sufix, format_ = regex.findall(filename)[0]
        if prefix == "page_":
            c("page data currently not supported. Jumping", filename)
            continue
        # size in megabytes
        filesize = os.path.getsize(datadir + filename) / (10**6)
        snapshotid = 'facebook-legacy-' + filename.replace(
            "_interactions.gdf", "").replace(".tab", "").replace(
                '.gml', '').replace('.gdf', '')
        snapshoturi = po.Snapshot + "#" + snapshotid
        date_obtained = datetime.date(int(date[4:]), int(date[2:4]),
                                      int(date[:2]))
        name_humanized = " ".join(regex2.findall(name))
        metadata = S.legacy.facebook.files.files_dict[filename.replace(
            "_interactions.gdf", ".gdf").replace(".tab", ".gdf")]
        if metadata[0]:
            triples += [(snapshoturi, po.numericID, metadata[0])]
        if metadata[1]:
            triples += [(snapshoturi, po.stringID, metadata[1])]
        if len(metadata) == 3:
            if not metadata[2]:
                c("group data without a publishing link: ", filename)
            else:
                triples += [(snapshoturi, po.url, metadata[2])]
        if filename.endswith(".gml") or any(
                filename.startswith(i)
                for i in ("ego_", "avlab_", "posavlab_")):
            isego = True
            isgroup = False
            isfriendship = True
            isinteraction = False
            isposts = False
            expressed_classes = (po.Friendship, po.Participant)

            if metadata[0]:
                expressed_reference = po.Participant+"#" + \
                    snapshotid+"-"+metadata[0]
            else:
                if "Mirtes" in filename:
                    expressed_reference = po.Participant+"#" + \
                        snapshotid+"-anon_mirtes"
                else:
                    raise ValueError(
                        "Numeric ID is needed for friendship networks")
            triples += [(expressed_reference, a, po.FacebookParticipant)]
        else:  # group snapshot
            isego = False
            isgroup = True
            ffilename = prefix + name + date + ".gdf"
            ifilename = prefix + name + date + "_interactions.gdf"
            tfilename = prefix + name + date + ".tab"
            isfriendship = ffilename in filenames
            isinteraction = ifilename in filenames
            isposts = tfilename in filenames
            if metadata[0]:
                expressed_reference = po.FacebookGroup+"#" +\
                    snapshotid+"-"+metadata[0]
            else:
                if metadata[1]:
                    expressed_reference = po.FacebookGroup+"#" +\
                        snapshotid+"-"+metadata[1]
                else:
                    raise ValueError("Numeric or string ID is needed\
                                     for group networks")
            triples += [(expressed_reference, a, po.FacebookGroup)]
            if filename == ffilename:
                expressed_classes = (po.Friendship, po.Participant)
            elif filename == ifilename:
                expressed_classes = (po.Interaction, po.Participant)
            elif format_ == "tab":
                expressed_classes = (po.Post, )
            else:
                raise NameError("filename structure not understood")

        fileuri = NS.po.File + "#" + snapshotid + "-_file_-" + filename
        triples += [
            (snapshoturi, a, po.Snapshot),
            # (snapshoturi, a, po.FacebookSnapshot),
            (snapshoturi, po.snapshotID, snapshotid),
            (snapshoturi, po.isEgo, isego),
            (snapshoturi, po.isGroup, isgroup),
            (snapshoturi, po.isFriendship, isfriendship),
            (snapshoturi, po.isInteraction, isinteraction),
            (snapshoturi, po.isPost, isposts),
            (snapshoturi, po.name, name_humanized),
            (snapshoturi, po.dateObtained, date_obtained),
            # (snapshoturi, po.expressedReference, expressed_reference),
            (snapshoturi, po.rawFile, fileuri),
            # (fileuri,     po.fileSize, filesize),
            (fileuri, po.fileName, filename),
            (fileuri, po.fileFormat, format_),
        ]
        triples += [(fileuri, po.expressedClass, expressed_class)
                    for expressed_class in expressed_classes]
        note = theNote(filename)  # for avlab and posavlab
        if note:
            triples += [
                (snapshoturi, NS.rdfs.comment, note),
            ]
        snapshots.add(snapshoturi)
    # data about the overall data in percolation graph
    nfiles = len(filenames)
    nsnapshots = len(snapshots)
    triples += [
        (NS.social.Session, NS.social.nFacebookParsedFiles, nfiles),
        (NS.social.Session, NS.social.nFacebookSnapshots, nsnapshots),
    ]
    P.context("social_facebook", "remove")
    P.add(triples, context="social_facebook")
    c("parsed {} facebook files ({} snapshots) are in percolation \
      graph and 'social_facebook' context".format(nfiles, nsnapshots))
    c("percolation graph have {} triples ({} in social_facebook context\
      )".format(len(P.percolation_graph), len(P.context("social_facebook"))))
    negos = P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isEgo true } ")
    ngroups = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isGroup true } ")
    nfriendships = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isFriendship true } ")
    ninteractions = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isInteraction true } ")
    nposts = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE { ?s po:isPost true } ")
    totalsize = sum(P.query(r" SELECT ?size WHERE { ?s po:fileSize ?size } "))
    c("""{} are ego snapshots, {} are group snapshots
{} have a friendship network. {} have an interaction network. \
      {} have post texts and reaction counts
Total raw data size is {:.2f}MB""".format(negos, ngroups, nfriendships,
                                          ninteractions, nposts, totalsize))

    return snapshots
Ejemplo n.º 50
0
    def rdfInteractionNetwork(self, fnet):
        if sum([("user" in i) for i in fnet["individuals"]["label"]]) == \
                len(fnet["individuals"]["label"]):
            # fake names and local ids
            self.interactions_anonymized = True
        else:
            self.interactions_anonymized = False
        tkeys = list(fnet["individuals"].keys())
        if "groupid" in tkeys:
            self.groupid2 = fnet["individuals"]["groupid"][0]
            tkeys.remove("groupid")
        else:
            self.groupid2 = None
        if self.interactions_anonymized:
            self.varsfriendsinteraction = [
                trans[i] for i in tkeys if i not in ('label', 'name')
            ]
        else:
            self.varsfriendsinteraction = [trans[i] for i in tkeys]
        insert = {"uris": [], "vals": []}
        for tkey in tkeys:
            insert["uris"].append(eval("po." + trans[tkey]))
            insert["vals"].append(fnet["individuals"][tkey])
        self.ninteracted = len(insert["vals"][0])
        iname = tkeys.index("name")
        ilabel = tkeys.index("label")
        for vals_ in zip(*insert["vals"]):
            if self.interactions_anonymized:
                name_ = "{}-{}".format(self.snapshotid, vals_[iname])
                insert_uris_ = [
                    el for i, el in enumerate(insert['uris'])
                    if i not in (ilabel, iname)
                ]
                vals__ = [
                    el for i, el in enumerate(vals_)
                    if i not in (ilabel, iname)
                ]
                name__ = '{}-{}'.format(self.snapshotid,
                                        self.observation_count)
                self.observation_count += 1
            else:
                name_ = "{}-{}".format(self.provenance_prefix, vals_[iname])
                insert_uris_ = [el for i, el in enumerate(insert['uris'])]
                vals__ = [el for i, el in enumerate(vals_)]
                # uri = insert['uris'][iname]
                # numericID = vals_[iname]
                # P.add([(ind, uri, numericID)], self.interaction_graph)
                obsname = '{}-{}'.format(self.snapshotid, vals_[iname])
            ind = P.rdf.ic(po.Participant, name_, self.interaction_graph,
                           self.snapshoturi)
            obs = P.rdf.ic(po.Observation, obsname, self.interaction_graph,
                           self.snapshoturi)
            P.add([(ind, po.observation, obs)], self.interaction_graph)
            if vals__:
                P.rdf.triplesScaffolding(obs, insert_uris_, vals__,
                                         self.interaction_graph)
            else:
                c(
                    "anonymous participant without attributes (besides local id). \
                  snapshotid:", self.snapshotid, "values:", vals_)

        c("participant written")
        self.interactionsvarsfoo = ["node1", "node2", "weight"]
        interactions_ = [
            fnet["relations"][i] for i in self.interactionsvarsfoo
        ]
        self.ninteractions = len(interactions_[0])
        self.interactionsvars = ["iFrom", "iTo", "weight"]
        i = 0
        for uid1, uid2, weight in zip(*interactions_):
            weight_ = int(weight)
            assert weight_-weight == 0, \
                "float weights in fb interaction networks?"
            if self.interactions_anonymized:
                iid = "{}-{}-{}".format(self.snapshotid, uid1, uid2)
                uids = [
                    r.URIRef(po.Participant +
                             "#{}-{}".format(self.snapshotid, i))
                    for i in (uid1, uid2)
                ]
            else:
                iid = "{}-{}-{}".format(self.provenance_prefix, uid1, uid2)
                uids = [
                    r.URIRef(po.Participant +
                             "#{}-{}".format(self.provenance_prefix, i))
                    for i in (uid1, uid2)
                ]
            ind = P.rdf.ic(po.Interaction, iid, self.interaction_graph,
                           self.snapshoturi)
            P.rdf.triplesScaffolding(ind,
                                     [po.interactionFrom, po.interactionTo],
                                     uids, self.interaction_graph)
            obsname = '{}-{}-{}'.format(self.snapshotid, uid1, uid2)
            obs = P.rdf.ic(po.Observation, obsname, self.interaction_graph,
                           self.snapshoturi)
            P.add([(ind, po.observation, obs), (obs, po.weight, weight_)],
                  self.interaction_graph)
            if (i % 1000) == 0:
                c("interactions: ", i)
            i += 1
        c("escritas interações")
Ejemplo n.º 51
0
    def makeMetadata(self):
        # triples = P.get(self.snapshoturi, None, None, self.social_graph)
        # for rawfile in P.get(self.snapshoturi, po.rawFile, None,
        #                      self.social_graph, strict=True, minimized=True):
        #     triples.extend(P.get(rawfile, None, None, self.social_graph))
        # self.totalchars = sum(self.nchars_all)
        # self.mcharstweets = n.mean(self.nchars_all)
        # self.dcharstweets = n.std(self.nchars_all)
        # self.totaltokens = sum(self.ntokens_all)
        # self.mtokenstweets = n.mean(self.ntokens_all)
        # self.dtokenstweets = n.std(self.ntokens_all)
        # P.add(triples, context=self.meta_graph)
        # (self.snapshoturi, po.numberOfParticipants, self.nparticipants),
        # (self.snapshoturi, po.numberOfTweets, self.ntweets),
        # (self.snapshoturi, po.numberOfReplies, self.nreplies),
        # (self.snapshoturi, po.numberOfRetweets, self.nretweets),
        # (self.snapshoturi, po.numberOfChars, self.totalchars),
        # (self.snapshoturi, po.meanChars, self.mcharstweets),
        # (self.snapshoturi, po.deviationChars, self.dcharstweets),
        # (self.snapshoturi, po.numberOfTokens, self.totaltokens),
        # (self.snapshoturi, po.meanTokens, self.mtokenstweets),
        # (self.snapshoturi, po.deviationTokens, self.dtokenstweets),
        # P.rdf.triplesScaffolding(
        #     self.snapshoturi,
        #     [po.tweetParticipantAttribute]*len(self.participantvars),
        #     self.participantvars, context=self.meta_graph
        # )
        # P.rdf.triplesScaffolding(
        #     self.snapshoturi,
        #     [po.tweetXMLFilename]*len(self.tweet_rdf) +
        #     [po.tweetTTLFilename]*len(self.tweet_ttl),
        #     self.tweet_rdf+self.tweet_ttl, context=self.meta_graph)
        # P.rdf.triplesScaffolding(
        #     self.snapshoturi,
        #     [po.onlineTweetXMLFile]*len(self.tweet_rdf) +
        #     [po.onlineTweetTTLFile]*len(self.tweet_ttl),
        #     [self.online_prefix+i for i in self.tweet_rdf+self.tweet_ttl],
        #     context=self.meta_graph)

        self.mrdf = self.snapshotid + "Meta.rdf"
        self.mttl = self.snapshotid + "Meta.ttl"
        self.desc = ("twitter dataset with snapshotID: {}\nsnapshotURI: "
                     "{} \nisEgo: {}. isGroup: {}.").format(
                         self.snapshotid,
                         self.snapshoturi,
                         self.isego,
                         self.isgroup,
                     )
        self.desc += "\nisFriendship: {}; ".format(self.isfriendship)
        self.desc += "isInteraction: {}.".format(self.isinteraction)
        # self.desc += ("\nnParticipants: {}; nInteractions: {} "
        #               "(replies+retweets+user mentions).").format(
        #                   self.nparticipants, self.nreplies+self.nretweets +
        #                   self.nuser_mentions,)
        self.desc += "\nisPost: {}".format(self.hastext)
        # self.desc += "\nnumberOfTweets: {}; ".format(self.ntweets)
        # self.desc += "numberOfReplies: {}; numberOfRetweets: {}; nmberOfUserMentions: {}.".format(
        #     self.nreplies, self.nretweets, self.nuser_mentions)
        # self.desc += "\nnumberOfTokens: {}; meanTokens: {}; deviationTokens: {};".format(
        #     self.totaltokens, self.mtokenstweets, self.dtokenstweets)
        # self.desc += "\nnumberOfChars: {}; meanChars: {}; deviationChars: {}.".format(
        #     self.totalchars, self.mcharstweets, self.dcharstweets)
        # self.desc += "\nnumberOfHashtags: {}; numberOfMedia: {}; ".format(
        #     self.nhashtags, self.nmedia)
        triples = []
        triples.extend((
            (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()),
            (self.snapshoturi, a, po.Snapshot),
            (self.snapshoturi, po.snapshotID, self.snapshotid),
            (self.snapshoturi, po.isEgo, False),
            (self.snapshoturi, po.isGroup, True),
            (self.snapshoturi, po.isFriendship, False),
            (self.snapshoturi, po.isInteraction, True),
            (self.snapshoturi, po.isPost, True),
            (self.snapshoturi, po.hashtag,
             '#' + self.snapshotid.replace('twitter-legacy-', '')),
            # (self.snapshoturi, po.triplifiedBy, "scripts/"),
            # (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]),
            # (self.snapshoturi, po.availableAt, self.online_prefix),
            # (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf),
            # (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl),
            # (self.snapshoturi, po.metaXMLFileName, self.mrdf),
            # (self.snapshoturi, po.metaTTLFileName, self.mttl),
            # (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_rdf)),
            # (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)),
            # (self.snapshoturi, po.acquiredThrough, "Twitter APIs"),
            (self.snapshoturi, po.socialProtocol, "Twitter"),
            # (self.snapshoturi, po.socialProtocolTag, "Twitter"),
            # (self.snapshoturi, po.socialProtocol, P.rdf.ic(po.Platform, "Twitter", self.meta_graph, self.snapshoturi)),
            # (self.snapshoturi, po.numberOfTriples, self.ntriples),
            (self.snapshoturi, po.comment, self.desc),
        ))
        P.add(triples, self.meta_graph)
Ejemplo n.º 52
0
def minimumOntology(context="minimum_ontology"):
    triples=rdfsTriples()
    if context=="triples":
        return triples
    P.add(triples,context=context)
Ejemplo n.º 53
0
    def writeAllTW(self):
        # write meta and readme with self.desc, finished.
        g = P.context(self.meta_graph)
        ntriples = len(g)
        triples = [
            (self.snapshoturi, po.nMetaTriples, ntriples),
        ]
        P.add(triples, context=self.meta_graph)
        g.namespace_manager.bind("po", po)
        g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle")
        c("ttl")
        g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml")
        c("serialized meta")
        # copia o script que gera este codigo
        if not os.path.isdir(self.final_path_ + "scripts"):
            os.mkdir(self.final_path_ + "scripts")
        shutil.copy(S.PACKAGEDIR + "/../tests/triplify.py",
                    self.final_path_ + "scripts/triplify.py")
        # copia do base data
        tinteraction = """\n\n{} individuals with metadata {}
and {} interactions (retweets: {}, replies: {}, user_mentions: {}) 
constitute the interaction 
network in the RDF/XML file(s):
{}
and the Turtle file(s):
{}
(anonymized: {}).""".format(
            self.nparticipants, str(self.participantvars),
            self.nretweets + self.nreplies + self.nuser_mentions,
            self.nretweets, self.nreplies, self.nuser_mentions, self.tweet_rdf,
            self.tweet_ttl, self.interactions_anonymized)
        tposts = """\n\nThe dataset consists of {} tweets with metadata {}
{:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {}
{:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}""".format(
            self.ntweets,
            str(self.tweetvars),
            self.mcharstweets,
            self.dcharstweets,
            self.totalchars,
            self.mtokenstweets,
            self.dtokenstweets,
            self.totaltokens,
        )
        self.dates = [i.isoformat() for i in self.dates]
        date1 = min(self.dates)
        date2 = max(self.dates)
        with open(self.final_path_ + "README", "w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the twitter
snapshot {snapid} with tweets from {date1} to {date2}
(total of {ntrip} triples).{tinteraction}{tposts}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::"""
                    .format(snapid=self.snapshotid,
                            date1=date1,
                            date2=date2,
                            ntrip=self.ntriples,
                            tinteraction=tinteraction,
                            tposts=tposts,
                            mrdf=self.mrdf,
                            mttl=self.mttl,
                            ise=self.isego,
                            isg=self.isgroup,
                            isf=self.isfriendship,
                            isi=self.isinteraction,
                            ist=self.hastext,
                            ava=self.online_prefix,
                            desc=self.desc))
Ejemplo n.º 54
0
    def makeMetadata(self):
        triples = P.get(self.snapshoturi, None, None, self.social_graph)
        for rawfile in P.get(self.snapshoturi,
                             po.rawFile,
                             None,
                             self.social_graph,
                             strict=True,
                             minimized=True):
            triples += P.get(rawfile, None, None, self.social_graph)
        self.totalchars = sum(self.nchars_all)
        self.mcharstweets = n.mean(self.nchars_all)
        self.dcharstweets = n.std(self.nchars_all)
        self.totaltokens = sum(self.ntokens_all)
        self.mtokenstweets = n.mean(self.ntokens_all)
        self.dtokenstweets = n.std(self.ntokens_all)
        P.add(triples, context=self.meta_graph)
        triples = [
            (self.snapshoturi, po.nParticipants, self.nparticipants),
            (self.snapshoturi, po.nTweets, self.ntweets),
            (self.snapshoturi, po.nReplies, self.nreplies),
            (self.snapshoturi, po.nRetweets, self.nretweets),
            (self.snapshoturi, po.nCharsOverall, self.totalchars),
            (self.snapshoturi, po.mCharsOverall, self.mcharstweets),
            (self.snapshoturi, po.dCharsOverall, self.dcharstweets),
            (self.snapshoturi, po.nTokensOverall, self.totaltokens),
            (self.snapshoturi, po.mTokensOverall, self.mtokenstweets),
            (self.snapshoturi, po.dTokensOverall, self.dtokenstweets),
        ]
        P.add(triples, context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                                 [po.tweetParticipantAttribute] *
                                 len(self.participantvars),
                                 self.participantvars,
                                 context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                                 [po.tweetXMLFilename] * len(self.tweet_rdf) +
                                 [po.tweetTTLFilename] * len(self.tweet_ttl),
                                 self.tweet_rdf + self.tweet_ttl,
                                 context=self.meta_graph)
        P.rdf.triplesScaffolding(
            self.snapshoturi, [po.onlineTweetXMLFile] * len(self.tweet_rdf) +
            [po.onlineTweetTTLFile] * len(self.tweet_ttl),
            [self.online_prefix + i for i in self.tweet_rdf + self.tweet_ttl],
            context=self.meta_graph)

        self.mrdf = self.snapshotid + "Meta.rdf"
        self.mttl = self.snapshotid + "Meta.ttl"
        self.desc = "twitter dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
            self.snapshotid,
            self.snapshoturi,
            self.isego,
            self.isgroup,
        )
        self.desc += "\nisFriendship: {}; ".format(self.isfriendship)
        self.desc += "isInteraction: {}.".format(self.isinteraction)
        self.desc += "\nnParticipants: {}; nInteractions: {} (replies+retweets+user mentions).".format(
            self.nparticipants,
            self.nreplies + self.nretweets + self.nuser_mentions,
        )
        self.desc += "\nisPost: {} (alias hasText: {})".format(
            self.hastext, self.hastext)
        self.desc += "\nnTweets: {}; ".format(self.ntweets)
        self.desc += "nReplies: {}; nRetweets: {}; nUserMentions: {}.".format(
            self.nreplies, self.nretweets, self.nuser_mentions)
        self.desc += "\nnTokens: {}; mTokens: {}; dTokens: {};".format(
            self.totaltokens, self.mtokenstweets, self.dtokenstweets)
        self.desc += "\nnChars: {}; mChars: {}; dChars: {}.".format(
            self.totalchars, self.mcharstweets, self.dcharstweets)
        self.desc += "\nnHashtags: {}; nMedia: {}; nLinks: {}.".format(
            self.nhashtags, self.nmedia, self.nlinks)
        triples = [
            (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()),
            (self.snapshoturi, po.triplifiedBy, "scripts/"),
            (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]),
            (self.snapshoturi, po.availableAt, self.online_prefix),
            (self.snapshoturi, po.onlineMetaXMLFile,
             self.online_prefix + self.mrdf),
            (self.snapshoturi, po.onlineMetaTTLFile,
             self.online_prefix + self.mttl),
            (self.snapshoturi, po.metaXMLFileName, self.mrdf),
            (self.snapshoturi, po.metaTTLFileName, self.mttl),
            (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_rdf)),
            (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)),
            (self.snapshoturi, po.acquiredThrough, "Twitter APIs"),
            (self.snapshoturi, po.socialProtocolTag, "Twitter"),
            (self.snapshoturi, po.socialProtocol,
             P.rdf.ic(po.Platform, "Twitter", self.meta_graph,
                      self.snapshoturi)),
            (self.snapshoturi, po.nTriples, self.ntriples),
            (self.snapshoturi, NS.rdfs.comment, self.desc),
        ]
        P.add(triples, self.meta_graph)
Ejemplo n.º 55
0
    def writeAllFB(self):
        c("started rendering of the snapshot publication. snapshotID:",
          self.snapshotid)
        self.final_path_ = "{}{}/".format(self.final_path, self.snapshotid)
        if not os.path.isdir(self.final_path_):
            os.mkdir(self.final_path_)
        #fnet,inet,mnet
        triples = []
        if self.isfriendship:
            g = P.context(self.friendship_graph)
            g.namespace_manager.bind("po", po)
            g.serialize(self.final_path_ + self.snapshotid + "Friendship.ttl",
                        "turtle")
            c("ttl")
            g.serialize(self.final_path_ + self.snapshotid + "Friendship.rdf",
                        "xml")
            c("serialized friendships")
            # get filesize and ntriples
            filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Friendship.rdf") / (10**6)
            filesizettl = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Friendship.ttl") / (10**6)
            ntriples = len(g)
            triples += [
                (self.snapshoturi, po.friendshipXMLFileSizeMB, filesizerdf),
                (self.snapshoturi, po.friendshipTTLFileSizeMB, filesizettl),
                (self.snapshoturi, po.nFriendshipTriples, ntriples),
            ]
        if self.isinteraction:
            g = P.context(self.interaction_graph)
            g.namespace_manager.bind("po", po)
            g.serialize(self.final_path_ + self.snapshotid + "Interaction.ttl",
                        "turtle")
            c("ttl")
            g.serialize(self.final_path_ + self.snapshotid + "Interaction.rdf",
                        "xml")
            c("serialized interaction")
            filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Interaction.rdf") / (10**6)
            filesizettl = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Interaction.ttl") / (10**6)
            ntriples = len(g)
            triples += [
                (self.snapshoturi, po.interactionXMLFileSizeMB, filesizerdf),
                (self.snapshoturi, po.interactionTTLFileSizeMB, filesizettl),
                (self.snapshoturi, po.nInteractionTriples, ntriples),
            ]
        if self.hastext:
            g = P.context(self.posts_graph)
            g.namespace_manager.bind("po", po)
            g.serialize(self.final_path_ + self.snapshotid + "Posts.ttl",
                        "turtle")
            c("ttl")
            g.serialize(self.final_path_ + self.snapshotid + "Posts.rdf",
                        "xml")
            c("serialized posts")
            filesizerdf = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Posts.rdf") / (10**6)
            filesizettl = os.path.getsize(self.final_path_ + self.snapshotid +
                                          "Posts.ttl") / (10**6)
            ntriples = len(g)
            triples += [
                (self.snapshoturi, po.postsXMLFileSizeMB, filesizerdf),
                (self.snapshoturi, po.postsTTLFileSizeMB, filesizettl),
                (self.snapshoturi, po.nPostsTriples, ntriples),
            ]
        g = P.context(self.meta_graph)
        ntriples = len(g)
        triples += [
            (self.snapshoturi, po.nMetaTriples, ntriples),
        ]
        P.add(triples, context=self.meta_graph)
        g.namespace_manager.bind("po", po)
        g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle")
        c("ttl")
        g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml")
        c("serialized meta")
        # copia o script que gera este codigo
        if not os.path.isdir(self.final_path_ + "scripts"):
            os.mkdir(self.final_path_ + "scripts")
        shutil.copy(S.PACKAGEDIR + "/../tests/triplify.py",
                    self.final_path_ + "scripts/triplify.py")
        # copia do base data
        if not os.path.isdir(self.final_path_ + "base"):
            os.mkdir(self.final_path_ + "base")
        originals = ""
        if self.isfriendship:
            shutil.copy(self.data_path + self.filename_friendships,
                        self.final_path_ + "base/")
            originals += "base/{}".format(self.filename_friendships)
            tfriendship = """\n\n{nf} individuals with metadata {fvars}
and {nfs} friendships constitute the friendship network in the RDF/XML file:
{frdf} \nor in the Turtle file: \n{fttl}
(anonymized: {fan}).""".format(
                nf=self.nfriends,
                fvars=str(self.friendsvars),
                nfs=self.nfriendships,
                frdf=self.frdf,
                fttl=self.fttl,
                fan=self.friendships_anonymized,
            )
        else:
            tfriendship = ""
        if self.isinteraction:
            shutil.copy(self.data_path + self.filename_interactions,
                        self.final_path_ + "base/")
            tinteraction = """\n\n{} individuals with metadata {}
and {} interactions with metadata {} constitute the interaction 
network in the RDF/XML file:
{}
or in the Turtle file:
{}
(anonymized: {}).""".format(self.ninteracted, str(self.varsfriendsinteraction),
                            self.ninteractions, str(self.interactionsvars),
                            self.irdf, self.ittl, self.interactions_anonymized)
            originals += "\nbase/{}".format(self.filename_interactions)
        else:
            tinteraction = ""
        if self.hastext:
            shutil.copy(self.data_path + self.filename_posts,
                        self.final_path_ + "base/")
            tposts = """\n\n{} posts with {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {}
{:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}
posts data in the RDF/XML file:
{}
or in the Turtle file:
{}""".format(self.nposts, self.mcharsposts, self.dcharsposts, self.totalchars,
             self.mtokensposts, self.dtokensposts, self.totaltokens, self.prdf,
             self.pttl)
            originals += "\nbase/{}".format(self.filename_posts)
        else:
            tposts = ""


#        P.rdf.writeAll(mnet,aname+"Meta",fpath_,1)
# faz um README
        datetime_string = P.get(r.URIRef(self.snapshoturi),
                                po.dateObtained,
                                None,
                                context="social_facebook")[2]
        #        if not os.path.isdir(self.final_path+"base"):
        #            os.mkdir(self.final_path+"base")
        with open(self.final_path_ + "README", "w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the facebook
snapshot {snapid} collected around {date}.{tfriendship}{tinteraction}{tposts}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nOriginal file(s):
{origs}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::"""
                    .format(snapid=self.snapshotid,
                            date=datetime_string,
                            tfriendship=tfriendship,
                            tinteraction=tinteraction,
                            tposts=tposts,
                            mrdf=self.mrdf,
                            mttl=self.mttl,
                            origs=originals,
                            ise=self.isego,
                            isg=self.isgroup,
                            isf=self.isfriendship,
                            isi=self.isinteraction,
                            ist=self.hastext,
                            ava=self.online_prefix,
                            desc=self.desc))
Ejemplo n.º 56
0
    def makeMetadata(self):
        triples = P.get(self.snapshoturi, None, None, self.social_graph)
        for rawfile in P.get(self.snapshoturi,
                             po.rawFile,
                             None,
                             self.social_graph,
                             strict=True,
                             minimized=True):
            triples += P.get(rawfile, None, None, self.social_graph)
        P.add(triples, context=self.meta_graph)
        self.totalchars = sum(self.nchars_all)
        self.mcharsmessages = n.mean(self.nchars_all)
        self.dcharsmessages = n.std(self.nchars_all)
        self.totaltokens = sum(self.ntokens_all)
        self.mtokensmessages = n.mean(self.ntokens_all)
        self.dtokensmessages = n.std(self.ntokens_all)
        self.totalsentences = sum(self.nsentences_all)
        self.msentencesmessages = n.mean(self.nsentences_all)
        self.dsentencesmessages = n.std(self.nsentences_all)
        self.nparticipants = len(self.NICKS)
        self.nmessages = len(self.messageids)
        self.ntriples = len(P.context(self.irc_graph))
        triples = [
            (self.snapshoturi, po.nParticipants, self.nparticipants),
            (self.snapshoturi, po.nMessages, self.nmessages),
            (self.snapshoturi, po.nDirectMessages, self.ndirect),
            (self.snapshoturi, po.nUserMentions, self.nmention),
            (self.snapshoturi, po.nCharsOverall, self.totalchars),
            (self.snapshoturi, po.mCharsOverall, self.mcharsmessages),
            (self.snapshoturi, po.dCharsOverall, self.dcharsmessages),
            (self.snapshoturi, po.nTokensOverall, self.totaltokens),
            (self.snapshoturi, po.mTokensOverall, self.mtokensmessages),
            (self.snapshoturi, po.dTokensOverall, self.dtokensmessages),
            (self.snapshoturi, po.nSentencesOverall, self.totalsentences),
            (self.snapshoturi, po.mSentencesOverall, self.msentencesmessages),
            (self.snapshoturi, po.dSentencesOverall, self.dsentencesmessages),
        ]
        P.add(triples, context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                                 [po.ircParticipantAttribute] *
                                 len(self.participantvars),
                                 self.participantvars,
                                 context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                                 [po.logXMLFilename] * len(self.log_xml) +
                                 [po.logTTLFilename] * len(self.log_ttl),
                                 self.log_xml + self.log_ttl,
                                 context=self.meta_graph)
        P.rdf.triplesScaffolding(
            self.snapshoturi, [po.onlineLogXMLFile] * len(self.log_xml) +
            [po.onlineLogTTLFile] * len(self.log_ttl),
            [self.online_prefix + i for i in self.log_xml + self.log_ttl],
            context=self.meta_graph)

        self.mrdf = self.snapshotid + "Meta.rdf"
        self.mttl = self.snapshotid + "Meta.ttl"
        self.desc = "irc dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
            self.snapshotid,
            self.snapshoturi,
            self.isego,
            self.isgroup,
        )
        self.desc += "\nisFriendship: {}; ".format(self.isfriendship)
        self.desc += "isInteraction: {}.".format(self.isinteraction)
        self.desc += "\nnParticipants: {}; nInteractions: {} (directed messages+user mentions).".format(
            self.nparticipants, self.ndirect + self.nmention)
        self.desc += "\nisPost: {} (alias hasText: {})".format(
            self.hastext, self.hastext)
        self.desc += "\nnMessages: {}; ".format(self.nmessages)
        self.desc += "nDirectedMessages: {}; nUserMentions: {};".format(
            self.ndirect, self.nmention)
        self.desc += "\nnCharsOverall: {}; mCharsOverall: {}; dCharsOverall: {}.".format(
            self.totalchars, self.mcharsmessages, self.dcharsmessages)
        self.desc += "\nnTokensOverall: {}; mTokensOverall: {}; dTokensOverall: {};".format(
            self.totaltokens, self.mtokensmessages, self.dtokensmessages)
        self.desc += "\nnSentencesOverall: {}; mSentencesOverall: {}; dSentencesOverall: {};".format(
            self.totalsentences, self.msentencesmessages,
            self.dsentencesmessages)
        self.desc += "\nnURLs: {}; nAAMessages {}.".format(
            self.nurls, self.naamessages)
        triples = [
            (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()),
            (self.snapshoturi, po.triplifiedBy, "scripts/"),
            (self.snapshoturi, po.donatedBy, self.snapshotid[:-4]),
            (self.snapshoturi, po.availableAt, self.online_prefix),
            (self.snapshoturi, po.onlineMetaXMLFile,
             self.online_prefix + self.mrdf),
            (self.snapshoturi, po.onlineMetaTTLFile,
             self.online_prefix + self.mttl),
            (self.snapshoturi, po.metaXMLFileName, self.mrdf),
            (self.snapshoturi, po.metaTTLFileName, self.mttl),
            (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_xml)),
            (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)),
            (self.snapshoturi, po.acquiredThrough, "channel text log"),
            (self.snapshoturi, po.socialProtocolTag, "IRC"),
            (self.snapshoturi, po.socialProtocol,
             P.rdf.ic(po.Platform, "IRC", self.meta_graph, self.snapshoturi)),
            (self.snapshoturi, po.nTriples, self.ntriples),
            (self.snapshoturi, NS.rdfs.comment, self.desc),
        ]
        P.add(triples, self.meta_graph)
Ejemplo n.º 57
0
    def rdfMbox(self):
        for filecount, file_ in enumerate(self.files):
            if filecount % 100 == 0:
                c(self.snapshoturi, filecount)
            mbox = mailbox.mbox(self.data_path + self.directory + "/" + file_)
            if not mbox.keys():
                self.nempty += 1
                mbox.close()
                #                c("||||||||||| EMPTY MESSAGE |||||||||||||||||||||",self.snapshotid,file_,"(",filecount,")")
                continue
            if not mbox[0]["Message-Id"]:
                raise ValueError(
                    "What to do with nonempy messages without id?")
            message = mbox[0]
            gmaneid = self.makeId(message["Message-Id"])
            #c("gmaneid",gmaneid)
            if not gmaneid:
                raise ValueError("Message without id")
            messageuri = P.rdf.ic(po.EmailMessage, gmaneid,
                                  self.translation_graph, self.snapshoturi)
            self.nmessages += 1
            triples = [
                (messageuri, po.gmaneID, gmaneid),
            ]
            email, name = self.parseParticipant(message["From"])
            if not email:
                raise ValueError("message without author")
            participanturi = P.rdf.ic(po.GmaneParticipant, email,
                                      self.translation_graph, self.snapshoturi)
            if not P.get(participanturi, po.emailAddress, None,
                         self.translation_graph):
                self.nparticipants += 1
                if self.nparticipants == 100:
                    pass
            triples += [
                (messageuri, po.author, participanturi),
                (participanturi, po.emailAddress, email),
            ]
            if name:
                triples += [
                    (participanturi, po.name, name),
                ]
            subject = message["Subject"]
            if subject:
                subject = decodeHeader(subject)
                assert isinstance(subject, str)
                triples += [
                    (messageuri, po.subject, subject),
                ]
            replyid_ = message["In-Reply-To"]
            saneid = self.makeId(replyid_)
            if bool(replyid_) and not bool(saneid):
                self.nreplies += 1
                replyid = self.snapshotid + "-" + str(self.nlost_messages)
                self.nlost_messages += 1
                replymessageuri = P.rdf.ic(po.LostEmailMessage, replyid,
                                           self.translation_graph,
                                           self.snapshoturi)
                triples += [
                    (replymessageuri, a, po.EmailMessage),
                    (replymessageuri, NS.rdfs.comment,
                     "This message registered as having a reply, but the field might be ill-formed: "
                     + replyid_),
                    (messageuri, po.replyTo, replymessageuri),
                ]
            elif saneid:
                self.nreplies += 1
                replymessageuri = P.rdf.ic(po.EmailMessage, saneid,
                                           self.translation_graph,
                                           self.snapshoturi)
                triples += [
                    (replymessageuri, po.gmaneID, saneid),
                    (messageuri, po.replyTo, replymessageuri),
                ]
            if isinstance(message["Date"], str):
                datetime = parseDate(message["Date"])
            elif isinstance(message["Date"], mailbox.email.header.Header):
                datetimestring = decodeHeader(message["Date"])
                if False in [i in string.printable for i in datetimestring]:
                    datetime = None
                    triples += [
                        (messageuri, po.lostCreatedAt, True),
                    ]
                else:
                    datetime_ = re.findall(r"(.*\d\d:\d\d:\d\d).*",
                                           datetimestring)[0]
                    datetime = parseDate(datetime_)
            else:
                raise ValueError("datetime not understood")
            if datetime:
                self.dates += [datetime]
                triples += [
                    (messageuri, po.createdAt, datetime),
                ]
            if message["References"]:
                references = message["References"].replace("\n", "").replace(
                    "\t", "").replace(" ", "")
                if not re.findall(r"\A<(.*?)>\Z", references):
                    c("::: ::: ::: references field not understood",
                      message["References"])
                    triples += [
                        (messageuri, po.comment,
                         "the references are not understood (<.*> ids are added anyway): "
                         + message["References"]),
                        (messageuri, po.referencesLost, True),
                    ]
                for reference in re.findall(r"<(.*?)>", references):
                    self.nreferences += 1
                    referenceuri = P.rdf.ic(po.EmailMessage, reference,
                                            self.translation_graph,
                                            self.snapshoturi)
                    triples += [
                        (referenceuri, po.gmaneID, reference),
                        (messageuri, po.hasReference, referenceuri),
                    ]
                for part in message["References"].replace("\n", "").replace(
                        "\t", "").split():
                    if validate_email(part):
                        self.nreferences += 1
                        referenceuri = P.rdf.ic(po.EmailMessage, part,
                                                self.translation_graph,
                                                self.snapshoturi)
                        triples += [
                            (referenceuri, po.gmaneID, reference),
                            (messageuri, po.hasReference, referenceuri),
                        ]
            text = getText(message)
            if text:
                nchars = len(text)
                ntokens = len(k.wordpunct_tokenize(text))
                nsentences = len(k.sent_tokenize(text))
                triples += [
                    (messageuri, po.messageText, text),
                    (messageuri, po.nChars, nchars),
                    (messageuri, po.nTokens, ntokens),
                    (messageuri, po.nSentences, nsentences),
                ]
                self.nchars_all += [nchars]
                self.ntokens_all += [ntokens]
                self.nsentences_all += [nsentences]

                clean_text = cleanEmailBody(text)
                self.nremoved_lines += text.count("\n") - clean_text.count(
                    "\n")
                self.nlines += text.count("\n")
                nchars_clean = len(clean_text)
                ntokens_clean = len(k.wordpunct_tokenize(clean_text))
                nsentences_clean = len(k.sent_tokenize(clean_text))
                triples += [
                    (messageuri, po.messageTextClean, clean_text),
                    (messageuri, po.nCharsClean, nchars_clean),
                    (messageuri, po.nTokensClean, ntokens_clean),
                    (messageuri, po.nSentencesClean, nsentences_clean),
                ]
                self.nchars_clean_all += [nchars_clean]
                self.ntokens_clean_all += [ntokens_clean]
                self.nsentences_clean_all += [nsentences_clean]

                for url in re.findall(
                        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        clean_text):
                    self.nurls += 1
                    triples += [
                        (messageuri, po.hasUrl, url),
                    ]

            content_type = message.get_content_type()
            if content_type:
                triples += [(messageuri, po.contentType, content_type)]
            else:
                raise ValueError("/\/\/\/\/\ message without content type")
            organization = message["Organization"]
            if organization:
                if not isinstance(organization, str):
                    organization = "".join(i for i in str(organization)
                                           if i in string.printable)
                triples += [
                    (messageuri, po.organization, organization),
                ]
            if message["cc"]:
                cc, unparsed = parseAddresses(message["cc"])
                if unparsed:
                    triples += [
                        (messageuri, po.unparsedCC, unparsed),
                    ]
                for peeraddress, peername in cc:
                    peeraddress = peeraddress.strip()
                    assert bool(peeraddress)
                    peeruri = P.rdf.ic(po.EmailPeer, peeraddress,
                                       self.translation_graph,
                                       self.snapshoturi)
                    triples += [
                        (messageuri, po.cc, peeruri),
                        (peeruri, po.emailAddress, peeraddress),
                    ]
                    self.ncc += 1
                    if peername:
                        triples += [
                            (peeruri, po.name, peername.strip()),
                        ]
            if message["to"]:
                to, unparsed = parseAddresses(message["to"])
                if unparsed:
                    triples += [
                        (messageuri, po.unparsedTo, unparsed),
                    ]
                for peeraddress, peername in to:
                    peeraddress = peeraddress.strip()
                    assert bool(peeraddress)
                    peeruri = P.rdf.ic(po.EmailPeer, peeraddress,
                                       self.translation_graph,
                                       self.snapshoturi)
                    triples += [
                        (messageuri, po.to, peeruri),
                        (peeruri, po.emailAddress, peeraddress),
                    ]
                    self.nto += 1
                    if peername:
                        triples += [
                            (peeruri, po.name, peername.strip()),
                        ]
            listid = message["list-id"]
            if listid:
                assert isinstance(listid, str)
                listid = listid.replace("\n", "").replace("\t", "")
                if listid.count("<") == listid.count(">") == listid.count(
                        " ") == 0:
                    listname = ""
                    listid_ = listid
                elif listid.count("<") == listid.count(">") == 0:
                    parts = listid.split()
                    lens = [len(i) for i in parts]
                    listid_ = [i for i in parts if len(i) == max(lens)][0]
                    listname = " ".join(i for i in parts
                                        if len(i) != max(lens))
                elif listid.count("<") == listid.count(">") == 1:
                    listname, listid_ = re.findall(r"(.*) {0,1}<(.*)>",
                                                   listid)[0]
                else:
                    raise ValueError("Unexpected listid string format")
                listuri = P.rdf.ic(po.EmailList, listid_,
                                   self.translation_graph, self.snapshoturi)
                triples += [
                    (messageuri, po.emailList, listuri),
                    (listuri, po.listID, listid_),
                ]
                if listname:
                    triples += [
                        (listuri, po.name, listname.strip()),
                    ]
            P.add(triples, self.translation_graph)
            mbox.close()
Ejemplo n.º 58
0
    def makeMetadata(self):
        triples=P.get(self.snapshoturi,None,None,self.social_graph)
        for rawfile in P.get(self.snapshoturi,po.rawFile,None,self.social_graph,strict=True,minimized=True):
            triples+=P.get(rawfile,None,None,self.social_graph)
        self.totalchars=sum(self.nchars_all)
        self.mcharstweets=n.mean(self.nchars_all)
        self.dcharstweets=n.std(self.nchars_all)
        self.totaltokens=sum(self.ntokens_all)
        self.mtokenstweets=n.mean(self.ntokens_all)
        self.dtokenstweets=n.std(self.ntokens_all)
        P.add(triples,context=self.meta_graph)
        triples=[
                (self.snapshoturi, po.nParticipants,           self.nparticipants),
                (self.snapshoturi, po.nTweets,                 self.ntweets),
                (self.snapshoturi, po.nReplies,              self.nreplies),
                (self.snapshoturi, po.nRetweets,               self.nretweets),
                (self.snapshoturi, po.nCharsOverall, self.totalchars),
                (self.snapshoturi, po.mCharsOverall, self.mcharstweets),
                (self.snapshoturi, po.dCharsOverall, self.dcharstweets),
                (self.snapshoturi, po.nTokensOverall, self.totaltokens),
                (self.snapshoturi, po.mTokensOverall, self.mtokenstweets),
                (self.snapshoturi, po.dTokensOverall, self.dtokenstweets),
                ]
        P.add(triples,context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                [po.tweetParticipantAttribute]*len(self.participantvars),
                self.participantvars,context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                [po.tweetXMLFilename]*len(self.tweet_rdf)+[po.tweetTTLFilename]*len(self.tweet_ttl),
                self.tweet_rdf+self.tweet_ttl,context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                [po.onlineTweetXMLFile]*len(self.tweet_rdf)+[po.onlineTweetTTLFile]*len(self.tweet_ttl),
                [self.online_prefix+i for i in self.tweet_rdf+self.tweet_ttl],context=self.meta_graph)

        self.mrdf=self.snapshotid+"Meta.rdf"
        self.mttl=self.snapshotid+"Meta.ttl"
        self.desc="twitter dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
                                                self.snapshotid,self.snapshoturi,self.isego,self.isgroup,)
        self.desc+="\nisFriendship: {}; ".format(self.isfriendship)
        self.desc+="isInteraction: {}.".format(self.isinteraction)
        self.desc+="\nnParticipants: {}; nInteractions: {} (replies+retweets+user mentions).".format(self.nparticipants,self.nreplies+self.nretweets+self.nuser_mentions,)
        self.desc+="\nisPost: {} (alias hasText: {})".format(self.hastext,self.hastext)
        self.desc+="\nnTweets: {}; ".format(self.ntweets)
        self.desc+="nReplies: {}; nRetweets: {}; nUserMentions: {}.".format(self.nreplies,self.nretweets,self.nuser_mentions)
        self.desc+="\nnTokens: {}; mTokens: {}; dTokens: {};".format(self.totaltokens,self.mtokenstweets,self.dtokenstweets)
        self.desc+="\nnChars: {}; mChars: {}; dChars: {}.".format(self.totalchars,self.mcharstweets,self.dcharstweets)
        self.desc+="\nnHashtags: {}; nMedia: {}; nLinks: {}.".format(self.nhashtags,self.nmedia,self.nlinks)
        triples=[
                (self.snapshoturi, po.triplifiedIn,      datetime.datetime.now()),
                (self.snapshoturi, po.triplifiedBy,      "scripts/"),
                (self.snapshoturi, po.donatedBy,         self.snapshotid[:-4]),
                (self.snapshoturi, po.availableAt,       self.online_prefix),
                (self.snapshoturi, po.onlineMetaXMLFile, self.online_prefix+self.mrdf),
                (self.snapshoturi, po.onlineMetaTTLFile, self.online_prefix+self.mttl),
                (self.snapshoturi, po.metaXMLFileName,   self.mrdf),
                (self.snapshoturi, po.metaTTLFileName,   self.mttl),
                (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_rdf)),
                (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)),
                (self.snapshoturi, po.acquiredThrough,   "Twitter APIs"),
                (self.snapshoturi, po.socialProtocolTag, "Twitter"),
                (self.snapshoturi, po.socialProtocol,    P.rdf.ic(po.Platform,"Twitter",self.meta_graph,self.snapshoturi)),
                (self.snapshoturi, po.nTriples,         self.ntriples),
                (self.snapshoturi, NS.rdfs.comment,         self.desc),
                ]
        P.add(triples,self.meta_graph)
Ejemplo n.º 59
0
    def makeMetadata(self):
        self.totalchars = sum(self.nchars_all)
        self.mchars_messages = n.mean(self.nchars_all)
        self.dchars_messages = n.std(self.nchars_all)
        self.totaltokens = sum(self.ntokens_all)
        self.mtokens_messages = n.mean(self.ntokens_all)
        self.dtokens_messages = n.std(self.ntokens_all)
        self.totalsentences = sum(self.nsentences_all)
        self.msentences_messages = n.mean(self.nsentences_all)
        self.dsentences_messages = n.std(self.nsentences_all)

        self.totalchars_clean = sum(self.nchars_clean_all)
        self.mchars_messages_clean = n.mean(self.nchars_clean_all)
        self.dchars_messages_clean = n.std(self.nchars_clean_all)
        self.totaltokens_clean = sum(self.ntokens_clean_all)
        self.mtokens_messages_clean = n.mean(self.ntokens_clean_all)
        self.dtokens_messages_clean = n.std(self.ntokens_clean_all)
        self.totalsentences_clean = sum(self.nsentences_clean_all)
        self.msentences_messages_clean = n.mean(self.nsentences_clean_all)
        self.dsentences_messages_clean = n.std(self.nsentences_clean_all)
        fremoved_lines = self.nremoved_lines / self.nlines

        triples = [
            (self.snapshoturi, po.nParticipants, self.nparticipants),
            (self.snapshoturi, po.nMessages, self.nmessages),
            (self.snapshoturi, po.nEmptyMessages, self.nempty),
            (self.snapshoturi, po.nReplies, self.nreplies),
            (self.snapshoturi, po.nCC, self.ncc),
            (self.snapshoturi, po.nTo, self.nto),
            (self.snapshoturi, po.nReferences, self.nreferences),
            (self.snapshoturi, po.nUrls, self.nurls),
            (self.snapshoturi, po.nCharsOverall, self.totalchars),
            (self.snapshoturi, po.mCharsOverall, self.mchars_messages),
            (self.snapshoturi, po.dCharsOverall, self.dchars_messages),
            (self.snapshoturi, po.nTokensOverall, self.totaltokens),
            (self.snapshoturi, po.mTokensOverall, self.mtokens_messages),
            (self.snapshoturi, po.dTokensOverall, self.dtokens_messages),
            (self.snapshoturi, po.nSentencesOverall, self.totalsentences),
            (self.snapshoturi, po.mSentencesOverall, self.msentences_messages),
            (self.snapshoturi, po.dSentencesOverall, self.dsentences_messages),
            (self.snapshoturi, po.nCharsOverallClean, self.totalchars_clean),
            (self.snapshoturi, po.mCharsOverallClean,
             self.mchars_messages_clean),
            (self.snapshoturi, po.dCharsOverallClean,
             self.dchars_messages_clean),
            (self.snapshoturi, po.nTokensOverallClean, self.totaltokens_clean),
            (self.snapshoturi, po.mTokensOverallClean,
             self.mtokens_messages_clean),
            (self.snapshoturi, po.dTokensOverallClean,
             self.dtokens_messages_clean),
            (self.snapshoturi, po.nSentencesOverallClean,
             self.totalsentences_clean),
            (self.snapshoturi, po.mSentencesOverallClean,
             self.msentences_messages_clean),
            (self.snapshoturi, po.dSentencesOverallClean,
             self.dsentences_messages_clean),
            (self.snapshoturi, po.fRemovedLines, fremoved_lines),
        ]
        P.add(triples, context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                                 [po.gmaneParticipantAttribute] *
                                 len(self.participantvars),
                                 self.participantvars,
                                 context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi, [po.gmaneMessageAttribute] *
                                 len(self.messagevars),
                                 self.messagevars,
                                 context=self.meta_graph)
        P.rdf.triplesScaffolding(self.snapshoturi,
                                 [po.emailXMLFilename] * len(self.email_xml) +
                                 [po.emailTTLFilename] * len(self.email_ttl),
                                 self.email_xml + self.email_ttl,
                                 context=self.meta_graph)
        P.rdf.triplesScaffolding(
            self.snapshoturi, [po.onlineEmailXMLFile] * len(self.email_xml) +
            [po.onlineEmailTTLFile] * len(self.email_ttl),
            [self.online_prefix + i for i in self.email_xml + self.email_ttl],
            context=self.meta_graph)
        self.mrdf = self.snapshotid + "Meta.rdf"
        self.mttl = self.snapshotid + "Meta.ttl"
        self.desc = "gmane public email list dataset with snapshotID: {}\nsnapshotURI: {} \nisEgo: {}. isGroup: {}.".format(
            self.snapshotid,
            self.snapshoturi,
            self.isego,
            self.isgroup,
        )
        self.desc += "\nisFriendship: {}; ".format(self.isfriendship)
        self.desc += "isInteraction: {}.".format(self.isinteraction)
        self.desc += "\nnParticipants: {}; nInteractions: {} (replies+references+cc+to).".format(
            self.nparticipants,
            self.nreplies + self.nreferences + self.ncc + self.nto)
        self.desc += "\nisPost: {} (alias hasText: {})".format(
            self.hastext, self.hastext)
        self.desc += "\nnMessages: {} (+ empty: {}); ".format(
            self.nmessages, self.nempty)
        self.desc += "nReplies: {}; nReferences: {}; nTo {}; nCC: {}.".format(
            self.nreplies, self.nreferences, self.ncc, self.nto)
        self.desc += "\nnChars: {}; mChars: {}; dChars: {}.".format(
            self.totalchars, self.mchars_messages, self.dchars_messages)
        self.desc += "\nnTokens: {}; mTokens: {}; dTokens: {};".format(
            self.totaltokens, self.mtokens_messages, self.dtokens_messages)
        self.desc += "\nnSentences: {}; mSentences: {}; dSentences: {}.".format(
            self.totalsentences, self.msentences_messages,
            self.dsentences_messages)
        self.desc += "\nnCharsClean: {}; mCharsClean: {}; dCharsClean: {}.".format(
            self.totalchars_clean, self.mchars_messages_clean,
            self.dchars_messages_clean)
        self.desc += "\nnTokensClean: {}; mTokensClean: {}; dTokensClean: {};".format(
            self.totaltokens_clean, self.mtokens_messages_clean,
            self.dtokens_messages_clean)
        self.desc += "\nnSentencesClean: {}; mSentencesClean: {}; dSentencesClean: {}.".format(
            self.totalsentences_clean, self.msentences_messages_clean,
            self.dsentences_messages_clean)
        self.desc += "\nnUrls: {};  fRemovedLines {};.".format(
            self.nurls, fremoved_lines)
        self.ntriples = len(P.context(self.translation_graph))
        triples = [
            (self.snapshoturi, po.triplifiedIn, datetime.datetime.now()),
            (self.snapshoturi, po.triplifiedBy, "scripts/"),
            (self.snapshoturi, po.donatedBy, self.snapshotid),
            (self.snapshoturi, po.availableAt, self.online_prefix),
            (self.snapshoturi, po.onlineMetaXMLFile,
             self.online_prefix + self.mrdf),
            (self.snapshoturi, po.onlineMetaTTLFile,
             self.online_prefix + self.mttl),
            (self.snapshoturi, po.metaXMLFileName, self.mrdf),
            (self.snapshoturi, po.metaTTLFileName, self.mttl),
            (self.snapshoturi, po.totalXMLFileSizeMB, sum(self.size_xml)),
            (self.snapshoturi, po.totalTTLFileSizeMB, sum(self.size_ttl)),
            (self.snapshoturi, po.acquiredThrough,
             "Gmane public mailing list archive RSS feed"),
            (self.snapshoturi, po.socialProtocolTag, "Gmane"),
            (self.snapshoturi, po.socialProtocol,
             P.rdf.ic(po.Platform, "Gmane", self.meta_graph,
                      self.snapshoturi)),
            (self.snapshoturi, po.nTriples, self.ntriples),
            (self.snapshoturi, NS.rdfs.comment, self.desc),
            (self.snapshoturi, po.gmaneID, self.directory),
        ]
        P.add(triples, context=self.meta_graph)