def parseParticipant(self,fromstring): fromstring=decodeHeader(fromstring) # fromstring="".join(i for i in str(fromstring) if i in string.printable) fromstring=fromstring.replace("\n","").replace("\t","") if ">" in fromstring and "<" not in fromstring: fromstring=re.sub(r"(.*[ ^]*)(.*>)", r"\1<\2", fromstring) c("-|-|-|-| corrected fromstring:", fromstring) elif "<" in fromstring and ">" not in fromstring: fromstring=re.sub(r"(<.*)([ $]*.*)", r"\1>\2", fromstring) c("-|-|-|-| corrected fromstring:", fromstring) if fromstring.count(">")==fromstring.count("<")>0: name,email=re.findall(r"(.*) {0,1}<(.*)>",fromstring)[0] elif "(" in fromstring: email,name=re.findall(r"(.*) {0,1}\((.*)\)",fromstring)[0] elif " " in fromstring: raise ValueError("new author field pattern") else: email=fromstring name="" email=email.replace("..",".") try: assert validate_email(email) except: if "cardecovil.co.kr" in email: email="*****@*****.**" name="" elif re.findall(r"(.*):(.*)",email): name,email=re.findall(r"(.*):(.*)",email)[0] else: raise ValueError("bad email") assert validate_email(email) return email,name.strip().replace("'","").replace('"','')
def parseParticipant(self, fromstring): fromstring = decodeHeader(fromstring) # fromstring="".join(i for i in str(fromstring) if i in string.printable) fromstring = fromstring.replace("\n", "").replace("\t", "") if ">" in fromstring and "<" not in fromstring: fromstring = re.sub(r"(.*[ ^]*)(.*>)", r"\1<\2", fromstring) c("-|-|-|-| corrected fromstring:", fromstring) elif "<" in fromstring and ">" not in fromstring: fromstring = re.sub(r"(<.*)([ $]*.*)", r"\1>\2", fromstring) c("-|-|-|-| corrected fromstring:", fromstring) if fromstring.count(">") == fromstring.count("<") > 0: name, email = re.findall(r"(.*) {0,1}<(.*)>", fromstring)[0] elif "(" in fromstring: email, name = re.findall(r"(.*) {0,1}\((.*)\)", fromstring)[0] elif " " in fromstring: raise ValueError("new author field pattern") else: email = fromstring name = "" email = email.replace("..", ".") try: assert validate_email(email) except: if "cardecovil.co.kr" in email: email = "*****@*****.**" name = "" elif re.findall(r"(.*):(.*)", email): name, email = re.findall(r"(.*):(.*)", email)[0] else: raise ValueError("bad email") assert validate_email(email) return email, name.strip().replace("'", "").replace('"', '')
def addArticleBody(self, body, articleuri): triples = [] if re.findall(r"<(.*)>(.*)<(.*)>", body, re.S): try: P.add((articleuri, po.htmlBodyText, body), context=self.translation_graph) except QueryBadFormed: c("QUOTING HTML BODY") P.add((articleuri, po.quotedHtmlBodyText, urllib.parse.quote(body)), context=self.translation_graph) cleanbody = BeautifulSoup(body, 'html.parser').get_text() if cleanbody: try: P.add((articleuri, po.cleanBodyText, cleanbody), context=self.translation_graph) except QueryBadFormed: c("QUOTING HTML CLEAN BODY") P.add((articleuri, po.quotedCleanBodyText, urllib.parse.quote(cleanbody)), context=self.translation_graph) else: triples += [ (articleuri, po.cleanBodyText, body), ] P.add(triples, context=self.translation_graph) self.bodies += [body]
def __init__(self,snapshoturi,snapshotid,directory="somedir/",\ data_path="../data/",final_path="./gmane_snapshots/",umbrella_dir="gmane_snapshotsX/"): c(snapshoturi, snapshotid, directory) isego = False isgroup = True isfriendship = False isinteraction = True hastext = True interactions_anonymized = False translation_graph = "translation" meta_graph = "translation_meta" gmane_graph = "gmane" P.context(translation_graph, "remove") P.context(meta_graph, "remove") final_path_ = "{}{}/".format(final_path, snapshotid) online_prefix = "https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format( umbrella_dir, snapshotid) ncc = nto = nlines = nremoved_lines = nurls = nlost_messages = nparticipants = nreferences = totalchars = nurls = nreplies = nmessages = nempty = 0 dates = [] nchars_all = [] ntokens_all = [] nsentences_all = [] participantvars = ["emailAddress", "name"] messagevars = [ "author", "createdAt", "replyTo", "messageText", "cleanMessageText", "nCharsClean", "nTokensClean", "nSentencesClean", "hasUrl", "nChars", "nTokens", "nSentences", "emptyMessage", "gmaneID", "subject", "cc", "to", "hasReference", "contentType", "organization", "unparsedCC", "unparsedTo", "emailList" ] messagevars.sort() files = os.listdir(data_path + directory) if not files: self.comment = "no files on the snapshot id" return files.sort() nchars_all = [] ntokens_all = [] nsentences_all = [] nchars_clean_all = [] ntokens_clean_all = [] nsentences_clean_all = [] locals_ = locals().copy() del locals_["self"] for i in locals_: exec("self.{}={}".format(i, i)) self.rdfMbox() if len(self.files) > self.nempty: if not os.path.isdir(final_path_): os.mkdir(final_path_) self.email_xml, self.size_xml, self.email_ttl, self.size_ttl = P.rdf.writeByChunks( self.final_path_ + self.snapshotid + "Email", context=self.translation_graph, ntriples=100000) self.makeMetadata() self.writeAllGmane()
def translateObservatoryTags(self): triples = [] for ot in self.data["observatorios_tem_tags"]: oid = ot[0] tid = ot[1] triples.append((po.Observatory+'#'+self.snapshotid+'-'+str(oid), po.hasTag, po.Tag+'#'+self.snapshotid+'-'+str(tid))) P.add(triples, self.translation_graph) c("finished add of observatory tag entries")
def parseLegacyFiles(profiles=True, articles=True, comments=True): """Parse legacy postgresql data from paricipabr""" # access mysql, access mongo, access irc log from social/ c("starting participabr access") con = psycopg2.connect( database=participabr.postgre_database, user=participabr.postgre_user) cur = con.cursor() # dados das tabelas return ParticipabrPublishing(cur, profiles, articles, comments)
def __init__(self,n_elements=4,method="dimino"): c("started permutations with",n_elements,"elements") self.n_elements=n_elements self.method=method self.getRotations() self.getMirrors() self.getAlternating() self.getFullSymmetry() self.getSwaps() c("finished permutations with",n_elements,"elements")
def __init__(self, snapshoturi, snapshotid, directory="somedir/", data_path="../data/", final_path="./gmane_snapshots/", umbrella_dir="gmane_snapshotsX/"): c(snapshoturi, snapshotid, directory) isego = False isgroup = True isfriendship = False isinteraction = True hastext = True interactions_anonymized = False translation_graph = "translation" meta_graph = "translation_meta" gmane_graph = "gmane" P.context(translation_graph, "remove") P.context(meta_graph, "remove") final_path_ = "{}{}/".format(final_path, snapshotid) online_prefix = "https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format(umbrella_dir, snapshotid) ncc = nto = nlines = nremoved_lines = nurls = nlost_messages = nparticipants = nreferences = totalchars = nurls = nreplies = nmessages = nempty = 0 dates = [] nchars_all = [] ntokens_all = [] nsentences_all = [] participantvars = ["emailAddress", "name"] messagevars = ["author", "createdAt", "replyTo", "messageText", "cleanMessageText", "nCharsClean", "nTokensClean", "nSentencesClean", "hasUrl", "nChars", "nTokens", "nSentences", "emptyMessage", "gmaneID", "subject", "cc", "to", "hasReference", "contentType", "organization", "unparsedCC", "unparsedTo", "emailList"] messagevars.sort() files = os.listdir(data_path+directory) if not files: self.comment = "no files on the snapshot id" return files.sort() nchars_all = [] ntokens_all = [] nsentences_all = [] nchars_clean_all = [] ntokens_clean_all = [] nsentences_clean_all = [] locals_ = locals().copy() del locals_["self"] for i in locals_: exec("self.{}={}".format(i, i)) self.rdfMbox() if len(self.files) > self.nempty: if not os.path.isdir(final_path_): os.mkdir(final_path_) self.email_xml, self.size_xml, self.email_ttl, self.size_ttl = P.rdf.writeByChunks( self.final_path_+self.snapshotid+"Email", context=self.translation_graph, ntriples=100000) self.makeMetadata() self.writeAllGmane()
def writeAllIRC(self): # g = P.context(self.meta_graph) # ntriples = len(g) # triples = [ # (self.snapshoturi, po.nMetaTriples, ntriples+1), # ] # P.add(triples, context=self.meta_graph) g = P.context(self.meta_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml") c("serialized meta")
def writeAllGmane(self): g = P.context(self.meta_graph) g.namespace_manager.bind("po", po) # ntriples = len(g) # triples = [ # (self.snapshoturi, po.nMetaTriples, ntriples), # ] # P.add(triples, context=self.meta_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_+self.snapshotid+"Meta.ttl", "turtle") c("ttl") g.serialize(self.final_path_+self.snapshotid+"Meta.rdf", "xml") c("serialized meta")
def writeAllTW(self): # write meta and readme with self.desc, then all is finished. g = P.context(self.meta_graph) # ntriples = len(g) # triples = [ # (self.snapshoturi, po.nMetaTriples, ntriples), # ] # P.add(triples, context=self.meta_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml") c("serialized meta")
def testTextIO(endpoint_url): client = Client(endpoint_url) triples = [ (test.Dummy, test.desc, """áéíóúćçêôãõà"""), (test.Dummy, test.desc2, "Não concordo com a inclusão da palavra controle, sou a favor da manutenção do texto 'Política Nacional de Participação Social'.\n\nA inclusão desta palavra pode ser interpretada como o poder de controle de determinados atores. O uso de 'Política Nacional de Participação Social'atende mais ao intuito de promover um ambiente democrático e horizontal nas relações de participação civil."), (test.Dummy, test.desc3, "Não concordo com a inclusão da palavra controle, sou a favor da manutenção do texto 'Política Nacional de Participação Social'.\n\nA inclusão desta palavra pode ser interpretada como o poder de controle de determinados atores. O uso de 'Política Nacional de Participação Social'atende mais ao intuito de promover um ambiente democrático e horizontal nas relações de participação civil."), (test.Dummy, test.desc3, " \\o/".encode("utf8")), # (test.Dummy, test.desc, "t:w\ # ex't\n\rte'st"çóṕxx%@#*%&%)(+_ ") ] client.insertTriples(triples, "text_graph") c("all graphs:", client.getAllGraphs()) c("triples in text_graph:", client.getAllTriples("text_graph"))
def rdfTweets(self): tweets=[] if self.pickle_filename1: tweets+=readPickleTweetFile( self.data_path+self.pickle_filename1)[0] if self.pickle_filename2: tweets,fopen=readPickleTweetChunk(self.data_path+self.pickle_filename2,tweets,None,10000) # limit chuck to 5k tweets chunk_count=0 self.tweets=tweets # for probing only, remove to release memory while tweets: c("rendering tweets, chunk:",chunk_count,"ntweets:",len(tweets),"snapshotid",self.snapshotid) for tweet in tweets: tweeturi,triples=self.tweetTriples(tweet) if "retweeted_status" in tweet.keys(): self.nretweets+=1 tweeturi0,triples0=self.tweetTriples(tweet) triples+=triples0 triples+=[(tweeturi,po.retweetOf,tweeturi0)] self.ntriples+=len(triples) P.set_(triples,context=self.tweet_graph) c("rendered",self.ntweets,"tweets") c("end of chunk:",chunk_count,"ntriples:",self.ntriples) self.writeTweets(chunk_count) c("chunk has been written") chunk_count+=1 if chunk_count==2: break if self.pickle_filename2: tweets,fopen=readPickleTweetChunk(None,[],fopen,10000) else: tweets=[] for i in range(chunk_count): # free memory P.context(self.tweet_graph[:-1]+str(i),"remove")
def writeRdf(self): pub_dir = './participabr_snapshot/' if not os.path.isdir(pub_dir): os.mkdir(pub_dir) g = P.context(self.translation_graph) g.serialize(pub_dir+'participabr.ttl', 'turtle') c('participation ttl serialized') g.serialize(pub_dir+'participabr.rdf', 'xml') c('participation xml serialized') # metadados: group, platform, triples = [ (self.snapshoturi, a, po.Snapshot), # (self.snapshoturi, a, po.ParticipabrSnapshot), (self.snapshoturi, po.snapshotID, self.snapshotid), (self.snapshoturi, po.isEgo, False), (self.snapshoturi, po.isGroup, True), (self.snapshoturi, po.isFriendship, True), (self.snapshoturi, po.isInteraction, True), (self.snapshoturi, po.isPost, True), (self.snapshoturi, po.socialProtocol, 'ParticipaBR'), (self.snapshoturi, po.dateObtained, datetime.date(2012, 6, 28)), ] P.add(triples, self.meta_graph) g = P.context(self.meta_graph) g.serialize(pub_dir+'participabrMeta.ttl', 'turtle') c('participation meta ttl serialized') g.serialize(pub_dir+'participabrMeta.rdf', 'xml') c('participation meta xml serialized')
def writeAll(self): g=P.context(self.meta_graph) ntriples=len(g) triples=[ (self.snapshoturi,po.nMetaTriples,ntriples) , ] P.add(triples,context=self.meta_graph) g.namespace_manager.bind("po",po) g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl") g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml") c("serialized meta") if not os.path.isdir(self.final_path_+"scripts"): os.mkdir(self.final_path_+"scripts") shutil.copy(PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py") # copia do base data self.dates=[i.isoformat() for i in self.dates] date1=min(self.dates) date2=max(self.dates) with open(self.final_path_+"README","w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the IRC snapshot {snapid} with tweets from {date1} to {date2} (total of {ntrip} triples).{tinteraction}{tposts} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""".format( snapid=self.snapshotid,date1=date1,date2=date2,ntrip=self.ntriples, tinteraction=tposts, tposts=tposts, mrdf=self.translation_xml, mttl=self.translation_ttl, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc ))
def publishAll(snapshoturis=None): """express tweets as RDF for publishing""" if not snapshoturis: c("getting twitter snapshots, implementation needs verification TTM") uridict={} for snapshoturi in P.get(None,a,NS.po.TwitterSnapshot,minimized=True): uridict[snapshoturi]=0 for rawFile in P.get(snapshoturi,NS.po.rawFile,strict=True,minimized=True): uridict[snapshoturi]+=P.get(rawFile,NS.po.fileSize,minimized=True).toPython() snapshoturis=[i for i in list(uridict.keys()) if i.endswith(".gml")] snapshoturis.sort(key=lambda x: uridict[x]) for snapshoturi in snapshoturis: triplification_class=publishAny(snapshoturi) #writePublishingReadme() return triplification_class
def writeTranslates(self,mode="full"): c("mode full or chunk or multigraph write:",mode) if mode=="full": g=P.context(self.translation_graph) self.translation_ttl=self.snapshotid+"Translation.ttl" self.translation_xml=self.snapshotid+"Translation.rdf" g.serialize(self.final_path_+self.translation_ttl,"turtle"); c("ttl") g.serialize(self.final_path_+self.translation_xml,"xml") self.translation_size_ttl=os.path.getsize(self.final_path_+self.translation_ttl)/10**6 self.translation_size_xml=os.path.getsize(self.final_path_+self.translation_xml)/10**6 self.ntranslation_triples=len(g) elif mode=="chunk": # writeByChunks raise NotImplementedError("Perform P.utils.writeByChunks on self.translation_graph") elif mode=="multigraph": raise NotImplementedError("Perform serialize(write) on each of the self.translation_graphs")
def writeTranslates(self,mode="full"): c("mode full or chunk or multigraph write:",mode) if mode=="full": g=P.context(self.translation_graph) self.translation_ttl=self.snapshotid+"Translation.ttl" self.translation_xml=self.snapshotid+"Translation.rdf" g.serialize(self.final_path_+self.translation_ttl,"turtle"); c("ttl") g.serialize(self.final_path_+self.translation_xml,"xml") self.size_ttl=os.path.getsize(self.final_path_+self.translation_ttl)/10**6 self.size_xml=os.path.getsize(self.final_path_+self.translation_xml)/10**6 self.ntranslation_triples=len(g) elif mode=="chunk": # writeByChunks raise NotImplementedError("Perform P.utils.writeByChunks on self.translation_graph") elif mode=="multigraph": raise NotImplementedError("Perform serialize(write) on each of the self.translation_graphs")
def publishAll(snapshoturis=None): """express emails as RDF for publishing""" if not snapshoturis: c("getting email snapshots, implementation needs verification TTM") uridict = {} for snapshoturi in P.get(None, a, NS.po.Snapshot, minimized=True): uridict[snapshoturi] = 0 for rawFile in P.get(snapshoturi, NS.po.rawFile, strict=True, minimized=True): uridict[snapshoturi] += P.get(rawFile, NS.po.directorySize, minimized=True).toPython() snapshoturis.sort(key=lambda x: uridict[x]) c("on triplification") triplification_classes = [] for snapshoturi in list(snapshoturis)[:10]: triplification_classes += [publishAny(snapshoturi)] # writePublishingReadme() return triplification_classes
def writeRdf(self): pub_dir = './cidadedemocratica_snapshot/' if not os.path.isdir(pub_dir): os.mkdir(pub_dir) # g = P.context(self.translation_graph) # g.serialize(pub_dir+'cidadedemocratica.ttl', 'turtle') # c('participation ttl serialized') # g.serialize(pub_dir+'cidadedemocratica.rdf', 'xml') # c('participation xml serialized') P.rdf.writeByChunks(pub_dir+'cidadedemocratica', context=self.translation_graph, ntriples=100000) # metadados: group, platform, g = P.context(self.meta_graph) g.serialize(pub_dir+'cidadedemocraticaMeta.ttl', 'turtle') c('participation meta ttl serialized') g.serialize(pub_dir+'cidadedemocraticaMeta.rdf', 'xml') c('participation meta xml serialized')
def parseLegacyFiles(mysqldb=True, mongoshouts=True, irclog=True, oreshouts=True): """Parse legacy files with aa shouts and sessions""" # access mysql, access mongo, access irc log from social/ c("starting aa access") if mysqldb: mysqldb = connectMysql() c("mysql ok") if mongoshouts: mongoshouts = connectMongo() c("mongo ok") if irclog: irclogs = accessIrcLog() c("irc ok") if oreshouts: oreshouts = accessOreShouts() c("ore ok") return mysqldb, mongoshouts, irclogs, oreshouts
def translateLoginHistory(self): triples = [] for login in self.data["historico_de_logins"]: lid = login[0] uid = login[1] created = login[2] ip = login[3] uri = P.rdf.ic(po.Login, self.snapshotid+"-"+str(lid), self.translation_graph, self.snapshoturi) triples += [ (uri, po.participant, po.Participant+'#'+self.snapshotid+'-'+str(uid)), (uri, po.createdAt, created), (uri, po.ip, ip) ] P.add(triples, self.translation_graph) c("finished add of login entries")
def __init__(self): self.snapshoturi = P.rdf.ic(po.Snapshot, self.snapshotid, self.meta_graph) c("get data") self.getData() c("start translate") self.translateToRdf() self.makeMeta() c("start render") self.writeRdf() c("finished render")
def translateMacrotags(self): triples = [] for mt in self.data["macro_tags"]: mtid = mt[0] title = mt[1] created = mt[2] updated = mt[3] uri = P.rdf.ic(po.Macrotag, self.snapshotid+"-"+str(mtid), self.translation_graph, self.snapshoturi) triples.append((uri, po.createdAt, created)) if updated != created: triples += [ (uri, po.updatedAt, updated), ] if title: triples.append((uri, po.title, title)) P.add(triples, self.translation_graph) c("finished add of microtag entries")
def __init__(self, postgresql_cursor, profiles=True, articles=True, comments=True): snapshoturi = P.rdf.ic(po.Snapshot, self.snapshotid, self.meta_graph) # P.add((snapshoturi, a, po.Snapshot), context=self.translation_graph) cur = postgresql_cursor datas2 = [] datas = [] bodies = [] abstracts = [] locals_ = locals().copy() del locals_["self"] for i in locals_: exec("self.{}={}".format(i, i)) c("get data") self.getData(cur) c("start translate") self.translateToRdf() self.writeRdf()
def writeTweets(self,chunk_count): if not os.path.isdir(self.final_path): os.mkdir(self.final_path) if not os.path.isdir(self.final_path_): os.mkdir(self.final_path_) filename=self.snapshotid+"Tweet{:05d}".format(chunk_count) g=P.context(self.tweet_graph) g.namespace_manager.bind("po",po) tttl=filename+".ttl" trdf=filename+".rdf" g.serialize(self.final_path_+tttl,"turtle"); c("ttl") g.serialize(self.final_path_+trdf,"xml") filesizettl=os.path.getsize(self.final_path_+tttl)/(10**6) filesizerdf=os.path.getsize(self.final_path_+trdf)/(10**6) self.tweet_ttl+=[tttl] self.size_ttl+=[filesizettl] self.tweet_rdf+=[trdf] self.size_rdf+=[filesizerdf] self.tweet_graph=self.tweet_graph[:-1]+str(chunk_count+1)
def writeTweets(self, chunk_count): if not os.path.isdir(self.final_path): os.mkdir(self.final_path) if not os.path.isdir(self.final_path_): os.mkdir(self.final_path_) filename = self.snapshotid + "Tweet{:05d}".format(chunk_count) g = P.context(self.tweet_graph) g.namespace_manager.bind("po", po) tttl = filename + ".ttl" trdf = filename + ".rdf" g.serialize(self.final_path_ + tttl, "turtle") c("ttl") g.serialize(self.final_path_ + trdf, "xml") filesizettl = os.path.getsize(self.final_path_ + tttl) / (10**6) filesizerdf = os.path.getsize(self.final_path_ + trdf) / (10**6) self.tweet_ttl += [tttl] self.size_ttl += [filesizettl] self.tweet_rdf += [trdf] self.size_rdf += [filesizerdf] self.tweet_graph = self.tweet_graph[:-1] + str(chunk_count + 1)
def __init__(self, snapshoturi, snapshotid, filename="foo.txt", data_path="../data/irc/", final_path="./irc_snapshots/", umbrella_dir="irc_snapshots/"): c(snapshoturi, snapshotid, filename) isego = False isgroup = True isfriendship = False isinteraction = True hastext = True interactions_anonymized = False irc_graph = "social_log" meta_graph = "social_irc_meta" social_graph = "social_irc" P.context(irc_graph, "remove") P.context(meta_graph, "remove") final_path_ = "{}{}/".format(final_path, snapshotid) online_prefix = "https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format( umbrella_dir, snapshotid) naamessages = nurls = ndirect = nmention = 0 dates = [] nchars_all = [] ntokens_all = [] nsentences_all = [] participantvars = ["nick"] messagevars = [ "author", "createdAt", "mentions", "directedTo", "systemMessage", "text", "cleanMessageText", "nChars", "nTokens", "nSentences", "url", "emptyMessage" ] messagevars.sort() locals_ = locals().copy() del locals_["self"] for i in locals_: exec("self.{}={}".format(i, i)) self.rdfLog() self.makeMetadata() self.writeAllIRC()
def publishAll(snapshoturis=None): """express emails as RDF for publishing""" if not snapshoturis: c("getting email snapshots, implementation needs verification TTM") uridict = {} for snapshoturi in P.get(None, a, NS.po.GmaneSnapshot, minimized=True): uridict[snapshoturi] = 0 for rawFile in P.get(snapshoturi, NS.po.rawFile, strict=True, minimized=True): uridict[snapshoturi] += P.get(rawFile, NS.po.directorySize, minimized=True).toPython() snapshoturis.sort(key=lambda x: uridict[x]) c("on triplification") triplification_classes = [] for snapshoturi in list(snapshoturis)[:10]: triplification_classes += [publishAny(snapshoturi)] #writePublishingReadme() return triplification_classes
def translateObservatories(self): count = 0 triples = [] for observatorio in self.data["observatorios"]: oid = observatorio[0] uid = observatorio[1] created = observatorio[4] updated = observatorio[5] uri = P.rdf.ic(po.Observatory, self.snapshotid+"-"+str(oid), self.translation_graph, self.snapshoturi) triples += [ (uri, po.participant, po.Participant+'#'+self.snapshoturi+'-'+str(uid)), (uri, po.createdAt, created), ] if updated != created: triples += [ (uri, po.updatedAt, updated), ] count += 1 if count % 60 == 0: c("finished observatory entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of observatory entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of observatory entries")
def translateLinks(self): count = 0 triples = [] for link in self.data['links']: lid = link[0] nome = link[1] url = link[2] tid = link[4] created = link[5] updated = link[6] uri = P.rdf.ic(po.Link, self.snapshotid+"-"+str(lid), self.translation_graph, self.snapshoturi) triples += [ (uri, po.name, nome), (uri, po.url, url), (uri, po.topic, po.Topic+'#'+self.snapshotid+'-'+str(tid)), (uri, po.createdAt, created) ] if updated != created: triples += [ (uri, po.updatedAt, updated), ] count += 1 if count % 60 == 0: c("finished links entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of links entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of links entries")
def translateSupporters(self): count = 0 triples = [] for adesao in self.data["adesoes"]: tid = adesao[0] uid = adesao[1] created = adesao[2] updated = adesao[3] aid = adesao[4] uri = P.rdf.ic(po.Support, self.snapshotid+"-"+str(aid), self.translation_graph, self.snapshoturi) triples += [ (uri, po.participant, po.Participant+'#'+self.snapshotid+'-'+str(uid)), (uri, po.topic, po.Topic+'#'+self.snapshotid+'-'+str(tid)), (uri, po.createdAt, created), ] if updated != created: triples += [ (uri, po.updatedAt, updated), ] count += 1 if count % 60 == 0: c("finished supporters entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of supporters entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of supporters entries")
def translateStates(self): count = 0 triples = [] for estado in self.data["estados"]: gid = estado[0] nome = estado[1] abr = estado[2] created = estado[3] updated = estado[4] relevance = estado[5] uri = P.rdf.ic(po.State, self.snapshotid+"-"+str(gid), self.translation_graph, self.snapshoturi) triples += [ (uri, po.name, nome), (uri, po.abbreviation, abr), (uri, po.createdAt, created), (uri, po.relevance, relevance), ] if updated != created: triples += [ (uri, po.updatedAt, updated), ] count += 1 if count % 60 == 0: c("finished states entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of states entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of states entries")
def translateCities(self): count = 0 triples = [] for cidade in self.data["cidades"]: cid = cidade[0] nome = cidade[1] eid = cidade[2] slug = cidade[3] created = cidade[4] updated = cidade[5] relevance = cidade[6] uri = P.rdf.ic(po.City, self.snapshotid+"-"+str(cid), self.translation_graph, self.snapshoturi) triples += [ (uri, po.name, nome), (uri, po.state, po.State+'#'+self.snapshotid+str(eid)), (uri, po.slug, slug), (uri, po.createdAt, created), (uri, po.relevance, relevance) ] if updated != created: triples += [ (uri, po.updatedAt, updated), ] count += 1 if count % 60 == 0: c("finished cities k entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of cities entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of cities entries")
def translateNeighborhoods(self): count = 0 triples = [] for bairro in self.data["bairros"]: bid = bairro[0] nome = bairro[1] cid = bairro[2] created = bairro[3] updated = bairro[4] relevance = bairro[5] uri = P.rdf.ic(po.Neighborhood, self.snapshotid+"-"+str(bid), self.translation_graph, self.snapshoturi) triples += [ (uri, po.name, nome), (uri, po.city, po.City+'#'+self.snapshotid+'-'+str(cid)), (uri, po.createdAt, created), (uri, po.relevance, relevance) ] if updated != created: triples += [ (uri, po.updatedAt, updated), ] count += 1 if count % 60 == 0: c("finished neighborhood entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of neighborhood entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of neighborhood entries")
def translateVotes(self): triples = [] commentids = set(self.comments_table.get("id")) count = 0 for id_, vote, voteable_id, voteable_type,\ voter_id, voter_type, created_at in \ self.votes_table.getMany( ("id", "vote", "voteable_id", "voteable_type", "voter_id", "voter_type", "created_at")): assert isinstance(id_, int) assert isinstance(voteable_id, int) assert isinstance(created_at, datetime.datetime) voteuri = P.rdf.ic(po.Vote, self.snapshotid+"-"+str(id_), self.translation_graph, self.snapshoturi) if voteable_type == "Article": type__ = self.articletypes[voteable_id].split("::")[-1] # referenceuri = \ # eval("po."+type__)+"#"+self.snapshotid+"-"+str(voteable_id) referenceuri = \ po.Article+"#"+self.snapshotid+"-"+str(voteable_id) elif voteable_type == "Comment": assert voteable_id in commentids referenceuri = \ po.Comment+"#"+self.snapshotid+"-"+str(voteable_id) else: raise ValueError("unexpected voteable type") triples += [ (voteuri, po.createdAt, created_at), (voteuri, po.vote, vote), (voteuri, po.reference, referenceuri), ] if voter_id: assert voter_type == "Profile" assert isinstance(voter_id, int) participanturi = po.Participant + '#' + \ self.snapshotid+"-"+self.profileids[voter_id] triples += [ (voteuri, po.author, participanturi), ] count += 1 if count % 100 == 0: c("votes done:", count) c("ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of votes") triples = [] if triples: c("ntriples:", len(triples)) P.add(triples, self.translation_graph)
def publishAll(snapshoturis=None): """express tweets as RDF for publishing""" if not snapshoturis: c("getting twitter snapshots, implementation needs verification TTM") uridict = {} for snapshoturi in P.get(None, a, NS.po.TwitterSnapshot, minimized=True): uridict[snapshoturi] = 0 for rawFile in P.get(snapshoturi, NS.po.rawFile, strict=True, minimized=True): uridict[snapshoturi] += P.get(rawFile, NS.po.fileSize, minimized=True).toPython() snapshoturis = [i for i in list(uridict.keys()) if i.endswith(".gml")] snapshoturis.sort(key=lambda x: uridict[x]) for snapshoturi in snapshoturis: triplification_class = publishAny(snapshoturi) return triplification_class
def parseLegacyFiles(data_dir=DATADIR+"irc/"): """Parse legacy txt files with irc logs""" filenames=os.listdir(data_dir) filenames=[i for i in filenames if i!="ipython_log.py" and not i.endswith(".swp")] snapshots=set() triples=[] for filename in filenames: snapshotid="irc-legacy-"+filename.replace("#","") snapshoturi=po.TwitterSnapshot+"#"+snapshotid expressed_classes=[po.Participant,po.IRCMessage] expressed_reference=filename.replace("#","").replace(".txt","").replace(".log","") name_humanized="IRC log of channel "+expressed_reference filesize=os.path.getsize(data_dir+filename)/10**6 fileformat="txt" fileuri=po.File+"#Irc-log-"+filename.replace("#","") triples+=[ (snapshoturi,a,po.Snapshot), (snapshoturi,a,po.IRCSnapshot), (snapshoturi,po.snapshotID,snapshotid), (snapshoturi, po.isEgo, False), (snapshoturi, po.isGroup, True), (snapshoturi, po.isFriendship, False), (snapshoturi, po.isInteraction, True), (snapshoturi, po.isPost, True), (snapshoturi, po.humanizedName, name_humanized), (snapshoturi, po.expressedReference, expressed_reference), (snapshoturi, po.rawFile, fileuri), (fileuri, po.fileSize, filesize), (fileuri, po.fileName, filename), (fileuri, po.fileFormat, fileformat), ]+[ (fileuri, po.expressedClass, expressed_class) for expressed_class in expressed_classes ] snapshots.add(snapshoturi) nfiles=len(filenames) nsnapshots=len(snapshots) P.context("social_irc","remove") platformuri=P.rdf.ic(po.Platform,"IRC",context="social_irc") triples+=[ (NS.social.Session,NS.social.nIRCParsedFiles,nfiles), (NS.social.Session,NS.social.nIRCSnapshots,nsnapshots), (platformuri, po.dataDir,data_dir), ] P.add(triples,context="social_irc") c("parsed {} irc logs files ({} snapshots) are in percolation graph and 'irc_twitter' context".format(nfiles,nsnapshots)) c("percolation graph have {} triples ({} in social_irc context)".format(len(P.percolation_graph),len(P.context("social_irc")))) negos=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isEgo true } } ") ngroups=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isGroup true } } ") nfriendships=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isFriendship true } } ") ninteractions=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isInteraction true } } ") nposts=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isPost true } } ") totalsize=sum(P.query(r" SELECT ?size WHERE { GRAPH <social_irc> { ?s po:fileSize ?size } } ")) c("""{} are ego snapshots, {} are group snapshots {} have a friendship structures. {} have an interaction structures. {} have texts Total raw data size is {:.2f}MB""".format(negos,ngroups,nfriendships,ninteractions,nposts,totalsize)) return snapshots
def translateComments(self): trans = {'resposta': 'answer', 'pergunta': 'question', 'comentario': 'comment', 'ideia': 'idea'} triples = [] count = 0 for comment in self.data['comments']: cid = comment[0] tid = comment[1] # topic id body = comment[3] if not body: continue body = body.replace('', '') uid = comment[4] ctype = comment[8] created = comment[9] updated = comment[10] assert isinstance(cid, int) assert isinstance(tid, int) assert isinstance(body, str) assert isinstance(uid, int) assert isinstance(ctype, str) assert isinstance(created, datetime.datetime) assert isinstance(updated, datetime.datetime) commenturi = P.rdf.ic(po.Comment, self.snapshotid+"-"+str(cid), self.translation_graph, self.snapshoturi) participanturi = po.Participant+'#'+self.snapshotid+"-"+str(uid) # topicuri = self.topicuris[tid] topicuri = po.Topic+'#'+self.snapshotid+'-'+str(tid) triples += [ (commenturi, po.author, participanturi), (commenturi, po.topic, topicuri), (commenturi, po.text, body), # (commenturi, po.nChars, len(body)), (commenturi, po.type, trans[ctype]), (topicuri, po.createdAt, created), ] if updated != created: triples.append( (topicuri, po.updatedAt, updated), ) count += 1 if count % 60 == 0: c("finished comment entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of comment entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of comment entries")
def translateFriendships(self): triples = [] fids = self.friendships_table.getMany(("person_id", "friend_id")) added_friendships = [] count = 0 for person_id, friend_id, created_at, group in \ self.friendships_table.getMany( ('person_id', 'friend_id', 'created_at', 'group')): if [friend_id, person_id] in added_friendships: pass else: added_friendships += [[person_id, friend_id]] id0 = self.profileids[person_id] id1 = self.profileids[friend_id] friendshipuri = P.rdf.ic(po.Friendship, self.snapshotid+'-'+id0+'-'+id1, self.translation_graph, self.snapshoturi) participanturi0 = po.Participant+"#"+self.snapshotid+"-"+id0 participanturi1 = po.Participant+"#"+self.snapshotid+"-"+id1 assert isinstance(created_at, datetime.datetime) triples += [ (friendshipuri, po.member, participanturi0), (friendshipuri, po.member, participanturi1), (friendshipuri, po.createdAt, created_at), ] if [friend_id, person_id] not in fids: triples += [ (participanturi0, po.knows, participanturi1), ] if group: triples += [ (friendshipuri, po.socialCircle, group), ] count += 1 if count % 100 == 0: c("done friendships:", count) c("ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of friendships") triples = [] if triples: c("ntriples:", len(triples)) P.add(triples, self.translation_graph)
def translatePlaces(self): count = 0 triples = [] for local in self.data["locais"]: lid = local[0] rid = local[1] rtype = local[2] bid = local[3] cid = local[4] created = local[7] updated = local[8] cep = local[9] eid = local[10] uri = P.rdf.ic(po.Place, self.snapshotid+"-"+str(lid), self.translation_graph, self.snapshoturi) triples += [(uri, po.createdAt, created)] if bid: triples.append((uri, po.neighborhood, po.Neighborhood+'#'+self.snapshotid+'-'+str(bid))) if cid: triples.append((uri, po.city, po.City+'#'+self.snapshotid+'-'+str(cid))) if eid: triples.append((uri, po.state, po.State+'#'+self.snapshotid+'-'+str(eid))) if cep: triples.append((uri, po.cep, cep)) if updated != created: triples += [ (uri, po.updatedAt, updated), ] if rtype == "Topico": uri_ = po.Topic+'#'+self.snapshotid+'-'+str(rid) elif rtype == "User": uri_ = po.User+'#'+self.snapshotid+'-'+str(rid) elif rtype == "Competition": uri_ = po.Competition+'#'+self.snapshotid+'-'+str(rid) elif rtype == "Observatorio": uri_ = po.Observatory+'#'+self.snapshotid+'-'+str(rid) if rtype: triples.append((uri, po.accountable, uri_)) count += 1 if count % 60 == 0: c("finished places entries:", count, "ntriples:", len(triples)) P.add(triples, self.translation_graph) c("finished add of places entries") triples = [] if triples: P.add(triples, self.translation_graph) c("finished add of places entries")
def rdfTweets(self): tweets = [] if self.pickle_filename1: tweets += readPickleTweetFile(self.data_path + self.pickle_filename1)[0] if self.pickle_filename2: # limit chuck to 10k tweets tweets, fopen = readPickleTweetChunk( self.data_path + self.pickle_filename2, tweets, None, 10000) chunk_count = 0 # self.tweets = tweets # for debugging only, remove to release memory while tweets: c("rendering tweets, chunk:", chunk_count, "ntweets:", len(tweets), "snapshotid", self.snapshotid) count = 0 for tweet in tweets: tweeturi, triples = self.tweetTriples(tweet) if "retweeted_status" in tweet.keys(): # self.nretweets += 1 tweeturi0, triples0 = self.tweetTriples( tweet['retweeted_status']) triples.extend(triples0) triples.append((tweeturi, po.retweetOf, tweeturi0)) self.ntriples += len(triples) P.add(triples, context=self.tweet_graph) count += 1 if count % 1000 == 0: c("triplified", count, "tweets") c("end of chunk:", chunk_count, "ntriples:", self.ntriples) self.writeTweets(chunk_count) c("chunk has been written") chunk_count += 1 # if chunk_count == 2: # break if self.pickle_filename2: tweets, fopen = readPickleTweetChunk(None, [], fopen, 10000) else: tweets = []
def rdfTweets(self): tweets = [] if self.pickle_filename1: tweets += readPickleTweetFile(self.data_path + self.pickle_filename1)[0] if self.pickle_filename2: tweets, fopen = readPickleTweetChunk( self.data_path + self.pickle_filename2, tweets, None, 10000) # limit chuck to 5k tweets chunk_count = 0 self.tweets = tweets # for probing only, remove to release memory while tweets: c("rendering tweets, chunk:", chunk_count, "ntweets:", len(tweets), "snapshotid", self.snapshotid) for tweet in tweets: tweeturi, triples = self.tweetTriples(tweet) if "retweeted_status" in tweet.keys(): self.nretweets += 1 tweeturi0, triples0 = self.tweetTriples(tweet) triples += triples0 triples += [(tweeturi, po.retweetOf, tweeturi0)] self.ntriples += len(triples) P.set_(triples, context=self.tweet_graph) c("rendered", self.ntweets, "tweets") c("end of chunk:", chunk_count, "ntriples:", self.ntriples) self.writeTweets(chunk_count) c("chunk has been written") chunk_count += 1 if chunk_count == 2: break if self.pickle_filename2: tweets, fopen = readPickleTweetChunk(None, [], fopen, 10000) else: tweets = [] for i in range(chunk_count): # free memory P.context(self.tweet_graph[:-1] + str(i), "remove")
# Note 60 is C4, 261.63Hz. Convert with e.g.: # f = M.utils.midi2Hz(scale_grid_[i]) # Pivots to use recurrently: pivots = [7 * i for i in range(3, 8)] pivots_m = [scale_grid[i] for i in pivots] pivots_f = [M.utils.midi2Hz(i) for i in pivots_m] # Plain changes with 2-7 bells: peal2 = M.structures.symmetry.PlainChanges(2) peal3 = M.structures.symmetry.PlainChanges(3) peal4 = M.structures.symmetry.PlainChanges(4) peal5 = M.structures.symmetry.PlainChanges(5) t = time.time() peal6 = M.structures.symmetry.PlainChanges(6, 4) c('Finished making peals 1-6') peal7 = M.structures.symmetry.PlainChanges(7, 5) c('Finished making peals 7') # This one takes too long, maybe save as a pickle file: # peal12 = M.structures.symmetry.PlainChanges(12,10) # If only part of interesting permutations are desired, # one might also do: # >>> R = M.structures.permutations.InterestingPermutations # >>> R.nelements=12 # >>> R.method='dimino' # >>> R.getRotations() # >>> R.getRotations(R) # >>> R.rotations # which is very fast
def getText(message): while message.is_multipart(): message = message.get_payload()[0] charsets = message.get_charsets() try: text = message.get_payload(decode=True) except AssertionError: text = "" if len(charsets) == 1 and text: charset = charsets[0] if charset: try: text = text.decode(charset) except LookupError: c("+++ lookup error in decoding messsage; charset:", charset) try: text = text.decode() except UnicodeDecodeError: try: text = text.decode("latin1") c("+++ used latin1 (no errors)", charset) except UnicodeDecodeError: text = text.decode(errors="ignore") c( "+-- unicode decode error in decoding messsage; used utf8 but charset:", charset) except UnicodeDecodeError: # c(text,charset) c("--- unicode error:", charset) try: text = text.decode("latin1") c("--- used latin1 (no errors)", charset) except UnicodeDecodeError: try: text = text.decode(charset, errors="ignore") c("--+ removed errors in decoding message; charset:", charset) except LookupError: text = text.decode(errors="ignore") c( "-++ lookup error in decoding messsage; used utf8 but charset:", charset) else: # c("*** charset is empty string or None. Might need encoding.") try: text = text.decode() except UnicodeDecodeError: try: text = text.decode("latin1") c("**+ used latin1 (no errors)", charset) except UnicodeDecodeError: text = text.decode(errors="ignore") c("*++ decoded with utf8 and removed errors", charset) elif len(charsets) == 0 and text: text = text.decode() elif text: raise ValueError("more than one charset at the lowest payload leaf") elif not text: text = "" assert isinstance(text, str) content_type = message.get_content_type() if content_type == "text/html": text = ''.join(bs4.BeautifulSoup(text).findAll(text=True)) elif content_type == "text/plain": pass #elif "text/plain" in content_type: elif "text" in content_type: c("WARNING: admitted text without fully understood content type") else: text = "" c("=== Lowest not multipart payload. Should not be translated to rdf") c("content_type", content_type) return P.utils.cleanText(text)
for sequence, state_var, position in zip(sequences, state_vars, positions): if position not in dir(self): self.__dict__[position] = 0 self.__dict__[state_var] = self.__dict__[sequence][self.__dict__[position]] self.__dict__[position] += 1 self.__dict__[position] %= len(self.__dict__[sequence]) isynth = IteratorSynth() isynth.fundamental_frequency_sequence = [] isynth.table = isynth.tables.sine for perm in peal.peal_direct: isynth.fundamental_frequency_sequence.extend(perm(notes)) sounds = [] for i in range(36): sounds += [isynth.renderIterate(duration=1/3)] # M.utils.write(M.H(*sounds),"./sandsounds/ra.wav") c('finished rendering peal') M.utils.write(M.H(*sounds), "./apeal.wav") # grave e agudo f0_ = f0/4 notes_ = [f0_, f0_*semi**4, f0_*semi**8] silence = n.zeros(int(44100*2/3)) bass = [] count = 0 sy = M.synths.CanonicalSynth() sy.table = sy.tables.saw for i in range(6): asound = [sy.render(fundamental_frequency=notes_[(2+count)%3], duration=1/3), silence] * 2 bass.extend(asound) count += 1
def rdfMbox(self): for filecount, file_ in enumerate(self.files): if filecount % 100 == 0: c(self.snapshoturi, filecount) mbox = mailbox.mbox(self.data_path + self.directory + "/" + file_) if not mbox.keys(): self.nempty += 1 mbox.close() # c("||||||||||| EMPTY MESSAGE |||||||||||||||||||||",self.snapshotid,file_,"(",filecount,")") continue if not mbox[0]["Message-Id"]: raise ValueError( "What to do with nonempy messages without id?") message = mbox[0] gmaneid = self.makeId(message["Message-Id"]) #c("gmaneid",gmaneid) if not gmaneid: raise ValueError("Message without id") messageuri = P.rdf.ic(po.EmailMessage, gmaneid, self.translation_graph, self.snapshoturi) self.nmessages += 1 triples = [ (messageuri, po.gmaneID, gmaneid), ] email, name = self.parseParticipant(message["From"]) if not email: raise ValueError("message without author") participanturi = P.rdf.ic(po.GmaneParticipant, email, self.translation_graph, self.snapshoturi) if not P.get(participanturi, po.emailAddress, None, self.translation_graph): self.nparticipants += 1 if self.nparticipants == 100: pass triples += [ (messageuri, po.author, participanturi), (participanturi, po.emailAddress, email), ] if name: triples += [ (participanturi, po.name, name), ] subject = message["Subject"] if subject: subject = decodeHeader(subject) assert isinstance(subject, str) triples += [ (messageuri, po.subject, subject), ] replyid_ = message["In-Reply-To"] saneid = self.makeId(replyid_) if bool(replyid_) and not bool(saneid): self.nreplies += 1 replyid = self.snapshotid + "-" + str(self.nlost_messages) self.nlost_messages += 1 replymessageuri = P.rdf.ic(po.LostEmailMessage, replyid, self.translation_graph, self.snapshoturi) triples += [ (replymessageuri, a, po.EmailMessage), (replymessageuri, NS.rdfs.comment, "This message registered as having a reply, but the field might be ill-formed: " + replyid_), (messageuri, po.replyTo, replymessageuri), ] elif saneid: self.nreplies += 1 replymessageuri = P.rdf.ic(po.EmailMessage, saneid, self.translation_graph, self.snapshoturi) triples += [ (replymessageuri, po.gmaneID, saneid), (messageuri, po.replyTo, replymessageuri), ] if isinstance(message["Date"], str): datetime = parseDate(message["Date"]) elif isinstance(message["Date"], mailbox.email.header.Header): datetimestring = decodeHeader(message["Date"]) if False in [i in string.printable for i in datetimestring]: datetime = None triples += [ (messageuri, po.lostCreatedAt, True), ] else: datetime_ = re.findall(r"(.*\d\d:\d\d:\d\d).*", datetimestring)[0] datetime = parseDate(datetime_) else: raise ValueError("datetime not understood") if datetime: self.dates += [datetime] triples += [ (messageuri, po.createdAt, datetime), ] if message["References"]: references = message["References"].replace("\n", "").replace( "\t", "").replace(" ", "") if not re.findall(r"\A<(.*?)>\Z", references): c("::: ::: ::: references field not understood", message["References"]) triples += [ (messageuri, po.comment, "the references are not understood (<.*> ids are added anyway): " + message["References"]), (messageuri, po.referencesLost, True), ] for reference in re.findall(r"<(.*?)>", references): self.nreferences += 1 referenceuri = P.rdf.ic(po.EmailMessage, reference, self.translation_graph, self.snapshoturi) triples += [ (referenceuri, po.gmaneID, reference), (messageuri, po.hasReference, referenceuri), ] for part in message["References"].replace("\n", "").replace( "\t", "").split(): if validate_email(part): self.nreferences += 1 referenceuri = P.rdf.ic(po.EmailMessage, part, self.translation_graph, self.snapshoturi) triples += [ (referenceuri, po.gmaneID, reference), (messageuri, po.hasReference, referenceuri), ] text = getText(message) if text: nchars = len(text) ntokens = len(k.wordpunct_tokenize(text)) nsentences = len(k.sent_tokenize(text)) triples += [ (messageuri, po.messageText, text), (messageuri, po.nChars, nchars), (messageuri, po.nTokens, ntokens), (messageuri, po.nSentences, nsentences), ] self.nchars_all += [nchars] self.ntokens_all += [ntokens] self.nsentences_all += [nsentences] clean_text = cleanEmailBody(text) self.nremoved_lines += text.count("\n") - clean_text.count( "\n") self.nlines += text.count("\n") nchars_clean = len(clean_text) ntokens_clean = len(k.wordpunct_tokenize(clean_text)) nsentences_clean = len(k.sent_tokenize(clean_text)) triples += [ (messageuri, po.messageTextClean, clean_text), (messageuri, po.nCharsClean, nchars_clean), (messageuri, po.nTokensClean, ntokens_clean), (messageuri, po.nSentencesClean, nsentences_clean), ] self.nchars_clean_all += [nchars_clean] self.ntokens_clean_all += [ntokens_clean] self.nsentences_clean_all += [nsentences_clean] for url in re.findall( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', clean_text): self.nurls += 1 triples += [ (messageuri, po.hasUrl, url), ] content_type = message.get_content_type() if content_type: triples += [(messageuri, po.contentType, content_type)] else: raise ValueError("/\/\/\/\/\ message without content type") organization = message["Organization"] if organization: if not isinstance(organization, str): organization = "".join(i for i in str(organization) if i in string.printable) triples += [ (messageuri, po.organization, organization), ] if message["cc"]: cc, unparsed = parseAddresses(message["cc"]) if unparsed: triples += [ (messageuri, po.unparsedCC, unparsed), ] for peeraddress, peername in cc: peeraddress = peeraddress.strip() assert bool(peeraddress) peeruri = P.rdf.ic(po.EmailPeer, peeraddress, self.translation_graph, self.snapshoturi) triples += [ (messageuri, po.cc, peeruri), (peeruri, po.emailAddress, peeraddress), ] self.ncc += 1 if peername: triples += [ (peeruri, po.name, peername.strip()), ] if message["to"]: to, unparsed = parseAddresses(message["to"]) if unparsed: triples += [ (messageuri, po.unparsedTo, unparsed), ] for peeraddress, peername in to: peeraddress = peeraddress.strip() assert bool(peeraddress) peeruri = P.rdf.ic(po.EmailPeer, peeraddress, self.translation_graph, self.snapshoturi) triples += [ (messageuri, po.to, peeruri), (peeruri, po.emailAddress, peeraddress), ] self.nto += 1 if peername: triples += [ (peeruri, po.name, peername.strip()), ] listid = message["list-id"] if listid: assert isinstance(listid, str) listid = listid.replace("\n", "").replace("\t", "") if listid.count("<") == listid.count(">") == listid.count( " ") == 0: listname = "" listid_ = listid elif listid.count("<") == listid.count(">") == 0: parts = listid.split() lens = [len(i) for i in parts] listid_ = [i for i in parts if len(i) == max(lens)][0] listname = " ".join(i for i in parts if len(i) != max(lens)) elif listid.count("<") == listid.count(">") == 1: listname, listid_ = re.findall(r"(.*) {0,1}<(.*)>", listid)[0] else: raise ValueError("Unexpected listid string format") listuri = P.rdf.ic(po.EmailList, listid_, self.translation_graph, self.snapshoturi) triples += [ (messageuri, po.emailList, listuri), (listuri, po.listID, listid_), ] if listname: triples += [ (listuri, po.name, listname.strip()), ] P.add(triples, self.translation_graph) mbox.close()
def parseLegacyFiles(data_dir=DATADIR): """Parse legacy txt files with irc logs""" data_dir = os.path.expanduser(data_dir) directories = os.listdir(data_dir) directories = [i for i in directories if os.path.isdir(data_dir + i)] snapshots = set() triples = [] for directory in directories: all_files = [ i for i in os.listdir(data_dir + directory) if i.isdigit() ] if all_files: all_files.sort() foo = all_files[0].lstrip("0") if not foo: foo = "0" snapshotid = "legacy-" + directory + "-" + foo + "-" + all_files[ -1].lstrip("0") snapshoturi = po.GmaneSnapshot + "#" + snapshotid expressed_classes = [ po.GmaneParticipant, po.EmailPeer, po.EmailMessage ] expressed_reference = directory name_humanized = "Gmane email list with id " + expressed_reference # get size for all files in dir directorysize = sum( os.path.getsize(data_dir + directory + "/" + filename) for filename in os.listdir(data_dir + directory)) / 10**6 nfiles = len(all_files) fileformat = "mbox" directoryuri = po.Directory + "#gmane-" + directory triples += [ (snapshoturi, a, po.Snapshot), (snapshoturi, po.dataDir, data_dir), (snapshoturi, a, po.Snapshot), (snapshoturi, a, po.GmaneSnapshot), (snapshoturi, po.snapshotID, snapshotid), (snapshoturi, po.isEgo, False), (snapshoturi, po.isGroup, True), (snapshoturi, po.isFriendship, False), (snapshoturi, po.isInteraction, True), (snapshoturi, po.isPost, True), (snapshoturi, po.humanizedName, name_humanized), (snapshoturi, po.expressedReference, expressed_reference), (snapshoturi, po.rawDirectory, directoryuri), (directoryuri, po.directorySize, directorysize), (directoryuri, po.directoryName, directory), (directoryuri, po.fileFormat, fileformat), ] + [(directoryuri, po.expressedClass, expressed_class) for expressed_class in expressed_classes] snapshots.add(snapshoturi) nsnapshots = ndirectories = len(directories) #P.context("gmane","remove") platformuri = P.rdf.ic(po.Platform, "Gmane", context="gmane") triples += [ (NS.social.Session, NS.social.nGmaneParsedDirectories, ndirectories), (NS.social.Session, NS.social.nGmaneSnapshots, nsnapshots), (NS.social.Session, po.platform, platformuri), ] P.add(triples, context="gmane") c("parsed {} gmane data directories (=={} snapshots) are in percolation graph and 'gmane' context" .format(ndirectories, nsnapshots)) c("percolation graph have {} triples ({} in gmane context)".format( len(P.percolation_graph), len(P.context("gmane")))) negos = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isEgo true } } " ) ngroups = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isGroup true } } " ) nfriendships = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isFriendship true } } " ) ninteractions = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isInteraction true } } " ) nposts = P.query( r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isPost true } } " ) totalsize = sum( P.query( r" SELECT ?size WHERE { GRAPH <gmane> { ?s po:directorySize ?size } } " )) c("""{} are ego snapshots, {} are group snapshots {} have a friendship structures. {} have an interaction structures. {} have texts Total raw data size is {:.2f}MB""".format(negos, ngroups, nfriendships, ninteractions, nposts, totalsize)) return snapshots
client = Client(endpoint_url+"/update") triples = [ (test.Dummy, test.desc, """áéíóúćçêôãõà"""), (test.Dummy, test.desc2, "Não concordo com a inclusão da palavra controle, sou a favor da manutenção do texto 'Política Nacional de Participação Social'.\n\nA inclusão desta palavra pode ser interpretada como o poder de controle de determinados atores. O uso de 'Política Nacional de Participação Social'atende mais ao intuito de promover um ambiente democrático e horizontal nas relações de participação civil.".replace("\\","\\\\")), (test.Dummy, test.desc3, " \\o/".replace("\\","\\\\")), (test.Dummy, test.desc3, ' Denominação "ASd"'.replace('"',"'")), (test.Dummy, test.desc3, ' Denominação "ASd"'), ] querystring = buildQuery(triples, method="insert") client.endpoint.method = "GET" client.endpoint.method = "POST" # client.endpoint.requestMethod = "" client.endpoint.requestMethod = "postdirectly" client.endpoint.requestMethod = "urlencoded" client.endpoint.setQuery(querystring) client.performQuery(querystring) if __name__ == "__main__": endpoint_url = os.getenv("PERCOLATION_ENDPOINT") if not endpoint_url: endpoint_url = input("please enter a sparql endpoint url") c("==> endpoint url:", endpoint_url) c("+++ testing create and delete graphs/contexts and triples:") #triples = testReadWriteDelete(endpoint_url) c("--- testing IO of text:", endpoint_url) #triples = testTextIO(endpoint_url) c("### testing custom server:", endpoint_url) triples = customConnection(endpoint_url) c("end of (remote) sparql endpoint tests", triples)
def testReadWriteDelete(endpoint_url): client = Client(endpoint_url) triples = [ (NS.test.Something, a, NS.test.OtherThing), ] client.insertTriples(triples, "another") c("should print a triple: ", client.getAllTriples("another")) client.updateQuery("DROP GRAPH <another> ") c("should not print a triple: ", client.getAllTriples("another")) client.insertTriples(triples, "another") c("should print a triple: ", client.getAllTriples("another")) client.insertTriples(triples, "even_another") query = "SELECT ?g WHERE { GRAPH ?g {} }" c("should print all graphs : ", client.retrieveQuery(query)) client.updateQuery("DROP GRAPH <another> ") client.updateQuery("DROP GRAPH <even_another> ") c("should have no more graphs : ", client.retrieveQuery(query)) # add and remove triples triples_ = [ (NS.test.SomethingElse, NS.test.pred, "banana"), ] client.insertTriples(triples+triples_, "another") c("should print two triples: ", client.getAllTriples("another")) query = r"DELETE DATA { GRAPH <another> { <%s> <%s> 'banana' . } } " % \ (NS.test.SomethingElse, NS.test.pred) client.updateQuery(query) c("should print one triple: ", client.getAllTriples("another")) client.updateQuery("DROP GRAPH <another> ") query = "SELECT ?g WHERE { GRAPH ?g {} }" c("should have no more dummy graphs : ", client.retrieveQuery(query))
def writeAllTW(self): # write meta and readme with self.desc, finished. g = P.context(self.meta_graph) ntriples = len(g) triples = [ (self.snapshoturi, po.nMetaTriples, ntriples), ] P.add(triples, context=self.meta_graph) g.namespace_manager.bind("po", po) g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle") c("ttl") g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml") c("serialized meta") # copia o script que gera este codigo if not os.path.isdir(self.final_path_ + "scripts"): os.mkdir(self.final_path_ + "scripts") shutil.copy(S.PACKAGEDIR + "/../tests/triplify.py", self.final_path_ + "scripts/triplify.py") # copia do base data tinteraction = """\n\n{} individuals with metadata {} and {} interactions (retweets: {}, replies: {}, user_mentions: {}) constitute the interaction network in the RDF/XML file(s): {} and the Turtle file(s): {} (anonymized: {}).""".format( self.nparticipants, str(self.participantvars), self.nretweets + self.nreplies + self.nuser_mentions, self.nretweets, self.nreplies, self.nuser_mentions, self.tweet_rdf, self.tweet_ttl, self.interactions_anonymized) tposts = """\n\nThe dataset consists of {} tweets with metadata {} {:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {} {:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}""".format( self.ntweets, str(self.tweetvars), self.mcharstweets, self.dcharstweets, self.totalchars, self.mtokenstweets, self.dtokenstweets, self.totaltokens, ) self.dates = [i.isoformat() for i in self.dates] date1 = min(self.dates) date2 = max(self.dates) with open(self.final_path_ + "README", "w") as f: f.write("""::: Open Linked Social Data publication \nThis repository is a RDF data expression of the twitter snapshot {snapid} with tweets from {date1} to {date2} (total of {ntrip} triples).{tinteraction}{tposts} \nMetadata for discovery in the RDF/XML file: {mrdf} \nor in the Turtle file:\n{mttl} \nEgo network: {ise} Group network: {isg} Friendship network: {isf} Interaction network: {isi} Has text/posts: {ist} \nAll files should be available at the git repository: {ava} \n{desc} The script that rendered this data publication is on the script/ directory.\n:::""" .format(snapid=self.snapshotid, date1=date1, date2=date2, ntrip=self.ntriples, tinteraction=tinteraction, tposts=tposts, mrdf=self.mrdf, mttl=self.mttl, ise=self.isego, isg=self.isgroup, isf=self.isfriendship, isi=self.isinteraction, ist=self.hastext, ava=self.online_prefix, desc=self.desc))
import sys keys = tuple(sys.modules.keys()) for key in keys: if "gmane" in key or "percolation" in key: del sys.modules[key] import gmane as G, percolation as P from percolation.rdf import NS, a, po, c #ss=S.facebook.access.parseLegacyFiles() ##ss=[i for i in ss if i.endswith("gdf_fb")] #last_triplification_class=S.facebook.render.publishAll(ss) #ss=S.twitter.access.parseLegacyFiles() ##ss=[i for i in ss if i.endswith("gdf_fb")] #last_triplification_class=S.twitter.render.publishAll(ss) #ss=G.access.parseLegacyFiles() ss = G.access.parseLegacyFiles("/home/r/.gmane3/") c("finished .gmane") #ss.union(G.access.parseLegacyFiles("/home/r/.gmane2/")); c("finished .gmane2") #ss.union(G.access.parseLegacyFiles("/home/r/.gmane3/")); c("finished .gmane3") #ss.union(G.access.parseLegacyFiles("/home/r/.gmane4/")); c("finished .gmane4") #ss=[i for i in ss if i.endswith("gdf_fb")] #last_triplification_classes+=G.render.publishAll(ss); c("finished publication of all") triplification_classes = G.render.publishAll(ss) c("finished publication of all")
def rdfLog(self): with codecs.open(self.data_path + self.filename, "rb", "iso-8859-1") as f: logtext = textFix(f.read()) # msgregex=r"\[(\d{2}):(\d{2}):(\d{2})\] \* ([^ ?]*)[ ]*(.*)" # DELETE ??? #rmessage= r"\[(\d{2}):(\d{2}):(\d{2})\] \<(.*?)\>[ ]*(.*)" # message # lista arquivos no dir rdate = r"(\d{4})(\d{2})(\d{2})" # date rsysmsg = r"(\d{4})\-(\d{2})\-(\d{2})T(\d{2}):(\d{2}):(\d{2}) \*\*\* (\S+) (.*)" # system message (?) rmsg = r"(\d{4})\-(\d{2})\-(\d{2})T(\d{2}):(\d{2}):(\d{2}) \<(.*?)\> (.*)" # message rurl = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' messages = re.findall(rmsg, logtext)[:10] system_messages = re.findall(rsysmsg, logtext)[:10] self.NICKS = set([Q(i[-2]) for i in messages] + [Q(i[-2]) for i in system_messages]) triples = [] for nick in self.NICKS: useruri = P.rdf.ic(po.Participant, "{}-{}".format(self.snapshotid, nick), self.irc_graph, self.snapshoturi) triples += [ (useruri, po.nick, nick), ] messageids = set() msgcount = 0 c("starting translation of log with", len(messages) + len(system_messages), "messages") for message in messages: year, month, day, hour, minute, second, nick, text = message nick = Q(nick) datetime_ = datetime.datetime( *[int(i) for i in (year, month, day, hour, minute, second)]) self.dates += [datetime_] timestamp = datetime_.isoformat() messageid = "{}-{}-{}".format(self.snapshotid, nick, timestamp) while messageid in messageids: messageid += '_r_%05x' % random.randrange(16**5) messageids.add(messageid) messageuri = P.rdf.ic(po.IRCMessage, messageid, self.irc_graph, self.snapshoturi) # achar direct message com virgula! TTM tokens = k.word_tokenize(text) tokens = [i for i in tokens if i not in set(string.punctuation)] direct_nicks = [] # for directed messages at mention_nicks = [] # for mentioned fellows direct = 1 for token in tokens: if token not in self.NICKS: direct = 0 else: if direct: direct_nicks += [token] else: mention_nicks += [token] for nick in direct_nicks: useruri2 = po.Participant + "#{}-{}".format( self.snapshotid, nick) triples += [ (messageuri, po.directedTo, useruri2), ] if direct_nicks: self.ndirect += 1 text_ = text[text.index(direct_nicks[-1]) + len(direct_nicks[-1]) + 1:].lstrip() else: text_ = text for nick in mention_nicks: useruri2 = po.Participant + "#{}-{}".format( self.snapshotid, nick) triples += [ (messageuri, po.mentions, useruri2), ] self.nmention += len(mention_nicks) useruri = po.Participant + "#{}-{}".format(self.snapshotid, nick) triples += [ (messageuri, po.author, useruri), (messageuri, po.systemMessage, False), (messageuri, po.createdAt, datetime_), ] if text: triples += [ (messageuri, po.messageText, text), ] if text_: nchars = len(text_) tokens = k.word_tokenize(text_) ntokens = len(tokens) nsentences = len(k.sent_tokenize(text_)) triples += [ (messageuri, po.cleanMessageText, text_), (messageuri, po.nChars, nchars), (messageuri, po.nTokens, ntokens), (messageuri, po.nSentences, nsentences), ] urls = re.findall(rurl, text_) for url in urls: triples += [ (messageuri, po.hasUrl, url), ] self.nchars_all += [nchars] self.ntokens_all += [ntokens] self.nsentences_all += [nsentences] self.nurls += len(urls) else: triples += [ (messageuri, po.emptyMessage, True), ] if text.startswith(";aa ") or text.startswith( "lalenia, aa ") or text.startswith("lalenia: aa "): self.naamessages += 1 triples += [ (messageuri, a, po.AAIRCMessage), ] msgcount += 1 if msgcount % 1000 == 0: c("finished user message", msgcount) msgcount = 0 for message in system_messages: year, month, day, hour, minute, second, nick, text = message nick = Q(nick) useruri = po.Participant + "#{}-{}".format(self.snapshotid, nick) datetime_ = datetime.datetime( *[int(i) for i in (year, month, day, hour, minute, second)]) self.dates += [datetime_] timestamp = datetime_.isoformat() messageid = "{}-{}".format(self.snapshotid, timestamp) while messageid in messageids: messageid += '_r_%05x' % random.randrange(16**5) messageids.update([messageid]) messageuri = P.rdf.ic(po.IRCMessage, messageid, self.irc_graph, self.snapshoturi) triples += [(messageuri, po.impliedUser, useruri), (messageuri, po.sentAt, datetime_), (messageuri, po.systemMessage, True)] if text: triples += [(messageuri, po.messageText, text)] msgcount += 1 if msgcount % 1000 == 0: c("finished system message. Total messages:", msgcount) self.messageids = messageids if not os.path.isdir(self.final_path): os.mkdir(self.final_path) if not os.path.isdir(self.final_path_): os.mkdir(self.final_path_) self.log_xml, self.size_xml, self.log_ttl, self.size_ttl = P.rdf.writeByChunks( self.final_path_ + self.snapshotid + "Log", ntriples=5, triples=triples)
for syllable in allConsonantSyllables(): if word[-1] == syllable[-1] == 'j': continue word_ = word + syllable words += [word_] # c('finished words with {} syllables'.format(n_+1)) n_ += 1 words_ = words[:] for vowel in vowels: words += [vowel + word for word in words_ if len(word) / 2 < n] words += [vowel for vowel in vowels] # assuming one vowel words # c('finished words starting with vowels') return words def allTokiPonaExistentWords(): from . import makeStatistics as stats return stats if __name__ == '__main__': c('vowels:', vowels) c('\n', 'consonants:', consonants) c('\n', 'invalid_syllables (4):', invalid_syllables) all_syllables = allConsonantSyllables() c('\n', 'valid syllables ({}):'.format(len(all_syllables)), all_syllables) all_possible_words = allTokiPonaPossibleWords() c('\n', 'all tokipona possible words with 3 syllables', '({})'.format(len(all_possible_words)), all_possible_words)
if "music" in key: del sys.modules[key] import music as M from percolation.rdf import c def fact(x): if x == 1: return 1 return x*fact(x-1) nelements = 0 while nelements not in range(3, 13): nelements_maximum = input("make changes until maximum number of elements:\ (min=3,,max=12,default=5) ") try: nelements = int(nelements_maximum) except: pass if not nelements_maximum: nelements_maximum = 5 # generate peals with elements in numbers of 3 to 12 peals = {} for nelements in range(3, int(nelements_maximum)+1): key = "peal_with_" + str(nelements) + "_elements" nhunts=nelements-3 peal = M.structures.symmetry.PlainChanges(nelements,nhunts) peals[key] = peal c(len(peal.peal_direct), fact(nelements)) assert len(peal.peal_direct) == fact(nelements)