コード例 #1
0
ファイル: mbox2rdf.py プロジェクト: land-pack/gmane
    def parseParticipant(self,fromstring):
        fromstring=decodeHeader(fromstring)
#            fromstring="".join(i for i in str(fromstring) if i in string.printable)
        fromstring=fromstring.replace("\n","").replace("\t","")
        if ">" in fromstring and "<" not in fromstring:
            fromstring=re.sub(r"(.*[ ^]*)(.*>)",   r"\1<\2", fromstring)
            c("-|-|-|-| corrected fromstring:", fromstring)
        elif "<" in fromstring and ">" not in fromstring:
            fromstring=re.sub(r"(<.*)([ $]*.*)",   r"\1>\2", fromstring)
            c("-|-|-|-| corrected fromstring:", fromstring)
        if fromstring.count(">")==fromstring.count("<")>0:
            name,email=re.findall(r"(.*) {0,1}<(.*)>",fromstring)[0]
        elif "(" in fromstring:
            email,name=re.findall(r"(.*) {0,1}\((.*)\)",fromstring)[0]
        elif " " in fromstring:
            raise ValueError("new author field pattern")
        else:
            email=fromstring
            name=""
        email=email.replace("..",".")
        try:
            assert validate_email(email)
        except:
            if "cardecovil.co.kr" in email:
                email="*****@*****.**"
                name=""
            elif re.findall(r"(.*):(.*)",email):
                name,email=re.findall(r"(.*):(.*)",email)[0]
            else:
                raise ValueError("bad email")
        assert validate_email(email)
        return email,name.strip().replace("'","").replace('"','')
コード例 #2
0
ファイル: mbox2rdf.py プロジェクト: land-pack/gmane
 def parseParticipant(self, fromstring):
     fromstring = decodeHeader(fromstring)
     #            fromstring="".join(i for i in str(fromstring) if i in string.printable)
     fromstring = fromstring.replace("\n", "").replace("\t", "")
     if ">" in fromstring and "<" not in fromstring:
         fromstring = re.sub(r"(.*[ ^]*)(.*>)", r"\1<\2", fromstring)
         c("-|-|-|-| corrected fromstring:", fromstring)
     elif "<" in fromstring and ">" not in fromstring:
         fromstring = re.sub(r"(<.*)([ $]*.*)", r"\1>\2", fromstring)
         c("-|-|-|-| corrected fromstring:", fromstring)
     if fromstring.count(">") == fromstring.count("<") > 0:
         name, email = re.findall(r"(.*) {0,1}<(.*)>", fromstring)[0]
     elif "(" in fromstring:
         email, name = re.findall(r"(.*) {0,1}\((.*)\)", fromstring)[0]
     elif " " in fromstring:
         raise ValueError("new author field pattern")
     else:
         email = fromstring
         name = ""
     email = email.replace("..", ".")
     try:
         assert validate_email(email)
     except:
         if "cardecovil.co.kr" in email:
             email = "*****@*****.**"
             name = ""
         elif re.findall(r"(.*):(.*)", email):
             name, email = re.findall(r"(.*):(.*)", email)[0]
         else:
             raise ValueError("bad email")
     assert validate_email(email)
     return email, name.strip().replace("'", "").replace('"', '')
コード例 #3
0
ファイル: postgre2rdf.py プロジェクト: ttm/participation
 def addArticleBody(self, body, articleuri):
     triples = []
     if re.findall(r"<(.*)>(.*)<(.*)>", body, re.S):
         try:
             P.add((articleuri, po.htmlBodyText, body),
                   context=self.translation_graph)
         except QueryBadFormed:
             c("QUOTING HTML BODY")
             P.add((articleuri, po.quotedHtmlBodyText,
                    urllib.parse.quote(body)),
                   context=self.translation_graph)
         cleanbody = BeautifulSoup(body, 'html.parser').get_text()
         if cleanbody:
             try:
                 P.add((articleuri, po.cleanBodyText, cleanbody),
                       context=self.translation_graph)
             except QueryBadFormed:
                 c("QUOTING HTML CLEAN BODY")
                 P.add((articleuri, po.quotedCleanBodyText,
                        urllib.parse.quote(cleanbody)),
                       context=self.translation_graph)
     else:
         triples += [
                    (articleuri, po.cleanBodyText, body),
                    ]
         P.add(triples, context=self.translation_graph)
     self.bodies += [body]
コード例 #4
0
ファイル: mbox2rdf.py プロジェクト: land-pack/gmane
    def __init__(self,snapshoturi,snapshotid,directory="somedir/",\
            data_path="../data/",final_path="./gmane_snapshots/",umbrella_dir="gmane_snapshotsX/"):
        c(snapshoturi, snapshotid, directory)
        isego = False
        isgroup = True
        isfriendship = False
        isinteraction = True
        hastext = True
        interactions_anonymized = False

        translation_graph = "translation"
        meta_graph = "translation_meta"
        gmane_graph = "gmane"
        P.context(translation_graph, "remove")
        P.context(meta_graph, "remove")
        final_path_ = "{}{}/".format(final_path, snapshotid)
        online_prefix = "https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format(
            umbrella_dir, snapshotid)
        ncc = nto = nlines = nremoved_lines = nurls = nlost_messages = nparticipants = nreferences = totalchars = nurls = nreplies = nmessages = nempty = 0
        dates = []
        nchars_all = []
        ntokens_all = []
        nsentences_all = []
        participantvars = ["emailAddress", "name"]
        messagevars = [
            "author", "createdAt", "replyTo", "messageText",
            "cleanMessageText", "nCharsClean", "nTokensClean",
            "nSentencesClean", "hasUrl", "nChars", "nTokens", "nSentences",
            "emptyMessage", "gmaneID", "subject", "cc", "to", "hasReference",
            "contentType", "organization", "unparsedCC", "unparsedTo",
            "emailList"
        ]
        messagevars.sort()
        files = os.listdir(data_path + directory)
        if not files:
            self.comment = "no files on the snapshot id"
            return
        files.sort()
        nchars_all = []
        ntokens_all = []
        nsentences_all = []
        nchars_clean_all = []
        ntokens_clean_all = []
        nsentences_clean_all = []
        locals_ = locals().copy()
        del locals_["self"]
        for i in locals_:
            exec("self.{}={}".format(i, i))
        self.rdfMbox()
        if len(self.files) > self.nempty:
            if not os.path.isdir(final_path_):
                os.mkdir(final_path_)
            self.email_xml, self.size_xml, self.email_ttl, self.size_ttl = P.rdf.writeByChunks(
                self.final_path_ + self.snapshotid + "Email",
                context=self.translation_graph,
                ntriples=100000)
            self.makeMetadata()
            self.writeAllGmane()
コード例 #5
0
ファイル: mysql2rdf.py プロジェクト: ttm/participation
 def translateObservatoryTags(self):
     triples = []
     for ot in self.data["observatorios_tem_tags"]:
         oid = ot[0]
         tid = ot[1]
         triples.append((po.Observatory+'#'+self.snapshotid+'-'+str(oid),
                         po.hasTag, po.Tag+'#'+self.snapshotid+'-'+str(tid)))
     P.add(triples, self.translation_graph)
     c("finished add of observatory tag entries")
コード例 #6
0
ファイル: access.py プロジェクト: ttm/participation
def parseLegacyFiles(profiles=True, articles=True, comments=True):
    """Parse legacy postgresql data from paricipabr"""
    # access mysql, access mongo, access irc log from social/
    c("starting participabr access")
    con = psycopg2.connect(
        database=participabr.postgre_database, user=participabr.postgre_user)
    cur = con.cursor()

    # dados das tabelas
    return ParticipabrPublishing(cur, profiles, articles, comments)
コード例 #7
0
ファイル: symusic.py プロジェクト: ttm/music
 def __init__(self,n_elements=4,method="dimino"):
     c("started permutations with",n_elements,"elements")
     self.n_elements=n_elements
     self.method=method
     self.getRotations()
     self.getMirrors()
     self.getAlternating()
     self.getFullSymmetry()
     self.getSwaps()
     c("finished permutations with",n_elements,"elements")
コード例 #8
0
ファイル: mbox2rdf.py プロジェクト: ttm/gmane
    def __init__(self, snapshoturi, snapshotid, directory="somedir/",
                 data_path="../data/", final_path="./gmane_snapshots/",
                 umbrella_dir="gmane_snapshotsX/"):
        c(snapshoturi, snapshotid, directory)
        isego = False
        isgroup = True
        isfriendship = False
        isinteraction = True
        hastext = True
        interactions_anonymized = False

        translation_graph = "translation"
        meta_graph = "translation_meta"
        gmane_graph = "gmane"
        P.context(translation_graph, "remove")
        P.context(meta_graph, "remove")
        final_path_ = "{}{}/".format(final_path, snapshotid)
        online_prefix = "https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format(umbrella_dir, snapshotid)
        ncc = nto = nlines = nremoved_lines = nurls = nlost_messages = nparticipants = nreferences = totalchars = nurls = nreplies = nmessages = nempty = 0
        dates = []
        nchars_all = []
        ntokens_all = []
        nsentences_all = []
        participantvars = ["emailAddress", "name"]
        messagevars = ["author", "createdAt", "replyTo", "messageText",
                       "cleanMessageText", "nCharsClean", "nTokensClean",
                       "nSentencesClean", "hasUrl", "nChars", "nTokens",
                       "nSentences", "emptyMessage", "gmaneID", "subject",
                       "cc", "to", "hasReference", "contentType", "organization",
                       "unparsedCC", "unparsedTo", "emailList"]
        messagevars.sort()
        files = os.listdir(data_path+directory)
        if not files:
            self.comment = "no files on the snapshot id"
            return
        files.sort()
        nchars_all = []
        ntokens_all = []
        nsentences_all = []
        nchars_clean_all = []
        ntokens_clean_all = []
        nsentences_clean_all = []
        locals_ = locals().copy()
        del locals_["self"]
        for i in locals_:
            exec("self.{}={}".format(i, i))
        self.rdfMbox()
        if len(self.files) > self.nempty:
            if not os.path.isdir(final_path_):
                os.mkdir(final_path_)
            self.email_xml, self.size_xml, self.email_ttl, self.size_ttl = P.rdf.writeByChunks(
                self.final_path_+self.snapshotid+"Email", context=self.translation_graph, ntriples=100000)
            self.makeMetadata()
            self.writeAllGmane()
コード例 #9
0
 def writeAllIRC(self):
     # g = P.context(self.meta_graph)
     # ntriples = len(g)
     # triples = [
     #          (self.snapshoturi, po.nMetaTriples, ntriples+1),
     #          ]
     # P.add(triples, context=self.meta_graph)
     g = P.context(self.meta_graph)
     g.namespace_manager.bind("po", po)
     g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle")
     c("ttl")
     g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml")
     c("serialized meta")
コード例 #10
0
ファイル: mbox2rdf.py プロジェクト: ttm/gmane
 def writeAllGmane(self):
     g = P.context(self.meta_graph)
     g.namespace_manager.bind("po", po)
     # ntriples = len(g)
     # triples = [
     #          (self.snapshoturi, po.nMetaTriples, ntriples),
     #          ]
     # P.add(triples, context=self.meta_graph)
     g.namespace_manager.bind("po", po)
     g.serialize(self.final_path_+self.snapshotid+"Meta.ttl", "turtle")
     c("ttl")
     g.serialize(self.final_path_+self.snapshotid+"Meta.rdf", "xml")
     c("serialized meta")
コード例 #11
0
 def writeAllTW(self):
     # write meta and readme with self.desc, then all is finished.
     g = P.context(self.meta_graph)
     # ntriples = len(g)
     # triples = [
     #          (self.snapshoturi, po.nMetaTriples, ntriples),
     #          ]
     # P.add(triples, context=self.meta_graph)
     g.namespace_manager.bind("po", po)
     g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle")
     c("ttl")
     g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml")
     c("serialized meta")
コード例 #12
0
ファイル: tests.py プロジェクト: Zeigar/percolation-1
def testTextIO(endpoint_url):
    client = Client(endpoint_url)
    triples = [
        (test.Dummy, test.desc,  """áéíóúćçêôãõà"""),
        (test.Dummy, test.desc2, "Não concordo com a inclusão da palavra controle, sou a favor da manutenção do texto 'Política Nacional de Participação Social'.\n\nA inclusão desta palavra pode ser interpretada como o poder de controle de determinados atores. O uso de 'Política Nacional de Participação Social'atende mais ao intuito de promover um ambiente democrático e horizontal nas relações de participação civil."),
        (test.Dummy, test.desc3, "Não concordo com a inclusão da palavra controle, sou a favor da manutenção do texto 'Política Nacional de Participação Social'.\n\nA inclusão desta palavra pode ser interpretada como o poder de controle de determinados atores. O uso de 'Política Nacional de Participação Social'atende mais ao intuito de promover um ambiente democrático e horizontal nas relações de participação civil."),
        (test.Dummy, test.desc3, " \\o/".encode("utf8")),
        # (test.Dummy, test.desc, "t:w\
        # ex't\n\rte'st"çóṕxx%@#*%&%)(+_ ")
              ]
    client.insertTriples(triples, "text_graph")
    c("all graphs:", client.getAllGraphs())
    c("triples in text_graph:", client.getAllTriples("text_graph"))
コード例 #13
0
ファイル: pickle2rdf.py プロジェクト: iSport59/social
 def rdfTweets(self):
     tweets=[]
     if self.pickle_filename1:
         tweets+=readPickleTweetFile( self.data_path+self.pickle_filename1)[0]
     if self.pickle_filename2:
         tweets,fopen=readPickleTweetChunk(self.data_path+self.pickle_filename2,tweets,None,10000) # limit chuck to 5k tweets
     chunk_count=0
     self.tweets=tweets # for probing only, remove to release memory
     while tweets:
         c("rendering tweets, chunk:",chunk_count,"ntweets:",len(tweets),"snapshotid",self.snapshotid)
         for tweet in tweets:
             tweeturi,triples=self.tweetTriples(tweet)
             if "retweeted_status" in tweet.keys():
                 self.nretweets+=1
                 tweeturi0,triples0=self.tweetTriples(tweet)
                 triples+=triples0
                 triples+=[(tweeturi,po.retweetOf,tweeturi0)]
             self.ntriples+=len(triples)
             P.set_(triples,context=self.tweet_graph)
             c("rendered",self.ntweets,"tweets")
         c("end of chunk:",chunk_count,"ntriples:",self.ntriples)
         self.writeTweets(chunk_count)
         c("chunk has been written")
         chunk_count+=1
         if chunk_count==2:
             break
         if self.pickle_filename2:
             tweets,fopen=readPickleTweetChunk(None,[],fopen,10000)
         else:
             tweets=[]
     for i in range(chunk_count): # free memory
         P.context(self.tweet_graph[:-1]+str(i),"remove")
コード例 #14
0
ファイル: postgre2rdf.py プロジェクト: ttm/participation
 def writeRdf(self):
     pub_dir = './participabr_snapshot/'
     if not os.path.isdir(pub_dir):
         os.mkdir(pub_dir)
     g = P.context(self.translation_graph)
     g.serialize(pub_dir+'participabr.ttl', 'turtle')
     c('participation ttl serialized')
     g.serialize(pub_dir+'participabr.rdf', 'xml')
     c('participation xml serialized')
     # metadados: group, platform,
     triples = [
              (self.snapshoturi, a, po.Snapshot),
              # (self.snapshoturi, a, po.ParticipabrSnapshot),
              (self.snapshoturi, po.snapshotID, self.snapshotid),
              (self.snapshoturi, po.isEgo, False),
              (self.snapshoturi, po.isGroup, True),
              (self.snapshoturi, po.isFriendship, True),
              (self.snapshoturi, po.isInteraction, True),
              (self.snapshoturi, po.isPost, True),
              (self.snapshoturi, po.socialProtocol, 'ParticipaBR'),
              (self.snapshoturi, po.dateObtained, datetime.date(2012, 6, 28)),
              ]
     P.add(triples, self.meta_graph)
     g = P.context(self.meta_graph)
     g.serialize(pub_dir+'participabrMeta.ttl', 'turtle')
     c('participation meta ttl serialized')
     g.serialize(pub_dir+'participabrMeta.rdf', 'xml')
     c('participation meta xml serialized')
コード例 #15
0
ファイル: publishing.py プロジェクト: Zeigar/percolation-1
    def writeAll(self):
        g=P.context(self.meta_graph)
        ntriples=len(g)
        triples=[
                 (self.snapshoturi,po.nMetaTriples,ntriples)      ,
                 ]
        P.add(triples,context=self.meta_graph)
        g.namespace_manager.bind("po",po)
        g.serialize(self.final_path_+self.snapshotid+"Meta.ttl","turtle"); c("ttl")
        g.serialize(self.final_path_+self.snapshotid+"Meta.rdf","xml")
        c("serialized meta")
        if not os.path.isdir(self.final_path_+"scripts"):
            os.mkdir(self.final_path_+"scripts")
        shutil.copy(PACKAGEDIR+"/../tests/triplify.py",self.final_path_+"scripts/triplify.py")
        # copia do base data

        self.dates=[i.isoformat() for i in self.dates]
        date1=min(self.dates)
        date2=max(self.dates)
        with open(self.final_path_+"README","w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the IRC
snapshot {snapid} with tweets from {date1} to {date2}
(total of {ntrip} triples).{tinteraction}{tposts}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::""".format(
                snapid=self.snapshotid,date1=date1,date2=date2,ntrip=self.ntriples,
                        tinteraction=tposts,
                        tposts=tposts,
                        mrdf=self.translation_xml,
                        mttl=self.translation_ttl,
                        ise=self.isego,
                        isg=self.isgroup,
                        isf=self.isfriendship,
                        isi=self.isinteraction,
                        ist=self.hastext,
                        ava=self.online_prefix,
                        desc=self.desc
                        ))
コード例 #16
0
ファイル: render.py プロジェクト: iSport59/social
def publishAll(snapshoturis=None):
    """express tweets as RDF for publishing"""
    if not snapshoturis:
        c("getting twitter snapshots, implementation needs verification TTM")
        uridict={}
        for snapshoturi in P.get(None,a,NS.po.TwitterSnapshot,minimized=True):
            uridict[snapshoturi]=0
            for rawFile in P.get(snapshoturi,NS.po.rawFile,strict=True,minimized=True):
                uridict[snapshoturi]+=P.get(rawFile,NS.po.fileSize,minimized=True).toPython()
        snapshoturis=[i for i in list(uridict.keys()) if i.endswith(".gml")]
        snapshoturis.sort(key=lambda x: uridict[x])
    for snapshoturi in snapshoturis:
        triplification_class=publishAny(snapshoturi)
    #writePublishingReadme()
    return triplification_class
コード例 #17
0
ファイル: publishing.py プロジェクト: Zeigar/percolation-1
 def writeTranslates(self,mode="full"):
     c("mode full or chunk or multigraph write:",mode)
     if mode=="full":
         g=P.context(self.translation_graph)
         self.translation_ttl=self.snapshotid+"Translation.ttl"
         self.translation_xml=self.snapshotid+"Translation.rdf"
         g.serialize(self.final_path_+self.translation_ttl,"turtle"); c("ttl")
         g.serialize(self.final_path_+self.translation_xml,"xml")
         self.translation_size_ttl=os.path.getsize(self.final_path_+self.translation_ttl)/10**6
         self.translation_size_xml=os.path.getsize(self.final_path_+self.translation_xml)/10**6
         self.ntranslation_triples=len(g)
     elif mode=="chunk":
         # writeByChunks
         raise NotImplementedError("Perform P.utils.writeByChunks on self.translation_graph")
     elif mode=="multigraph":
         raise NotImplementedError("Perform serialize(write) on each of the self.translation_graphs")
コード例 #18
0
ファイル: general.py プロジェクト: ttm/participation
 def writeTranslates(self,mode="full"):
     c("mode full or chunk or multigraph write:",mode)
     if mode=="full":
         g=P.context(self.translation_graph)
         self.translation_ttl=self.snapshotid+"Translation.ttl"
         self.translation_xml=self.snapshotid+"Translation.rdf"
         g.serialize(self.final_path_+self.translation_ttl,"turtle"); c("ttl")
         g.serialize(self.final_path_+self.translation_xml,"xml")
         self.size_ttl=os.path.getsize(self.final_path_+self.translation_ttl)/10**6
         self.size_xml=os.path.getsize(self.final_path_+self.translation_xml)/10**6
         self.ntranslation_triples=len(g)
     elif mode=="chunk":
         # writeByChunks
         raise NotImplementedError("Perform P.utils.writeByChunks on self.translation_graph")
     elif mode=="multigraph":
         raise NotImplementedError("Perform serialize(write) on each of the self.translation_graphs")
コード例 #19
0
ファイル: render.py プロジェクト: ttm/gmane
def publishAll(snapshoturis=None):
    """express emails as RDF for publishing"""
    if not snapshoturis:
        c("getting email snapshots, implementation needs verification TTM")
        uridict = {}
        for snapshoturi in P.get(None, a, NS.po.Snapshot, minimized=True):
            uridict[snapshoturi] = 0
            for rawFile in P.get(snapshoturi, NS.po.rawFile, strict=True, minimized=True):
                uridict[snapshoturi] += P.get(rawFile, NS.po.directorySize, minimized=True).toPython()
        snapshoturis.sort(key=lambda x: uridict[x])
    c("on triplification")
    triplification_classes = []
    for snapshoturi in list(snapshoturis)[:10]:
        triplification_classes += [publishAny(snapshoturi)]
    # writePublishingReadme()
    return triplification_classes
コード例 #20
0
ファイル: mysql2rdf.py プロジェクト: ttm/participation
 def writeRdf(self):
     pub_dir = './cidadedemocratica_snapshot/'
     if not os.path.isdir(pub_dir):
         os.mkdir(pub_dir)
     # g = P.context(self.translation_graph)
     # g.serialize(pub_dir+'cidadedemocratica.ttl', 'turtle')
     # c('participation ttl serialized')
     # g.serialize(pub_dir+'cidadedemocratica.rdf', 'xml')
     # c('participation xml serialized')
     P.rdf.writeByChunks(pub_dir+'cidadedemocratica',
                         context=self.translation_graph,
                         ntriples=100000)
     # metadados: group, platform,
     g = P.context(self.meta_graph)
     g.serialize(pub_dir+'cidadedemocraticaMeta.ttl', 'turtle')
     c('participation meta ttl serialized')
     g.serialize(pub_dir+'cidadedemocraticaMeta.rdf', 'xml')
     c('participation meta xml serialized')
コード例 #21
0
ファイル: access.py プロジェクト: ttm/participation
def parseLegacyFiles(mysqldb=True, mongoshouts=True, irclog=True,
                     oreshouts=True):
    """Parse legacy files with aa shouts and sessions"""
    # access mysql, access mongo, access irc log from social/
    c("starting aa access")
    if mysqldb:
        mysqldb = connectMysql()
        c("mysql ok")
    if mongoshouts:
        mongoshouts = connectMongo()
        c("mongo ok")
    if irclog:
        irclogs = accessIrcLog()
        c("irc ok")
    if oreshouts:
        oreshouts = accessOreShouts()
        c("ore ok")
    return mysqldb, mongoshouts, irclogs, oreshouts
コード例 #22
0
ファイル: mysql2rdf.py プロジェクト: ttm/participation
 def translateLoginHistory(self):
     triples = []
     for login in self.data["historico_de_logins"]:
         lid = login[0]
         uid = login[1]
         created = login[2]
         ip = login[3]
         uri = P.rdf.ic(po.Login,
                        self.snapshotid+"-"+str(lid),
                        self.translation_graph, self.snapshoturi)
         triples += [
                 (uri, po.participant,
                     po.Participant+'#'+self.snapshotid+'-'+str(uid)),
                 (uri, po.createdAt, created),
                 (uri, po.ip, ip)
         ]
     P.add(triples, self.translation_graph)
     c("finished add of login entries")
コード例 #23
0
ファイル: mysql2rdf.py プロジェクト: ttm/participation
 def __init__(self):
     self.snapshoturi = P.rdf.ic(po.Snapshot,
                                 self.snapshotid, self.meta_graph)
     c("get data")
     self.getData()
     c("start translate")
     self.translateToRdf()
     self.makeMeta()
     c("start render")
     self.writeRdf()
     c("finished render")
コード例 #24
0
ファイル: mysql2rdf.py プロジェクト: ttm/participation
 def translateMacrotags(self):
     triples = []
     for mt in self.data["macro_tags"]:
         mtid = mt[0]
         title = mt[1]
         created = mt[2]
         updated = mt[3]
         uri = P.rdf.ic(po.Macrotag,
                        self.snapshotid+"-"+str(mtid),
                        self.translation_graph, self.snapshoturi)
         triples.append((uri, po.createdAt, created))
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         if title:
             triples.append((uri, po.title, title))
     P.add(triples, self.translation_graph)
     c("finished add of microtag entries")
コード例 #25
0
ファイル: postgre2rdf.py プロジェクト: ttm/participation
 def __init__(self, postgresql_cursor, profiles=True, articles=True,
              comments=True):
     snapshoturi = P.rdf.ic(po.Snapshot,
                            self.snapshotid, self.meta_graph)
     # P.add((snapshoturi, a, po.Snapshot), context=self.translation_graph)
     cur = postgresql_cursor
     datas2 = []
     datas = []
     bodies = []
     abstracts = []
     locals_ = locals().copy()
     del locals_["self"]
     for i in locals_:
         exec("self.{}={}".format(i, i))
     c("get data")
     self.getData(cur)
     c("start translate")
     self.translateToRdf()
     self.writeRdf()
コード例 #26
0
ファイル: pickle2rdf.py プロジェクト: iSport59/social
 def writeTweets(self,chunk_count):
     if not os.path.isdir(self.final_path):
         os.mkdir(self.final_path)
     if not os.path.isdir(self.final_path_):
         os.mkdir(self.final_path_)
     filename=self.snapshotid+"Tweet{:05d}".format(chunk_count)
     g=P.context(self.tweet_graph)
     g.namespace_manager.bind("po",po)
     tttl=filename+".ttl"
     trdf=filename+".rdf"
     g.serialize(self.final_path_+tttl,"turtle"); c("ttl")
     g.serialize(self.final_path_+trdf,"xml")
     filesizettl=os.path.getsize(self.final_path_+tttl)/(10**6)
     filesizerdf=os.path.getsize(self.final_path_+trdf)/(10**6)
     self.tweet_ttl+=[tttl]
     self.size_ttl+=[filesizettl]
     self.tweet_rdf+=[trdf]
     self.size_rdf+=[filesizerdf]
     self.tweet_graph=self.tweet_graph[:-1]+str(chunk_count+1)
コード例 #27
0
ファイル: pickle2rdf.py プロジェクト: iSport59/social
 def writeTweets(self, chunk_count):
     if not os.path.isdir(self.final_path):
         os.mkdir(self.final_path)
     if not os.path.isdir(self.final_path_):
         os.mkdir(self.final_path_)
     filename = self.snapshotid + "Tweet{:05d}".format(chunk_count)
     g = P.context(self.tweet_graph)
     g.namespace_manager.bind("po", po)
     tttl = filename + ".ttl"
     trdf = filename + ".rdf"
     g.serialize(self.final_path_ + tttl, "turtle")
     c("ttl")
     g.serialize(self.final_path_ + trdf, "xml")
     filesizettl = os.path.getsize(self.final_path_ + tttl) / (10**6)
     filesizerdf = os.path.getsize(self.final_path_ + trdf) / (10**6)
     self.tweet_ttl += [tttl]
     self.size_ttl += [filesizettl]
     self.tweet_rdf += [trdf]
     self.size_rdf += [filesizerdf]
     self.tweet_graph = self.tweet_graph[:-1] + str(chunk_count + 1)
コード例 #28
0
 def __init__(self,
              snapshoturi,
              snapshotid,
              filename="foo.txt",
              data_path="../data/irc/",
              final_path="./irc_snapshots/",
              umbrella_dir="irc_snapshots/"):
     c(snapshoturi, snapshotid, filename)
     isego = False
     isgroup = True
     isfriendship = False
     isinteraction = True
     hastext = True
     interactions_anonymized = False
     irc_graph = "social_log"
     meta_graph = "social_irc_meta"
     social_graph = "social_irc"
     P.context(irc_graph, "remove")
     P.context(meta_graph, "remove")
     final_path_ = "{}{}/".format(final_path, snapshotid)
     online_prefix = "https://raw.githubusercontent.com/OpenLinkedSocialData/{}master/{}/".format(
         umbrella_dir, snapshotid)
     naamessages = nurls = ndirect = nmention = 0
     dates = []
     nchars_all = []
     ntokens_all = []
     nsentences_all = []
     participantvars = ["nick"]
     messagevars = [
         "author", "createdAt", "mentions", "directedTo", "systemMessage",
         "text", "cleanMessageText", "nChars", "nTokens", "nSentences",
         "url", "emptyMessage"
     ]
     messagevars.sort()
     locals_ = locals().copy()
     del locals_["self"]
     for i in locals_:
         exec("self.{}={}".format(i, i))
     self.rdfLog()
     self.makeMetadata()
     self.writeAllIRC()
コード例 #29
0
ファイル: render.py プロジェクト: land-pack/gmane
def publishAll(snapshoturis=None):
    """express emails as RDF for publishing"""
    if not snapshoturis:
        c("getting email snapshots, implementation needs verification TTM")
        uridict = {}
        for snapshoturi in P.get(None, a, NS.po.GmaneSnapshot, minimized=True):
            uridict[snapshoturi] = 0
            for rawFile in P.get(snapshoturi,
                                 NS.po.rawFile,
                                 strict=True,
                                 minimized=True):
                uridict[snapshoturi] += P.get(rawFile,
                                              NS.po.directorySize,
                                              minimized=True).toPython()
        snapshoturis.sort(key=lambda x: uridict[x])
    c("on triplification")
    triplification_classes = []
    for snapshoturi in list(snapshoturis)[:10]:
        triplification_classes += [publishAny(snapshoturi)]
    #writePublishingReadme()
    return triplification_classes
コード例 #30
0
ファイル: mysql2rdf.py プロジェクト: ttm/participation
 def translateObservatories(self):
     count = 0
     triples = []
     for observatorio in self.data["observatorios"]:
         oid = observatorio[0]
         uid = observatorio[1]
         created = observatorio[4]
         updated = observatorio[5]
         uri = P.rdf.ic(po.Observatory,
                        self.snapshotid+"-"+str(oid),
                        self.translation_graph, self.snapshoturi)
         triples += [
                 (uri, po.participant,
                     po.Participant+'#'+self.snapshoturi+'-'+str(uid)),
                 (uri, po.createdAt, created),
         ]
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         count += 1
         if count % 60 == 0:
             c("finished observatory  entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of observatory entries")
             triples = []
     if triples:
             P.add(triples, self.translation_graph)
     c("finished add of observatory entries")
コード例 #31
0
ファイル: mysql2rdf.py プロジェクト: ttm/participation
 def translateLinks(self):
     count = 0
     triples = []
     for link in self.data['links']:
         lid = link[0]
         nome = link[1]
         url = link[2]
         tid = link[4]
         created = link[5]
         updated = link[6]
         uri = P.rdf.ic(po.Link,
                        self.snapshotid+"-"+str(lid),
                        self.translation_graph, self.snapshoturi)
         triples += [
                 (uri, po.name, nome),
                 (uri, po.url, url),
                 (uri, po.topic,
                     po.Topic+'#'+self.snapshotid+'-'+str(tid)),
                 (uri, po.createdAt, created)
         ]
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         count += 1
         if count % 60 == 0:
             c("finished links entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of links entries")
             triples = []
     if triples:
             P.add(triples, self.translation_graph)
     c("finished add of links entries")
コード例 #32
0
ファイル: mysql2rdf.py プロジェクト: ttm/participation
 def translateSupporters(self):
     count = 0
     triples = []
     for adesao in self.data["adesoes"]:
         tid = adesao[0]
         uid = adesao[1]
         created = adesao[2]
         updated = adesao[3]
         aid = adesao[4]
         uri = P.rdf.ic(po.Support,
                        self.snapshotid+"-"+str(aid),
                        self.translation_graph, self.snapshoturi)
         triples += [
                 (uri, po.participant,
                     po.Participant+'#'+self.snapshotid+'-'+str(uid)),
                 (uri, po.topic,
                     po.Topic+'#'+self.snapshotid+'-'+str(tid)),
                 (uri, po.createdAt, created),
         ]
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         count += 1
         if count % 60 == 0:
             c("finished supporters entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of supporters entries")
             triples = []
     if triples:
             P.add(triples, self.translation_graph)
     c("finished add of supporters entries")
コード例 #33
0
ファイル: mysql2rdf.py プロジェクト: ttm/participation
 def translateStates(self):
     count = 0
     triples = []
     for estado in self.data["estados"]:
         gid = estado[0]
         nome = estado[1]
         abr = estado[2]
         created = estado[3]
         updated = estado[4]
         relevance = estado[5]
         uri = P.rdf.ic(po.State,
                        self.snapshotid+"-"+str(gid),
                        self.translation_graph, self.snapshoturi)
         triples += [
                 (uri, po.name, nome),
                 (uri, po.abbreviation, abr),
                 (uri, po.createdAt, created),
                 (uri, po.relevance, relevance),
         ]
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         count += 1
         if count % 60 == 0:
             c("finished states entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of states entries")
             triples = []
     if triples:
             P.add(triples, self.translation_graph)
     c("finished add of states entries")
コード例 #34
0
ファイル: mysql2rdf.py プロジェクト: ttm/participation
 def translateCities(self):
     count = 0
     triples = []
     for cidade in self.data["cidades"]:
         cid = cidade[0]
         nome = cidade[1]
         eid = cidade[2]
         slug = cidade[3]
         created = cidade[4]
         updated = cidade[5]
         relevance = cidade[6]
         uri = P.rdf.ic(po.City,
                        self.snapshotid+"-"+str(cid),
                        self.translation_graph, self.snapshoturi)
         triples += [
                 (uri, po.name, nome),
                 (uri, po.state,
                     po.State+'#'+self.snapshotid+str(eid)),
                 (uri, po.slug, slug),
                 (uri, po.createdAt, created),
                 (uri, po.relevance, relevance)
         ]
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         count += 1
         if count % 60 == 0:
             c("finished cities k entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of cities entries")
             triples = []
     if triples:
             P.add(triples, self.translation_graph)
     c("finished add of cities entries")
コード例 #35
0
ファイル: mysql2rdf.py プロジェクト: ttm/participation
 def translateNeighborhoods(self):
     count = 0
     triples = []
     for bairro in self.data["bairros"]:
         bid = bairro[0]
         nome = bairro[1]
         cid = bairro[2]
         created = bairro[3]
         updated = bairro[4]
         relevance = bairro[5]
         uri = P.rdf.ic(po.Neighborhood,
                        self.snapshotid+"-"+str(bid),
                        self.translation_graph, self.snapshoturi)
         triples += [
                 (uri, po.name, nome),
                 (uri, po.city,
                     po.City+'#'+self.snapshotid+'-'+str(cid)),
                 (uri, po.createdAt, created),
                 (uri, po.relevance, relevance)
         ]
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         count += 1
         if count % 60 == 0:
             c("finished neighborhood entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of neighborhood entries")
             triples = []
     if triples:
             P.add(triples, self.translation_graph)
     c("finished add of neighborhood entries")
コード例 #36
0
ファイル: postgre2rdf.py プロジェクト: ttm/participation
 def translateVotes(self):
     triples = []
     commentids = set(self.comments_table.get("id"))
     count = 0
     for id_, vote, voteable_id, voteable_type,\
         voter_id, voter_type, created_at in \
         self.votes_table.getMany(
                 ("id", "vote", "voteable_id",
                  "voteable_type", "voter_id", "voter_type", "created_at")):
         assert isinstance(id_, int)
         assert isinstance(voteable_id, int)
         assert isinstance(created_at, datetime.datetime)
         voteuri = P.rdf.ic(po.Vote, self.snapshotid+"-"+str(id_),
                            self.translation_graph, self.snapshoturi)
         if voteable_type == "Article":
             type__ = self.articletypes[voteable_id].split("::")[-1]
             # referenceuri = \
             #     eval("po."+type__)+"#"+self.snapshotid+"-"+str(voteable_id)
             referenceuri = \
                 po.Article+"#"+self.snapshotid+"-"+str(voteable_id)
         elif voteable_type == "Comment":
             assert voteable_id in commentids
             referenceuri = \
                 po.Comment+"#"+self.snapshotid+"-"+str(voteable_id)
         else:
             raise ValueError("unexpected voteable type")
         triples += [
                    (voteuri, po.createdAt, created_at),
                    (voteuri, po.vote, vote),
                    (voteuri, po.reference, referenceuri),
                    ]
         if voter_id:
             assert voter_type == "Profile"
             assert isinstance(voter_id, int)
             participanturi = po.Participant + '#' + \
                 self.snapshotid+"-"+self.profileids[voter_id]
             triples += [
                        (voteuri, po.author, participanturi),
                        ]
         count += 1
         if count % 100 == 0:
             c("votes done:", count)
             c("ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of votes")
             triples = []
     if triples:
         c("ntriples:", len(triples))
         P.add(triples, self.translation_graph)
コード例 #37
0
ファイル: render.py プロジェクト: shadowridgedev/social
def publishAll(snapshoturis=None):
    """express tweets as RDF for publishing"""
    if not snapshoturis:
        c("getting twitter snapshots, implementation needs verification TTM")
        uridict = {}
        for snapshoturi in P.get(None,
                                 a,
                                 NS.po.TwitterSnapshot,
                                 minimized=True):
            uridict[snapshoturi] = 0
            for rawFile in P.get(snapshoturi,
                                 NS.po.rawFile,
                                 strict=True,
                                 minimized=True):
                uridict[snapshoturi] += P.get(rawFile,
                                              NS.po.fileSize,
                                              minimized=True).toPython()
        snapshoturis = [i for i in list(uridict.keys()) if i.endswith(".gml")]
        snapshoturis.sort(key=lambda x: uridict[x])
    for snapshoturi in snapshoturis:
        triplification_class = publishAny(snapshoturi)
    return triplification_class
コード例 #38
0
def parseLegacyFiles(data_dir=DATADIR+"irc/"):
    """Parse legacy txt files with irc logs"""
    filenames=os.listdir(data_dir)
    filenames=[i for i in filenames if i!="ipython_log.py" and not i.endswith(".swp")]

    snapshots=set()
    triples=[]
    for filename in filenames:
        snapshotid="irc-legacy-"+filename.replace("#","")
        snapshoturi=po.TwitterSnapshot+"#"+snapshotid
        expressed_classes=[po.Participant,po.IRCMessage]
        expressed_reference=filename.replace("#","").replace(".txt","").replace(".log","")
        name_humanized="IRC log of channel "+expressed_reference
        filesize=os.path.getsize(data_dir+filename)/10**6
        fileformat="txt"
        fileuri=po.File+"#Irc-log-"+filename.replace("#","")
        triples+=[
                 (snapshoturi,a,po.Snapshot),
                 (snapshoturi,a,po.IRCSnapshot),
                 (snapshoturi,po.snapshotID,snapshotid),
                 (snapshoturi, po.isEgo, False),
                 (snapshoturi, po.isGroup, True),
                 (snapshoturi, po.isFriendship, False),
                 (snapshoturi, po.isInteraction, True),
                 (snapshoturi, po.isPost, True),
                 (snapshoturi, po.humanizedName, name_humanized),
                 (snapshoturi, po.expressedReference, expressed_reference),
                 (snapshoturi, po.rawFile, fileuri),
                 (fileuri,     po.fileSize, filesize),
                 (fileuri,     po.fileName, filename),
                 (fileuri,     po.fileFormat, fileformat),
                 ]+[
                 (fileuri,    po.expressedClass, expressed_class) for expressed_class in expressed_classes
                 ]
        snapshots.add(snapshoturi)
    nfiles=len(filenames)
    nsnapshots=len(snapshots)
    P.context("social_irc","remove")
    platformuri=P.rdf.ic(po.Platform,"IRC",context="social_irc")
    triples+=[
             (NS.social.Session,NS.social.nIRCParsedFiles,nfiles),
             (NS.social.Session,NS.social.nIRCSnapshots,nsnapshots),
             (platformuri, po.dataDir,data_dir),
             ]
    P.add(triples,context="social_irc")
    c("parsed {} irc logs files ({} snapshots) are in percolation graph and 'irc_twitter' context".format(nfiles,nsnapshots))
    c("percolation graph have {} triples ({} in social_irc context)".format(len(P.percolation_graph),len(P.context("social_irc"))))
    negos=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE         { GRAPH <social_irc> { ?s po:isEgo true         } } ")
    ngroups=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE       { GRAPH <social_irc> { ?s po:isGroup true       } } ")
    nfriendships=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE  { GRAPH <social_irc> { ?s po:isFriendship true  } } ")
    ninteractions=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <social_irc> { ?s po:isInteraction true } } ")
    nposts=P.query(r" SELECT (COUNT(?s) as ?cs) WHERE        { GRAPH <social_irc> { ?s po:isPost true        } } ")
    totalsize=sum(P.query(r" SELECT ?size WHERE              { GRAPH <social_irc> { ?s po:fileSize ?size     } } "))
    c("""{} are ego snapshots, {} are group snapshots
{} have a friendship structures. {} have an interaction structures. {} have texts 
Total raw data size is {:.2f}MB""".format(negos,ngroups,nfriendships,ninteractions,nposts,totalsize))
    return snapshots
コード例 #39
0
ファイル: mysql2rdf.py プロジェクト: ttm/participation
    def translateComments(self):
        trans = {'resposta': 'answer',
                 'pergunta': 'question',
                 'comentario': 'comment',
                 'ideia': 'idea'}
        triples = []
        count = 0
        for comment in self.data['comments']:
            cid = comment[0]
            tid = comment[1]  # topic id
            body = comment[3]
            if not body:
                continue
            body = body.replace('', '')
            uid = comment[4]
            ctype = comment[8]
            created = comment[9]
            updated = comment[10]

            assert isinstance(cid, int)
            assert isinstance(tid, int)
            assert isinstance(body, str)
            assert isinstance(uid, int)
            assert isinstance(ctype, str)
            assert isinstance(created, datetime.datetime)
            assert isinstance(updated, datetime.datetime)
            commenturi = P.rdf.ic(po.Comment,
                                  self.snapshotid+"-"+str(cid),
                                  self.translation_graph, self.snapshoturi)
            participanturi = po.Participant+'#'+self.snapshotid+"-"+str(uid)
            # topicuri = self.topicuris[tid]
            topicuri = po.Topic+'#'+self.snapshotid+'-'+str(tid)
            triples += [
                (commenturi, po.author, participanturi),
                (commenturi, po.topic, topicuri),
                (commenturi, po.text, body),
                # (commenturi, po.nChars, len(body)),
                (commenturi, po.type, trans[ctype]),
                (topicuri, po.createdAt, created),
            ]
            if updated != created:
                 triples.append(
                    (topicuri, po.updatedAt, updated),
                 )
            count += 1
            if count % 60 == 0:
                c("finished comment entries:", count, "ntriples:", len(triples))
                P.add(triples, self.translation_graph)
                c("finished add of comment entries")
                triples = []
        if triples:
                P.add(triples, self.translation_graph)
        c("finished add of comment entries")
コード例 #40
0
ファイル: postgre2rdf.py プロジェクト: ttm/participation
 def translateFriendships(self):
     triples = []
     fids = self.friendships_table.getMany(("person_id", "friend_id"))
     added_friendships = []
     count = 0
     for person_id, friend_id, created_at, group in \
             self.friendships_table.getMany(
                 ('person_id', 'friend_id', 'created_at', 'group')):
         if [friend_id, person_id] in added_friendships:
             pass
         else:
             added_friendships += [[person_id, friend_id]]
         id0 = self.profileids[person_id]
         id1 = self.profileids[friend_id]
         friendshipuri = P.rdf.ic(po.Friendship,
                                  self.snapshotid+'-'+id0+'-'+id1,
                                  self.translation_graph, self.snapshoturi)
         participanturi0 = po.Participant+"#"+self.snapshotid+"-"+id0
         participanturi1 = po.Participant+"#"+self.snapshotid+"-"+id1
         assert isinstance(created_at, datetime.datetime)
         triples += [
                    (friendshipuri, po.member, participanturi0),
                    (friendshipuri, po.member, participanturi1),
                    (friendshipuri, po.createdAt, created_at),
                    ]
         if [friend_id, person_id] not in fids:
             triples += [
                        (participanturi0, po.knows, participanturi1),
                        ]
         if group:
             triples += [
                        (friendshipuri, po.socialCircle, group),
                        ]
         count += 1
         if count % 100 == 0:
             c("done friendships:", count)
             c("ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of friendships")
             triples = []
     if triples:
         c("ntriples:", len(triples))
         P.add(triples, self.translation_graph)
コード例 #41
0
ファイル: mysql2rdf.py プロジェクト: ttm/participation
 def translatePlaces(self):
     count = 0
     triples = []
     for local in self.data["locais"]:
         lid = local[0]
         rid = local[1]
         rtype = local[2]
         bid = local[3]
         cid = local[4]
         created = local[7]
         updated = local[8]
         cep = local[9]
         eid = local[10]
         uri = P.rdf.ic(po.Place,
                        self.snapshotid+"-"+str(lid),
                        self.translation_graph, self.snapshoturi)
         triples += [(uri, po.createdAt, created)]
         if bid:
             triples.append((uri, po.neighborhood,
                             po.Neighborhood+'#'+self.snapshotid+'-'+str(bid)))
         if cid:
             triples.append((uri, po.city,
                             po.City+'#'+self.snapshotid+'-'+str(cid)))
         if eid:
             triples.append((uri, po.state,
                             po.State+'#'+self.snapshotid+'-'+str(eid)))
         if cep:
             triples.append((uri, po.cep, cep))
         if updated != created:
             triples += [
                        (uri, po.updatedAt, updated),
                        ]
         if rtype == "Topico":
             uri_ = po.Topic+'#'+self.snapshotid+'-'+str(rid)
         elif rtype == "User":
             uri_ = po.User+'#'+self.snapshotid+'-'+str(rid)
         elif rtype == "Competition":
             uri_ = po.Competition+'#'+self.snapshotid+'-'+str(rid)
         elif rtype == "Observatorio":
             uri_ = po.Observatory+'#'+self.snapshotid+'-'+str(rid)
         if rtype:
             triples.append((uri, po.accountable, uri_))
         count += 1
         if count % 60 == 0:
             c("finished places entries:", count, "ntriples:", len(triples))
             P.add(triples, self.translation_graph)
             c("finished add of places entries")
             triples = []
     if triples:
             P.add(triples, self.translation_graph)
     c("finished add of places entries")
コード例 #42
0
    def rdfTweets(self):
        tweets = []
        if self.pickle_filename1:
            tweets += readPickleTweetFile(self.data_path +
                                          self.pickle_filename1)[0]
        if self.pickle_filename2:
            # limit chuck to 10k tweets
            tweets, fopen = readPickleTweetChunk(
                self.data_path + self.pickle_filename2, tweets, None, 10000)
        chunk_count = 0
        # self.tweets = tweets  # for debugging only, remove to release memory
        while tweets:
            c("rendering tweets, chunk:", chunk_count, "ntweets:", len(tweets),
              "snapshotid", self.snapshotid)
            count = 0

            for tweet in tweets:
                tweeturi, triples = self.tweetTriples(tweet)
                if "retweeted_status" in tweet.keys():
                    # self.nretweets += 1
                    tweeturi0, triples0 = self.tweetTriples(
                        tweet['retweeted_status'])
                    triples.extend(triples0)
                    triples.append((tweeturi, po.retweetOf, tweeturi0))
                self.ntriples += len(triples)
                P.add(triples, context=self.tweet_graph)
                count += 1
                if count % 1000 == 0:
                    c("triplified", count, "tweets")
            c("end of chunk:", chunk_count, "ntriples:", self.ntriples)
            self.writeTweets(chunk_count)
            c("chunk has been written")
            chunk_count += 1
            # if chunk_count == 2:
            #     break
            if self.pickle_filename2:
                tweets, fopen = readPickleTweetChunk(None, [], fopen, 10000)
            else:
                tweets = []
コード例 #43
0
ファイル: pickle2rdf.py プロジェクト: iSport59/social
 def rdfTweets(self):
     tweets = []
     if self.pickle_filename1:
         tweets += readPickleTweetFile(self.data_path +
                                       self.pickle_filename1)[0]
     if self.pickle_filename2:
         tweets, fopen = readPickleTweetChunk(
             self.data_path + self.pickle_filename2, tweets, None,
             10000)  # limit chuck to 5k tweets
     chunk_count = 0
     self.tweets = tweets  # for probing only, remove to release memory
     while tweets:
         c("rendering tweets, chunk:", chunk_count, "ntweets:", len(tweets),
           "snapshotid", self.snapshotid)
         for tweet in tweets:
             tweeturi, triples = self.tweetTriples(tweet)
             if "retweeted_status" in tweet.keys():
                 self.nretweets += 1
                 tweeturi0, triples0 = self.tweetTriples(tweet)
                 triples += triples0
                 triples += [(tweeturi, po.retweetOf, tweeturi0)]
             self.ntriples += len(triples)
             P.set_(triples, context=self.tweet_graph)
             c("rendered", self.ntweets, "tweets")
         c("end of chunk:", chunk_count, "ntriples:", self.ntriples)
         self.writeTweets(chunk_count)
         c("chunk has been written")
         chunk_count += 1
         if chunk_count == 2:
             break
         if self.pickle_filename2:
             tweets, fopen = readPickleTweetChunk(None, [], fopen, 10000)
         else:
             tweets = []
     for i in range(chunk_count):  # free memory
         P.context(self.tweet_graph[:-1] + str(i), "remove")
コード例 #44
0
# Note 60 is C4, 261.63Hz. Convert with e.g.:
# f = M.utils.midi2Hz(scale_grid_[i])

# Pivots to use recurrently:
pivots = [7 * i for i in range(3, 8)]
pivots_m = [scale_grid[i] for i in pivots]
pivots_f = [M.utils.midi2Hz(i) for i in pivots_m]

# Plain changes with 2-7 bells:
peal2 = M.structures.symmetry.PlainChanges(2)
peal3 = M.structures.symmetry.PlainChanges(3)
peal4 = M.structures.symmetry.PlainChanges(4)
peal5 = M.structures.symmetry.PlainChanges(5)
t = time.time()
peal6 = M.structures.symmetry.PlainChanges(6, 4)
c('Finished making peals 1-6')
peal7 = M.structures.symmetry.PlainChanges(7, 5)
c('Finished making peals 7')

# This one takes too long, maybe save as a pickle file:
# peal12 = M.structures.symmetry.PlainChanges(12,10)
# If only part of interesting permutations are desired,
# one might also do:
# >>> R = M.structures.permutations.InterestingPermutations
# >>> R.nelements=12
# >>> R.method='dimino'
# >>> R.getRotations()
# >>> R.getRotations(R)
# >>> R.rotations
# which is very fast
コード例 #45
0
ファイル: mbox2rdf.py プロジェクト: land-pack/gmane
def getText(message):
    while message.is_multipart():
        message = message.get_payload()[0]
    charsets = message.get_charsets()
    try:
        text = message.get_payload(decode=True)
    except AssertionError:
        text = ""
    if len(charsets) == 1 and text:
        charset = charsets[0]
        if charset:
            try:
                text = text.decode(charset)
            except LookupError:
                c("+++ lookup error in decoding messsage; charset:", charset)
                try:
                    text = text.decode()
                except UnicodeDecodeError:
                    try:
                        text = text.decode("latin1")
                        c("+++ used latin1 (no errors)", charset)
                    except UnicodeDecodeError:
                        text = text.decode(errors="ignore")
                        c(
                            "+-- unicode decode error in decoding messsage; used utf8 but charset:",
                            charset)
            except UnicodeDecodeError:
                # c(text,charset)
                c("--- unicode error:", charset)
                try:
                    text = text.decode("latin1")
                    c("--- used latin1 (no errors)", charset)
                except UnicodeDecodeError:
                    try:
                        text = text.decode(charset, errors="ignore")
                        c("--+ removed errors in decoding message; charset:",
                          charset)
                    except LookupError:
                        text = text.decode(errors="ignore")
                        c(
                            "-++ lookup error in decoding messsage; used utf8 but charset:",
                            charset)
        else:
            #            c("*** charset is empty string or None. Might need encoding.")
            try:
                text = text.decode()
            except UnicodeDecodeError:
                try:
                    text = text.decode("latin1")
                    c("**+ used latin1 (no errors)", charset)
                except UnicodeDecodeError:
                    text = text.decode(errors="ignore")
                    c("*++ decoded with utf8 and removed errors", charset)
    elif len(charsets) == 0 and text:
        text = text.decode()
    elif text:
        raise ValueError("more than one charset at the lowest payload leaf")
    elif not text:
        text = ""
    assert isinstance(text, str)
    content_type = message.get_content_type()
    if content_type == "text/html":
        text = ''.join(bs4.BeautifulSoup(text).findAll(text=True))
    elif content_type == "text/plain":
        pass
    #elif "text/plain" in content_type:
    elif "text" in content_type:
        c("WARNING: admitted text without fully understood content type")
    else:
        text = ""
        c("=== Lowest not multipart payload. Should not be translated to rdf")
        c("content_type", content_type)
    return P.utils.cleanText(text)
コード例 #46
0
        for sequence, state_var, position in zip(sequences, state_vars, positions):
            if position not in dir(self):
                self.__dict__[position] = 0
            self.__dict__[state_var] = self.__dict__[sequence][self.__dict__[position]]
            self.__dict__[position] += 1
            self.__dict__[position] %= len(self.__dict__[sequence])
isynth = IteratorSynth()
isynth.fundamental_frequency_sequence = []
isynth.table = isynth.tables.sine
for perm in peal.peal_direct:
    isynth.fundamental_frequency_sequence.extend(perm(notes))
sounds = []
for i in range(36):
    sounds += [isynth.renderIterate(duration=1/3)]
# M.utils.write(M.H(*sounds),"./sandsounds/ra.wav")
c('finished rendering peal')
M.utils.write(M.H(*sounds), "./apeal.wav")

# grave e agudo
f0_ = f0/4
notes_ = [f0_, f0_*semi**4, f0_*semi**8]
silence = n.zeros(int(44100*2/3))
bass = []
count = 0
sy = M.synths.CanonicalSynth()
sy.table = sy.tables.saw
for i in range(6):
    asound = [sy.render(fundamental_frequency=notes_[(2+count)%3], duration=1/3),
              silence] * 2
    bass.extend(asound)
    count += 1
コード例 #47
0
ファイル: mbox2rdf.py プロジェクト: land-pack/gmane
    def rdfMbox(self):
        for filecount, file_ in enumerate(self.files):
            if filecount % 100 == 0:
                c(self.snapshoturi, filecount)
            mbox = mailbox.mbox(self.data_path + self.directory + "/" + file_)
            if not mbox.keys():
                self.nempty += 1
                mbox.close()
                #                c("||||||||||| EMPTY MESSAGE |||||||||||||||||||||",self.snapshotid,file_,"(",filecount,")")
                continue
            if not mbox[0]["Message-Id"]:
                raise ValueError(
                    "What to do with nonempy messages without id?")
            message = mbox[0]
            gmaneid = self.makeId(message["Message-Id"])
            #c("gmaneid",gmaneid)
            if not gmaneid:
                raise ValueError("Message without id")
            messageuri = P.rdf.ic(po.EmailMessage, gmaneid,
                                  self.translation_graph, self.snapshoturi)
            self.nmessages += 1
            triples = [
                (messageuri, po.gmaneID, gmaneid),
            ]
            email, name = self.parseParticipant(message["From"])
            if not email:
                raise ValueError("message without author")
            participanturi = P.rdf.ic(po.GmaneParticipant, email,
                                      self.translation_graph, self.snapshoturi)
            if not P.get(participanturi, po.emailAddress, None,
                         self.translation_graph):
                self.nparticipants += 1
                if self.nparticipants == 100:
                    pass
            triples += [
                (messageuri, po.author, participanturi),
                (participanturi, po.emailAddress, email),
            ]
            if name:
                triples += [
                    (participanturi, po.name, name),
                ]
            subject = message["Subject"]
            if subject:
                subject = decodeHeader(subject)
                assert isinstance(subject, str)
                triples += [
                    (messageuri, po.subject, subject),
                ]
            replyid_ = message["In-Reply-To"]
            saneid = self.makeId(replyid_)
            if bool(replyid_) and not bool(saneid):
                self.nreplies += 1
                replyid = self.snapshotid + "-" + str(self.nlost_messages)
                self.nlost_messages += 1
                replymessageuri = P.rdf.ic(po.LostEmailMessage, replyid,
                                           self.translation_graph,
                                           self.snapshoturi)
                triples += [
                    (replymessageuri, a, po.EmailMessage),
                    (replymessageuri, NS.rdfs.comment,
                     "This message registered as having a reply, but the field might be ill-formed: "
                     + replyid_),
                    (messageuri, po.replyTo, replymessageuri),
                ]
            elif saneid:
                self.nreplies += 1
                replymessageuri = P.rdf.ic(po.EmailMessage, saneid,
                                           self.translation_graph,
                                           self.snapshoturi)
                triples += [
                    (replymessageuri, po.gmaneID, saneid),
                    (messageuri, po.replyTo, replymessageuri),
                ]
            if isinstance(message["Date"], str):
                datetime = parseDate(message["Date"])
            elif isinstance(message["Date"], mailbox.email.header.Header):
                datetimestring = decodeHeader(message["Date"])
                if False in [i in string.printable for i in datetimestring]:
                    datetime = None
                    triples += [
                        (messageuri, po.lostCreatedAt, True),
                    ]
                else:
                    datetime_ = re.findall(r"(.*\d\d:\d\d:\d\d).*",
                                           datetimestring)[0]
                    datetime = parseDate(datetime_)
            else:
                raise ValueError("datetime not understood")
            if datetime:
                self.dates += [datetime]
                triples += [
                    (messageuri, po.createdAt, datetime),
                ]
            if message["References"]:
                references = message["References"].replace("\n", "").replace(
                    "\t", "").replace(" ", "")
                if not re.findall(r"\A<(.*?)>\Z", references):
                    c("::: ::: ::: references field not understood",
                      message["References"])
                    triples += [
                        (messageuri, po.comment,
                         "the references are not understood (<.*> ids are added anyway): "
                         + message["References"]),
                        (messageuri, po.referencesLost, True),
                    ]
                for reference in re.findall(r"<(.*?)>", references):
                    self.nreferences += 1
                    referenceuri = P.rdf.ic(po.EmailMessage, reference,
                                            self.translation_graph,
                                            self.snapshoturi)
                    triples += [
                        (referenceuri, po.gmaneID, reference),
                        (messageuri, po.hasReference, referenceuri),
                    ]
                for part in message["References"].replace("\n", "").replace(
                        "\t", "").split():
                    if validate_email(part):
                        self.nreferences += 1
                        referenceuri = P.rdf.ic(po.EmailMessage, part,
                                                self.translation_graph,
                                                self.snapshoturi)
                        triples += [
                            (referenceuri, po.gmaneID, reference),
                            (messageuri, po.hasReference, referenceuri),
                        ]
            text = getText(message)
            if text:
                nchars = len(text)
                ntokens = len(k.wordpunct_tokenize(text))
                nsentences = len(k.sent_tokenize(text))
                triples += [
                    (messageuri, po.messageText, text),
                    (messageuri, po.nChars, nchars),
                    (messageuri, po.nTokens, ntokens),
                    (messageuri, po.nSentences, nsentences),
                ]
                self.nchars_all += [nchars]
                self.ntokens_all += [ntokens]
                self.nsentences_all += [nsentences]

                clean_text = cleanEmailBody(text)
                self.nremoved_lines += text.count("\n") - clean_text.count(
                    "\n")
                self.nlines += text.count("\n")
                nchars_clean = len(clean_text)
                ntokens_clean = len(k.wordpunct_tokenize(clean_text))
                nsentences_clean = len(k.sent_tokenize(clean_text))
                triples += [
                    (messageuri, po.messageTextClean, clean_text),
                    (messageuri, po.nCharsClean, nchars_clean),
                    (messageuri, po.nTokensClean, ntokens_clean),
                    (messageuri, po.nSentencesClean, nsentences_clean),
                ]
                self.nchars_clean_all += [nchars_clean]
                self.ntokens_clean_all += [ntokens_clean]
                self.nsentences_clean_all += [nsentences_clean]

                for url in re.findall(
                        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        clean_text):
                    self.nurls += 1
                    triples += [
                        (messageuri, po.hasUrl, url),
                    ]

            content_type = message.get_content_type()
            if content_type:
                triples += [(messageuri, po.contentType, content_type)]
            else:
                raise ValueError("/\/\/\/\/\ message without content type")
            organization = message["Organization"]
            if organization:
                if not isinstance(organization, str):
                    organization = "".join(i for i in str(organization)
                                           if i in string.printable)
                triples += [
                    (messageuri, po.organization, organization),
                ]
            if message["cc"]:
                cc, unparsed = parseAddresses(message["cc"])
                if unparsed:
                    triples += [
                        (messageuri, po.unparsedCC, unparsed),
                    ]
                for peeraddress, peername in cc:
                    peeraddress = peeraddress.strip()
                    assert bool(peeraddress)
                    peeruri = P.rdf.ic(po.EmailPeer, peeraddress,
                                       self.translation_graph,
                                       self.snapshoturi)
                    triples += [
                        (messageuri, po.cc, peeruri),
                        (peeruri, po.emailAddress, peeraddress),
                    ]
                    self.ncc += 1
                    if peername:
                        triples += [
                            (peeruri, po.name, peername.strip()),
                        ]
            if message["to"]:
                to, unparsed = parseAddresses(message["to"])
                if unparsed:
                    triples += [
                        (messageuri, po.unparsedTo, unparsed),
                    ]
                for peeraddress, peername in to:
                    peeraddress = peeraddress.strip()
                    assert bool(peeraddress)
                    peeruri = P.rdf.ic(po.EmailPeer, peeraddress,
                                       self.translation_graph,
                                       self.snapshoturi)
                    triples += [
                        (messageuri, po.to, peeruri),
                        (peeruri, po.emailAddress, peeraddress),
                    ]
                    self.nto += 1
                    if peername:
                        triples += [
                            (peeruri, po.name, peername.strip()),
                        ]
            listid = message["list-id"]
            if listid:
                assert isinstance(listid, str)
                listid = listid.replace("\n", "").replace("\t", "")
                if listid.count("<") == listid.count(">") == listid.count(
                        " ") == 0:
                    listname = ""
                    listid_ = listid
                elif listid.count("<") == listid.count(">") == 0:
                    parts = listid.split()
                    lens = [len(i) for i in parts]
                    listid_ = [i for i in parts if len(i) == max(lens)][0]
                    listname = " ".join(i for i in parts
                                        if len(i) != max(lens))
                elif listid.count("<") == listid.count(">") == 1:
                    listname, listid_ = re.findall(r"(.*) {0,1}<(.*)>",
                                                   listid)[0]
                else:
                    raise ValueError("Unexpected listid string format")
                listuri = P.rdf.ic(po.EmailList, listid_,
                                   self.translation_graph, self.snapshoturi)
                triples += [
                    (messageuri, po.emailList, listuri),
                    (listuri, po.listID, listid_),
                ]
                if listname:
                    triples += [
                        (listuri, po.name, listname.strip()),
                    ]
            P.add(triples, self.translation_graph)
            mbox.close()
コード例 #48
0
def parseLegacyFiles(data_dir=DATADIR):
    """Parse legacy txt files with irc logs"""
    data_dir = os.path.expanduser(data_dir)
    directories = os.listdir(data_dir)
    directories = [i for i in directories if os.path.isdir(data_dir + i)]

    snapshots = set()
    triples = []
    for directory in directories:
        all_files = [
            i for i in os.listdir(data_dir + directory) if i.isdigit()
        ]
        if all_files:
            all_files.sort()
            foo = all_files[0].lstrip("0")
            if not foo:
                foo = "0"
            snapshotid = "legacy-" + directory + "-" + foo + "-" + all_files[
                -1].lstrip("0")
            snapshoturi = po.GmaneSnapshot + "#" + snapshotid
            expressed_classes = [
                po.GmaneParticipant, po.EmailPeer, po.EmailMessage
            ]
            expressed_reference = directory
            name_humanized = "Gmane email list with id " + expressed_reference
            # get size for all files in dir
            directorysize = sum(
                os.path.getsize(data_dir + directory + "/" + filename)
                for filename in os.listdir(data_dir + directory)) / 10**6
            nfiles = len(all_files)
            fileformat = "mbox"
            directoryuri = po.Directory + "#gmane-" + directory
            triples += [
                (snapshoturi, a, po.Snapshot),
                (snapshoturi, po.dataDir, data_dir),
                (snapshoturi, a, po.Snapshot),
                (snapshoturi, a, po.GmaneSnapshot),
                (snapshoturi, po.snapshotID, snapshotid),
                (snapshoturi, po.isEgo, False),
                (snapshoturi, po.isGroup, True),
                (snapshoturi, po.isFriendship, False),
                (snapshoturi, po.isInteraction, True),
                (snapshoturi, po.isPost, True),
                (snapshoturi, po.humanizedName, name_humanized),
                (snapshoturi, po.expressedReference, expressed_reference),
                (snapshoturi, po.rawDirectory, directoryuri),
                (directoryuri, po.directorySize, directorysize),
                (directoryuri, po.directoryName, directory),
                (directoryuri, po.fileFormat, fileformat),
            ] + [(directoryuri, po.expressedClass, expressed_class)
                 for expressed_class in expressed_classes]
            snapshots.add(snapshoturi)
    nsnapshots = ndirectories = len(directories)
    #P.context("gmane","remove")
    platformuri = P.rdf.ic(po.Platform, "Gmane", context="gmane")
    triples += [
        (NS.social.Session, NS.social.nGmaneParsedDirectories, ndirectories),
        (NS.social.Session, NS.social.nGmaneSnapshots, nsnapshots),
        (NS.social.Session, po.platform, platformuri),
    ]
    P.add(triples, context="gmane")
    c("parsed {} gmane data directories (=={} snapshots) are in percolation graph and 'gmane' context"
      .format(ndirectories, nsnapshots))
    c("percolation graph have {} triples ({} in gmane context)".format(
        len(P.percolation_graph), len(P.context("gmane"))))
    negos = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE         { GRAPH <gmane> { ?s po:isEgo true         } } "
    )
    ngroups = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE       { GRAPH <gmane> { ?s po:isGroup true       } } "
    )
    nfriendships = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE  { GRAPH <gmane> { ?s po:isFriendship true  } } "
    )
    ninteractions = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE { GRAPH <gmane> { ?s po:isInteraction true } } "
    )
    nposts = P.query(
        r" SELECT (COUNT(?s) as ?cs) WHERE        { GRAPH <gmane> { ?s po:isPost true        } } "
    )
    totalsize = sum(
        P.query(
            r" SELECT ?size WHERE              { GRAPH <gmane> { ?s po:directorySize ?size     } } "
        ))
    c("""{} are ego snapshots, {} are group snapshots
{} have a friendship structures. {} have an interaction structures. {} have texts 
Total raw data size is {:.2f}MB""".format(negos, ngroups, nfriendships,
                                          ninteractions, nposts, totalsize))
    return snapshots
コード例 #49
0
ファイル: tests.py プロジェクト: Zeigar/percolation-1
    client = Client(endpoint_url+"/update")
    triples = [
        (test.Dummy, test.desc,  """áéíóúćçêôãõà"""),
        (test.Dummy, test.desc2, "Não concordo com a inclusão da palavra controle, sou a favor da manutenção do texto 'Política Nacional de Participação Social'.\n\nA inclusão desta palavra pode ser interpretada como o poder de controle de determinados atores. O uso de 'Política Nacional de Participação Social'atende mais ao intuito de promover um ambiente democrático e horizontal nas relações de participação civil.".replace("\\","\\\\")),
        (test.Dummy, test.desc3, " \\o/".replace("\\","\\\\")),
        (test.Dummy, test.desc3, ' Denominação "ASd"'.replace('"',"'")),
        (test.Dummy, test.desc3, ' Denominação "ASd"'),
              ]
    querystring = buildQuery(triples, method="insert")
    client.endpoint.method = "GET"
    client.endpoint.method = "POST"
    # client.endpoint.requestMethod = ""
    client.endpoint.requestMethod = "postdirectly"
    client.endpoint.requestMethod = "urlencoded"
    client.endpoint.setQuery(querystring)
    client.performQuery(querystring)


if __name__ == "__main__":
    endpoint_url = os.getenv("PERCOLATION_ENDPOINT")
    if not endpoint_url:
        endpoint_url = input("please enter a sparql endpoint url")
    c("==> endpoint url:", endpoint_url)
    c("+++ testing create and delete graphs/contexts and triples:")
    #triples = testReadWriteDelete(endpoint_url)
    c("--- testing IO of text:", endpoint_url)
    #triples = testTextIO(endpoint_url)
    c("### testing custom server:", endpoint_url)
    triples = customConnection(endpoint_url)
    c("end of (remote) sparql endpoint tests", triples)
コード例 #50
0
ファイル: tests.py プロジェクト: Zeigar/percolation-1
def testReadWriteDelete(endpoint_url):
    client = Client(endpoint_url)
    triples = [
              (NS.test.Something, a, NS.test.OtherThing),
              ]
    client.insertTriples(triples, "another")
    c("should print a triple: ", client.getAllTriples("another"))
    client.updateQuery("DROP GRAPH <another> ")
    c("should not print a triple: ", client.getAllTriples("another"))
    client.insertTriples(triples, "another")
    c("should print a triple: ", client.getAllTriples("another"))
    client.insertTriples(triples, "even_another")
    query = "SELECT ?g WHERE { GRAPH ?g {} }"
    c("should print all graphs : ", client.retrieveQuery(query))
    client.updateQuery("DROP GRAPH <another> ")
    client.updateQuery("DROP GRAPH <even_another> ")
    c("should have no more graphs : ", client.retrieveQuery(query))

    # add and remove triples
    triples_ = [
               (NS.test.SomethingElse, NS.test.pred, "banana"),
               ]
    client.insertTriples(triples+triples_, "another")
    c("should print two triples: ", client.getAllTriples("another"))
    query = r"DELETE DATA { GRAPH <another> { <%s> <%s> 'banana' . } } " % \
            (NS.test.SomethingElse, NS.test.pred)
    client.updateQuery(query)
    c("should print one triple: ", client.getAllTriples("another"))
    client.updateQuery("DROP GRAPH <another> ")
    query = "SELECT ?g WHERE { GRAPH ?g {} }"
    c("should have no more dummy graphs : ", client.retrieveQuery(query))
コード例 #51
0
ファイル: pickle2rdf.py プロジェクト: iSport59/social
    def writeAllTW(self):
        # write meta and readme with self.desc, finished.
        g = P.context(self.meta_graph)
        ntriples = len(g)
        triples = [
            (self.snapshoturi, po.nMetaTriples, ntriples),
        ]
        P.add(triples, context=self.meta_graph)
        g.namespace_manager.bind("po", po)
        g.serialize(self.final_path_ + self.snapshotid + "Meta.ttl", "turtle")
        c("ttl")
        g.serialize(self.final_path_ + self.snapshotid + "Meta.rdf", "xml")
        c("serialized meta")
        # copia o script que gera este codigo
        if not os.path.isdir(self.final_path_ + "scripts"):
            os.mkdir(self.final_path_ + "scripts")
        shutil.copy(S.PACKAGEDIR + "/../tests/triplify.py",
                    self.final_path_ + "scripts/triplify.py")
        # copia do base data
        tinteraction = """\n\n{} individuals with metadata {}
and {} interactions (retweets: {}, replies: {}, user_mentions: {}) 
constitute the interaction 
network in the RDF/XML file(s):
{}
and the Turtle file(s):
{}
(anonymized: {}).""".format(
            self.nparticipants, str(self.participantvars),
            self.nretweets + self.nreplies + self.nuser_mentions,
            self.nretweets, self.nreplies, self.nuser_mentions, self.tweet_rdf,
            self.tweet_ttl, self.interactions_anonymized)
        tposts = """\n\nThe dataset consists of {} tweets with metadata {}
{:.3f} characters in average (std: {:.3f}) and total chars in snapshot: {}
{:.3f} tokens in average (std: {:.3f}) and total tokens in snapshot: {}""".format(
            self.ntweets,
            str(self.tweetvars),
            self.mcharstweets,
            self.dcharstweets,
            self.totalchars,
            self.mtokenstweets,
            self.dtokenstweets,
            self.totaltokens,
        )
        self.dates = [i.isoformat() for i in self.dates]
        date1 = min(self.dates)
        date2 = max(self.dates)
        with open(self.final_path_ + "README", "w") as f:
            f.write("""::: Open Linked Social Data publication
\nThis repository is a RDF data expression of the twitter
snapshot {snapid} with tweets from {date1} to {date2}
(total of {ntrip} triples).{tinteraction}{tposts}
\nMetadata for discovery in the RDF/XML file:
{mrdf} \nor in the Turtle file:\n{mttl}
\nEgo network: {ise}
Group network: {isg}
Friendship network: {isf}
Interaction network: {isi}
Has text/posts: {ist}
\nAll files should be available at the git repository:
{ava}
\n{desc}

The script that rendered this data publication is on the script/ directory.\n:::"""
                    .format(snapid=self.snapshotid,
                            date1=date1,
                            date2=date2,
                            ntrip=self.ntriples,
                            tinteraction=tinteraction,
                            tposts=tposts,
                            mrdf=self.mrdf,
                            mttl=self.mttl,
                            ise=self.isego,
                            isg=self.isgroup,
                            isf=self.isfriendship,
                            isi=self.isinteraction,
                            ist=self.hastext,
                            ava=self.online_prefix,
                            desc=self.desc))
コード例 #52
0
import sys
keys = tuple(sys.modules.keys())
for key in keys:
    if "gmane" in key or "percolation" in key:
        del sys.modules[key]
import gmane as G, percolation as P
from percolation.rdf import NS, a, po, c

#ss=S.facebook.access.parseLegacyFiles()
##ss=[i for i in ss if i.endswith("gdf_fb")]
#last_triplification_class=S.facebook.render.publishAll(ss)

#ss=S.twitter.access.parseLegacyFiles()
##ss=[i for i in ss if i.endswith("gdf_fb")]
#last_triplification_class=S.twitter.render.publishAll(ss)

#ss=G.access.parseLegacyFiles()
ss = G.access.parseLegacyFiles("/home/r/.gmane3/")
c("finished .gmane")
#ss.union(G.access.parseLegacyFiles("/home/r/.gmane2/")); c("finished .gmane2")
#ss.union(G.access.parseLegacyFiles("/home/r/.gmane3/")); c("finished .gmane3")
#ss.union(G.access.parseLegacyFiles("/home/r/.gmane4/")); c("finished .gmane4")
#ss=[i for i in ss if i.endswith("gdf_fb")]
#last_triplification_classes+=G.render.publishAll(ss); c("finished publication of all")
triplification_classes = G.render.publishAll(ss)
c("finished publication of all")
コード例 #53
0
ファイル: log2rdf.py プロジェクト: iSport59/social
    def rdfLog(self):
        with codecs.open(self.data_path + self.filename, "rb",
                         "iso-8859-1") as f:
            logtext = textFix(f.read())
        # msgregex=r"\[(\d{2}):(\d{2}):(\d{2})\] \* ([^ ?]*)[ ]*(.*)" # DELETE ???
        #rmessage= r"\[(\d{2}):(\d{2}):(\d{2})\] \<(.*?)\>[ ]*(.*)" # message
        # lista arquivos no dir
        rdate = r"(\d{4})(\d{2})(\d{2})"  # date
        rsysmsg = r"(\d{4})\-(\d{2})\-(\d{2})T(\d{2}):(\d{2}):(\d{2})  \*\*\* (\S+) (.*)"  # system message (?)
        rmsg = r"(\d{4})\-(\d{2})\-(\d{2})T(\d{2}):(\d{2}):(\d{2})  \<(.*?)\> (.*)"  # message
        rurl = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        messages = re.findall(rmsg, logtext)[:10]
        system_messages = re.findall(rsysmsg, logtext)[:10]
        self.NICKS = set([Q(i[-2]) for i in messages] +
                         [Q(i[-2]) for i in system_messages])
        triples = []
        for nick in self.NICKS:
            useruri = P.rdf.ic(po.Participant,
                               "{}-{}".format(self.snapshotid, nick),
                               self.irc_graph, self.snapshoturi)
            triples += [
                (useruri, po.nick, nick),
            ]
        messageids = set()
        msgcount = 0
        c("starting translation of log with",
          len(messages) + len(system_messages), "messages")
        for message in messages:
            year, month, day, hour, minute, second, nick, text = message
            nick = Q(nick)
            datetime_ = datetime.datetime(
                *[int(i) for i in (year, month, day, hour, minute, second)])
            self.dates += [datetime_]
            timestamp = datetime_.isoformat()
            messageid = "{}-{}-{}".format(self.snapshotid, nick, timestamp)
            while messageid in messageids:
                messageid += '_r_%05x' % random.randrange(16**5)
            messageids.add(messageid)
            messageuri = P.rdf.ic(po.IRCMessage, messageid, self.irc_graph,
                                  self.snapshoturi)

            # achar direct message com virgula! TTM

            tokens = k.word_tokenize(text)
            tokens = [i for i in tokens if i not in set(string.punctuation)]
            direct_nicks = []  # for directed messages at
            mention_nicks = []  # for mentioned fellows
            direct = 1
            for token in tokens:
                if token not in self.NICKS:
                    direct = 0
                else:
                    if direct:
                        direct_nicks += [token]
                    else:
                        mention_nicks += [token]
            for nick in direct_nicks:
                useruri2 = po.Participant + "#{}-{}".format(
                    self.snapshotid, nick)
                triples += [
                    (messageuri, po.directedTo, useruri2),
                ]
            if direct_nicks:
                self.ndirect += 1
                text_ = text[text.index(direct_nicks[-1]) +
                             len(direct_nicks[-1]) + 1:].lstrip()
            else:
                text_ = text
            for nick in mention_nicks:
                useruri2 = po.Participant + "#{}-{}".format(
                    self.snapshotid, nick)
                triples += [
                    (messageuri, po.mentions, useruri2),
                ]
            self.nmention += len(mention_nicks)

            useruri = po.Participant + "#{}-{}".format(self.snapshotid, nick)
            triples += [
                (messageuri, po.author, useruri),
                (messageuri, po.systemMessage, False),
                (messageuri, po.createdAt, datetime_),
            ]
            if text:
                triples += [
                    (messageuri, po.messageText, text),
                ]
            if text_:
                nchars = len(text_)
                tokens = k.word_tokenize(text_)
                ntokens = len(tokens)
                nsentences = len(k.sent_tokenize(text_))
                triples += [
                    (messageuri, po.cleanMessageText, text_),
                    (messageuri, po.nChars, nchars),
                    (messageuri, po.nTokens, ntokens),
                    (messageuri, po.nSentences, nsentences),
                ]
                urls = re.findall(rurl, text_)
                for url in urls:
                    triples += [
                        (messageuri, po.hasUrl, url),
                    ]
                self.nchars_all += [nchars]
                self.ntokens_all += [ntokens]
                self.nsentences_all += [nsentences]
                self.nurls += len(urls)
            else:
                triples += [
                    (messageuri, po.emptyMessage, True),
                ]
            if text.startswith(";aa ") or text.startswith(
                    "lalenia, aa ") or text.startswith("lalenia: aa "):
                self.naamessages += 1
                triples += [
                    (messageuri, a, po.AAIRCMessage),
                ]
            msgcount += 1
            if msgcount % 1000 == 0:
                c("finished user message", msgcount)
        msgcount = 0
        for message in system_messages:
            year, month, day, hour, minute, second, nick, text = message
            nick = Q(nick)
            useruri = po.Participant + "#{}-{}".format(self.snapshotid, nick)

            datetime_ = datetime.datetime(
                *[int(i) for i in (year, month, day, hour, minute, second)])
            self.dates += [datetime_]
            timestamp = datetime_.isoformat()
            messageid = "{}-{}".format(self.snapshotid, timestamp)
            while messageid in messageids:
                messageid += '_r_%05x' % random.randrange(16**5)
            messageids.update([messageid])
            messageuri = P.rdf.ic(po.IRCMessage, messageid, self.irc_graph,
                                  self.snapshoturi)
            triples += [(messageuri, po.impliedUser, useruri),
                        (messageuri, po.sentAt, datetime_),
                        (messageuri, po.systemMessage, True)]
            if text:
                triples += [(messageuri, po.messageText, text)]
            msgcount += 1
            if msgcount % 1000 == 0:
                c("finished system message. Total messages:", msgcount)
        self.messageids = messageids
        if not os.path.isdir(self.final_path):
            os.mkdir(self.final_path)
        if not os.path.isdir(self.final_path_):
            os.mkdir(self.final_path_)
        self.log_xml, self.size_xml, self.log_ttl, self.size_ttl = P.rdf.writeByChunks(
            self.final_path_ + self.snapshotid + "Log",
            ntriples=5,
            triples=triples)
コード例 #54
0
ファイル: utils.py プロジェクト: eliheuer/tokipona
            for syllable in allConsonantSyllables():
                if word[-1] == syllable[-1] == 'j':
                    continue
                word_ = word + syllable
                words += [word_]
        # c('finished words with {} syllables'.format(n_+1))
        n_ += 1
    words_ = words[:]
    for vowel in vowels:
        words += [vowel + word for word in words_ if len(word) / 2 < n]
    words += [vowel for vowel in vowels]  # assuming one vowel words
    # c('finished words starting with vowels')

    return words


def allTokiPonaExistentWords():
    from . import makeStatistics as stats
    return stats


if __name__ == '__main__':
    c('vowels:', vowels)
    c('\n', 'consonants:', consonants)
    c('\n', 'invalid_syllables (4):', invalid_syllables)
    all_syllables = allConsonantSyllables()
    c('\n', 'valid syllables ({}):'.format(len(all_syllables)), all_syllables)
    all_possible_words = allTokiPonaPossibleWords()
    c('\n', 'all tokipona possible words with 3 syllables',
      '({})'.format(len(all_possible_words)), all_possible_words)
コード例 #55
0
ファイル: makeAllPlainChanges.py プロジェクト: petitmi/music
    if "music" in key:
        del sys.modules[key]
import music as M
from percolation.rdf import c

def fact(x):
    if x == 1:
        return 1
    return x*fact(x-1)


nelements = 0
while nelements not in range(3, 13):
    nelements_maximum = input("make changes until maximum number of elements:\
                    (min=3,,max=12,default=5) ")
    try:
        nelements = int(nelements_maximum)
    except:
        pass
    if not nelements_maximum:
        nelements_maximum = 5
# generate peals with elements in numbers of 3 to 12
peals = {}
for nelements in range(3, int(nelements_maximum)+1):
    key = "peal_with_" + str(nelements) + "_elements"
    nhunts=nelements-3
    peal = M.structures.symmetry.PlainChanges(nelements,nhunts)
    peals[key] = peal
    c(len(peal.peal_direct), fact(nelements))
    assert len(peal.peal_direct) == fact(nelements)