Exemple #1
0
 startmin=txtlog[13:15]
 startsec=txtlog[15:17]
 timezone=txtlog[17:20]+":"+txtlog[20:22]
 # variables to keep track of time the first and last messages
 firsttstamp = '' # this one only updates once
 lasttstamp = ''  # this one keeps updating
 # create an XML soup for the converted log with the header info
 soup = BeautifulStoneSoup('<?xml version="1.0" encoding=""?>')
 # Populate the soup
 chatTag = Tag(soup, "chat")
 chatTag['xmlns'] = XMLNS
 chatTag['account'] = account
 chatTag['service'] = SERVICE
 chatTag['adiumversion'] = ADIUMVERSION
 chatTag['buildid'] = BUILDID
 soup.append(chatTag)
 # Parse a pidgin log file and compose the Adium log file
 # Pidgin log format note:
 # 1) Messages starts from the 2nd line
 # 2) The first line of a message starts with a timestamp embraced by ()
 #    * (5/6/2015 12:13:14 AM) -- for msg date different from what is recorded in the filename)
 #    * (2:05:59 PM) -- for msg date the same as the one recored in the filename 
 # 3) If the current line is a continuation of the message started in some previous line, it will not have the above time header. 
 # 4) After the timestamp, it has optional sender info before the next :. No sender info if the message is a status update.
 #    * (2:05:59 PM) [email protected]
 # Adium log format note:
 # 1) Sample log format
 #    <chat xmlns="http://purl.org/net/ulf/ns/0.4-02" account="*****@*****.**" service="Jabber" adiumversion="1.5.7" buildid="c72b164f75a7">
 #    <event type="WindowOpened" sender="*****@*****.**" time="2014-12-04T16:14:01+08:00">
 #    </event>
 #    <message sender="*****@*****.**" time="2014-12-04T16:14:01+08:00">
    def output(self, filename, os=sys.stdout):
        if self._articles == None:
            print 'Please parse file(s) first'
            return
        elif not self._articles:
            print 'No articles detected in parse file.'
            return
        osoup = BeautifulStoneSoup()
        channel = Tag(osoup, 'channel', [('id', '1')])
        osoup.append(channel)
        # journal meta (extract from first article)
        article = self._articles[0]
        jtitle = Tag(osoup, 'title')
        jtitle.append('Science Magazine')
        jhomepage = Tag(osoup, 'homepage')
        jhomepage.append('http://www.sciencemag.org/content/current')
        pubdate = article.find('pub-date')
        jpubdate = Tag(osoup, 'pubDate')
        jpubdate.append('%(year)s-%(month)s-%(day)s' % 
                        {'year' : pubdate.year.contents[0],
                         'month' : pubdate.month.contents[0],
                         'day' : pubdate.day.contents[0]})
        jvolume = article.find('volume')
        jissue = article.find('issue')
        jcoverimg = Tag(osoup, 'image', [('type', 'cover'),
                                         ('url','http://coverurl')])
        channel.append(jtitle)
        channel.append(jhomepage)
        channel.append(jpubdate)
        channel.append(jvolume)
        channel.append(jissue)
        channel.append(jcoverimg)

        # article meta
        sectiondict = {}
        for article in self._articles:
            # item, fpage
            fpagetag = article.find('fpage')
            fpage = fpagetag.contents[0]
            try:
                subpage = fpagetag['seq']
            except KeyError, msg:
                subpage = ''
            item = Tag(osoup, 'item', [('fpage',fpage),
                                        ('subpage',subpage)])
            # title
            atitle = article.find('article-title')
            atitle.name = 'title'
            # pubdate
            pubdate = article.find('pub-date')
            apubdate = Tag(osoup, 'pubDate')
            apubdate.append('%(year)s-%(month)s-%(day)s' % 
                            {'year' : pubdate.year.contents[0],
                             'month' : pubdate.month.contents[0],
                             'day' : pubdate.day.contents[0]})
            # overline
            overline = article.find('subj-group', 
                                   {'subj-group-type' : 'heading' }
                                  ).subject.contents[0]
            aoverline = Tag(osoup, 'overline')
            aoverline.append(overline)
            # authors
            contribs = article.findAll('contrib')
            acontribs = Tag(osoup, 'authors')
            for c in contribs:
                ctype = c['contrib-type']
                fname = c.find('given-names').contents[0]
                lname = c.find('surname').contents[0]
                acontrib = Tag(osoup, ctype)
                afname = Tag(osoup, 'fname')
                afname.append(fname)
                alname = Tag(osoup, 'lname')
                alname.append(lname)
                acontrib.append(alname)
                acontrib.append(afname)
                acontribs.append(acontrib)
            # summary
            teaser = article.find('abstract', {'abstract-type':'teaser'})
            if not teaser:
                teaser = ''
            asummary = Tag(osoup, 'summary')
            asummary.append(teaser)
            # text
            abody = article.find('body')
            try:
                abody.name = 'text'
            except AttributeError:
                abody = Tag(osoup, 'text')
            # images
            aimages = Tag(osoup, 'images')
            aimage = Tag(osoup, 'image')
            aimage.append('test image')
            aimages.append(aimage)
            # links
            alinks = Tag(osoup, 'links')
            alinkabs = Tag(osoup, 'link', [('type','abstract')])
            alinkabs.append('abstract url')
            alinkfull = Tag(osoup, 'link', [('type','full')])
            alinkfull.append('full url')
            alinks.append(alinkabs)
            alinks.append(alinkfull)
            # categories
            acats = Tag(osoup, 'categories')
            cats = article.findAll('subj-group', {'subj-group-type' : 'field'})
            for cat in cats:
                acat = Tag(osoup, 'category')
                acat.append(cat.subject.contents[0])
                acats.append(acat)
            # section
            sectionname = article.find('subj-group', 
                                   {'subj-group-type' : 'article-type' }
                                  ).subject.contents[0]
            if sectionname in sectiondict:
                section = sectiondict[sectionname]
            else:
                section = Tag(osoup, 'section', [('id','#'), 
                                                 ('title', sectionname)])
                sectiondict[sectionname] = section
                channel.append(section)
            # build item
            item.append(atitle)
            item.append(apubdate)
            item.append(aoverline)
            item.append(acontribs)
            item.append(asummary)
            item.append(abody)
            item.append(aimages)
            item.append(alinks)
            item.append(acats)
            section.append(item)
def format_wp(outfile):
    # extract list of threads
    threads = json.load(open(dumpfile,'r'))
    # set up xml output
    f = open(outfile,'w')
    soup = BeautifulStoneSoup()
    soup.append(ProcessingInstruction('xml version="1.0" encoding="UTF-8'))
    rss = Tag(soup, 'rss',
              [('version','2.0'),
               ('xmlns:excerpt','http://wordpress.org/export/1.0/excerpt/'),
               ('xmlns:content','http://purl.org/rss/1.0/modules/content/'),
               ('xmlns:wfw','http://wellformedweb.org/CommentAPI/'),
               ('xmlns:dc','http://purl.org/dc/elements/1.1/'),
               ('xmlns:wp','http://wordpress.org/export/1.0/')])
    soup.append(rss)
    channel = Tag(soup, 'channel')
    clink = Tag(soup, 'link')
    clink.append('http://news.sciencemag.org/scienceinsider')
    rss.append(channel)
    channel.append(clink)
    print 'Reformatting comments in',len(threads),'threads from json into xml...'
    # print threads in descending order of date
    threads = sorted(threads.iteritems(), key=operator.itemgetter(0))
    ncom = 0
    for uniq,thread in threads:
        item = Tag(soup,'item')
        channel.append(item)
        title = Tag(soup,'title')
        title.append('Comments for '+thread[0]['url'])
        link = Tag(soup,'link')
        link.append(thread[0]['url'])
        guid = Tag(soup, 'guid', [('isPermaLink','false')])
        guid.append(thread[0]['url'])
        id = Tag(soup,'wp:post_id')
        id.append(thread[0]['uniq'])
        item.append(title)
        item.append(link)
        item.append(guid)
        item.append(id)
        for comment in thread:
            ctag = Tag(soup, 'wp:comment')
            id = Tag(soup,'wp:comment_id')
            id.append(comment['id'])
            author = Tag(soup,'wp:comment_author')
            author.append(CData(comment['author']))
            email = Tag(soup,'wp:comment_author_email')
            if comment['email']:
                email.append(comment['email'])
            ip = Tag(soup,'wp:comment_author_ip')
            ip.append(comment['ip'])
            date = Tag(soup,'wp:comment_date')
            date.append(comment['created_on'])
            dategmt = Tag(soup,'wp:post_date_gmt')
            dategmt.append(comment['created_on'])
            text = Tag(soup,'wp:comment_content')
            text.append(CData(comment['text']))
            status = Tag(soup,'wp:comment_approved')
            status.append('1')
            type = Tag(soup,'wp:comment_type')
            parent = Tag(soup,'wp:comment_parent')
            parent.append('0')
            user = Tag(soup,'wp:comment_user_id')
            user.append('0')

            item.append(ctag)
            ctag.append(id)
            ctag.append(author)
            ctag.append(email)
            ctag.append(ip)
            ctag.append(date)
            ctag.append(dategmt)
            ctag.append(text)
            ctag.append(status)
            ctag.append(type)
            ctag.append(parent)
            ctag.append(user)
            ncom += 1
    print 'Outputted',ncom,'comments.'
    print_soup(f,soup)