コード例 #1
0
ファイル: wiki_parser.py プロジェクト: stickmanlechu/wikidata
 def __init__(self, filename, useFile=False):
     self.file = BZ2File(filename)
     if useFile:
         self.writeToFile = True
         self.output = codecs.open('output.dat', 'w', 'utf-8')
     else:
         self.writeToFile = False
         self.dao = WikiDao('../resources/datasource.properties')
     self.namespace = '{http://www.mediawiki.org/xml/export-0.8/}'
コード例 #2
0
ファイル: wiki_parser.py プロジェクト: stickmanlechu/wikidata
class WikiParser:
    
    def __init__(self, filename, useFile=False):
        self.file = BZ2File(filename)
        if useFile:
            self.writeToFile = True
            self.output = codecs.open('output.dat', 'w', 'utf-8')
        else:
            self.writeToFile = False
            self.dao = WikiDao('../resources/datasource.properties')
        self.namespace = '{http://www.mediawiki.org/xml/export-0.8/}'
    
    def __del__(self):
        self.file.close()
        if self.writeToFile:
            self.output.close()
        else:
            del self.dao

    def tag_name(self,name):
        return self.namespace + name
    
    def cope_with_data(self,data):
        if self.writeToFile:
            #save to file
            userlist = data['users']
            if len(userlist) == 1:
                self.output.write(userlist[0].id)
            elif len(userlist) > 1:
                for i in range(0, len(userlist)-1):
                    for j in range(i+1, len(userlist)):
                        self.output.write( '%s;%s\n'  % (userlist[i].id,userlist[j].id) )
        else:
            #save to database
            self.dao.savePage(data['page'])
            for user in data['users']:
                self.dao.saveUser(user)
            for revision in data['revisions']:
                self.dao.saveRevision(revision)
            self.dao.connection.commit()

        del data

    def revIsValid(self, id, time):
        return id is not None and id.text is not None and time is not None and time.text is not None

    def process_xml(self):
        for event, elem in etree.iterparse(self.file, events=("start","end")):
            data = {}
            if elem.tag == self.tag_name('page') and event == 'start':
                title = elem.find(self.tag_name('title'))
                pgId = elem.find(self.tag_name('id'))
                if title is not None and pgId is not None:
                    if title.text is None or pgId.text is None:
                        print 'Page title or id is none'
                    else:
                        # save page data
                        aTitle = title.text.replace('\'', '')
                        if len(aTitle) > 100:
                            aTitle = aTitle[:100]
                        page = Page(pgId.text, aTitle)
                        data['page'] = page
                        revisions = []
                        users = []
                        for revision in elem.findall(self.tag_name('revision')):
                            # get the revision data
                            rev_id = revision.find(self.tag_name('id'))
                            rev_timestamp = revision.find(self.tag_name('timestamp'))
                            rev_comment = revision.find(self.tag_name('comment'))
                            if self.revIsValid(rev_id, rev_timestamp):
                                timestamp = rev_timestamp.text.split('T')[0] #only date in format YYYY-MM-DD
                                comment = '' if rev_comment is None else rev_comment.text
                                comment = '' if comment is None else comment.replace('\'', '') #avoid special chars in comments
                                if len(comment) > 200:
                                    comment = comment[:200]
                                contrib = revision.find(self.tag_name('contributor'))
                                if(contrib != None):
                                    username = contrib.find(self.tag_name('username'))
                                    user_id = contrib.find(self.tag_name('id'))
                                    if username is not None and user_id is not None:
                                        if user_id.text is not None and username.text is not None:
                                            user = User(user_id.text, username.text)
                                            users.append(user)
                                            revision = Revision(rev_id.text, page.id, user.id, timestamp, comment)
                                            revisions.append(revision)
                                        else:
                                            print 'UserId or username not found'
                                    else:
                                        pass
                            else:
                                print 'Invalid revision'
                        data['revisions'] = revisions
                        data['users'] = users
                        self.cope_with_data(data)
            elem.clear()
            del elem