Ejemplo n.º 1
0
class ArticleParser(xml.sax.ContentHandler) :
    def __init__(self) :
        self.identity = None
        self.content = None
        self.article = None

    def cleaned(self) :
        return self.content.replace('\n', ' ').strip()
    #xml.sax.saxutils.escape(self.content.replace('\n', ' ').strip())

    def startElement(self, name, attrs) :
        self.content = ""

        if name == 'Article' :
            self.article = Article()

    def characters(self, c) :
        self.content += c

    def endElement(self, name) :
        if name == 'Article' : 
            if self.article :
                if self.identity == 420 :
                    print self.article.author

                self.article.save()
                self.article = None
        
        elif name == 'title'    : self.article.title = self.cleaned()
        elif name == 'author'   : self.article.author = self.cleaned()
        elif name == 'abstract' : self.article.abstract = self.cleaned()
        elif name == 'venue'    : self.article.venue = self.cleaned()
        elif name == 'url'      : self.article.url = self.cleaned()
        elif name == 'id'       : self.identity = int(self.cleaned())
        else : pass
Ejemplo n.º 2
0
class ArticleParser(xml.sax.ContentHandler):
    def __init__(self):
        self.content = None
        self.article = None
        self.count = 0

    def cleaned(self):
        return self.content.replace('\n', ' ').strip()
        #return xml.sax.saxutils.escape(self.content.replace('\n', ' ').strip())

    def startElement(self, name, attrs):
        self.content = ""

        if name == 'article':
            self.article = Article()

    def characters(self, c):
        self.content += c

    def endElement(self, name):
        if name == 'article':
            if self.article:
                self.article.save()
                self.article = None
                self.count += 1

                if (self.count % 1000) == 0:
                    print >> stderr, "read in %d articles" % self.count

        elif name == 'title':
            self.article.title = self.cleaned()
        elif name == 'author':
            self.article.author = self.cleaned()
        elif name == 'abstract':
            self.article.abstract = self.cleaned()
        elif name == 'venue':
            self.article.venue = self.cleaned()
        elif name == 'url':
            self.article.url = self.cleaned()
        elif name == 'id':
            self.article.arxivid = self.cleaned()
        elif name == 'created':
            self.article.date = datetime.date(
                *[int(i) for i in self.cleaned().split('-')])
        else:
            pass
Ejemplo n.º 3
0
    def startElement(self, name, attrs) :
        self.content = ""

        if name == 'Article' :
            self.article = Article()
Ejemplo n.º 4
0
    def startElement(self, name, attrs):
        self.content = ""

        if name == 'article':
            self.article = Article()