Ejemplo n.º 1
0
 def __init__(self, xmlcompatibility='db09'):
     self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0,
                                           headercoef=3.0,
                                           mintextlen=30)
     self.citaextractor = CitationEntityExtractor(
         ALL, xmlcompatibility=xmlcompatibility)
     self.ee = EntityExtractor()
     self.mime = MIMEhandler()
     self.crawler = Crawler()
     self.bibtex = BibTeXParser()
     self.xmlcompatibility = xmlcompatibility
     self._xmlvalid = int(xmlcompatibility.lstrip('db'))
     self._publ_list = []
 def __init__(self, xmlcompatibility='db09'):
     self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0, headercoef=3.0, mintextlen=30)
     self.citaextractor = CitationEntityExtractor(ALL, xmlcompatibility=xmlcompatibility)
     self.ee = EntityExtractor()
     self.mime = MIMEhandler()
     self.crawler = Crawler()
     self.bibtex = BibTeXParser()
     self.xmlcompatibility = xmlcompatibility
     self._xmlvalid = int(xmlcompatibility.lstrip('db'))
     self._publ_list = []
Ejemplo n.º 3
0
class PublicationListExtractor(object):
    """
    PublPageMetaExtractor handles harvests metadata from web pages containing
    references (publication list). For parsing sequences in HTML dom we use
    extractors.sequencewrapper.HTMLSequenceWrapper. For parsing citations
    (records in data regions, which were found by sequencewrapper) we use
    extractors.citationentityextractor.CitationEntityExtractor.

    To improve accuracy of this system, we check headers wheather they contain some
    keyword, which could help us to determine the correct type of publication.

    From headers we also harvest keywords.
    """

    entitydefstr = {
        '216': 'O',
        '217': 'U',
        '214': 'O',
        '197': 'A',
        '198': 'E',
        '210': 'O',
        '211': 'O',
        '195': 'A',
        '194': 'A',
        '196': 'A',
        '193': 'A',
        '192': 'A',
        '251': 'u',
        '252': 'u',
        '238': 'i',
        '239': 'i',
        '235': 'e',
        '234': 'e',
        '212': 'O',
        '236': 'e',
        '237': 'i',
        '230': 'e',
        '231': 'c',
        '232': 'e',
        '213': 'O',
        '224': 'a',
        '249': 'u',
        '253': 'y',
        '248': 'o',
        '243': 'o',
        '255': 'y',
        '250': 'u',
        '233': 'e',
        '201': 'E',
        '200': 'E',
        '203': 'E',
        '202': 'E',
        '205': 'I',
        '204': 'I',
        '207': 'I',
        '206': 'I',
        '242': 'o',
        '220': 'U',
        '245': 'o',
        '244': 'o',
        '246': 'o',
        '241': 'n',
        '218': 'U',
        '229': 'a',
        '228': 'a',
        '227': 'a',
        '226': 'a',
        '225': 'a',
        '219': 'U',
        '221': 'Y',
        # these are added
        '248': 'r',
        '185': 's',
        '174': 'Z',
        '232': 'c',
        '200': 'C',
        '169': 'S',
        '190': 'z',
        '199': 'C',
        'amp': '&',
        'nbsp': ' ',
        'quot': '\"'
    }

    def __init__(self, xmlcompatibility='db09'):
        self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0,
                                              headercoef=3.0,
                                              mintextlen=30)
        self.citaextractor = CitationEntityExtractor(
            ALL, xmlcompatibility=xmlcompatibility)
        self.ee = EntityExtractor()
        self.mime = MIMEhandler()
        self.crawler = Crawler()
        self.bibtex = BibTeXParser()
        self.xmlcompatibility = xmlcompatibility
        self._xmlvalid = int(xmlcompatibility.lstrip('db'))
        self._publ_list = []

    def _set_new_topic(self, publ, kw):
        """
        This method adds new topic to publication.
        """
        if not re.search("[a-z]{4,}", kw):
            return publ
        if re.search("publi|paper", kw, re.I):
            return publ
        t = RRSTopic(title=kw)
        publ.set('topic', t)
        return publ

    def _set_publ_type(self, header, publ):
        def _floor(i):
            if i > 100: i = 100
            return i

        if header is None: return publ
        # try to set publication type from header
        for _type in RRSPublication.publication_types:
            if re.search(_type, header, re.I):
                if publ.get('type') == _type:
                    publ.set('credibility', _floor(publ.get('credibility')))
                else:
                    publ.set('type', _type)
                return publ
        if re.search("dissertation", header, re.I):
            publ.set('type', 'phdthesis')
            return publ
        if re.search('technical report', header, re.I):
            publ.set('type', 'techreport')
            return publ
        # make keyword from header
        return self._set_new_topic(publ, header)

    def translate_html_entities(self, text):
        ents = re.findall(r'&(#?)(x?)(\w+);', text)
        for ent in set(ents):
            try:
                text = re.sub('&(#?)' + re.escape(ent[2]) + ";",
                              self.entitydefstr[ent[2]], text)
            except:
                pass
        return text

    def compare_chunks_to_extracted(self, chunks, publ):
        if not publ.get('title'): return publ
        title = self.translate_html_entities(publ.get('title'))
        authors = publ.get('person_author')
        author_names = [a.get('name')[0].get('full_name') for a in authors]
        for ch in chunks:
            l = ch.get_link()
            # get chunk text
            ch = self.translate_html_entities(ch.get_text())
            # add url if available
            if l is not None and not l.startswith("javascript") and l != "#":
                u = RRSUrl(type='publication', title=ch, link=l)
                publ.set('url', u)

            # repair title if needed
            if ch in title or ch == title:
                if float(len(ch)) / float(len(title)) > 0.4:
                    publ.set('title', ch)
            # repair names if needed
            for a in author_names:
                if a in ch:
                    authors_extracted = self.ee.find_authors(ch)
                    publ.person_author = authors_extracted[0]
                break
        return publ

    def _fill_citation(self, publ):
        c = RRSCitation()
        c.set('content', self.cita_text)
        if publ.get('event'):
            c.set('event', publ.get('event')[0].get('title'))
        return c

    def _handle_bibtex_pages(self):
        urls = {}
        for i, p in enumerate(self._publ_list):
            pub_u = p.get('url')
            for u in pub_u:
                urls[u.get('link')] = i

        #if link is web page, not pdf
        urls_to_download = []
        content_types = self.mime.start(urls.keys())
        for k in urls.keys():
            if content_types[k] in ('text/html', 'application/xhtml+xml',
                                    'application/x-httpd-php',
                                    'text/javascript'):
                urls_to_download.append(k)
        # download page a try it for bibtex
        pages = self.crawler.start(urls_to_download)

        for u in urls_to_download:
            bibtex = self.bibtex.parse(pages[u])
            # if bibtex on page, set publication
            if bibtex is not None:
                self._publ_list[urls[u]] = bibtex

    def _empty(self):
        for x in range(len(self._publ_list)):
            self._publ_list.pop()
        self.cita_text = None

    def _handle_document(self, doc):
        self._empty()
        # for all regions which were found
        for reg in doc.get_regions():
            # get their header
            header = reg.get_name()
            # for all records in region
            for rec in reg._manual_process_page():
                # create empty citation object
                c = RRSCitation()
                # harvest citation record text (probably citation we hope)
                self.cita_text = self.translate_html_entities(rec.get_text())
                # set the content of record to citation object
                c.set('content', self.cita_text)
                # fill object it wih extracted data
                c = self.citaextractor.extract(c)

                # get extracted publication
                publ = c.get('publication_cited')
                # if sequencewrapper extracted come text chunks, it helps us a lot,
                # beacause we can compare extracted data to chunks and if not matched
                # we can fix it
                publ = self.compare_chunks_to_extracted(rec.get_chunks(), publ)
                # insert citation into publication
                # !!! we are extracting publications, not citations. Because we dont
                # want tree like this: citation->publication but this:
                # publication->citation
                publ.set('citation', self._fill_citation(publ))
                # try to find publication type in header of data region
                publ = self._set_publ_type(header, publ)
                # add to publication list
                self._publ_list.append(publ)
        #self._handle_bibtex_pages()
        return self._publ_list

    #---------------------------------------------------------------------------
    # public methods
    #---------------------------------------------------------------------------
    def extract_data(self, tree, url):
        """
        Main method for extracting publication metadata from page.
        """
        # wrap html document
        document = self.seqwrapper.wrap_h(tree, url)
        # handle it and return the result
        return self._handle_document(document)
class PublicationListExtractor(object):
    """
    PublPageMetaExtractor handles harvests metadata from web pages containing
    references (publication list). For parsing sequences in HTML dom we use
    extractors.sequencewrapper.HTMLSequenceWrapper. For parsing citations
    (records in data regions, which were found by sequencewrapper) we use
    extractors.citationentityextractor.CitationEntityExtractor.

    To improve accuracy of this system, we check headers wheather they contain some
    keyword, which could help us to determine the correct type of publication.

    From headers we also harvest keywords.
    """

    entitydefstr = {'216': 'O', '217': 'U', '214': 'O', '197': 'A',
    '198': 'E', '210': 'O', '211': 'O', '195': 'A', '194': 'A',
    '196': 'A', '193': 'A', '192': 'A', '251': 'u', '252': 'u', '238': 'i',
    '239': 'i', '235': 'e', '234': 'e', '212': 'O', '236': 'e', '237': 'i',
    '230': 'e', '231': 'c', '232': 'e', '213': 'O', '224': 'a', '249': 'u',
    '253': 'y', '248': 'o', '243': 'o', '255': 'y', '250': 'u',
    '233': 'e', '201': 'E', '200': 'E', '203': 'E', '202': 'E', '205': 'I',
    '204': 'I', '207': 'I', '206': 'I', '242': 'o', '220': 'U',
    '245': 'o', '244': 'o', '246': 'o', '241': 'n', '218': 'U', '229': 'a',
    '228': 'a', '227': 'a', '226': 'a', '225': 'a', '219': 'U', '221': 'Y',
    # these are added
    '248': 'r', '185': 's', '174': 'Z', '232': 'c', '200': 'C', '169': 'S', '190': 'z',
    '199': 'C', 'amp': '&', 'nbsp': ' ', 'quot': '\"'
    }

    def __init__(self, xmlcompatibility='db09'):
        self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0, headercoef=3.0, mintextlen=30)
        self.citaextractor = CitationEntityExtractor(ALL, xmlcompatibility=xmlcompatibility)
        self.ee = EntityExtractor()
        self.mime = MIMEhandler()
        self.crawler = Crawler()
        self.bibtex = BibTeXParser()
        self.xmlcompatibility = xmlcompatibility
        self._xmlvalid = int(xmlcompatibility.lstrip('db'))
        self._publ_list = []


    def _set_new_topic(self, publ, kw):
        """
        This method adds new topic to publication.
        """
        if not re.search("[a-z]{4,}", kw):
            return publ
        if re.search("publi|paper", kw, re.I):
            return publ
        t = RRSTopic(title=kw)
        publ.set('topic', t)
        return publ


    def _set_publ_type(self, header, publ):
        def _floor(i):
            if i > 100: i=100
            return i

        if header is None: return publ
        # try to set publication type from header
        for _type in RRSPublication.publication_types:
            if re.search(_type, header, re.I):
                if publ.get('type') == _type:
                    publ.set('credibility', _floor(publ.get('credibility')))
                else:
                    publ.set('type', _type)
                return publ
        if re.search("dissertation", header, re.I):
            publ.set('type', 'phdthesis')
            return publ
        if re.search('technical report', header, re.I):
            publ.set('type', 'techreport')
            return publ
        # make keyword from header
        return self._set_new_topic(publ, header)


    def translate_html_entities(self, text):
        ents = re.findall(r'&(#?)(x?)(\w+);', text)
        for ent in set(ents):
            try:
                text = re.sub('&(#?)'+re.escape(ent[2])+";", self.entitydefstr[ent[2]], text)
            except: pass
        return text


    def compare_chunks_to_extracted(self, chunks, publ):
        if not publ.get('title'): return publ
        title = self.translate_html_entities(publ.get('title'))
        authors = publ.get('person_author')
        author_names = [a.get('name')[0].get('full_name') for a in authors]
        for ch in chunks:
            l = ch.get_link()
            # get chunk text
            ch = self.translate_html_entities(ch.get_text())
            # add url if available
            if l is not None and not l.startswith("javascript") and l != "#":
                u = RRSUrl(type='publication', title=ch, link=l)
                publ.set('url', u)

            # repair title if needed
            if ch in title or ch == title:
                if float(len(ch))/float(len(title)) > 0.4:
                    publ.set('title', ch)
            # repair names if needed
            for a in author_names:
                if a in ch:
                    authors_extracted = self.ee.find_authors(ch)
                    publ.person_author = authors_extracted[0]
                break
        return publ


    def _fill_citation(self, publ):
        c = RRSCitation()
        c.set('content', self.cita_text)
        if publ.get('event'):
            c.set('event', publ.get('event')[0].get('title'))
        return c


    def _handle_bibtex_pages(self):
        urls = {}
        for i, p in enumerate(self._publ_list):
            pub_u = p.get('url')
            for u in pub_u:
                urls[u.get('link')] = i

        #if link is web page, not pdf
        urls_to_download = []
        content_types = self.mime.start(urls.keys())
        for k in urls.keys():
            if content_types[k] in ('text/html', 'application/xhtml+xml',
                                    'application/x-httpd-php', 'text/javascript'):
                urls_to_download.append(k)
        # download page a try it for bibtex
        pages = self.crawler.start(urls_to_download)

        for u in urls_to_download:
            bibtex = self.bibtex.parse(pages[u])
            # if bibtex on page, set publication
            if bibtex is not None:
                self._publ_list[urls[u]] = bibtex


    def _empty(self):
        for x in range(len(self._publ_list)):
            self._publ_list.pop()
        self.cita_text = None


    def _handle_document(self, doc):
        self._empty()
        # for all regions which were found
        for reg in doc.get_regions():
            # get their header
            header = reg.get_name()
            # for all records in region
            for rec in reg._manual_process_page():
                # create empty citation object
                c = RRSCitation()
                # harvest citation record text (probably citation we hope)
                self.cita_text = self.translate_html_entities(rec.get_text())
                # set the content of record to citation object
                c.set('content', self.cita_text)
                # fill object it wih extracted data
                c = self.citaextractor.extract(c)

                # get extracted publication
                publ = c.get('publication_cited')
                # if sequencewrapper extracted come text chunks, it helps us a lot,
                # beacause we can compare extracted data to chunks and if not matched
                # we can fix it
                publ = self.compare_chunks_to_extracted(rec.get_chunks(), publ)
                # insert citation into publication
                # !!! we are extracting publications, not citations. Because we dont
                # want tree like this: citation->publication but this:
                # publication->citation
                publ.set('citation', self._fill_citation(publ))
                # try to find publication type in header of data region
                publ = self._set_publ_type(header, publ)
                # add to publication list
                self._publ_list.append(publ)
        #self._handle_bibtex_pages()
        return self._publ_list

    #---------------------------------------------------------------------------
    # public methods
    #---------------------------------------------------------------------------
    def extract_data(self, tree, url):
        """
        Main method for extracting publication metadata from page.
        """
        # wrap html document
        document = self.seqwrapper.wrap_h(tree, url)
        # handle it and return the result
        return self._handle_document(document)