def __init__(self, xmlcompatibility='db09'): self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0, headercoef=3.0, mintextlen=30) self.citaextractor = CitationEntityExtractor( ALL, xmlcompatibility=xmlcompatibility) self.ee = EntityExtractor() self.mime = MIMEhandler() self.crawler = Crawler() self.bibtex = BibTeXParser() self.xmlcompatibility = xmlcompatibility self._xmlvalid = int(xmlcompatibility.lstrip('db')) self._publ_list = []
def __init__(self, xmlcompatibility='db09'): self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0, headercoef=3.0, mintextlen=30) self.citaextractor = CitationEntityExtractor(ALL, xmlcompatibility=xmlcompatibility) self.ee = EntityExtractor() self.mime = MIMEhandler() self.crawler = Crawler() self.bibtex = BibTeXParser() self.xmlcompatibility = xmlcompatibility self._xmlvalid = int(xmlcompatibility.lstrip('db')) self._publ_list = []
class PublicationListExtractor(object): """ PublPageMetaExtractor handles harvests metadata from web pages containing references (publication list). For parsing sequences in HTML dom we use extractors.sequencewrapper.HTMLSequenceWrapper. For parsing citations (records in data regions, which were found by sequencewrapper) we use extractors.citationentityextractor.CitationEntityExtractor. To improve accuracy of this system, we check headers wheather they contain some keyword, which could help us to determine the correct type of publication. From headers we also harvest keywords. """ entitydefstr = { '216': 'O', '217': 'U', '214': 'O', '197': 'A', '198': 'E', '210': 'O', '211': 'O', '195': 'A', '194': 'A', '196': 'A', '193': 'A', '192': 'A', '251': 'u', '252': 'u', '238': 'i', '239': 'i', '235': 'e', '234': 'e', '212': 'O', '236': 'e', '237': 'i', '230': 'e', '231': 'c', '232': 'e', '213': 'O', '224': 'a', '249': 'u', '253': 'y', '248': 'o', '243': 'o', '255': 'y', '250': 'u', '233': 'e', '201': 'E', '200': 'E', '203': 'E', '202': 'E', '205': 'I', '204': 'I', '207': 'I', '206': 'I', '242': 'o', '220': 'U', '245': 'o', '244': 'o', '246': 'o', '241': 'n', '218': 'U', '229': 'a', '228': 'a', '227': 'a', '226': 'a', '225': 'a', '219': 'U', '221': 'Y', # these are added '248': 'r', '185': 's', '174': 'Z', '232': 'c', '200': 'C', '169': 'S', '190': 'z', '199': 'C', 'amp': '&', 'nbsp': ' ', 'quot': '\"' } def __init__(self, xmlcompatibility='db09'): self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0, headercoef=3.0, mintextlen=30) self.citaextractor = CitationEntityExtractor( ALL, xmlcompatibility=xmlcompatibility) self.ee = EntityExtractor() self.mime = MIMEhandler() self.crawler = Crawler() self.bibtex = BibTeXParser() self.xmlcompatibility = xmlcompatibility self._xmlvalid = int(xmlcompatibility.lstrip('db')) self._publ_list = [] def _set_new_topic(self, publ, kw): """ This method adds new topic to publication. """ if not re.search("[a-z]{4,}", kw): return publ if re.search("publi|paper", kw, re.I): return publ t = RRSTopic(title=kw) publ.set('topic', t) return publ def _set_publ_type(self, header, publ): def _floor(i): if i > 100: i = 100 return i if header is None: return publ # try to set publication type from header for _type in RRSPublication.publication_types: if re.search(_type, header, re.I): if publ.get('type') == _type: publ.set('credibility', _floor(publ.get('credibility'))) else: publ.set('type', _type) return publ if re.search("dissertation", header, re.I): publ.set('type', 'phdthesis') return publ if re.search('technical report', header, re.I): publ.set('type', 'techreport') return publ # make keyword from header return self._set_new_topic(publ, header) def translate_html_entities(self, text): ents = re.findall(r'&(#?)(x?)(\w+);', text) for ent in set(ents): try: text = re.sub('&(#?)' + re.escape(ent[2]) + ";", self.entitydefstr[ent[2]], text) except: pass return text def compare_chunks_to_extracted(self, chunks, publ): if not publ.get('title'): return publ title = self.translate_html_entities(publ.get('title')) authors = publ.get('person_author') author_names = [a.get('name')[0].get('full_name') for a in authors] for ch in chunks: l = ch.get_link() # get chunk text ch = self.translate_html_entities(ch.get_text()) # add url if available if l is not None and not l.startswith("javascript") and l != "#": u = RRSUrl(type='publication', title=ch, link=l) publ.set('url', u) # repair title if needed if ch in title or ch == title: if float(len(ch)) / float(len(title)) > 0.4: publ.set('title', ch) # repair names if needed for a in author_names: if a in ch: authors_extracted = self.ee.find_authors(ch) publ.person_author = authors_extracted[0] break return publ def _fill_citation(self, publ): c = RRSCitation() c.set('content', self.cita_text) if publ.get('event'): c.set('event', publ.get('event')[0].get('title')) return c def _handle_bibtex_pages(self): urls = {} for i, p in enumerate(self._publ_list): pub_u = p.get('url') for u in pub_u: urls[u.get('link')] = i #if link is web page, not pdf urls_to_download = [] content_types = self.mime.start(urls.keys()) for k in urls.keys(): if content_types[k] in ('text/html', 'application/xhtml+xml', 'application/x-httpd-php', 'text/javascript'): urls_to_download.append(k) # download page a try it for bibtex pages = self.crawler.start(urls_to_download) for u in urls_to_download: bibtex = self.bibtex.parse(pages[u]) # if bibtex on page, set publication if bibtex is not None: self._publ_list[urls[u]] = bibtex def _empty(self): for x in range(len(self._publ_list)): self._publ_list.pop() self.cita_text = None def _handle_document(self, doc): self._empty() # for all regions which were found for reg in doc.get_regions(): # get their header header = reg.get_name() # for all records in region for rec in reg._manual_process_page(): # create empty citation object c = RRSCitation() # harvest citation record text (probably citation we hope) self.cita_text = self.translate_html_entities(rec.get_text()) # set the content of record to citation object c.set('content', self.cita_text) # fill object it wih extracted data c = self.citaextractor.extract(c) # get extracted publication publ = c.get('publication_cited') # if sequencewrapper extracted come text chunks, it helps us a lot, # beacause we can compare extracted data to chunks and if not matched # we can fix it publ = self.compare_chunks_to_extracted(rec.get_chunks(), publ) # insert citation into publication # !!! we are extracting publications, not citations. Because we dont # want tree like this: citation->publication but this: # publication->citation publ.set('citation', self._fill_citation(publ)) # try to find publication type in header of data region publ = self._set_publ_type(header, publ) # add to publication list self._publ_list.append(publ) #self._handle_bibtex_pages() return self._publ_list #--------------------------------------------------------------------------- # public methods #--------------------------------------------------------------------------- def extract_data(self, tree, url): """ Main method for extracting publication metadata from page. """ # wrap html document document = self.seqwrapper.wrap_h(tree, url) # handle it and return the result return self._handle_document(document)
class PublicationListExtractor(object): """ PublPageMetaExtractor handles harvests metadata from web pages containing references (publication list). For parsing sequences in HTML dom we use extractors.sequencewrapper.HTMLSequenceWrapper. For parsing citations (records in data regions, which were found by sequencewrapper) we use extractors.citationentityextractor.CitationEntityExtractor. To improve accuracy of this system, we check headers wheather they contain some keyword, which could help us to determine the correct type of publication. From headers we also harvest keywords. """ entitydefstr = {'216': 'O', '217': 'U', '214': 'O', '197': 'A', '198': 'E', '210': 'O', '211': 'O', '195': 'A', '194': 'A', '196': 'A', '193': 'A', '192': 'A', '251': 'u', '252': 'u', '238': 'i', '239': 'i', '235': 'e', '234': 'e', '212': 'O', '236': 'e', '237': 'i', '230': 'e', '231': 'c', '232': 'e', '213': 'O', '224': 'a', '249': 'u', '253': 'y', '248': 'o', '243': 'o', '255': 'y', '250': 'u', '233': 'e', '201': 'E', '200': 'E', '203': 'E', '202': 'E', '205': 'I', '204': 'I', '207': 'I', '206': 'I', '242': 'o', '220': 'U', '245': 'o', '244': 'o', '246': 'o', '241': 'n', '218': 'U', '229': 'a', '228': 'a', '227': 'a', '226': 'a', '225': 'a', '219': 'U', '221': 'Y', # these are added '248': 'r', '185': 's', '174': 'Z', '232': 'c', '200': 'C', '169': 'S', '190': 'z', '199': 'C', 'amp': '&', 'nbsp': ' ', 'quot': '\"' } def __init__(self, xmlcompatibility='db09'): self.seqwrapper = HTMLSequenceWrapper(childcoef=7.0, headercoef=3.0, mintextlen=30) self.citaextractor = CitationEntityExtractor(ALL, xmlcompatibility=xmlcompatibility) self.ee = EntityExtractor() self.mime = MIMEhandler() self.crawler = Crawler() self.bibtex = BibTeXParser() self.xmlcompatibility = xmlcompatibility self._xmlvalid = int(xmlcompatibility.lstrip('db')) self._publ_list = [] def _set_new_topic(self, publ, kw): """ This method adds new topic to publication. """ if not re.search("[a-z]{4,}", kw): return publ if re.search("publi|paper", kw, re.I): return publ t = RRSTopic(title=kw) publ.set('topic', t) return publ def _set_publ_type(self, header, publ): def _floor(i): if i > 100: i=100 return i if header is None: return publ # try to set publication type from header for _type in RRSPublication.publication_types: if re.search(_type, header, re.I): if publ.get('type') == _type: publ.set('credibility', _floor(publ.get('credibility'))) else: publ.set('type', _type) return publ if re.search("dissertation", header, re.I): publ.set('type', 'phdthesis') return publ if re.search('technical report', header, re.I): publ.set('type', 'techreport') return publ # make keyword from header return self._set_new_topic(publ, header) def translate_html_entities(self, text): ents = re.findall(r'&(#?)(x?)(\w+);', text) for ent in set(ents): try: text = re.sub('&(#?)'+re.escape(ent[2])+";", self.entitydefstr[ent[2]], text) except: pass return text def compare_chunks_to_extracted(self, chunks, publ): if not publ.get('title'): return publ title = self.translate_html_entities(publ.get('title')) authors = publ.get('person_author') author_names = [a.get('name')[0].get('full_name') for a in authors] for ch in chunks: l = ch.get_link() # get chunk text ch = self.translate_html_entities(ch.get_text()) # add url if available if l is not None and not l.startswith("javascript") and l != "#": u = RRSUrl(type='publication', title=ch, link=l) publ.set('url', u) # repair title if needed if ch in title or ch == title: if float(len(ch))/float(len(title)) > 0.4: publ.set('title', ch) # repair names if needed for a in author_names: if a in ch: authors_extracted = self.ee.find_authors(ch) publ.person_author = authors_extracted[0] break return publ def _fill_citation(self, publ): c = RRSCitation() c.set('content', self.cita_text) if publ.get('event'): c.set('event', publ.get('event')[0].get('title')) return c def _handle_bibtex_pages(self): urls = {} for i, p in enumerate(self._publ_list): pub_u = p.get('url') for u in pub_u: urls[u.get('link')] = i #if link is web page, not pdf urls_to_download = [] content_types = self.mime.start(urls.keys()) for k in urls.keys(): if content_types[k] in ('text/html', 'application/xhtml+xml', 'application/x-httpd-php', 'text/javascript'): urls_to_download.append(k) # download page a try it for bibtex pages = self.crawler.start(urls_to_download) for u in urls_to_download: bibtex = self.bibtex.parse(pages[u]) # if bibtex on page, set publication if bibtex is not None: self._publ_list[urls[u]] = bibtex def _empty(self): for x in range(len(self._publ_list)): self._publ_list.pop() self.cita_text = None def _handle_document(self, doc): self._empty() # for all regions which were found for reg in doc.get_regions(): # get their header header = reg.get_name() # for all records in region for rec in reg._manual_process_page(): # create empty citation object c = RRSCitation() # harvest citation record text (probably citation we hope) self.cita_text = self.translate_html_entities(rec.get_text()) # set the content of record to citation object c.set('content', self.cita_text) # fill object it wih extracted data c = self.citaextractor.extract(c) # get extracted publication publ = c.get('publication_cited') # if sequencewrapper extracted come text chunks, it helps us a lot, # beacause we can compare extracted data to chunks and if not matched # we can fix it publ = self.compare_chunks_to_extracted(rec.get_chunks(), publ) # insert citation into publication # !!! we are extracting publications, not citations. Because we dont # want tree like this: citation->publication but this: # publication->citation publ.set('citation', self._fill_citation(publ)) # try to find publication type in header of data region publ = self._set_publ_type(header, publ) # add to publication list self._publ_list.append(publ) #self._handle_bibtex_pages() return self._publ_list #--------------------------------------------------------------------------- # public methods #--------------------------------------------------------------------------- def extract_data(self, tree, url): """ Main method for extracting publication metadata from page. """ # wrap html document document = self.seqwrapper.wrap_h(tree, url) # handle it and return the result return self._handle_document(document)