def parse_template_4(self, element): """ A template for joint proceedings of three workshops. Examples: - http://ceur-ws.org/Vol-981/ - http://ceur-ws.org/Vol-862/ - http://ceur-ws.org/Vol-853/ """ workshop_1 = {'id': 1} workshop_2 = {'id': 2} workshop_3 = {'id': 3} summary = self.rex(element[1], [ r'(joint\s+proceedings\s+of\s+[the]*.*workshops:\s*([\s\w]+)\(([a-zA-Z]+)\d+\)' r'[and,\s]+([\s\w]+)\(([a-zA-Z]+)\d+\)[and,\s]+([\s\w]+)\(([a-zA-Z]+)\d+\)[,\s]+.*)Edited by.*', r"(joint\s+proceedings\s+of\s+([\s\w,]+)\(([a-zA-Z]+)['\s]?\d+\)[and,\s]+([\s\w-]+)\(([a-zA-Z]+)['\s]?\d+\)" r"[and,\s]+([\s\w]+)\(([a-zA-Z]+)['\s]?\d+\)[,\s]+.*)Edited by.*" ], re.I | re.S) if len(summary.groups()) != 7: raise DataNotFound() title = summary.group(1) workshop_1['volume_number'] = workshop_2['volume_number'] = workshop_3['volume_number'] = \ WorkshopSummaryParser.extract_volume_number(element[0].get('href')) workshop_1['url'] = workshop_2['url'] = workshop_3['url'] = element[ 0].get('href') workshop_1['time'] = workshop_2['time'] = workshop_3[ 'time'] = utils.parse_date(title) workshop_1['label'] = summary.group(2) workshop_1['short_label'] = summary.group(3) workshop_2['label'] = summary.group(4) workshop_2['short_label'] = summary.group(5) workshop_3['label'] = summary.group(6) workshop_3['short_label'] = summary.group(7) self.add_workshop(workshop_1) self.add_workshop(workshop_2) self.add_workshop(workshop_3)
def parse_template_1(self): """ Examples: - http://ceur-ws.org/Vol-981/ """ self.begin_template() editors_block = u' '.join( self.grab.tree.xpath( '/html/body//text()[preceding::*[contains(., "Edited by")] and ' 'following::*[contains(.,"Table of Contents") or @class="CEURTOC"]]' )) editors = self.graph.objects(self.data['proceedings'], SWRC.editor) self.data['chairs'] = dict() for editor in editors: name = self.graph.objects(editor, FOAF.name).next() regexp = u'.*' + name + u'[\s~\xc2\xb0@#$%\^&*+-\xc2\xac]*\((\w+?)\d+\).*' match = re.match(regexp, editors_block, re.I | re.S) if match: self.data['chairs'][editor] = match.group(1) if len(self.data['chairs']) == 0: raise DataNotFound()
def parse_template_main(self): proceedings_list = [] tr = self.grab.tree.xpath(XPATH_SUMMARY) for i in range(0, len(tr), 2): href = tr[i].find(self.XPATH_SUMMARY_TITLE) try: if href.get('href') in config.input_urls or len( config.input_urls) == 1: proceedings = dict() proceedings[ 'volume_number'] = ProceedingsSummaryParser.extract_volume_number( href.get('href')) proceedings['url'] = href.get('href') summary_match = rex.rex( tr[i + 1].find('.//td[last()]').text_content(), r'(.*)(\nEdited\s*by\s*:\s*)(.*)(\nSubmitted\s*by\s*:\s*)(.*)(\nPublished\s*on\s*CEUR-WS:\s*)(.*)(\nONLINE)(.*)', re.I | re.M | re.S) proceedings['label'] = re.sub( r'\n', '', text.normalize_space(summary_match.group(1), ' \n')) proceedings['editors'] = re.split( r",+\s*", text.normalize_space(summary_match.group(3))) proceedings['submission_date'] = datetime.strptime( text.normalize_space(summary_match.group(7), ' \n'), '%d-%b-%Y') proceedings_list.append(proceedings) except: print "[WORKSHOP %s: ProceedingsSummaryParser] Summary information not found!" % href.get( 'href') #traceback.print_exc() self.data['proceedings_list'] = proceedings_list if len(proceedings_list) == 0: raise DataNotFound("There is no summary information to parse!")
def check_for_completeness(self): if len(self.data['publications']) == 0: self.data = {} raise DataNotFound()
def parse_template_3(self): self.begin_template() publications = [] elements = self.grab.tree.xpath('//li[a[@href] and (i or em or br)]') if elements is None or len(elements) == 0: elements = self.grab.tree.xpath('//p[a[@href] and (i or em)]') for publication in elements: try: name = clean_string(publication.find('a').text_content()) if rex.rex( name, r'.*(preface|first\s+pages|author\s+list|foreword).*', re.I, default=None): #Examples: 180, 186 continue link = publication.find('a').get('href') editors = [] editors_tag = None if publication.find('i') is not None: editors_tag = publication.findall('i')[-1] elif publication.find('em') is not None: editors_tag = publication.find('em') if editors_tag is None: editors_tag_content = publication.find('br').tail else: editors_tag_content = editors_tag.text_content() editors_tag_content = re.sub(r'\s*[,\s]*and\s+', ',', editors_tag_content, flags=re.I | re.S).strip() if not editors_tag_content: #a publication should have non-empty list of authors raise DataNotFound(link) for publication_editor_name in editors_tag_content.split(","): pen = clean_string(publication_editor_name.strip()) if pen: editors.append(pen) file_name = link.rsplit('.pdf')[0].rsplit('/')[-1] publication_object = { 'name': name, 'file_name': file_name, 'link': self.task.url + link, 'editors': editors } publication_object['is_invited'] = self.is_invited( publication_object) if self.check_for_workshop_paper(publication_object): publications.append(publication_object) except Exception as ex: #traceback.print_exc() raise DataNotFound(ex) self.data['publications'] = publications self.end_template()
def parse_template_2(self): """ Examples: - http://ceur-ws.org/Vol-1008/ - http://ceur-ws.org/Vol-1043/ """ self.begin_template() publications = [] for element in self.grab.tree.xpath( '/html/body//*[@class="CEURTOC"]//*[a and ' 'descendant-or-self::*[@class="CEURAUTHORS"] and ' 'descendant-or-self::*[@class="CEURTITLE"]]'): try: name_el = element.find_class('CEURTITLE')[0] name = clean_string(name_el.text_content()).strip() if name is None or not name: # In case of unclosed span element with the author list # Example: http://ceur-ws.org/Vol-640 name = clean_string(name_el.tail) href = element.find('a').get('href') link = href if href.startswith( 'http://') else self.task.url + href editors = [] editors_list_el = element.find_class('CEURAUTHORS')[0] editors_list = clean_string(editors_list_el.text_content()) if not editors_list: # In case of unclosed span element with the author list # Example: http://ceur-ws.org/Vol-1043 editors_list = clean_string(editors_list_el.tail) for editor_name in editors_list.split(","): editor_name = clean_string(editor_name.strip()) if editor_name: editors.append(editor_name) if not editors: #a publication should have non-empty list of authors raise DataNotFound(link) file_name = link.rsplit('.pdf')[0].rsplit('/')[-1] publication_object = { 'name': name, 'file_name': file_name, 'link': link, 'editors': editors } publication_object['is_invited'] = self.is_invited( publication_object) if len(self.data['workshops']) > 1: try: session = self.grab.tree.xpath( '//a[@href="%s"]/preceding::*[@class="CEURSESSION"][1]' % href)[0] publication_object['presentedAt'] = [] for w in self.data['workshops']: if w[1] is not None and w[1] in session.text: publication_object['presentedAt'].append(w[0]) except: # traceback.print_exc() pass if self.check_for_workshop_paper(publication_object): publications.append(publication_object) except Exception as ex: raise DataNotFound(ex) self.data['publications'] = publications self.end_template()