def book_from_xml(xml): authors = re.findall('<author>(.+?)</author>', xml) title = re.search('<title>(.+?)</title>', xml) # title if title is not None: title = title.group(1) else: title = "" # year year = re.search('<year>(.+?)</year>', xml) if year is not None: year = year.group(1) else: year = "" # link link = re.search('<ee>(.+?)</ee>', xml) if link is not None: link = link.group(1) else: link = "" book = Publication(title, authors, year, link) # contributor. It can be not existing publisher = re.search('<journal>(.+?)</journal', xml) if publisher is not None: publisher = publisher.group(1) book.add_contributor(publisher) book.print_info()
def get_book_from_block(self, block): title = re.search('<h3>.*?<a.*?>(.*?)</a>', block) if title is not None: title = title.group(1) else: title = "" authors = re.findall('class="author-name-tooltip".*?>(.*?)</a>', block) for i, auth in enumerate(authors): authors[i] = re.sub('<.*?>', '', auth) year = re.search('class="conference">.*?<span.*? (\d{4}).*?</span>', block) if year is not None: year = year.group(1) else: year = "" link = re.search('<h3>.*?<a.*?href="(.*?)">', block) if link is not None: link = self.base_href + link.group(1) else: link = "" book = Publication(title, authors, year, link) desc = re.search('class="abstract">.*?<span.*?>(.*?)</span>.*?class="conference">.*?<span', block) if desc is not None: desc = desc.group(1) book.add_description(desc) publisher = re.search('class="conference">.*?</span><a.*?>(.*?)</a>', block) if publisher is not None: publisher = publisher.group(1) book.add_publisher(publisher) book.print_info()