def parse_wok_page(page, qobj, callback=None): logger = parse_wok_page.get_logger() parser = etree.HTMLParser() tree = etree.parse(StringIO(page), parser) elements = tree.xpath('//td[@class="summary_data"]') # to be returned records = [] for element in elements: record = {} record['title'] = perform(element.xpath('a/value//text()'), a_join, unicode) record['source'] = perform(element.xpath('span[contains(text(),"Source")]/following-sibling::text()')[0], unicode, unicode.strip) record['authors'] = perform(element.xpath('span[contains(text(),"Author")]/following-sibling::text()')[0], unicode, a_split_semicolon, m_trim) record['publish_date'] = perform(element.xpath('span[contains(text(),"Published")]/following::text()')[1], lambda x: a_find(x, r'(\d{4})'),a_int, a_date) record['times_cited'] = perform(element.xpath('span[contains(text(),"Times Cited")]/following::text()')[1], a_trim, lambda s: s.replace(',',''), a_int) # remove et al record['authors'] = filter(lambda author: not author.startswith('et al'), record['authors']) # convert name from Doe, J to J Doe record['authors'] = map(lambda author: ' '.join(reversed(map(unicode.strip, author.split(' ')))), record['authors']) records.append(record) logger.warning("Got %d results for the query '%s' from isi/wok" % (len(records), qobj.query)) if callback: return subtask(callback).delay(records=records, qobj=qobj) else: return records
def parse_wok_page(page, qobj, callback=None): logger = parse_wok_page.get_logger() parser = etree.HTMLParser() tree = etree.parse(StringIO(page), parser) elements = tree.xpath('//td[@class="summary_data"]') # to be returned records = [] for element in elements: record = {} record['title'] = perform(element.xpath('a/value//text()'), a_join, unicode) record['source'] = perform( element.xpath( 'span[contains(text(),"Source")]/following-sibling::text()') [0], unicode, unicode.strip) record['authors'] = perform( element.xpath( 'span[contains(text(),"Author")]/following-sibling::text()') [0], unicode, a_split_semicolon, m_trim) record['publish_date'] = perform( element.xpath( 'span[contains(text(),"Published")]/following::text()')[1], lambda x: a_find(x, r'(\d{4})'), a_int, a_date) record['times_cited'] = perform( element.xpath( 'span[contains(text(),"Times Cited")]/following::text()')[1], a_trim, lambda s: s.replace(',', ''), a_int) # remove et al record['authors'] = filter( lambda author: not author.startswith('et al'), record['authors']) # convert name from Doe, J to J Doe record['authors'] = map( lambda author: ' '.join( reversed(map(unicode.strip, author.split(' ')))), record['authors']) records.append(record) logger.warning("Got %d results for the query '%s' from isi/wok" % (len(records), qobj.query)) if callback: return subtask(callback).delay(records=records, qobj=qobj) else: return records
def parse_scholar_page(url, page, qobj, callback=None): parser = etree.HTMLParser() tree = etree.parse(StringIO(page), parser) elements = tree.xpath("//body/div[@class='gs_r']") # to be returned records = [] for element in elements: record = {} record['title'] = perform(element.xpath('h3[@class="gs_rt"]/a//text()'), a_join, unicode) record['url'] = perform(element.xpath('h3[@class="gs_rt"]/a/@href'), a_join, unicode) record['snippet'] = perform(element.xpath('div[@class="gs_rs"]//text()'), a_join, unicode) record['source'] = perform(element.xpath('div[@class="gs_a"]//text()'), a_join, lambda x: a_find(x, r'-\s+(.+)[,|-]\s+\d{4}'), unicode) record['authors'] = perform(element.xpath('div[@class="gs_a"]//text()'), a_join, lambda x: a_find(x, r'\A(.+?)\s+-\s+'), unicode, a_split_komma, m_trim) record['publish_date'] = perform(element.xpath('div[@class="gs_a"]//text()'), a_join, lambda x: a_find(x, r'\s+(\d{4})\s+\-'),a_int, a_date) records.append(record) logger = parse_wok_page.get_logger() logger.warning("Got %d results for the query '%s' from scholar" % (len(records), qobj.query)) if callback: return subtask(callback).delay(records, qobj) else: return url, records
def parse_scholar_page(url, page, qobj, callback=None): parser = etree.HTMLParser() tree = etree.parse(StringIO(page), parser) elements = tree.xpath("//body/div[@class='gs_r']") # to be returned records = [] for element in elements: record = {} record['title'] = perform( element.xpath('h3[@class="gs_rt"]/a//text()'), a_join, unicode) record['url'] = perform(element.xpath('h3[@class="gs_rt"]/a/@href'), a_join, unicode) record['snippet'] = perform( element.xpath('div[@class="gs_rs"]//text()'), a_join, unicode) record['source'] = perform( element.xpath('div[@class="gs_a"]//text()'), a_join, lambda x: a_find(x, r'-\s+(.+)[,|-]\s+\d{4}'), unicode) record['authors'] = perform( element.xpath('div[@class="gs_a"]//text()'), a_join, lambda x: a_find(x, r'\A(.+?)\s+-\s+'), unicode, a_split_komma, m_trim) record['publish_date'] = perform( element.xpath('div[@class="gs_a"]//text()'), a_join, lambda x: a_find(x, r'\s+(\d{4})\s+\-'), a_int, a_date) records.append(record) logger = parse_wok_page.get_logger() logger.warning("Got %d results for the query '%s' from scholar" % (len(records), qobj.query)) if callback: return subtask(callback).delay(records, qobj) else: return url, records