Ejemplo n.º 1
0
def parse_wok_page(page, qobj, callback=None):
    logger = parse_wok_page.get_logger()

    parser = etree.HTMLParser()
    tree = etree.parse(StringIO(page), parser)

    elements = tree.xpath('//td[@class="summary_data"]')

    # to be returned
    records = []
    for element in elements:
        record = {}
        record['title'] = perform(element.xpath('a/value//text()'), a_join, unicode)
        record['source'] = perform(element.xpath('span[contains(text(),"Source")]/following-sibling::text()')[0], unicode, unicode.strip)
        record['authors'] = perform(element.xpath('span[contains(text(),"Author")]/following-sibling::text()')[0], unicode, a_split_semicolon, m_trim)
        record['publish_date'] = perform(element.xpath('span[contains(text(),"Published")]/following::text()')[1], lambda x: a_find(x, r'(\d{4})'),a_int, a_date)
        record['times_cited'] = perform(element.xpath('span[contains(text(),"Times Cited")]/following::text()')[1], a_trim, lambda s: s.replace(',',''), a_int)


        # remove et al
        record['authors'] = filter(lambda author: not author.startswith('et al'), record['authors'])

        # convert name from Doe, J to J Doe
        record['authors'] = map(lambda author: ' '.join(reversed(map(unicode.strip, author.split(' ')))), record['authors'])

        records.append(record)

    logger.warning("Got %d results for the query '%s' from isi/wok" % (len(records), qobj.query))

    if callback:
        return subtask(callback).delay(records=records, qobj=qobj)
    else:
        return records
Ejemplo n.º 2
0
def parse_wok_page(page, qobj, callback=None):
    logger = parse_wok_page.get_logger()

    parser = etree.HTMLParser()
    tree = etree.parse(StringIO(page), parser)

    elements = tree.xpath('//td[@class="summary_data"]')

    # to be returned
    records = []
    for element in elements:
        record = {}
        record['title'] = perform(element.xpath('a/value//text()'), a_join,
                                  unicode)
        record['source'] = perform(
            element.xpath(
                'span[contains(text(),"Source")]/following-sibling::text()')
            [0], unicode, unicode.strip)
        record['authors'] = perform(
            element.xpath(
                'span[contains(text(),"Author")]/following-sibling::text()')
            [0], unicode, a_split_semicolon, m_trim)
        record['publish_date'] = perform(
            element.xpath(
                'span[contains(text(),"Published")]/following::text()')[1],
            lambda x: a_find(x, r'(\d{4})'), a_int, a_date)
        record['times_cited'] = perform(
            element.xpath(
                'span[contains(text(),"Times Cited")]/following::text()')[1],
            a_trim, lambda s: s.replace(',', ''), a_int)

        # remove et al
        record['authors'] = filter(
            lambda author: not author.startswith('et al'), record['authors'])

        # convert name from Doe, J to J Doe
        record['authors'] = map(
            lambda author: ' '.join(
                reversed(map(unicode.strip, author.split(' ')))),
            record['authors'])

        records.append(record)

    logger.warning("Got %d results for the query '%s' from isi/wok" %
                   (len(records), qobj.query))

    if callback:
        return subtask(callback).delay(records=records, qobj=qobj)
    else:
        return records
Ejemplo n.º 3
0
def parse_scholar_page(url, page, qobj, callback=None):
    parser = etree.HTMLParser()
    tree = etree.parse(StringIO(page), parser)

    elements = tree.xpath("//body/div[@class='gs_r']")

    # to be returned
    records = []

    for element in elements:
        record = {}
        record['title'] = perform(element.xpath('h3[@class="gs_rt"]/a//text()'), a_join, unicode)
        record['url'] = perform(element.xpath('h3[@class="gs_rt"]/a/@href'), a_join, unicode)
        record['snippet'] = perform(element.xpath('div[@class="gs_rs"]//text()'), a_join, unicode)
        record['source'] = perform(element.xpath('div[@class="gs_a"]//text()'), a_join, lambda x: a_find(x, r'-\s+(.+)[,|-]\s+\d{4}'),  unicode)
        record['authors'] = perform(element.xpath('div[@class="gs_a"]//text()'), a_join, lambda x: a_find(x, r'\A(.+?)\s+-\s+'), unicode, a_split_komma, m_trim)
        record['publish_date'] = perform(element.xpath('div[@class="gs_a"]//text()'), a_join, lambda x: a_find(x, r'\s+(\d{4})\s+\-'),a_int, a_date)

        records.append(record)

    logger = parse_wok_page.get_logger()
    logger.warning("Got %d results for the query '%s' from scholar" % (len(records), qobj.query))

    if callback:
        return subtask(callback).delay(records, qobj)
    else:
        return url, records
Ejemplo n.º 4
0
def parse_scholar_page(url, page, qobj, callback=None):
    parser = etree.HTMLParser()
    tree = etree.parse(StringIO(page), parser)

    elements = tree.xpath("//body/div[@class='gs_r']")

    # to be returned
    records = []

    for element in elements:
        record = {}
        record['title'] = perform(
            element.xpath('h3[@class="gs_rt"]/a//text()'), a_join, unicode)
        record['url'] = perform(element.xpath('h3[@class="gs_rt"]/a/@href'),
                                a_join, unicode)
        record['snippet'] = perform(
            element.xpath('div[@class="gs_rs"]//text()'), a_join, unicode)
        record['source'] = perform(
            element.xpath('div[@class="gs_a"]//text()'), a_join,
            lambda x: a_find(x, r'-\s+(.+)[,|-]\s+\d{4}'), unicode)
        record['authors'] = perform(
            element.xpath('div[@class="gs_a"]//text()'), a_join,
            lambda x: a_find(x, r'\A(.+?)\s+-\s+'), unicode, a_split_komma,
            m_trim)
        record['publish_date'] = perform(
            element.xpath('div[@class="gs_a"]//text()'), a_join,
            lambda x: a_find(x, r'\s+(\d{4})\s+\-'), a_int, a_date)

        records.append(record)

    logger = parse_wok_page.get_logger()
    logger.warning("Got %d results for the query '%s' from scholar" %
                   (len(records), qobj.query))

    if callback:
        return subtask(callback).delay(records, qobj)
    else:
        return url, records