def extract_half_infobox(sideid): bititle_xpath_template = '//*[@class="basic-info"]/dl[%d]/dt[%%d]' % sideid bicontent_xpath_template = '//*[@class="basic-info"]/dl[%d]/dd[%%d]' % sideid counter = 1 while True: bititle_elements = root.xpath(bititle_xpath_template % counter) bicontent_elements = root.xpath(bicontent_xpath_template % counter) if len(bititle_elements) != 1 or len(bicontent_elements) != 1: # both are zero, we are exiting if len(bititle_elements) == 0 and len(bicontent_elements) == 0: break else: error = 'extractor2: should exit with (len(bititle)==0 and len(bicontent)==0), but '\ 'got (len(bititle)=%d, len(bicontent)=%d)' % (len(bititle_elements), len(bicontent_elements)) raise ValueError(error) if len(bicontent_elements[0].xpath('.//dl'))>0: # we should extract expanded values bicontent_elements = bicontent_elements[0].xpath('.//dd') remove_tailing_expand_sign=True else: remove_tailing_expand_sign=False bititle = get_inner_text_with_hrefs(bititle_elements[0]) bicontent = get_inner_text_with_hrefs(bicontent_elements[0]) if remove_tailing_expand_sign: bicontent = bicontent.strip() if bicontent[-2:] == u'收起': bicontent = bicontent[:-2] yield remove_links(cleanup_verb(bititle)), bicontent counter += 1
def extract_half_infobox(sideid): bititle_xpath_template = '//*[@class="basic-info"]/dl[%d]/dt[%%d]' % sideid bicontent_xpath_template = '//*[@class="basic-info"]/dl[%d]/dd[%%d]' % sideid counter = 1 while True: bititle_elements = root.xpath(bititle_xpath_template % counter) bicontent_elements = root.xpath(bicontent_xpath_template % counter) if len(bititle_elements) != 1 or len(bicontent_elements) != 1: # both are zero, we are exiting if len(bititle_elements) == 0 and len(bicontent_elements) == 0: break else: error = 'extractor2: should exit with (len(bititle)==0 and len(bicontent)==0), but '\ 'got (len(bititle)=%d, len(bicontent)=%d)' % (len(bititle_elements), len(bicontent_elements)) raise ValueError(error) if len(bicontent_elements[0].xpath('.//dl')) > 0: # we should extract expanded values bicontent_elements = bicontent_elements[0].xpath('.//dd') remove_tailing_expand_sign = True else: remove_tailing_expand_sign = False bititle = get_inner_text_with_hrefs(bititle_elements[0]) bicontent = get_inner_text_with_hrefs(bicontent_elements[0]) if remove_tailing_expand_sign: bicontent = bicontent.strip() if bicontent[-2:] == u'收起': bicontent = bicontent[:-2] yield remove_links(cleanup_verb(bititle)), bicontent counter += 1
def extract_half_infobox(sideid): bititle_xpath_template = '//*[@id="baseInfoWrapDom"]/div[%d]/div[%%d]/div/span' % sideid bicontent_xpath_template = '//*[@id="baseInfoWrapDom"]/div[%d]/div[%%d]/div/div' % sideid bicontent_multiline_xpath_template = '//*[@id="baseInfoWrapDom"]/div[%d]/div[%%d]/div[@class="biOpenItem"]/div[@class="biOpenItemCon"]/div[@class="biOpenContent"]' % sideid counter = 1 while True: bititle_elements = root.xpath(bititle_xpath_template % counter) bicontent_elements = root.xpath(bicontent_xpath_template % counter) if len(bititle_elements) != 1 or len(bicontent_elements) != 1: # if both are zero, we are exiting if len(bititle_elements) == 0 and len(bicontent_elements) == 0: break else: # some bicontent have multi-lines if len(bititle_elements) == 1 and len(bicontent_elements) > 1: bicontent_elements_multiline = root.xpath(bicontent_multiline_xpath_template % counter) if len(bicontent_elements_multiline) != 1: error = 'extractor1: should find len(bicontent_elements_multiline)==1, but'\ 'got %d' % len(bicontent_elements_multiline) raise ValueError(error) bicontent_elements = bicontent_elements_multiline else: # we are having trouble error = 'extractor1: should exit with (len(bititle)==0 and len(bicontent)==0), but '\ 'got (len(bititle)=%d, len(bicontent)=%d)' % (len(bititle_elements), len(bicontent_elements)) raise ValueError(error) bititle = get_inner_text_with_hrefs(bititle_elements[0]) bicontent = get_inner_text_with_hrefs(bicontent_elements[0]) yield remove_links(cleanup_verb(bititle)), bicontent counter += 1
def extract_half_infobox(sideid): bititle_xpath_template = '//*[@id="baseInfoWrapDom"]/div[%d]/div[%%d]/div/span' % sideid bicontent_xpath_template = '//*[@id="baseInfoWrapDom"]/div[%d]/div[%%d]/div/div' % sideid bicontent_multiline_xpath_template = '//*[@id="baseInfoWrapDom"]/div[%d]/div[%%d]/div[@class="biOpenItem"]/div[@class="biOpenItemCon"]/div[@class="biOpenContent"]' % sideid counter = 1 while True: bititle_elements = root.xpath(bititle_xpath_template % counter) bicontent_elements = root.xpath(bicontent_xpath_template % counter) if len(bititle_elements) != 1 or len(bicontent_elements) != 1: # if both are zero, we are exiting if len(bititle_elements) == 0 and len(bicontent_elements) == 0: break else: # some bicontent have multi-lines if len(bititle_elements ) == 1 and len(bicontent_elements) > 1: bicontent_elements_multiline = root.xpath( bicontent_multiline_xpath_template % counter) if len(bicontent_elements_multiline) != 1: error = 'extractor1: should find len(bicontent_elements_multiline)==1, but'\ 'got %d' % len(bicontent_elements_multiline) raise ValueError(error) bicontent_elements = bicontent_elements_multiline else: # we are having trouble error = 'extractor1: should exit with (len(bititle)==0 and len(bicontent)==0), but '\ 'got (len(bititle)=%d, len(bicontent)=%d)' % (len(bititle_elements), len(bicontent_elements)) raise ValueError(error) bititle = get_inner_text_with_hrefs(bititle_elements[0]) bicontent = get_inner_text_with_hrefs(bicontent_elements[0]) yield remove_links(cleanup_verb(bititle)), bicontent counter += 1