def parse_url(self, response): if len(response.url) < 100: item = UrlextractItem() item['site'] = 'familydoctor' url = re.search('www\.familydoctor\.com\.cn\/.*?\/.*', response.url).group() item['url'] = url yield item
def parse_url(self, response): if len(response.url) < 100: item = UrlextractItem() item['site'] = 'fx120' item['url'] = response.url yield item try: try: url = re.search('http.*?www.*?fx120.*?/.*?/', response.url).group() except AttributeError: url = re.search('.*?fx120.net/', response.url).group() item = UrlextractItem() item['site'] = 'fx120' item['url'] = url yield item except AttributeError: pass
def parse_url(self, response): if len(response.url) < 100: try: item = UrlextractItem() item['site'] = 'haodf' url = re.search('.*?\.haodf\.com/', response.url).group() item['url'] = url yield item except AttributeError: pass
def parse_url(self, response): if len(response.url) < 100: try: item = UrlextractItem() item['site'] = 'healthSina' url = re.search('.*?health\.sina\.com\.cn/.*?/', response.url).group() item['url'] = url yield item except AttributeError: pass
def parse_url(self, response): if len(response.url) < 100: item = UrlextractItem() item['site'] = 'net39' item['url'] = response.url yield item