def parse(self, response): titles_raw = response.xpath( '//*[@id="main_content"]/div[2]/ul/li/dl/dt/a/text()').extract() writers = response.css('.writing::text').extract() previews = response.css('.lede::text').extract() titles = [] for title in titles_raw: if title.strip(): titles.append(title.strip()) for idx in range(len(titles)): item = MyscraperItem() item['title'] = titles[idx] item['writer'] = writers[idx] item['preview'] = previews[idx] if response.url == "https://news.naver.com/main/list.nhn?mode=LS2D&sid2=263&sid1=101&mid=shm&page=1": list1.append(item) elif response.url == "https://news.naver.com/main/list.nhn?mode=LS2D&sid2=263&sid1=101&mid=shm&page=2": list2.append(item) elif response.url == "https://news.naver.com/main/list.nhn?mode=LS2D&sid2=263&sid1=101&mid=shm&page=3": list3.append(item) elif response.url == "https://news.naver.com/main/list.nhn?mode=LS2D&sid2=263&sid1=101&mid=shm&page=4": list4.append(item) elif response.url == "https://news.naver.com/main/list.nhn?mode=LS2D&sid2=263&sid1=101&mid=shm&page=5": list5.append(item) items = [] for list in [list1, list2, list3, list4, list5]: for i in list: items.append(i) if len(items) == 100: return items
def parse_items(self, response): # The list of items that are found on the particular page items = [] # Only extract canonicalized and unique links (with respect to the current page) links = LinkExtractor(canonicalize=True, unique=True).extract_links(response) # Now go through all the found links for link in links: # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains is_allowed = False for allowed_domain in self.allowed_domains: if allowed_domain in link.url: is_allowed = True # If it is allowed, create a new item and add it to the list of found items if is_allowed: item = MyscraperItem() item['link'] = link.url items.append(item) patterns = [ "kalerkantho.com/online/national/", "kalerkantho.com/online/Politics/", "kalerkantho.com/online/Court/", "kalerkantho.com/online/world/", "kalerkantho.com/online/business/", "kalerkantho.com/online/sahitya/", "kalerkantho.com/online/sport/", "kalerkantho.com/online/entertainment/", "kalerkantho.com/online/info-tech/", "kalerkantho.com/online/prescription/" ] file = None if patterns[0] in link.url: file = open('../../data/national.csv', 'a') if patterns[1] in link.url: file = open('../../data/politics.csv', 'a') if patterns[2] in link.url: file = open('../../data/court.csv', 'a') if patterns[3] in link.url: file = open('../../data/world.csv', 'a') if patterns[4] in link.url: file = open('../../data/business.csv', 'a') if patterns[5] in link.url: file = open('../../data/literature.csv', 'a') if patterns[6] in link.url: file = open('../../data/sports.csv', 'a') if patterns[7] in link.url: file = open('../../data/entertainment.csv', 'a') if patterns[8] in link.url: file = open('../../data/tech.csv', 'a') if patterns[9] in link.url: file = open('../../data/medical.csv', 'a') if file != None: file.write(urlShortener(link.url) + "\n") file.close() # Return all the found items return items
def parse(self, response): titles = response.xpath('//*[@id="main_content"]/div[2]/ul/li/dl/dt[2]/a/text()').extract() # dt[2] 는 고정값이기 떄문에 [2]를 지워주지 않음 authors = response.css('.writing::text').extract() previews = response.css('.lede::text').extract() items = [] for idx in range(len(titles)): item = MyscraperItem() item['title'] = titles[idx] item['author'] = authors[idx] item['preview'] = previews[idx] items.append(item) return items
def parse_items(self, response): # The list of items that are found on the particular page items = [] # Only extract canonicalized and unique links (with respect to the current page) links = LinkExtractor(canonicalize=True, unique=True).extract_links(response) # Now go through all the found links for link in links: # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains is_allowed = False for allowed_domain in self.allowed_domains: if allowed_domain in link.url: is_allowed = True # If it is allowed, create a new item and add it to the list of found items if is_allowed: item = MyscraperItem() item['link'] = link.url items.append(item) patterns = [ "banglatribune.com/sport/news/", "banglatribune.com/business/news/", "banglatribune.com/entertainment/news/", "banglatribune.com/country/news/", "banglatribune.com/foreign/news/", "banglatribune.com/tech-and-gadget/news/", "banglatribune.com/literature/news/" ] file = None if patterns[0] in link.url: file = open('../../data/sports.csv', 'a') if patterns[1] in link.url: file = open('../../data/economy.csv', 'a') if patterns[2] in link.url: file = open('../../data/entertainment.csv', 'a') if patterns[3] in link.url: file = open('../../data/bangladesh.csv', 'a') if patterns[4] in link.url: file = open('../../data/international.csv', 'a') if patterns[5] in link.url: file = open('../../data/technology.csv', 'a') if patterns[6] in link.url: file = open('../../data/literature.csv', 'a') if file != None: file.write(urlShortener(link.url) + "\n") file.close() # Return all the found items return items
def parse(self, response): titles = response.xpath('//*[@id="main_content"]/div[2]/ul/li/dl/dt[2]/a/text()').extract() writers = response.css('.writing::text').extract() previews = response.css('.lede::text').extract() #zip(titles, writers, previews) items = [] # items에 XPATH, CSS를 통해 추출한 데이터를 저장 for idx in range(len(titles)): item = MyscraperItem() item['title'] = titles[idx] item['writer'] = writers[idx] item['preview'] = previews[idx] items.append(item) return items
def parse_items(self, response): # The list of items that are found on the particular page items = [] # Only extract canonicalized and unique links (with respect to the current page) links = LinkExtractor(canonicalize=True, unique=True).extract_links(response) # Now go through all the found links for link in links: # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains is_allowed = False for allowed_domain in self.allowed_domains: if allowed_domain in link.url: is_allowed = True # If it is allowed, create a new item and add it to the list of found items if is_allowed: item = MyscraperItem() item['url_from'] = response.url item['url_to'] = link.url items.append(item) # Return all the found items return items