class MySpider(CrawlSpider): input_dir = '../input' encoding = 'utf8' passing_file = path_join(input_dir, 'site_to_scrap.txt') with open(passing_file, 'r', encoding=encoding) as f: input_file = str(f.readlines()[0]).replace('\n', '') f.close() with open(path_join(input_dir, input_file + '.json'), 'r') as json_file: data = json.load(json_file) json_file.close() name = data['name'] allowed_domains = data['allowed_domains'] start_urls = data['start_urls'] def start_requests(self): ''' This method fetches a response from the original url. ''' for url in self.start_urls: yield scrapy.Request(url=url, callback=self.parse_page) def parse_page(self, response): for a in response.css('a'): yield { 'span': a.css('a::text').get(), } for small in response.css('small'): yield { 'p': small.css('small::text').get(), } for div in response.xpath('//div[@class = "media-body"]'): yield { 'div': div.css('div::text').extract()[1], } print("URL: " + response.url) # print visited urls to console # write visited urls into a file output_dir = '../output' file = 'visited_urls.txt' with open(path_join(output_dir, file), 'a') as outfile: print(response.url, file=outfile) outfile.close()
def parse_page(self, response): for a in response.css('a'): yield { 'span': a.css('a::text').get(), } for small in response.css('small'): yield { 'p': small.css('small::text').get(), } for div in response.xpath('//div[@class = "media-body"]'): yield { 'div': div.css('div::text').extract()[1], } print("URL: " + response.url) # print visited urls to console # write visited urls into a file output_dir = '../output' file = 'visited_urls.txt' with open(path_join(output_dir, file), 'a') as outfile: print(response.url, file=outfile) outfile.close()
def compare_audiences(self): """ Compare if audiences have similar content. Returns : joint_keywords for two audiences joint_ratio """ project = 'renault' _, output_dir = dir_from_project(project) selected_audiences = 'Car enthusiasts' input_dir = '../kwords_db' sys.path.append(input_dir) sub_site = [] for f in os.listdir(input_dir): if os.path.isfile(os.path.join(input_dir, f)) and 'json' in f: with open(path_join(input_dir, f), 'r') as json_file: data = json.load(json_file) if data['audience'] == selected_audiences: for key in data: name = data['name'] if name not in sub_site: sub_site.append(name) print(sub_site) percentage_key = 0.25 col_min = 1 finder = Keyword_finder(sub_site, output_dir, n_keyword=n_keyword, useless_w_list=useless_w_list) all_kwords = finder.all_keyword() joint_kwords, support_dict = finder.common_keyword(col_min, percentage_key) joint_ratio = (round(len(joint_kwords) / len(all_kwords) * 100, 2)) print('joint =', sorted(joint_kwords)) # printing the ratio to judge the model configuration print('joint_ratio:', joint_ratio, '%')
def top_keyword_per_audience(self): """ Returns : top keywords per given audience sorted by number of appearances """ selected_audience = 'Car enthusiasts' input_dir = '../kwords_db' all_key = [] for f in os.listdir(input_dir): if os.path.isfile(os.path.join(input_dir, f)) and 'json' in f: with open(path_join(input_dir, f), 'r') as json_file: data = json.load(json_file) for key in data: if key == 'joint_kwords' and \ data['audience'] == selected_audience: site_kw = [el for el in data[key]] all_key = sorted(all_key + site_kw) count = Counter(word for word in all_key) print('Top %s keywords for %s with number of appearances:' % (len(all_key), selected_audience), '\n') for value, count in count.most_common(): print(value, count)
def parse_page(self, response): # following sections are combined in 'p' output # b, ul, li, strong for p in response.css('p'): yield { 'p': p.css('p::text').get(), } for b in response.css('b'): yield { 'p': b.css('b::text').get(), } for ul in response.css('ul'): yield { 'p': ul.css('ul::text').get(), } for li in response.css('li'): yield { 'p': li.css('li::text').get(), } for strong in response.css('strong'): yield { 'p': strong.css('strong::text').get(), } for span in response.css('span'): yield { 'span': span.css('span::text').get(), } for title in response.css('div.title'): yield { 'header': title.css('div.title::text').get(), } # following sections are combined in 'header' output # head, h1, h2, h3, h4, h5, h6 for head in response.css('head'): yield { 'header': head.css('head::text').get(), } for h1 in response.css('h1'): yield { 'header': h1.css('h1::text').get(), } for h2 in response.css('h2'): yield { 'header': h2.css('h2::text').get(), } for h3 in response.css('h3'): yield { 'header': h3.css('h3::text').get(), } for h4 in response.css('h4'): yield { 'header': h4.css('h4::text').get(), } for h5 in response.css('h5'): yield { 'header': h5.css('h5::text').get(), } for h6 in response.css('h6'): yield { 'header': h6.css('h6::text').get(), } # write visited urls into a file file = 'visited_urls.txt' with open(path_join('../output/%s' % MySpider.input_file, file), 'a')\ as outfile: print(response.url, file=outfile) outfile.close()
class MySpider(CrawlSpider): # print('USING ITERATIVE') input_dir, _ = dir_from_project('iterative') encoding = 'utf8' passing_file = path_join('../input', 'site_to_scrap.txt') with open(passing_file, 'r', encoding=encoding) as f: input_file = str(f.readlines()[0]).replace('\n', '') f.close() with open(path_join(input_dir, input_file + '.json'), 'r') as json_file: data = json.load(json_file) json_file.close() name = data['name'] allowed_domains = data['allowed_domains'] start_urls = data['start_urls'] rules = (Rule(LinkExtractor(allow=data['allow'], deny=data['deny']), callback='parse_page', follow=True), ) def parse_page(self, response): # following sections are combined in 'p' output # b, ul, li, strong for p in response.css('p'): yield { 'p': p.css('p::text').get(), } for b in response.css('b'): yield { 'p': b.css('b::text').get(), } for ul in response.css('ul'): yield { 'p': ul.css('ul::text').get(), } for li in response.css('li'): yield { 'p': li.css('li::text').get(), } for strong in response.css('strong'): yield { 'p': strong.css('strong::text').get(), } for span in response.css('span'): yield { 'span': span.css('span::text').get(), } for title in response.css('div.title'): yield { 'header': title.css('div.title::text').get(), } # following sections are combined in 'header' output # head, h1, h2, h3, h4, h5, h6 for head in response.css('head'): yield { 'header': head.css('head::text').get(), } for h1 in response.css('h1'): yield { 'header': h1.css('h1::text').get(), } for h2 in response.css('h2'): yield { 'header': h2.css('h2::text').get(), } for h3 in response.css('h3'): yield { 'header': h3.css('h3::text').get(), } for h4 in response.css('h4'): yield { 'header': h4.css('h4::text').get(), } for h5 in response.css('h5'): yield { 'header': h5.css('h5::text').get(), } for h6 in response.css('h6'): yield { 'header': h6.css('h6::text').get(), } # write visited urls into a file file = 'visited_urls.txt' with open(path_join('../output/%s' % MySpider.input_file, file), 'a')\ as outfile: print(response.url, file=outfile) outfile.close() print('scrape for site: %s - in progress' % input_file)