Exemple #1
0
class MySpider(CrawlSpider):
    input_dir = '../input'
    encoding = 'utf8'
    passing_file = path_join(input_dir, 'site_to_scrap.txt')
    with open(passing_file, 'r', encoding=encoding) as f:
        input_file = str(f.readlines()[0]).replace('\n', '')
        f.close()

    with open(path_join(input_dir, input_file + '.json'), 'r') as json_file:
        data = json.load(json_file)
        json_file.close()
    name = data['name']
    allowed_domains = data['allowed_domains']
    start_urls = data['start_urls']

    def start_requests(self):
        '''
        This method fetches a response from the original url.
        '''
        for url in self.start_urls:
            yield scrapy.Request(url=url, callback=self.parse_page)

    def parse_page(self, response):
        for a in response.css('a'):
            yield {
                'span': a.css('a::text').get(),
            }
        for small in response.css('small'):
            yield {
                'p': small.css('small::text').get(),
            }
        for div in response.xpath('//div[@class = "media-body"]'):
            yield {
                'div': div.css('div::text').extract()[1],
            }

        print("URL: " + response.url)  # print visited urls to console

        # write visited urls into a file
        output_dir = '../output'
        file = 'visited_urls.txt'
        with open(path_join(output_dir, file), 'a') as outfile:
            print(response.url, file=outfile)
            outfile.close()
Exemple #2
0
    def parse_page(self, response):
        for a in response.css('a'):
            yield {
                'span': a.css('a::text').get(),
            }
        for small in response.css('small'):
            yield {
                'p': small.css('small::text').get(),
            }
        for div in response.xpath('//div[@class = "media-body"]'):
            yield {
                'div': div.css('div::text').extract()[1],
            }

        print("URL: " + response.url)  # print visited urls to console

        # write visited urls into a file
        output_dir = '../output'
        file = 'visited_urls.txt'
        with open(path_join(output_dir, file), 'a') as outfile:
            print(response.url, file=outfile)
            outfile.close()
Exemple #3
0
    def compare_audiences(self):
        """
        Compare if audiences have similar content.

        Returns :

        joint_keywords for two audiences
        joint_ratio

        """
        project = 'renault'
        _, output_dir = dir_from_project(project)
        selected_audiences = 'Car enthusiasts'
        input_dir = '../kwords_db'
        sys.path.append(input_dir)
        sub_site = []
        for f in os.listdir(input_dir):
            if os.path.isfile(os.path.join(input_dir, f)) and 'json' in f:
                with open(path_join(input_dir, f), 'r') as json_file:
                    data = json.load(json_file)
                    if data['audience'] == selected_audiences:
                        for key in data:
                            name = data['name']
                            if name not in sub_site:
                                sub_site.append(name)

        print(sub_site)
        percentage_key = 0.25
        col_min = 1

        finder = Keyword_finder(sub_site, output_dir, n_keyword=n_keyword,
                                useless_w_list=useless_w_list)

        all_kwords = finder.all_keyword()
        joint_kwords, support_dict = finder.common_keyword(col_min, percentage_key)
        joint_ratio = (round(len(joint_kwords) / len(all_kwords) * 100, 2))
        print('joint =', sorted(joint_kwords))
        # printing the ratio to judge the model configuration
        print('joint_ratio:', joint_ratio, '%')
Exemple #4
0
    def top_keyword_per_audience(self):
        """
        Returns :

        top keywords per given audience sorted by number of appearances
        """
        selected_audience = 'Car enthusiasts'
        input_dir = '../kwords_db'
        all_key = []
        for f in os.listdir(input_dir):
            if os.path.isfile(os.path.join(input_dir, f)) and 'json' in f:
                with open(path_join(input_dir, f), 'r') as json_file:
                    data = json.load(json_file)
                    for key in data:
                        if key == 'joint_kwords' and \
                                data['audience'] == selected_audience:
                            site_kw = [el for el in data[key]]
                            all_key = sorted(all_key + site_kw)

        count = Counter(word for word in all_key)
        print('Top %s keywords for %s with number of appearances:'
              % (len(all_key), selected_audience), '\n')
        for value, count in count.most_common():
            print(value, count)
Exemple #5
0
    def parse_page(self, response):
        # following sections are combined in 'p' output
        # b, ul, li, strong
        for p in response.css('p'):
            yield {
                'p': p.css('p::text').get(),
            }
        for b in response.css('b'):
            yield {
                'p': b.css('b::text').get(),
            }
        for ul in response.css('ul'):
            yield {
                'p': ul.css('ul::text').get(),
            }
        for li in response.css('li'):
            yield {
                'p': li.css('li::text').get(),
            }
        for strong in response.css('strong'):
            yield {
                'p': strong.css('strong::text').get(),
            }
        for span in response.css('span'):
            yield {
                'span': span.css('span::text').get(),
            }
        for title in response.css('div.title'):
            yield {
                'header': title.css('div.title::text').get(),
            }
        # following sections are combined in 'header' output
        # head, h1, h2, h3, h4, h5, h6
        for head in response.css('head'):
            yield {
                'header': head.css('head::text').get(),
            }
        for h1 in response.css('h1'):
            yield {
                'header': h1.css('h1::text').get(),
            }
        for h2 in response.css('h2'):
            yield {
                'header': h2.css('h2::text').get(),
            }
        for h3 in response.css('h3'):
            yield {
                'header': h3.css('h3::text').get(),
            }
        for h4 in response.css('h4'):
            yield {
                'header': h4.css('h4::text').get(),
            }
        for h5 in response.css('h5'):
            yield {
                'header': h5.css('h5::text').get(),
            }
        for h6 in response.css('h6'):
            yield {
                'header': h6.css('h6::text').get(),
            }

        # write visited urls into a file
        file = 'visited_urls.txt'
        with open(path_join('../output/%s' % MySpider.input_file, file), 'a')\
                as outfile:
            print(response.url, file=outfile)
            outfile.close()
Exemple #6
0
class MySpider(CrawlSpider):
    #    print('USING ITERATIVE')
    input_dir, _ = dir_from_project('iterative')
    encoding = 'utf8'
    passing_file = path_join('../input', 'site_to_scrap.txt')

    with open(passing_file, 'r', encoding=encoding) as f:
        input_file = str(f.readlines()[0]).replace('\n', '')
        f.close()

    with open(path_join(input_dir, input_file + '.json'), 'r') as json_file:
        data = json.load(json_file)
        json_file.close()

    name = data['name']
    allowed_domains = data['allowed_domains']
    start_urls = data['start_urls']

    rules = (Rule(LinkExtractor(allow=data['allow'], deny=data['deny']),
                  callback='parse_page',
                  follow=True), )

    def parse_page(self, response):
        # following sections are combined in 'p' output
        # b, ul, li, strong
        for p in response.css('p'):
            yield {
                'p': p.css('p::text').get(),
            }
        for b in response.css('b'):
            yield {
                'p': b.css('b::text').get(),
            }
        for ul in response.css('ul'):
            yield {
                'p': ul.css('ul::text').get(),
            }
        for li in response.css('li'):
            yield {
                'p': li.css('li::text').get(),
            }
        for strong in response.css('strong'):
            yield {
                'p': strong.css('strong::text').get(),
            }
        for span in response.css('span'):
            yield {
                'span': span.css('span::text').get(),
            }
        for title in response.css('div.title'):
            yield {
                'header': title.css('div.title::text').get(),
            }
        # following sections are combined in 'header' output
        # head, h1, h2, h3, h4, h5, h6
        for head in response.css('head'):
            yield {
                'header': head.css('head::text').get(),
            }
        for h1 in response.css('h1'):
            yield {
                'header': h1.css('h1::text').get(),
            }
        for h2 in response.css('h2'):
            yield {
                'header': h2.css('h2::text').get(),
            }
        for h3 in response.css('h3'):
            yield {
                'header': h3.css('h3::text').get(),
            }
        for h4 in response.css('h4'):
            yield {
                'header': h4.css('h4::text').get(),
            }
        for h5 in response.css('h5'):
            yield {
                'header': h5.css('h5::text').get(),
            }
        for h6 in response.css('h6'):
            yield {
                'header': h6.css('h6::text').get(),
            }

        # write visited urls into a file
        file = 'visited_urls.txt'
        with open(path_join('../output/%s' % MySpider.input_file, file), 'a')\
                as outfile:
            print(response.url, file=outfile)
            outfile.close()

    print('scrape for site: %s - in progress' % input_file)