Esempio n. 1
0
    def compare_domains(self):
        """
        Compare if domains have similar content.

        Returns :

        joint_keywords for two domains
        joint_ratio

        """
        project = 'renault'
        _, output_dir = dir_from_project(project)

        selected_domains = ['valeo', 'whatcar']

        percentage_key = 0.25
        col_min = 1

        finder = Keyword_finder(selected_domains, output_dir, n_keyword=n_keyword,
                                useless_w_list=useless_w_list)

        all_kwords = finder.all_keyword()
        joint_kwords, support_dict = finder.common_keyword(col_min, percentage_key)
        joint_ratio = (round(len(joint_kwords) / len(all_kwords) * 100, 2))
        print('Joint keywords for %s and %s are:' % (selected_domains[0], selected_domains[1]), sorted(joint_kwords))
        # printing the ratio to judge the model configuration
        print('joint_ratio:', joint_ratio, '%')
Esempio n. 2
0
def keywords_per_site(site_list):
    """
    Calculate keywords appearing on given site.
    Save results in kwords_db folder per site.
    """
    _, output_dir = dir_from_project('iterative')
    for site in site_list:
        finder = Site_keyword(site, output_dir)
        all_kwords = finder.all_keyword()
        kwords_dir = '../kwords_db'
        file = '%s_kw.json' % site
        data = {}
        data['language'] = ''  # pass from lang detect
        data['name'] = '%s' % site
        data['len_all_kwords'] = len(all_kwords)
        data['all_kwords'] = sorted(str(el) for el in all_kwords)

        with open(path_join(kwords_dir, file), 'w') as outfile:
            json.dump(data, outfile, indent=4, sort_keys=False)
            outfile.close()
Esempio n. 3
0
    def compare_audiences(self):
        """
        Compare if audiences have similar content.

        Returns :

        joint_keywords for two audiences
        joint_ratio

        """
        project = 'renault'
        _, output_dir = dir_from_project(project)
        selected_audiences = 'Car enthusiasts'
        input_dir = '../kwords_db'
        sys.path.append(input_dir)
        sub_site = []
        for f in os.listdir(input_dir):
            if os.path.isfile(os.path.join(input_dir, f)) and 'json' in f:
                with open(path_join(input_dir, f), 'r') as json_file:
                    data = json.load(json_file)
                    if data['audience'] == selected_audiences:
                        for key in data:
                            name = data['name']
                            if name not in sub_site:
                                sub_site.append(name)

        print(sub_site)
        percentage_key = 0.25
        col_min = 1

        finder = Keyword_finder(sub_site, output_dir, n_keyword=n_keyword,
                                useless_w_list=useless_w_list)

        all_kwords = finder.all_keyword()
        joint_kwords, support_dict = finder.common_keyword(col_min, percentage_key)
        joint_ratio = (round(len(joint_kwords) / len(all_kwords) * 100, 2))
        print('joint =', sorted(joint_kwords))
        # printing the ratio to judge the model configuration
        print('joint_ratio:', joint_ratio, '%')
Esempio n. 4
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-f",
                        help="file to be parsed",
                        type=str,
                        default='ces2020')
    parser.add_argument("-p",
                        help="project",
                        type=str,
                        default='renault',
                        choices=project_list)
    site = str(parser.parse_args().f).replace("\n", "")
    project = str(parser.parse_args().p).replace("\n", "")

    input_dir, output_dir = dir_from_project(project)
    #    print(project, input_dir, output_dir)
    encoding = 'utf8'
    passing_file = path_join(input_dir, 'site_to_scrap.txt')
    with open(passing_file, 'w', encoding=encoding) as filehandle:
        filehandle.write('%s' % site)
        filehandle.close()
    out_file = path_join(output_dir, site, 'store.json')
    #remove existing file in order to avoid appending at the end of it
    try:
        os.remove(out_file)
    except FileNotFoundError:
        pass
#        print('nothing to remove')
    item_list = spider_results(site, project)
Esempio n. 5
0
    def __init__(self, google_name, google_url, top_level_domain):
        """
        Save the rules into an input json file.

        Parameters :

        common_deny : list
            Common nodes not to follow.
        languages_deny: list
            Language codes to omit during scrape.
        specific_deny : list
            Insert particular HTML nodes below to exclude them from broad crawl
            Precise use.
        name : str
            Name of the file.
        allowed_domains : url
            Website which we want to scrape. ("example.com")
        start_urls : url
            Initial links for broad crawl.
        restrict_text : list
            Allow only the content which contains given text. Applied within
            items (eg. a paragraph). For precise use.

        Returns :

        input file : json
            Contains all the rules obtained from iterative search and specified
            in this module.
        """
        self.google_name = google_name
        self.google_url = google_url
        self.top_level_domain = top_level_domain

        input_dir, _ = dir_from_project('iterative')
        file = '%s.json' % self.google_name

        common_deny = [
            'contact', 'newsletter', 'special-offers', 'financial-services',
            'find-a-dealer', 'site-map', 'forum', 'privacy', 'terms', 'faq',
            'finance', 'careers', 'contact-us', 'gallery', 'healthcare',
            'health-care', '/events/'
        ]
        languages_deny = [
            '/es/', '/de/', '/at/', '/be/', '/cz/', '/ie/', '/it/', '/no/',
            '/pl/', '/se/', '/sk/', '/ja/', '/ko/', '/bs/', '/pt/', '/bg/',
            '/zh/', '/cs/', '/da/', '/ro/', '/ru/', '/sq/', '/sr/', '/nl/',
            '/nb/', '/sk/', '/sl/', '/sv/', '/tr/', '/hu/', '/hr/', '/mk/',
            '/ua/'
        ]  #'/fr/', '/en/'
        specific_deny = []

        data = {}
        data['language'] = ''  # pass from langdetect
        data['name'] = '%s' % self.google_name
        data['allowed_domains'] = ['%s' % self.top_level_domain]
        data['start_urls'] = ['%s' % self.google_url]
        '''
        common_allow = ['auto-industry', 'auto-manufacturer', 'auto-suppliers',
                         'automobile', 'automotive', 'autonomous', 'car',
                         'car-makers', 'carbon-offset', 'carmakers',
                         'charging infrastructure', 'decarbonization',
                         'emerging-trends-real-estate', 'energy-demand',
                         'energy-insights', 'ev adoption',
                         'innovation', 'low carbon', 'market-intelligence',
                         'mobility', 'mobility ecosystem', 'mobilitys',
                         'oil-and-gas', 'resource-revolution',
                         'sectors-without-borders', 'self-driving',
                         'sustainability', 'sustainable', 'transport',
                         'vehicle']
        specific_allow = []
        data['allow'] = common_allow + specific_allow
        '''
        data['allow'] = []
        data['deny'] = languages_deny + specific_deny  # + common_deny
        # data['deny_domains'] = ()
        # data['restrict_text'] = []

        with open(path_join(input_dir, file), 'w') as outfile:
            json.dump(data, outfile, indent=4, sort_keys=False)
            outfile.close()
        """
Esempio n. 6
0
                    type=str,
                    default='renault',
                    choices=project_list)

site_list = list(parser.parse_args().f)
w_list = list(parser.parse_args().w)
project = str(parser.parse_args().p).replace("\n", "")
"""
INITIALISATION

"""

percentage_n = 0.01
k_cluster = 2

_, output_dir, = dir_from_project()
for word in w_list:
    print('word:', word)
    for site in site_list:
        print('site:', site)
        # quantiles are printed from semantic_search.py
        sem_src = Semantic_search(output_dir,
                                  site,
                                  percentage=percentage_n,
                                  language="english",
                                  normalizer="spacy")
        sem_src.is_keyword(word)
        syn_used, similar = sem_src.sem_sphere(word)
        syn_unique = []
        # skip printing the keyword in synonyms
        for syn_used in [x for x in syn_used if x != word]:
Esempio n. 7
0
class MySpider(CrawlSpider):
    #    print('USING ITERATIVE')
    input_dir, _ = dir_from_project('iterative')
    encoding = 'utf8'
    passing_file = path_join('../input', 'site_to_scrap.txt')

    with open(passing_file, 'r', encoding=encoding) as f:
        input_file = str(f.readlines()[0]).replace('\n', '')
        f.close()

    with open(path_join(input_dir, input_file + '.json'), 'r') as json_file:
        data = json.load(json_file)
        json_file.close()

    name = data['name']
    allowed_domains = data['allowed_domains']
    start_urls = data['start_urls']

    rules = (Rule(LinkExtractor(allow=data['allow'], deny=data['deny']),
                  callback='parse_page',
                  follow=True), )

    def parse_page(self, response):
        # following sections are combined in 'p' output
        # b, ul, li, strong
        for p in response.css('p'):
            yield {
                'p': p.css('p::text').get(),
            }
        for b in response.css('b'):
            yield {
                'p': b.css('b::text').get(),
            }
        for ul in response.css('ul'):
            yield {
                'p': ul.css('ul::text').get(),
            }
        for li in response.css('li'):
            yield {
                'p': li.css('li::text').get(),
            }
        for strong in response.css('strong'):
            yield {
                'p': strong.css('strong::text').get(),
            }
        for span in response.css('span'):
            yield {
                'span': span.css('span::text').get(),
            }
        for title in response.css('div.title'):
            yield {
                'header': title.css('div.title::text').get(),
            }
        # following sections are combined in 'header' output
        # head, h1, h2, h3, h4, h5, h6
        for head in response.css('head'):
            yield {
                'header': head.css('head::text').get(),
            }
        for h1 in response.css('h1'):
            yield {
                'header': h1.css('h1::text').get(),
            }
        for h2 in response.css('h2'):
            yield {
                'header': h2.css('h2::text').get(),
            }
        for h3 in response.css('h3'):
            yield {
                'header': h3.css('h3::text').get(),
            }
        for h4 in response.css('h4'):
            yield {
                'header': h4.css('h4::text').get(),
            }
        for h5 in response.css('h5'):
            yield {
                'header': h5.css('h5::text').get(),
            }
        for h6 in response.css('h6'):
            yield {
                'header': h6.css('h6::text').get(),
            }

        # write visited urls into a file
        file = 'visited_urls.txt'
        with open(path_join('../output/%s' % MySpider.input_file, file), 'a')\
                as outfile:
            print(response.url, file=outfile)
            outfile.close()

    print('scrape for site: %s - in progress' % input_file)