def __mutate(domain_list): """ get mutation from random nodes :param: domain_list: a list of domain nodes :return: a list of mutation domain nodes """ mutants = [] # mutants mutation_pool = [] # mutants' urls domain_list = [domain for domain in domain_list if domain] # get rid of False for domain in domain_list: if random( ) > self.mut_rate: # dont implement mutation for this domain continue sleep(0.4) print(f'mutating {domain}......') try: _, url_list = mutate(domain.url) url_list = [ get_domain_url(url) for url in url_list if get_domain_url(url) not in self.visited_domain ] if url_list: mutation_pool.extend( url_list) # add mutation urls to mutation_pool except: # requests in mutate() error pass mutation_pool = list(set(mutation_pool)) # remove duplication if self.mut_quota < len( mutation_pool): # control size within set mutation quota mutation_pool = sample(mutation_pool, self.mut_quota) else: shuffle(mutation_pool) for url in mutation_pool: # parse mutants' urls domain = parse_domain(url, self.strictHubthres) domain.get_score() self.visited_domain.add(domain.url) mutants.append(domain) # add mutants to mutant list with open('evolve_logger.txt', 'a') as f: f.write( f'\n\n\n-----------------------\nafter mutation: {len(self.population)}' ) for node in self.population: f.write(f'{node.url}\n') return mutants
def get_name_urls(url, html): """ get name urls from a strictHub page :param url: url of this page :param html: html string :return: a list of name urls """ head = get_domain_url(url) # format html string and find all 'a' string html_clean = ' '.join(html.split()) astring = re.findall(r'<a href=.*?>.*?</a>', html_clean) # a href block print(astring) # def containers url_list = [] for ahref in astring: textlist = re.findall(r'>.+?<', ahref) text = ''.join(textlist) Chinese_list = findChinese(text) if Chinese_list and sum([isname(chinese) for chinese in Chinese_list]) > 0: try: url_text = re.findall(r'<a href=.+?>', ahref)[0].split()[1][6:-1] name_url = legalurl(url_text, head) print(name_url) url_list.append(name_url) except: pass
def __crossover(parents): """ get children from parents according to cr_quota :param parents: a list of domain_nodes :return: a list of children domain_nodes """ if not parents: # selection failed return False children_domains = [] # children domains children_urls = [] # urls of children domains for domain in parents: children_urls.extend([ get_domain_url(url) for url in domain.domain_set if url not in self.visited_domain ]) children_urls = list(set(children_urls)) if len(children_urls ) > self.cr_quota: # sample urls from domains under parents children_urls = sample(children_urls, self.cr_quota) else: shuffle(children_urls) for url in children_urls: # parse children's urls domain = parse_domain(url, self.strictHubthres) if not domain: continue domain.get_score() self.visited_domain.add(get_domain_url(url)) children_domains.append(domain) # add child to children list with open('evolve_logger.txt', 'a') as f: f.write(f'\nafter crossover: {len(self.population)}\n') for node in self.population: f.write(f'{node.url}\n') return children_domains
def __init__(self, url, content=None, html=None, url_list=None, parse_time=None, score_time=None, generation=None): super(Domain, self).__init__(url, content, html, url_list, parse_time, score_time, generation) self.visited_urls = set() # all urls under this Domain Node self.Authority = set( ) # all Authority pages(Node) under this Domain Node self.Hub = set() # all Hub pages(Node) under this Domain Node self.domain_set = {get_domain_url(url) } # all domain urls under this domain self.Domain_score = None self.score = None self.gen = None
def evolve(self): if not self.population: print('nothing to evolve') return False # get domain urls for domain_node in self.population: domain_node.url = get_domain_url(domain_node.url) # selection def __select(): """ select some individuals as new parents, popped from population, and the remnants are kept :return: new parents """ if len( self.population ) <= self.sel_quota: # quota is large enough for the whole population return self.population score_list = [node.score for node in self.population] i_list = [i for i in range(len(self.population))] parents = [] # new parents for _ in range(self.sel_quota): try: i_to_pop, i_population = roulette( i_list, score_list, return_ind=True ) # return (the ith item in i_list, the ith node in population) i_list.pop(i_to_pop) score_list.pop(i_to_pop) assert len(i_list) == len(score_list) parents.append( self.population.pop(i_to_pop) ) # one individual of the population is chosen as new parent and picked out from population except: print('illegal situation in selection') break with open('evolve_logger.txt', 'a') as f: f.write(f'\nafter selection: {len(self.population)}') for node in self.population: f.write(f'{node.url}\n') return parents # crossover def __crossover(parents): """ get children from parents according to cr_quota :param parents: a list of domain_nodes :return: a list of children domain_nodes """ if not parents: # selection failed return False children_domains = [] # children domains children_urls = [] # urls of children domains for domain in parents: children_urls.extend([ get_domain_url(url) for url in domain.domain_set if url not in self.visited_domain ]) children_urls = list(set(children_urls)) if len(children_urls ) > self.cr_quota: # sample urls from domains under parents children_urls = sample(children_urls, self.cr_quota) else: shuffle(children_urls) for url in children_urls: # parse children's urls domain = parse_domain(url, self.strictHubthres) if not domain: continue domain.get_score() self.visited_domain.add(get_domain_url(url)) children_domains.append(domain) # add child to children list with open('evolve_logger.txt', 'a') as f: f.write(f'\nafter crossover: {len(self.population)}\n') for node in self.population: f.write(f'{node.url}\n') return children_domains # mutation def __mutate(domain_list): """ get mutation from random nodes :param: domain_list: a list of domain nodes :return: a list of mutation domain nodes """ mutants = [] # mutants mutation_pool = [] # mutants' urls domain_list = [domain for domain in domain_list if domain] # get rid of False for domain in domain_list: if random( ) > self.mut_rate: # dont implement mutation for this domain continue sleep(0.4) print(f'mutating {domain}......') try: _, url_list = mutate(domain.url) url_list = [ get_domain_url(url) for url in url_list if get_domain_url(url) not in self.visited_domain ] if url_list: mutation_pool.extend( url_list) # add mutation urls to mutation_pool except: # requests in mutate() error pass mutation_pool = list(set(mutation_pool)) # remove duplication if self.mut_quota < len( mutation_pool): # control size within set mutation quota mutation_pool = sample(mutation_pool, self.mut_quota) else: shuffle(mutation_pool) for url in mutation_pool: # parse mutants' urls domain = parse_domain(url, self.strictHubthres) domain.get_score() self.visited_domain.add(domain.url) mutants.append(domain) # add mutants to mutant list with open('evolve_logger.txt', 'a') as f: f.write( f'\n\n\n-----------------------\nafter mutation: {len(self.population)}' ) for node in self.population: f.write(f'{node.url}\n') return mutants def __eliminate(frac=0.2): if len(self.population) <= self.poolsize: # eliminate 0.01(false requested domain or no Authority page under this domain) self.population = [ node for node in self.population if node.score != 0.01 ] return True self.population = sorted(self.population, key=lambda domain: domain.score, reverse=False) self.population = self.population[round(frac * len(self.population)):] # eliminate 0.01(false requested domain) self.population = [ node for node in self.population if node.score != 0.01 ] # filter wrong parsed node after elimination with open('evolve_logger.txt', 'a') as f: f.write(f'\nafter elimination: {len(self.population)}') for node in self.population: f.write(f'{node.url}\n') # main events in an evolution mutants = __mutate( self.population ) # mutate before parent domain nodes are picked out parents = __select() children = __crossover(parents) __eliminate( 0.2) # remove the worst 20% of the population if it's too large self.population += mutants + children # population changed # make sure they are all domain_urls # generation marker for domain_node in self.population: if domain_node.gen is None: domain_node.gen = self.genmarker self.genmarker += 1 return self.population
def parse_domain(domain_url, threshold=5): """ input a string representing a domain url, return a node containing the information in this domain :param domain_url: a string representing a url :param threshold: strictHub recognition threhosld :return: a node representing this domain with its information """ generation = 0 # get domain url domain_url = get_domain_url(domain_url) # build domain Node domain_url = [domain_url] # to fit parse() try: parsed_urls, content_list, html_list, offspring_list = parse( domain_url) _ = html_list[0] except: print('gen 0 parse error: domain_url') sleep(10) return False domain_node = Domain(url=parsed_urls[0], content=content_list[0], html=html_list[0], url_list=list(set(offspring_list[0])), generation=generation) # parse domain get nodes with depth 1 generation += 1 domain_node.visited_urls.add( domain_node.url ) # mark as visited url and it will not be in dep1_nodes dep1_nodes = parse_node(domain_node, domain_node.visited_urls) for url in domain_node.url_list: # in case that url in url_list different from that in parsed_list domain_node.visited_urls.add(url) if isStrictHub(domain_node.html, threshold=threshold): domain_node.Hub.add(domain_node) domain_node.type = 'Hub' for node in dep1_nodes: node.generation = generation node.type = 'Authority' domain_node.visited_urls.add( node.url) # mark depth 1 authority urls as visited domain_node.Authority.add( node ) # mark depth 1 authority nodes as authority nodes under this domain node else: domain_node.type = 'not_related' # not strictHub and domain_node is not considered to be Authority for node in dep1_nodes: node.generation = generation node.type = None # could be strictHub domain_node.visited_urls.add( node.url) # mark depth 1 authority urls as visited # sink next_parents = dep1_nodes # parent of next generation satisfied = False for _loop in range(2): generation += 1 if satisfied and _loop > 1: # there's a Hub page found besides the home page break parents = next_parents # parent of this generation next_parents = [] ul = [node.url for node in parents] for node in parents: if not node.type and isStrictHub(node.html, threshold=threshold): satisfied = True domain_node.Hub.add(node) node.type = 'Hub' authority = parse_node( node, domain_node.visited_urls ) # parse offspring nodes if not visited, not affecting parent node for url in node.url_list: # in case that url in url_list different from that in parsed_list domain_node.visited_urls.add(url) for authority_node in authority: authority_node.type = 'Authority' authority_node.generation = generation domain_node.visited_urls.add(authority_node.url) domain_node.Authority.add(authority_node) else: # parent nodes from second loop or after start here node.type = 'not_related' # not Authority and not strictHub not_authority_list = parse_node( node, domain_node.visited_urls ) # get offsprings of this not-strictHub for url in node.url_list: # in case that url in url_list different from that in parsed_list domain_node.visited_urls.add(url) for not_authority_node in not_authority_list: # TODO: if an Authority is exposed under a non-StrictHub page, it will be regards as not-related permanently not_authority_node.generation = generation domain_node.visited_urls.add(not_authority_node.url) if isStrictHub( not_authority_node.html, threshold=threshold ): # if this not-authority node is a strictHub domain_node.Hub.add(not_authority_node) not_authority_node.type = 'Hub' authority = parse_node(node, domain_node.visited_urls ) # parse nodes if not visited for url in node.url_list: # in case that url in url_list different from that in parsed_list domain_node.visited_urls.add(url) for authority_node in authority: authority_node.type = 'Authority' authority_node.generation = generation + 1 domain_node.visited_urls.add(authority_node.url) domain_node.Authority.add(authority_node) else: # if this not authority node is not strictHub either, push it to next parent not_authority_node.type = 'not_related' next_parents.append(not_authority_node) next_parents = [ node for node in next_parents if node.url not in domain_node.visited_urls ] # potential Hub in Authority for _ in range(1): generation = generation + 1 for node in deepcopy(domain_node.Authority): if isStrictHub( node.html, threshold): # this Authority node is also a Hub node domain_node.Hub.add(node) node.type = 'strictHub and Authority' authority_list = parse_node( node, domain_node.visited_urls) # parse nodes if not visited for url in node.url_list: # in case that url in url_list different from that in parsed_list domain_node.visited_urls.add(url) if not authority_list: continue for authority_node in authority_list: authority_node.generation = generation authority_node.type = 'Authority' domain_node.visited_urls.add(authority_node.url) domain_node.Authority.add(authority_node) # add outter domains under this domain for url in domain_node.visited_urls: domain_node.domain_set.add(get_domain_url(url)) # get domain score domain_node.get_score() return domain_node