Esempio n. 1
0
        def __mutate(domain_list):
            """
            get mutation from random nodes

            :param: domain_list: a list of domain nodes
            :return: a list of mutation domain nodes
            """

            mutants = []  # mutants
            mutation_pool = []  # mutants' urls
            domain_list = [domain for domain in domain_list
                           if domain]  # get rid of False
            for domain in domain_list:
                if random(
                ) > self.mut_rate:  # dont implement mutation for this domain
                    continue

                sleep(0.4)
                print(f'mutating {domain}......')
                try:
                    _, url_list = mutate(domain.url)
                    url_list = [
                        get_domain_url(url) for url in url_list
                        if get_domain_url(url) not in self.visited_domain
                    ]
                    if url_list:
                        mutation_pool.extend(
                            url_list)  # add mutation urls to mutation_pool
                except:  # requests in mutate() error
                    pass

            mutation_pool = list(set(mutation_pool))  # remove duplication
            if self.mut_quota < len(
                    mutation_pool):  # control size within set mutation quota
                mutation_pool = sample(mutation_pool, self.mut_quota)
            else:
                shuffle(mutation_pool)

            for url in mutation_pool:  # parse mutants' urls
                domain = parse_domain(url, self.strictHubthres)
                domain.get_score()
                self.visited_domain.add(domain.url)
                mutants.append(domain)  # add mutants to mutant list

            with open('evolve_logger.txt', 'a') as f:
                f.write(
                    f'\n\n\n-----------------------\nafter mutation: {len(self.population)}'
                )
                for node in self.population:
                    f.write(f'{node.url}\n')

            return mutants
Esempio n. 2
0
def get_name_urls(url, html):
    """
    get name urls from a strictHub page

    :param url: url of this page
    :param html: html string
    :return: a list of name urls
    """

    head = get_domain_url(url)

    # format html string and find all 'a' string
    html_clean = ' '.join(html.split())
    astring = re.findall(r'<a href=.*?>.*?</a>', html_clean)  # a href block
    print(astring)
    # def containers
    url_list = []

    for ahref in astring:
        textlist = re.findall(r'>.+?<', ahref)
        text = ''.join(textlist)
        Chinese_list = findChinese(text)
        if Chinese_list and sum([isname(chinese)
                                 for chinese in Chinese_list]) > 0:
            try:
                url_text = re.findall(r'<a href=.+?>',
                                      ahref)[0].split()[1][6:-1]
                name_url = legalurl(url_text, head)
                print(name_url)
                url_list.append(name_url)
            except:
                pass
Esempio n. 3
0
        def __crossover(parents):
            """
            get children from parents according to cr_quota

            :param parents: a list of domain_nodes
            :return: a list of children domain_nodes
            """
            if not parents:  # selection failed
                return False

            children_domains = []  # children domains
            children_urls = []  # urls of children domains
            for domain in parents:
                children_urls.extend([
                    get_domain_url(url) for url in domain.domain_set
                    if url not in self.visited_domain
                ])

            children_urls = list(set(children_urls))
            if len(children_urls
                   ) > self.cr_quota:  # sample urls from domains under parents
                children_urls = sample(children_urls, self.cr_quota)
            else:
                shuffle(children_urls)

            for url in children_urls:  # parse children's urls
                domain = parse_domain(url, self.strictHubthres)
                if not domain:
                    continue
                domain.get_score()
                self.visited_domain.add(get_domain_url(url))
                children_domains.append(domain)  # add child to children list

            with open('evolve_logger.txt', 'a') as f:
                f.write(f'\nafter crossover: {len(self.population)}\n')
                for node in self.population:
                    f.write(f'{node.url}\n')

            return children_domains
Esempio n. 4
0
 def __init__(self,
              url,
              content=None,
              html=None,
              url_list=None,
              parse_time=None,
              score_time=None,
              generation=None):
     super(Domain, self).__init__(url, content, html, url_list, parse_time,
                                  score_time, generation)
     self.visited_urls = set()  # all urls under this Domain Node
     self.Authority = set(
     )  # all Authority pages(Node) under this Domain Node
     self.Hub = set()  # all Hub pages(Node) under this Domain Node
     self.domain_set = {get_domain_url(url)
                        }  # all domain urls under this domain
     self.Domain_score = None
     self.score = None
     self.gen = None
Esempio n. 5
0
    def evolve(self):
        if not self.population:
            print('nothing to evolve')
            return False

        # get domain urls
        for domain_node in self.population:
            domain_node.url = get_domain_url(domain_node.url)

        # selection
        def __select():
            """
            select some individuals as new parents, popped from population, and the remnants are kept

            :return: new parents
            """
            if len(
                    self.population
            ) <= self.sel_quota:  # quota is large enough for the whole population
                return self.population

            score_list = [node.score for node in self.population]
            i_list = [i for i in range(len(self.population))]

            parents = []  # new parents
            for _ in range(self.sel_quota):
                try:
                    i_to_pop, i_population = roulette(
                        i_list, score_list, return_ind=True
                    )  # return (the ith item in i_list, the ith node in population)
                    i_list.pop(i_to_pop)
                    score_list.pop(i_to_pop)
                    assert len(i_list) == len(score_list)
                    parents.append(
                        self.population.pop(i_to_pop)
                    )  # one individual of the population is chosen as new parent and picked out from population
                except:
                    print('illegal situation in selection')
                    break

            with open('evolve_logger.txt', 'a') as f:
                f.write(f'\nafter selection: {len(self.population)}')
                for node in self.population:
                    f.write(f'{node.url}\n')

            return parents

        # crossover
        def __crossover(parents):
            """
            get children from parents according to cr_quota

            :param parents: a list of domain_nodes
            :return: a list of children domain_nodes
            """
            if not parents:  # selection failed
                return False

            children_domains = []  # children domains
            children_urls = []  # urls of children domains
            for domain in parents:
                children_urls.extend([
                    get_domain_url(url) for url in domain.domain_set
                    if url not in self.visited_domain
                ])

            children_urls = list(set(children_urls))
            if len(children_urls
                   ) > self.cr_quota:  # sample urls from domains under parents
                children_urls = sample(children_urls, self.cr_quota)
            else:
                shuffle(children_urls)

            for url in children_urls:  # parse children's urls
                domain = parse_domain(url, self.strictHubthres)
                if not domain:
                    continue
                domain.get_score()
                self.visited_domain.add(get_domain_url(url))
                children_domains.append(domain)  # add child to children list

            with open('evolve_logger.txt', 'a') as f:
                f.write(f'\nafter crossover: {len(self.population)}\n')
                for node in self.population:
                    f.write(f'{node.url}\n')

            return children_domains

        # mutation
        def __mutate(domain_list):
            """
            get mutation from random nodes

            :param: domain_list: a list of domain nodes
            :return: a list of mutation domain nodes
            """

            mutants = []  # mutants
            mutation_pool = []  # mutants' urls
            domain_list = [domain for domain in domain_list
                           if domain]  # get rid of False
            for domain in domain_list:
                if random(
                ) > self.mut_rate:  # dont implement mutation for this domain
                    continue

                sleep(0.4)
                print(f'mutating {domain}......')
                try:
                    _, url_list = mutate(domain.url)
                    url_list = [
                        get_domain_url(url) for url in url_list
                        if get_domain_url(url) not in self.visited_domain
                    ]
                    if url_list:
                        mutation_pool.extend(
                            url_list)  # add mutation urls to mutation_pool
                except:  # requests in mutate() error
                    pass

            mutation_pool = list(set(mutation_pool))  # remove duplication
            if self.mut_quota < len(
                    mutation_pool):  # control size within set mutation quota
                mutation_pool = sample(mutation_pool, self.mut_quota)
            else:
                shuffle(mutation_pool)

            for url in mutation_pool:  # parse mutants' urls
                domain = parse_domain(url, self.strictHubthres)
                domain.get_score()
                self.visited_domain.add(domain.url)
                mutants.append(domain)  # add mutants to mutant list

            with open('evolve_logger.txt', 'a') as f:
                f.write(
                    f'\n\n\n-----------------------\nafter mutation: {len(self.population)}'
                )
                for node in self.population:
                    f.write(f'{node.url}\n')

            return mutants

        def __eliminate(frac=0.2):
            if len(self.population) <= self.poolsize:
                # eliminate 0.01(false requested domain or no Authority page under this domain)
                self.population = [
                    node for node in self.population if node.score != 0.01
                ]
                return True
            self.population = sorted(self.population,
                                     key=lambda domain: domain.score,
                                     reverse=False)
            self.population = self.population[round(frac *
                                                    len(self.population)):]
            # eliminate 0.01(false requested domain)
            self.population = [
                node for node in self.population if node.score != 0.01
            ]  # filter wrong parsed node after elimination

            with open('evolve_logger.txt', 'a') as f:
                f.write(f'\nafter elimination: {len(self.population)}')
                for node in self.population:
                    f.write(f'{node.url}\n')

        # main events in an evolution
        mutants = __mutate(
            self.population
        )  # mutate before parent domain nodes are picked out
        parents = __select()
        children = __crossover(parents)

        __eliminate(
            0.2)  # remove the worst 20% of the population if it's too large
        self.population += mutants + children  # population changed
        # make sure they are all domain_urls

        # generation marker
        for domain_node in self.population:
            if domain_node.gen is None:
                domain_node.gen = self.genmarker
        self.genmarker += 1
        return self.population
Esempio n. 6
0
def parse_domain(domain_url, threshold=5):
    """
    input a string representing a domain url, return a node containing the information in this domain

    :param domain_url: a string representing a url
    :param threshold: strictHub recognition threhosld
    :return: a node representing this domain with its information
    """
    generation = 0

    # get domain url
    domain_url = get_domain_url(domain_url)

    # build domain Node
    domain_url = [domain_url]  # to fit parse()
    try:
        parsed_urls, content_list, html_list, offspring_list = parse(
            domain_url)
        _ = html_list[0]
    except:
        print('gen 0 parse error: domain_url')
        sleep(10)
        return False

    domain_node = Domain(url=parsed_urls[0],
                         content=content_list[0],
                         html=html_list[0],
                         url_list=list(set(offspring_list[0])),
                         generation=generation)

    # parse domain get nodes with depth 1
    generation += 1
    domain_node.visited_urls.add(
        domain_node.url
    )  # mark as visited url and it will not be in dep1_nodes
    dep1_nodes = parse_node(domain_node, domain_node.visited_urls)
    for url in domain_node.url_list:  # in case that url in url_list different from that in parsed_list
        domain_node.visited_urls.add(url)
    if isStrictHub(domain_node.html, threshold=threshold):
        domain_node.Hub.add(domain_node)
        domain_node.type = 'Hub'
        for node in dep1_nodes:
            node.generation = generation
            node.type = 'Authority'
            domain_node.visited_urls.add(
                node.url)  # mark depth 1 authority urls as visited
            domain_node.Authority.add(
                node
            )  # mark depth 1 authority nodes as authority nodes under this domain node
    else:
        domain_node.type = 'not_related'  # not strictHub and domain_node is not considered to be Authority
        for node in dep1_nodes:
            node.generation = generation
            node.type = None  # could be strictHub
            domain_node.visited_urls.add(
                node.url)  # mark depth 1 authority urls as visited

    # sink
    next_parents = dep1_nodes  # parent of next generation
    satisfied = False
    for _loop in range(2):
        generation += 1
        if satisfied and _loop > 1:  # there's a Hub page found besides the home page
            break
        parents = next_parents  # parent of this generation
        next_parents = []
        ul = [node.url for node in parents]
        for node in parents:
            if not node.type and isStrictHub(node.html, threshold=threshold):
                satisfied = True
                domain_node.Hub.add(node)
                node.type = 'Hub'
                authority = parse_node(
                    node, domain_node.visited_urls
                )  # parse offspring nodes if not visited, not affecting parent node
                for url in node.url_list:  # in case that url in url_list different from that in parsed_list
                    domain_node.visited_urls.add(url)
                for authority_node in authority:
                    authority_node.type = 'Authority'
                    authority_node.generation = generation
                    domain_node.visited_urls.add(authority_node.url)
                    domain_node.Authority.add(authority_node)
            else:  # parent nodes from second loop or after start here
                node.type = 'not_related'  # not Authority and not strictHub
                not_authority_list = parse_node(
                    node, domain_node.visited_urls
                )  # get offsprings of this not-strictHub
                for url in node.url_list:  # in case that url in url_list different from that in parsed_list
                    domain_node.visited_urls.add(url)
                for not_authority_node in not_authority_list:
                    # TODO: if an Authority is exposed under a non-StrictHub page, it will be regards as not-related permanently
                    not_authority_node.generation = generation
                    domain_node.visited_urls.add(not_authority_node.url)
                    if isStrictHub(
                            not_authority_node.html, threshold=threshold
                    ):  # if this not-authority node is a strictHub
                        domain_node.Hub.add(not_authority_node)
                        not_authority_node.type = 'Hub'
                        authority = parse_node(node, domain_node.visited_urls
                                               )  # parse nodes if not visited
                        for url in node.url_list:  # in case that url in url_list different from that in parsed_list
                            domain_node.visited_urls.add(url)
                        for authority_node in authority:
                            authority_node.type = 'Authority'
                            authority_node.generation = generation + 1
                            domain_node.visited_urls.add(authority_node.url)
                            domain_node.Authority.add(authority_node)
                    else:  # if this not authority node is not strictHub either, push it to next parent
                        not_authority_node.type = 'not_related'
                        next_parents.append(not_authority_node)
        next_parents = [
            node for node in next_parents
            if node.url not in domain_node.visited_urls
        ]

    # potential Hub in Authority
    for _ in range(1):
        generation = generation + 1
        for node in deepcopy(domain_node.Authority):
            if isStrictHub(
                    node.html,
                    threshold):  # this Authority node is also a Hub node
                domain_node.Hub.add(node)
                node.type = 'strictHub and Authority'
                authority_list = parse_node(
                    node,
                    domain_node.visited_urls)  # parse nodes if not visited
                for url in node.url_list:  # in case that url in url_list different from that in parsed_list
                    domain_node.visited_urls.add(url)
                if not authority_list:
                    continue
                for authority_node in authority_list:
                    authority_node.generation = generation
                    authority_node.type = 'Authority'
                    domain_node.visited_urls.add(authority_node.url)
                    domain_node.Authority.add(authority_node)

    # add outter domains under this domain
    for url in domain_node.visited_urls:
        domain_node.domain_set.add(get_domain_url(url))

    # get domain score
    domain_node.get_score()

    return domain_node