Esempi in Python per MassRequest, esempi in Python per massweb.mass_requests.mass_request.MassRequest

Esempio n. 1

0

Mostra file

File: bsqli_fuzzer.py Progetto: jblann/massweb

    def newmreq(self):
        """ Hot, fresh MassRequest everytime!

        Workaround for the iFuzzer class using self.mreq.

        """
        return MassRequest(**self.mreq_config_dict)

Esempio n. 2

0

Mostra file

File: web_fuzzer.py Progetto: jblann/massweb

    def __init__(self, targets=None, payloads=None, num_threads=10,
                 time_per_url=10, request_timeout=10, proxy_list=None,
                 hadoop_reporting=False, depreciated=None):
        """ Initialize this WebFuzzer object.

        targets             list of Target objects. Default [].
        payloads            list of Payload objects. Default [].
        num_threads         Number of threads/processes to launch as an int.
                                Default 10.
        time_per_url        Time in seconds to spend on each Target.
                                Default 10.
        request_timeout     Time in seconds to wait for a connection before
                                giving up. Default 10.
        proxy_list          list of proxies specified as dicts. Default empty.
        hadoop_reporting    Output info for hadoop if True. Default False.
        payload_groups      UNUSED. list of groups of Payload objects.
                                Default [].
        """
        super(WebFuzzer, self).__init__()
        # do this because we may need to create more MassRequest objects in
        #  checks (like bsqli), needs to be configured the same
        self.mreq_config_dict = {"num_threads": num_threads,
                                 "time_per_url": time_per_url,
                                 "request_timeout": request_timeout,
                                 "proxy_list": proxy_list or [{}],
                                 "hadoop_reporting": hadoop_reporting}
        self.mreq = MassRequest(**self.mreq_config_dict)
        self.targets = targets or []
        self.payloads = payloads or []
        self.mxi_check = MXICheck()
        self.osci_check = OSCICheck()
        self.sqli_check = SQLICheck()
        self.trav_check = TravCheck()
        self.xpathi_check = XPathICheck()
        self.xss_check = XSSCheck()
        self.hadoop_reporting = hadoop_reporting
        if self.hadoop_reporting:
            logger.info("Hadoop reporting set in fuzzer")
        self.fuzzy_targets = []

Esempio n. 3

0

Mostra file

File: masscrawl.py Progetto: jblann/massweb

 def fetch(self,
           num_threads=10,
           time_per_url=10,
           request_timeout=10,
           proxy_list=[{}]):
     """Fetch URLs and append them to the seed list"""
     self.mreq = MassRequest(num_threads=num_threads,
                             time_per_url=time_per_url,
                             request_timeout=request_timeout,
                             proxy_list=proxy_list,
                             hadoop_reporting=True)
     unfetched_targets = [
         unfetched_target for unfetched_target in self.targets
         if unfetched_target.status == "unfetched"
     ]
     for ut in unfetched_targets:
         logger.info("Fetching %s", ut)
     # NB: this only fetches via GET, doesn't submit forms for more links
     self.mreq.get_targets(self.targets)
     self.results = self.mreq.results
     for target in self.targets:
         target.status = "fetched"

Esempio n. 4

0

Mostra file

File: masscrawl.py Progetto: shekkbuilder/massweb

 def fetch(self, num_threads=10, time_per_url=10, request_timeout=10,
           proxy_list=[{}]):
     """Fetch URLs and append them to the seed list"""
     self.mreq = MassRequest(num_threads=num_threads,
                             time_per_url=time_per_url,
                             request_timeout=request_timeout,
                             proxy_list=proxy_list,
                             hadoop_reporting=True)
     unfetched_targets = [unfetched_target
                          for unfetched_target in self.targets
                          if unfetched_target.status == "unfetched"]
     for ut in unfetched_targets:
         logger.info("Fetching %s", ut)
     # NB: this only fetches via GET, doesn't submit forms for more links
     self.mreq.get_targets(self.targets)
     self.results = self.mreq.results
     for target in self.targets:
         target.status = "fetched"

Esempio n. 5

0

Mostra file

File: wiki_usage_test_2.py Progetto: jblann/massweb

    >>> targets = [target_1, target_2, target_3]
    >>> mr = MassRequest()
    >>> mr.request_targets(targets)
    >>> for r in mr.results:
    ...     print r
    ... 
    (<massweb.targets.target.Target object at 0x15496d0>, <Response [200]>)
    (<massweb.targets.target.Target object at 0x1549650>, <Response [200]>)
    (<massweb.targets.target.Target object at 0x1549490>, <Response [200]>)
    >>> for target, response in mr.results:
    ...     print target, response.status_code
    ... 
    http://course.hyperiongray.com/vuln2/898538a7335fd8e6bac310f079ba3fd1/ 200
    http://www.hyperiongray.com/ 200
    http://course.hyperiongray.com/vuln1 200``
"""

from massweb.mass_requests.mass_request import MassRequest
from massweb.targets.target import Target
target_1 = Target(url=u"http://course.hyperiongray.com/vuln1", data={"password": "******"}, ttype="post")
target_2 = Target(url=u"http://course.hyperiongray.com/vuln2/898538a7335fd8e6bac310f079ba3fd1/", data={"how": "I'm good thx"}, ttype="post")
target_3 = Target(url=u"http://www.hyperiongray.com/", ttype="get")
targets = [target_1, target_2, target_3]
mr = MassRequest()
mr.request_targets(targets)
for result in mr.results:
    print result
for target, response in mr.results:
    print target, response.status_code

Esempio n. 6

0

Mostra file

File: masscrawl.py Progetto: shekkbuilder/massweb

class MassCrawl(object):

    def __init__(self, seeds=[], add_seeds_to_scope=True):
        logger.info("Insantiating MassCrawl object")
        self.seeds = seeds
        self.domains = []
        self.posts_identified = []
        self.targets = []
        self.results = []
        self.mreq = None
        self.add_seeds_to_scope(seeds)
        self.add_seeds_to_targets(seeds)

    def add_seeds_to_scope(self, seeds):
        for seed in seeds:
            self.add_to_scope_from_url(seed)

    def add_seeds_to_targets(self, seeds):
        for seed in seeds:
            ct = CrawlTarget(seed)
            self.add_target(ct)

    def get_domain_from_url(self, url):
        domain_raw = urlparse(url).netloc
        if ":" in domain_raw:
            domain = domain_raw.split(":")[0]
        else:
            domain = domain_raw
        return domain

    def add_to_scope_from_url(self, url):
        domain = self.get_domain_from_url(url)
        self.add_to_scope(domain)

    def add_to_scope(self, domain):
        if domain not in self.domains:
            self.domains.append(domain)

    def in_scope(self, url):
        domain = self.get_domain_from_url(url)
        return domain in self.domains

    def add_target(self, target):
        if target not in self.targets:
            self.targets.append(target)

    def parse_response(self, response, stay_in_scope=True, max_links=10):
        links = []
        for tag in BeautifulSoup(response.text, 'html.parser',
                                 parse_only=SoupStrainer(['a', 'img', 'script',
                                                          'link'])):
            # stop finding links if max links reached
            if len(links) <= max_links:
                link = self.parse_tag(tag, response, stay_in_scope)
                if link:
                    links.append(link)
        return links

    def parse_tag(self, tag, response, stay_in_scope):
        href = None
        if tag.get('href'):
            href = tag.get('href')
        elif tag.get('src'):
            href = tag.get('src')
        if href and not href.startswith("mailto:"):
            link_normed = normalize_link(href, response.url)["norm_url"]
            if stay_in_scope:
                if self.in_scope(link_normed):
                    return link_normed
            else:
                return link_normed

    def dedupe_targets(self):
        seen_hashes = []
        for target in self.targets:
            target_hash = hash(target)
            if target_hash in seen_hashes:
                self.targets.pop(self.targets.index(target))
                logger.warn("Found duplicate target: %s", target)
            else:
                seen_hashes.append(target_hash)

    def filter_targets_by_scope(self):
        #FIXME: !in large-scale crawls, there's some out of scope posts,
        #   this is a hack to stop that, real issue should be found
        #   and resolved
        logger.info("Filtering targets by scope")
        for target in self.targets:
            if not self.in_scope(target.url):
                self.targets.pop(self.targets.index(target))
                logger.warn("Target filtered out that was not in scope: %s",
                               target.url)

    def fetch(self, num_threads=10, time_per_url=10, request_timeout=10,
              proxy_list=[{}]):
        """Fetch URLs and append them to the seed list"""
        self.mreq = MassRequest(num_threads=num_threads,
                                time_per_url=time_per_url,
                                request_timeout=request_timeout,
                                proxy_list=proxy_list,
                                hadoop_reporting=True)
        unfetched_targets = [unfetched_target
                             for unfetched_target in self.targets
                             if unfetched_target.status == "unfetched"]
        for ut in unfetched_targets:
            logger.info("Fetching %s", ut)
        # NB: this only fetches via GET, doesn't submit forms for more links
        self.mreq.get_targets(self.targets)
        self.results = self.mreq.results
        for target in self.targets:
            target.status = "fetched"

    def parse(self, stay_in_scope=True, max_links=10):
        for target, response in self.results:
            # skip 40X replies and strings (i.e. failed requests)
            logger.info("Attempting to parse %s", target)
            try:
                response.raise_for_status()
            except (HTTPError, AttributeError) as exc:   # only exception type we care about from requests.Response
                logger.debug("Failed request.", exc_info=True)
                continue
            if parse_worthy(response, content_type_match="text/html",
                            hadoop_reporting=True):
                logger.info("pase_worthy function tells us to parse")
            else:
                logger.info("pase_worthy function tells us not to try"
                            " parsing")
                continue
            logger.info("Finding post requests on page %s", response.url)
            #FIXME: !this doesn't stay in scope?
            post_request_targets = find_post_requests(target=response.url,
                                                      response_text=response.text)
            for target_post in post_request_targets:
                ct_post = CrawlTarget(target_post.url)
                ct_post.__dict__ = target_post.__dict__
                ct_post.status = "unfetched"
                self.add_target(ct_post)
            links = self.parse_response(response, stay_in_scope=stay_in_scope,
                                        max_links=max_links)
            for link in links:
                ct_link = CrawlTarget(unicode(link))
                self.add_target(ct_link)
            if stay_in_scope:
                self.filter_targets_by_scope()

            logger.info("Finished attempted parsing for %s", target)

    def crawl(self,
              depth=3,
              num_threads=10,
              time_per_url=10,
              request_timeout=10,
              proxy_list=None,
              stay_in_scope=True,
              max_links=20, dedupe=True):

        for _ in range(depth):
            logger.info("Entering the fetch phase at depth %d", depth)
            self.fetch(num_threads=num_threads, time_per_url=time_per_url,
                       request_timeout=request_timeout, proxy_list=proxy_list or None)
            logger.info("Entering the parse phase at depth %d", depth)
            self.parse(max_links=max_links, stay_in_scope=stay_in_scope)
            if dedupe:
                self.dedupe_targets()
            if stay_in_scope:
                self.filter_targets_by_scope()

Esempio n. 7

0

Mostra file

File: wiki_usage_test_1.py Progetto: jblann/massweb

"""
    >>> from massweb.mass_requests.mass_request import MassRequest
    >>> urls_to_fetch = [u"http://www.hyperiongray.com", u"http://course.hyperiongray.com/vuln1/", u"http://course.hyperiongray.com/vuln2/898538a7335fd8e6bac310f079ba3fd1/"]
    >>> mr = MassRequest()
    >>> mr.get_urls(urls_to_fetch)
    >>> for r in mr.results:
    ...     print r
    ... 
    ('http://www.hyperiongray.com', <Response [200]>)
    ('http://course.hyperiongray.com/vuln2/898538a7335fd8e6bac310f079ba3fd1/', <Response [200]>)
    ('http://course.hyperiongray.com/vuln1/', <Response [200]>)
"""

from massweb.mass_requests.mass_request import MassRequest
urls_to_fetch = [
    u"http://www.hyperiongray.com", u"http://course.hyperiongray.com/vuln1/",
    u"http://course.hyperiongray.com/vuln2/898538a7335fd8e6bac310f079ba3fd1/"
]
mr = MassRequest()
mr.get_urls(urls_to_fetch)
for target, response in mr.results:
    print target, response

Esempio n. 8

0

Mostra file

File: wiki_usage_test_3.py Progetto: shekkbuilder/massweb

    >>> urls_file = "urls.txt"
    >>> proxies = [{"http": "user:password@http://proxy.example.com:1234/some/path"}, {"http": "otheruser:otherpassword@http://proxy.example.net:6789/someother/path"}]
    >>> from massweb.mass_requests.mass_request import MassRequest
    >>> mr = MassRequest(num_threads=20, time_per_url=2, proxy_list=proxies)
    >>> mr.get_urls_from_file(urls_file)
    >>> len(mr.results)
    1000
    >>> for target, response in mr.results[:10]:
    ...     print target, response
    ... 
    ('http://www.abcselfstorage.co.uk/', '__PNK_REQ_FAILED')
    ('http://www.abcskiphirews32.co.uk/', '__PNK_REQ_FAILED')
    ('http://abcskateboarding.co.uk/', <Response [404]>)
    ('http://www.abcsalestraining.co.uk/', <Response [200]>)
    ('http://www.abcservice.co.uk/', <Response [200]>)
    ('http://www.abcseaangling.co.uk/', <Response [200]>)
    ('http://www.abcselfdrive.co.uk/', <Response [404]>)
    ('http://www.abcselfstore.co.uk/storage-blogwp-login.php?redirect_to=http%3A%2F%2Fwww.abcselfstore.co.uk%2Fstorage-blog%2Fwp-admin%2F&amp;reauth=1', <Response [404]>)
    ('http://www.abcselfstore.co.uk/abc24-hour-access.html', <Response [200]>)
"""

urls_file = "example/urls.txt"
proxies = [{"http": "user:password@http://proxy.example.com:1234/some/path"}, {"http": "otheruser:otherpassword@http://proxy.example.net:6789/someother/path"}]
from massweb.mass_requests.mass_request import MassRequest
mr = MassRequest(num_threads=20, time_per_url=2, proxy_list=proxies)
mr.get_urls_from_file(urls_file)
len(mr.results)
for target, response in mr.results[:10]:
    print target, response

Esempio n. 9

0

Mostra file

File: web_fuzzer.py Progetto: jblann/massweb

class WebFuzzer(iFuzzer):
    """ Fuzzy a generated list of Targets.

    Generates lists of targets with associated payloads and runs them against
    the target systems.
    """

    def __init__(self, targets=None, payloads=None, num_threads=10,
                 time_per_url=10, request_timeout=10, proxy_list=None,
                 hadoop_reporting=False, depreciated=None):
        """ Initialize this WebFuzzer object.

        targets             list of Target objects. Default [].
        payloads            list of Payload objects. Default [].
        num_threads         Number of threads/processes to launch as an int.
                                Default 10.
        time_per_url        Time in seconds to spend on each Target.
                                Default 10.
        request_timeout     Time in seconds to wait for a connection before
                                giving up. Default 10.
        proxy_list          list of proxies specified as dicts. Default empty.
        hadoop_reporting    Output info for hadoop if True. Default False.
        payload_groups      UNUSED. list of groups of Payload objects.
                                Default [].
        """
        super(WebFuzzer, self).__init__()
        # do this because we may need to create more MassRequest objects in
        #  checks (like bsqli), needs to be configured the same
        self.mreq_config_dict = {"num_threads": num_threads,
                                 "time_per_url": time_per_url,
                                 "request_timeout": request_timeout,
                                 "proxy_list": proxy_list or [{}],
                                 "hadoop_reporting": hadoop_reporting}
        self.mreq = MassRequest(**self.mreq_config_dict)
        self.targets = targets or []
        self.payloads = payloads or []
        self.mxi_check = MXICheck()
        self.osci_check = OSCICheck()
        self.sqli_check = SQLICheck()
        self.trav_check = TravCheck()
        self.xpathi_check = XPathICheck()
        self.xss_check = XSSCheck()
        self.hadoop_reporting = hadoop_reporting
        if self.hadoop_reporting:
            logger.info("Hadoop reporting set in fuzzer")
        self.fuzzy_targets = []

    def __generate_fuzzy_target_get(self, target):
        """ Associate fuzzing data for GET requests with the target.

        target  Target object.
        returns list of Targets with fuzzing data.
        """
        url = target.url
        parsed_url = urlparse(url)
        parsed_url_query = parsed_url.query
        url_q_dic = parse_qs(parsed_url_query)
        fuzzy_targets = []
        for query_param, _ in url_q_dic.iteritems():
            for payload in self.payloads:
                fuzzy_url = (self.replace_param_value(url, query_param,
                                                      str(payload)))
                fuzzy_target = FuzzyTarget(fuzzy_url, url, query_param, GET,
                                           payload=payload)
                logger.debug("GET fuzzy_target type: %s", type(fuzzy_target))
                fuzzy_targets.append(fuzzy_target)
        return fuzzy_targets

    def __generate_fuzzy_target_post(self, target):
        """ Associate fuzzing data for POST requests with the target.

        target  Target object.
        returns list of Targets with fuzzing data.
        """
        url = target.url
        fuzzy_targets = []
        post_keys = target.data.keys()
        for key in post_keys:
            data_copy = target.data.copy()
            for payload in self.payloads:
                data_copy[key] = str(payload)
                fuzzy_target = FuzzyTarget(url, url, key, POST,
                                           data=data_copy.copy(),
                                           payload=payload,
                                           unfuzzed_data=target.data)
                logger.debug("POST fuzzy_target type: %s", type(fuzzy_target))
                fuzzy_targets.append(fuzzy_target)
        return fuzzy_targets

    def generate_fuzzy_targets(self):
        """ Associate fuzzing data with the targets. """
        if self.hadoop_reporting:
            logger.info("Generating fuzzy targets")
        # If no targets then raise an exception
        if len(self.targets) == 0:
            raise ValueError("Targets list must not be empty!")
        self.fuzzy_targets = []
        for target in self.targets:
            logger.debug("input target type: %s", type(target))
            if target.ttype == "get":
                fuzzy_target_list = self.__generate_fuzzy_target_get(target)
                self.fuzzy_targets += fuzzy_target_list
            if target.ttype == "post":
                fuzzy_target_list = self.__generate_fuzzy_target_post(target)
                self.fuzzy_targets += fuzzy_target_list
        if not self.fuzzy_targets:
            raise ValueError("fuzzy_targets is empty. No targets generated"
                             " from: %s",
                             ','.join([str(x) for x in self.targets]))
        return self.fuzzy_targets

    def fuzz(self):
        """ Fuzz all the targets and return the results.

        returns     list of Result objects.
        """
        self.mreq.request_targets(self.fuzzy_targets)
        results = []
        for target, response in self.mreq.results:
            #FIXME: Clarify with alex: !not yet multithreaded, should it be?
            logger.debug("target type: %s", type(target))
            try:
                result = self.analyze_response(target, response)
            except (TypeError, AttributeError) as err:
                # If request failed and str is returned instead of Response obj
                #  could save some cycles here not analyzing response
                if self.hadoop_reporting:
                    logger.info("Marking target as failed due to exception: ", exc_info=True)
                logger.debug(err)
                try:
                    result = self._make_failed_result(target, "__PNK_FAILED_RESPONSE")
                except TypeError as err:
                    logger.debug("Failed to make a failed result for %s.", target)
                    logger.warn(err.message, exc_info=True)
                    continue
            results.append(result)
        return results

    def _make_failed_result(self, target, result_dic=None):
        """ Macro to make a failed Result. """
        if not result_dic:
            result_dic = {}
            for check_type in target.payload.check_type_list:
                result_dic[check_type] = False
        return Result(target, result_dic)

    def analyze_response(self, ftarget, response):
        """ Analyze the results of the request and return the info gathered.

        ftargeet    FuzzyTarget object.
        response    requests.Response object.

        returns     Result object.
        raises      TypeError or AttributeError when non requests.Response is given as response.

        """
        #FIXME: Clarify with alex: !function is a mess, response is of type
        #    text or non-text, trying to read blah blah
        result_dic = {}
        check_type_list = ftarget.payload.check_type_list
        if self.hadoop_reporting:
            logger.info("Response is of type %s for target %s.",
                        response.__class__.__name__, ftarget)
        worthy = parse_worthy(response,
                              hadoop_reporting=self.hadoop_reporting)
        if worthy:
            logger.info("FuzzyTarget %s looks worth checking for vulnerabilities.",
                        ftarget)
        else:
            logger.info("Response deemed non-parse-worthy. Setting all checks "
                        "in result_dic to False for %s", ftarget)
            return self._make_failed_result(ftarget)
        result_dic = self._run_checks(response, result_dic, check_type_list)
        return Result(ftarget, result_dic)

    def _run_checks(self, response, result_dic, check_type_list):
        """ Check reponse output with the specified checkers.

        response        requests.Response object.
        result_dic      dict with checker names as keys.
        check_type_list list of names of checkers to check with.
        """
        #FIXME: Make me work on a dict of checker IDs and methods to call
        #   instead of an if statement cascade
        if "mxi" in check_type_list:
            mxi_result = self.mxi_check.check(response.text)
            result_dic["mxi"] = mxi_result
        if "sqli" in check_type_list:
            sqli_result = self.sqli_check.check(response.text)
            result_dic["sqli"] = sqli_result
        if "xpathi" in check_type_list:
            xpathi_result = self.xpathi_check.check(response.text)
            result_dic["xpathi"] = xpathi_result
        if "trav" in check_type_list:
            trav_result = self.trav_check.check(response.text)
            result_dic["trav"] = trav_result
        if "osci" in check_type_list:
            osci_result = self.osci_check.check(response.text)
            result_dic["osci"] = osci_result
        if "xss" in check_type_list:
            xss_result = self.xss_check.check(response.text)
            result_dic["xss"] = xss_result
        return result_dic

Esempio n. 10

0

Mostra file

File: wiki_usage_test_1.py Progetto: shekkbuilder/massweb

"""
    >>> from massweb.mass_requests.mass_request import MassRequest
    >>> urls_to_fetch = [u"http://www.hyperiongray.com", u"http://course.hyperiongray.com/vuln1/", u"http://course.hyperiongray.com/vuln2/898538a7335fd8e6bac310f079ba3fd1/"]
    >>> mr = MassRequest()
    >>> mr.get_urls(urls_to_fetch)
    >>> for r in mr.results:
    ...     print r
    ... 
    ('http://www.hyperiongray.com', <Response [200]>)
    ('http://course.hyperiongray.com/vuln2/898538a7335fd8e6bac310f079ba3fd1/', <Response [200]>)
    ('http://course.hyperiongray.com/vuln1/', <Response [200]>)
"""
 
from massweb.mass_requests.mass_request import MassRequest
urls_to_fetch = [u"http://www.hyperiongray.com", u"http://course.hyperiongray.com/vuln1/", u"http://course.hyperiongray.com/vuln2/898538a7335fd8e6bac310f079ba3fd1/"]
mr = MassRequest()
mr.get_urls(urls_to_fetch)
for target, response in mr.results:
    print target, response

Esempio n. 11

0

Mostra file

File: masscrawl.py Progetto: jblann/massweb

class MassCrawl(object):
    def __init__(self, seeds=[], add_seeds_to_scope=True):
        logger.info("Insantiating MassCrawl object")
        self.seeds = seeds
        self.domains = []
        self.posts_identified = []
        self.targets = []
        self.results = []
        self.mreq = None
        self.add_seeds_to_scope(seeds)
        self.add_seeds_to_targets(seeds)

    def add_seeds_to_scope(self, seeds):
        for seed in seeds:
            self.add_to_scope_from_url(seed)

    def add_seeds_to_targets(self, seeds):
        for seed in seeds:
            ct = CrawlTarget(seed)
            self.add_target(ct)

    def get_domain_from_url(self, url):
        domain_raw = urlparse(url).netloc
        if ":" in domain_raw:
            domain = domain_raw.split(":")[0]
        else:
            domain = domain_raw
        return domain

    def add_to_scope_from_url(self, url):
        domain = self.get_domain_from_url(url)
        self.add_to_scope(domain)

    def add_to_scope(self, domain):
        if domain not in self.domains:
            self.domains.append(domain)

    def in_scope(self, url):
        domain = self.get_domain_from_url(url)
        return domain in self.domains

    def add_target(self, target):
        if target not in self.targets:
            self.targets.append(target)

    def parse_response(self, response, stay_in_scope=True, max_links=10):
        links = []
        for tag in BeautifulSoup(response.text,
                                 'html.parser',
                                 parse_only=SoupStrainer(
                                     ['a', 'img', 'script', 'link'])):
            # stop finding links if max links reached
            if len(links) <= max_links:
                link = self.parse_tag(tag, response, stay_in_scope)
                if link:
                    links.append(link)
        return links

    def parse_tag(self, tag, response, stay_in_scope):
        href = None
        if tag.get('href'):
            href = tag.get('href')
        elif tag.get('src'):
            href = tag.get('src')
        if href and not href.startswith("mailto:"):
            link_normed = normalize_link(href, response.url)["norm_url"]
            if stay_in_scope:
                if self.in_scope(link_normed):
                    return link_normed
            else:
                return link_normed

    def dedupe_targets(self):
        seen_hashes = []
        for target in self.targets:
            target_hash = hash(target)
            if target_hash in seen_hashes:
                self.targets.pop(self.targets.index(target))
                logger.warn("Found duplicate target: %s", target)
            else:
                seen_hashes.append(target_hash)

    def filter_targets_by_scope(self):
        #FIXME: !in large-scale crawls, there's some out of scope posts,
        #   this is a hack to stop that, real issue should be found
        #   and resolved
        logger.info("Filtering targets by scope")
        for target in self.targets:
            if not self.in_scope(target.url):
                self.targets.pop(self.targets.index(target))
                logger.warn("Target filtered out that was not in scope: %s",
                            target.url)

    def fetch(self,
              num_threads=10,
              time_per_url=10,
              request_timeout=10,
              proxy_list=[{}]):
        """Fetch URLs and append them to the seed list"""
        self.mreq = MassRequest(num_threads=num_threads,
                                time_per_url=time_per_url,
                                request_timeout=request_timeout,
                                proxy_list=proxy_list,
                                hadoop_reporting=True)
        unfetched_targets = [
            unfetched_target for unfetched_target in self.targets
            if unfetched_target.status == "unfetched"
        ]
        for ut in unfetched_targets:
            logger.info("Fetching %s", ut)
        # NB: this only fetches via GET, doesn't submit forms for more links
        self.mreq.get_targets(self.targets)
        self.results = self.mreq.results
        for target in self.targets:
            target.status = "fetched"

    def parse(self, stay_in_scope=True, max_links=10):
        for target, response in self.results:
            # skip 40X replies and strings (i.e. failed requests)
            logger.info("Attempting to parse %s", target)
            try:
                response.raise_for_status()
            except (
                    HTTPError, AttributeError
            ) as exc:  # only exception type we care about from requests.Response
                logger.debug("Failed request.", exc_info=True)
                continue
            if parse_worthy(response,
                            content_type_match="text/html",
                            hadoop_reporting=True):
                logger.info("pase_worthy function tells us to parse")
            else:
                logger.info("pase_worthy function tells us not to try"
                            " parsing")
                continue
            logger.info("Finding post requests on page %s", response.url)
            #FIXME: !this doesn't stay in scope?
            post_request_targets = find_post_requests(
                target=response.url, response_text=response.text)
            for target_post in post_request_targets:
                ct_post = CrawlTarget(target_post.url)
                ct_post.__dict__ = target_post.__dict__
                ct_post.status = "unfetched"
                self.add_target(ct_post)
            links = self.parse_response(response,
                                        stay_in_scope=stay_in_scope,
                                        max_links=max_links)
            for link in links:
                ct_link = CrawlTarget(unicode(link))
                self.add_target(ct_link)
            if stay_in_scope:
                self.filter_targets_by_scope()

            logger.info("Finished attempted parsing for %s", target)

    def crawl(self,
              depth=3,
              num_threads=10,
              time_per_url=10,
              request_timeout=10,
              proxy_list=None,
              stay_in_scope=True,
              max_links=20,
              dedupe=True):

        for _ in range(depth):
            logger.info("Entering the fetch phase at depth %d", depth)
            self.fetch(num_threads=num_threads,
                       time_per_url=time_per_url,
                       request_timeout=request_timeout,
                       proxy_list=proxy_list or None)
            logger.info("Entering the parse phase at depth %d", depth)
            self.parse(max_links=max_links, stay_in_scope=stay_in_scope)
            if dedupe:
                self.dedupe_targets()
            if stay_in_scope:
                self.filter_targets_by_scope()