Beispiel #1
0
 def __init__(self):
     self.email_regex = r'\b([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b'
     self.email_link_regex = r'mailto:\b([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b'
     self.sreq = SoupRequestor()
Beispiel #2
0
class EmailScraper(object):
    def __init__(self):
        self.email_regex = r'\b([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b'
        self.email_link_regex = r'mailto:\b([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b'
        self.sreq = SoupRequestor()

    def follow_link(self, base_url, pattern):
        pass

    def get_email_link_from_page(self, soup):
        r = re.compile(self.email_link_regex, re.I)
        a = soup.find('a', href=re.compile(r))

        if a:
            m = re.search(r, a.get('href'))
            if len(m.groups()):
                return m.group(1)

        return None

    def get_email_text_from_page(self, soup):
        r = re.compile(self.email_regex, re.I)
        t = soup.find(text=re.compile(r))

        if t:
            m = re.search(r, t)
            if len(m.groups()):
                return m.group(1)

        return None

    def scrape_email(self, base_url):
        (r, s) = self.sreq.get(base_url)
        if r is None:
            return None

        if s.meta:
            if 'searchassist.verizon.com' in s.meta.get('content', ''):
                return None

        # First try landing page
        e = self.get_email_link_from_page(s)
        if e:
            return e

        e = self.get_email_text_from_page(s)
        if e:
            return e

        # See if there's a "contact us" page
        a = s.find('a', text=re.compile(r'contact', re.I))
        if a:
            u = urlparse.urljoin(base_url, a.get('href'))
            (r, s) = self.sreq.get(u)

            if r is not None:
                e = self.get_email_link_from_page(s)
                if e:
                    return e

                e = self.get_email_text_from_page(s)
                if e:
                    return e

        # Try escaped fragment version of landing page
        u = urlparse.urljoin(base_url, '?_escaped_fragment_=')
        (r, s) = self.sreq.get(u)

        if r is not None:
            e = self.get_email_link_from_page(s)
            if e:
                return e

            e = self.get_email_text_from_page(s)
            if e:
                return e
Beispiel #3
0
 def __init__(self):
     self.url = "http://map.crossfit.com/affinfo.php?a={}&t=0"
     self.email_scraper = EmailScraper()
     self.sreq = SoupRequestor()
Beispiel #4
0
class CrossfitScraper(object):
    def __init__(self):
        self.url = "http://map.crossfit.com/affinfo.php?a={}&t=0"
        self.email_scraper = EmailScraper()
        self.sreq = SoupRequestor()

    def get_gym(self, affid):
        u = self.url.format(affid)
        r, s = self.sreq.get(u)
        return (r, s)

    def set_gym_from_response(self, affid, r, s):
        if s.b is None:
            return

        b = s.b.extract()
        p = s.contents[-1].extract()

        if b.a is None:
            return

        addr = ' '.join(['%s' % x for x in s.findAll(text=True)])
        addr = ' '.join(addr.split())

        gym = CrossfitGym()
        gym.name = b.a.text
        gym.link = b.a['href']
        gym.addr = addr
        gym.affid = affid
        gym.phone = p
        gym.save()

    def get_gym_list(self):
        for i in xrange(1, 3500):
            if CrossfitGym.objects.filter(affid=i).exists():
                continue

            print 'Getting info for %d' % i
            r, s = self.get_gym(i)
            if r is None:
                continue

            self.set_gym_from_response(i, r, s)
            sleep(0.75)

    def get_gym_email(self, gym):
        if not gym.email and gym.checked_email is False:
            print 'Getting email for %s' % gym
            e = self.email_scraper.scrape_email(gym.link)
            if e:
                gym.email = e
                gym.save()

            gym.checked_email = True
            gym.save()

    def get_gym_emails(self):
        for gym in CrossfitGym.objects.all():
            self.get_gym_email(gym)

    def scrape(self):
        self.get_gym_list()
        self.get_gym_emails()
Beispiel #5
0
 def __init__(self):
     self.url = "http://map.crossfit.com/affinfo.php?a={}&t=0"
     self.email_scraper = EmailScraper()
     self.sreq = SoupRequestor()
Beispiel #6
0
class CrossfitScraper(object):
    def __init__(self):
        self.url = "http://map.crossfit.com/affinfo.php?a={}&t=0"
        self.email_scraper = EmailScraper()
        self.sreq = SoupRequestor()

    def get_gym(self, affid):
        u = self.url.format(affid)
        r,s = self.sreq.get(u)
        return (r,s)

    def set_gym_from_response(self, affid, r, s):
        if s.b is None:
            return

        b = s.b.extract()
        p = s.contents[-1].extract()

        if b.a is None:
            return

        addr = ' '.join(['%s' % x for x in s.findAll(text=True)])
        addr = ' '.join(addr.split())

        gym = CrossfitGym()
        gym.name = b.a.text
        gym.link = b.a['href']
        gym.addr = addr
        gym.affid = affid
        gym.phone = p
        gym.save()
        
    def get_gym_list(self):
        for i in xrange(1, 3500):
            if CrossfitGym.objects.filter(affid=i).exists():
                continue

            print 'Getting info for %d' % i
            r,s = self.get_gym(i)
            if r is None:
                continue

            self.set_gym_from_response(i, r, s)
            sleep(0.75)

    def get_gym_email(self, gym):
        if not gym.email and gym.checked_email is False:
            print 'Getting email for %s' % gym
            e = self.email_scraper.scrape_email(gym.link)
            if e:
                gym.email = e
                gym.save()

            gym.checked_email = True
            gym.save()

    def get_gym_emails(self):
        for gym in CrossfitGym.objects.all():
            self.get_gym_email(gym)

    def scrape(self):
        self.get_gym_list()
        self.get_gym_emails()