def fetch(country_code, industry=None, limit=5): """Entry point for fetching contact list.""" domain = 'http://{country}.linkedin.com'.format(country=country_code) url_pattern = '{domain}/directory/people-{letter}-{cat_1}-{cat_2}' contact_list = list() for letter in string.ascii_lowercase: # We generate random URL. url = url_pattern.format(domain=domain, letter=letter, cat_1=randint(1, 100), cat_2=randint(1, 100)) try: soup = connect(url) except urllib.error.HTTPError: continue # go to the next page if we got 404 ul = soup.find('ul', class_='directory') if ul is not None: for li in ul.children: if len(contact_list) == limit: break if istag(li): link = li.a['href'] profile_url = '{domain}{link}'.format(domain=domain, link=link) try: data = _fetch_details(profile_url) except urllib.error.HTTPError: continue else: if not 'company' in data: # must be provided continue contact_list.append(data) return contact_list
def fetch(country_code, industry=None, limit=5): """Entry point for fetching contact list.""" domain = 'http://{country}.linkedin.com'.format(country=country_code) url_pattern = '{domain}/directory/people-{letter}-{cat_1}-{cat_2}' contact_list = list() for letter in string.ascii_lowercase: # We generate random URL. url = url_pattern.format(domain=domain, letter=letter, cat_1=randint(1, 100), cat_2=randint(1, 100)) try: soup = connect(url) except urllib.error.HTTPError: continue # go to the next page if we got 404 ul = soup.find('ul', class_='directory') if ul is not None: for li in ul.children: if len(contact_list) == limit: break if istag(li): link = li.a['href'] profile_url = '{domain}{link}'.format( domain=domain, link=link) try: data = _fetch_details(profile_url) except urllib.error.HTTPError: continue else: if not 'company' in data: # must be provided continue contact_list.append(data) return contact_list
def _fetch_details(url): """Fetch and parse company's details.""" print("Connecting to {}".format(url)) soup = connect(url) data = dict() # Get company name. cont = soup.find(id='result') if cont is not None: name_tag = cont.find('h2') if name_tag is not None: data['name'] = cleanstr(cont.find('h2').string) table = cont.find('table') if table is not None: rows = table.find_all('tr') if len(rows) > 1: row = rows[1] for col in row.children: if istag(col): s = str() for el in col.contents: if el.string is not None: if istag(el): s += cleanstr(el.string) else: s += cleanstr(el) if 'Tel:' in s: # Get phone number. phone = '' for d in s[5:17]: if d.isdigit(): phone += d if len(phone) == 10: data['phone'] = phone else: # Get city. for city in CITIES: if city.lower() in s.lower(): data['city'] = city print("Result: {}".format(data)) if 'name' in data and 'phone' in data and 'city' in data: return data return
def _parse(industry, delay=False, attempts=10): """Parse data for industry. Use recursive call when the page isn't available or we haven't receive enough data. """ if not attempts: # Exceeded attempts, exiting recursion. return if delay: # In order to avoid ban. time.sleep(10) slug = slugify(industry) url_pattern = '{domain}/category/{category}/?p={page}' url = url_pattern.format(domain=DOMAIN, category=slug, page=randint(1, 10)) try: soup = connect(url) except urllib.error.HTTPError: _parse(industry, True, attempts - 1) list_ = list() ul = soup.find(id='result') if ul is not None: for li in ul.find_all(class_='listing'): if len(list_) == limit: break if istag(li): link = li.find('a')['href'] profile_url = '{domain}{link}'.format(domain=DOMAIN, link=link) try: data = _fetch_details(profile_url) except urllib.error.HTTPError: continue else: if data is None: # incomplete information continue data.update(dict(industry=industry, country='Malaysia')) list_.append(data) else: # Probably we got banned. return [] # Continue searching for records if we haven't got enough. if len(list_) < limit: _parse(industry, True, attempts - 1) return list_
def _parse(industry, delay=False, attempts=10): """Parse data for industry. Use recursive call when the page isn't available or we haven't receive enough data. """ if not attempts: # Exceeded attempts, exiting recursion. return if delay: # In order to avoid ban. time.sleep(10) slug = slugify(industry) url_pattern = '{domain}/category/{category}/?p={page}' url = url_pattern.format(domain=DOMAIN, category=slug, page=randint(1, 10)) try: soup = connect(url) except urllib.error.HTTPError: _parse(industry, True, attempts-1) list_ = list() ul = soup.find(id='result') if ul is not None: for li in ul.find_all(class_='listing'): if len(list_) == limit: break if istag(li): link = li.find('a')['href'] profile_url = '{domain}{link}'.format( domain=DOMAIN, link=link) try: data = _fetch_details(profile_url) except urllib.error.HTTPError: continue else: if data is None: # incomplete information continue data.update(dict(industry=industry, country='Malaysia')) list_.append(data) else: # Probably we got banned. return [] # Continue searching for records if we haven't got enough. if len(list_) < limit: _parse(industry, True, attempts-1) return list_