class BingViadeoContacts: def __init__(self, bing_key): self.bing = BingAPIFetcher(bing_key) self.viadeo = ViadeoFetcher() self.logger = logging.getLogger("webmining:viadeo_contact_fetcher") self.logger.setLevel(logging.INFO) def fetch(self, company_name, country, city=None, pages=1): # Viadeo activated for France only if country != "FR": return [] for designation in company_designations: company_name = " " + company_name + " " company_name = company_name.replace(designation, "") company_name = company_name.strip() tld = country_to_tld[country] query = '(site:%s.viadeo.com/%s/profile) intitle:"%s" %d' % ( tld, tld, company_name, datetime.datetime.now().year) if city is not None: query = query + '("%s")' % (company_name, city) results = [] for page in range(pages): results += self.bing.fetch(query, start=page, country=country) all_contacts = [] for res in results: # Sometimes, the "©{year}" in the footer is the snippet if "©" in res.snippet: continue if company_name.lower() not in res.title.lower(): continue contact = self.extract(company_name, res.title, res.snippet) if contact is not None: contact_obj = Contact(*contact) contact_obj.sources.append(res.url) all_contacts.append(contact_obj) self.logger.info("%d contacts found" % len(all_contacts)) return all_contacts def extract(self, company_name, title, snippet): name = title.split(",")[0] for month in months: snippet = snippet.replace(month, "") snippet.replace(" ", " ") for pattern in viadeo_snippet_patterns: match = re.search(pattern % datetime.datetime.now().year, snippet, re.I) if match: return name, match.group("job") return None
class LinkedinAccountDetector: def __init__(self, api_key): self.bing = BingAPIFetcher(api_key) def _fetch(self, query, company): results = self.bing.fetch(query) return self.parse_results(results, company) def detect(self, company_name, company_website=None): request = 'site:linkedin.com/company "%s"' % company_name result = self._fetch(request, company_name) if result is None and company_website is not None: company_domain = urlparse(company_website).netloc if company_domain != "": request = 'site:linkedin.com/company "%s"' % company_domain result = self._fetch(request, company_name) if result is None: return result if not LINKEDIN_URL.match(result.url): #sys.stderr.write("Not a linkedin url: " + result.url + "\n") return None company_identifier = LINKEDIN_URL.search( result.url).groupdict()["company"] #If the identifier is the universal name and not the id, we test for similarity try: int(company_identifier) except ValueError: score = jaro_winkler(normalize(company_name), normalize(company_identifier)) if score < 0.7: #sys.stderr.write("%s too distant from %s (%.2f)\n" % (normalize(company_name), # normalize(company_identifier), # score)) return None return result def parse_results(self, results, company): if len(results) == 0: return None else: return LinkedinAccount(company, results[0].url)
class BingLinkedinContacts: def __init__(self, bing_key): self.bing = BingAPIFetcher(bing_key) self.logger = logging.getLogger("webmining:linkedin_contact_fetcher") self.logger.setLevel(logging.INFO) def fetch(self, company_name, country, city=None, pages=1): tld = country_to_tld[country] query = '(site:%s.linkedin.com/pub/ OR site:%s.linkedin.com/in/) ' % ( tld, tld) if city is not None: query = query + '("%s" "%s") ' % (company_name, city) else: query = query + '"%s"' % (company_name, ) results = [] for page in range(pages): results += self.bing.fetch(query, start=page, country=country) all_contacts = [] for res in results: if "/pub/dir" in res.url: continue contact = self.extract(company_name, res.title, res.snippet) if contact is not None: contact_obj = Contact(*contact) contact_obj.sources.append(res.url) all_contacts.append(contact_obj) return all_contacts def extract(self, company, title, snippet): contact_name = re.match("^(.*) \\| LinkedIn", title) normalize = lambda x: x.strip().lower() if contact_name is None: return None contact_name = contact_name.group(1) job = None matched_company = None self.logger.debug("Searching data for %s in company %s" % (contact_name, company)) # good snippets come in the form 'Contact Name. Title. Location.' """ 'Clément Chastagnol. R&D Engineer chez Data Publica, PhD in Computer Sciences. Lieu Région de Paris , France Secteur Études/recherche' """ m = re.match( "%s\. (.+?)(?: chez | at | @ )(.+?)\. " % re.escape(contact_name), snippet) if m is not None: job = m.group(1) matched_company = m.group(2) if normalize(company) in normalize(matched_company): return (contact_name, job) else: self.logger.warning("Company name mismatch for %s : %s VS %s" % \ (contact_name, company, matched_company)) return None