Ejemplo n.º 1
0
class BingViadeoContacts:
    def __init__(self, bing_key):
        self.bing = BingAPIFetcher(bing_key)
        self.viadeo = ViadeoFetcher()

        self.logger = logging.getLogger("webmining:viadeo_contact_fetcher")
        self.logger.setLevel(logging.INFO)

    def fetch(self, company_name, country, city=None, pages=1):
        # Viadeo activated for France only
        if country != "FR":
            return []

        for designation in company_designations:
            company_name = " " + company_name + " "
            company_name = company_name.replace(designation, "")
            company_name = company_name.strip()

        tld = country_to_tld[country]
        query = '(site:%s.viadeo.com/%s/profile) intitle:"%s" %d' % (
            tld, tld, company_name, datetime.datetime.now().year)

        if city is not None:
            query = query + '("%s")' % (company_name, city)

        results = []
        for page in range(pages):
            results += self.bing.fetch(query, start=page, country=country)

        all_contacts = []
        for res in results:
            # Sometimes, the "©{year}" in the footer is the snippet
            if "©" in res.snippet:
                continue
            if company_name.lower() not in res.title.lower():
                continue

            contact = self.extract(company_name, res.title, res.snippet)
            if contact is not None:
                contact_obj = Contact(*contact)
                contact_obj.sources.append(res.url)
                all_contacts.append(contact_obj)

        self.logger.info("%d contacts found" % len(all_contacts))

        return all_contacts

    def extract(self, company_name, title, snippet):
        name = title.split(",")[0]

        for month in months:
            snippet = snippet.replace(month, "")
            snippet.replace("  ", " ")

        for pattern in viadeo_snippet_patterns:
            match = re.search(pattern % datetime.datetime.now().year, snippet,
                              re.I)
            if match:
                return name, match.group("job")
        return None
class LinkedinAccountDetector:
    def __init__(self, api_key):
        self.bing = BingAPIFetcher(api_key)

    def _fetch(self, query, company):
        results = self.bing.fetch(query)
        return self.parse_results(results, company)

    def detect(self, company_name, company_website=None):
        request = 'site:linkedin.com/company "%s"' % company_name
        result = self._fetch(request, company_name)

        if result is None and company_website is not None:
            company_domain = urlparse(company_website).netloc
            if company_domain != "":
                request = 'site:linkedin.com/company "%s"' % company_domain
                result = self._fetch(request, company_name)

        if result is None:
            return result

        if not LINKEDIN_URL.match(result.url):
            #sys.stderr.write("Not a linkedin url: " + result.url + "\n")
            return None

        company_identifier = LINKEDIN_URL.search(
            result.url).groupdict()["company"]

        #If the identifier is the universal name and not the id, we test for similarity
        try:
            int(company_identifier)
        except ValueError:
            score = jaro_winkler(normalize(company_name),
                                 normalize(company_identifier))
            if score < 0.7:
                #sys.stderr.write("%s too distant from %s (%.2f)\n" % (normalize(company_name),
                #                                                      normalize(company_identifier),
                #                                                      score))
                return None

        return result

    def parse_results(self, results, company):
        if len(results) == 0:
            return None
        else:
            return LinkedinAccount(company, results[0].url)
Ejemplo n.º 3
0
class BingLinkedinContacts:
    def __init__(self, bing_key):
        self.bing = BingAPIFetcher(bing_key)
        self.logger = logging.getLogger("webmining:linkedin_contact_fetcher")
        self.logger.setLevel(logging.INFO)

    def fetch(self, company_name, country, city=None, pages=1):
        tld = country_to_tld[country]
        query = '(site:%s.linkedin.com/pub/ OR site:%s.linkedin.com/in/) ' % (
            tld, tld)

        if city is not None:
            query = query + '("%s" "%s") ' % (company_name, city)
        else:
            query = query + '"%s"' % (company_name, )

        results = []
        for page in range(pages):
            results += self.bing.fetch(query, start=page, country=country)

        all_contacts = []
        for res in results:
            if "/pub/dir" in res.url:
                continue

            contact = self.extract(company_name, res.title, res.snippet)
            if contact is not None:
                contact_obj = Contact(*contact)
                contact_obj.sources.append(res.url)
                all_contacts.append(contact_obj)

        return all_contacts

    def extract(self, company, title, snippet):
        contact_name = re.match("^(.*) \\| LinkedIn", title)
        normalize = lambda x: x.strip().lower()

        if contact_name is None:
            return None

        contact_name = contact_name.group(1)
        job = None
        matched_company = None
        self.logger.debug("Searching data for %s in company %s" %
                          (contact_name, company))

        # good snippets come in the form 'Contact Name. Title. Location.'
        """
        'Clément Chastagnol. R&D Engineer chez Data Publica, PhD in Computer Sciences. Lieu Région de Paris , France Secteur Études/recherche'
        """
        m = re.match(
            "%s\. (.+?)(?: chez | at | @ )(.+?)\. " % re.escape(contact_name),
            snippet)
        if m is not None:
            job = m.group(1)
            matched_company = m.group(2)

            if normalize(company) in normalize(matched_company):
                return (contact_name, job)
            else:
                self.logger.warning("Company name mismatch for %s : %s VS %s" % \
                                    (contact_name, company, matched_company))

        return None