Example #1
0
 def test_vine(self):
     res = socialregexes.identify('https://vine.co/LoganPaul')
     self.assertTrue(not res is None and len(res) == 2)
     self.assertTrue(res[0] == 'vine')
     self.assertTrue(res[1] == 'LoganPaul')
     res = socialregexes.identify('https://vine.co/u/940474327508377600')
     self.assertTrue(not res is None and len(res) == 2)
     self.assertTrue(res[0] == 'vine')
     self.assertTrue(res[1] == '940474327508377600')
Example #2
0
 def test_googleplus(self):
     res = socialregexes.identify(
         'https://plus.google.com/u/0/+guillemlefait')
     self.assertTrue(not res is None and len(res) == 2)
     self.assertTrue(res[0] == 'googleplus')
     self.assertTrue(res[1] == '+guillemlefait')
     res = socialregexes.identify(
         'https://plus.google.com/u/0/116882192944905398376')
     self.assertTrue(not res is None and len(res) == 2)
     self.assertTrue(res[0] == 'googleplus')
     self.assertTrue(res[1] == '116882192944905398376')
Example #3
0
 def test_facebook(self):
     for link in ('https://www.facebook.com/guillem.lefait',
                  'https://fr-fr.facebook.com/guillem.lefait',
                  'https://www.facebook.com/guillem.lefait?ref=br_rs'):
         res = socialregexes.identify(link)
         self.assertTrue(not res is None and len(res) == 2)
         self.assertTrue(res[0] == 'facebook')
         self.assertTrue(res[1] == 'guillem.lefait')
     res = socialregexes.identify(
         'https://www.facebook.com/profile.php?id=654000317')
     self.assertTrue(not res is None and len(res) == 2)
     self.assertTrue(res[0] == 'facebook')
     self.assertTrue(res[1] == '654000317')
Example #4
0
 def test_stackoverflow(self):
     for link in ('http://stackoverflow.com/users/3090365/glefait',
                  'http://stackoverflow.com/users/3090365/glefait?tab'):
         res = socialregexes.identify(link)
         self.assertTrue(not res is None and len(res) == 2)
         self.assertTrue(res[0] == 'stackoverflow')
         self.assertTrue(res[1] == '3090365')
Example #5
0
 def test_twitter_true_positive(self):
     for link in ('https://twitter.com/guillem_lefait',
                  'https://twitter.com/@guillem_lefait',
                  'https://twitter.com/#!/guillem_lefait'):
         res = socialregexes.identify(link)
         self.assertTrue(not res is None and len(res) == 2)
         self.assertTrue(res[0] == 'twitter')
         self.assertTrue(res[1] == 'guillem_lefait')
Example #6
0
 def test_pinterest(self):
     res = socialregexes.identify('https://fr.pinterest.com/diceverywhere/')
     self.assertTrue(not res is None and len(res) == 2)
     self.assertTrue(res[0] == 'pinterest')
     self.assertTrue(res[1] == 'diceverywhere')
Example #7
0
 def test_linkedin(self):
     res = socialregexes.identify('https://www.linkedin.com/in/glefait/')
     self.assertTrue(not res is None and len(res) == 2)
     self.assertTrue(res[0] == 'linkedin')
     self.assertTrue(res[1] == 'glefait')
Example #8
0
 def test_github_true_negative(self):
     res = socialregexes.identify('https://github.com/search')
     self.assertTrue(res is None)
Example #9
0
 def test_github_true_positive(self):
     res = socialregexes.identify('https://github.com/glefait')
     self.assertTrue(not res is None and len(res) == 2)
     self.assertTrue(res[0] == 'github')
     self.assertTrue(res[1] == 'glefait')
Example #10
0
 def test_junk(self):
     res = socialregexes.identify('http://blabla.com/xyz')
     self.assertTrue(res is None)
Example #11
0
 def test_email(self):
     res = socialregexes.identify(
         '*****@*****.**')
     self.assertTrue(not res is None and len(res) == 2)
     self.assertTrue(res[0] == 'email')
     self.assertTrue(res[1] == '*****@*****.**')
Example #12
0
 def test_twitter_true_negative(self):
     res = socialregexes.identify('https://twitter.com/tos')
     self.assertTrue(res is None)
Example #13
0
    def _get_vcs_links(self, rule, name, data, inputs):
        normalized_name = normalize(name)
        results = []
        previous_results = None
        max_fetch_count = max(rule.fetch_count, self._max_fetch_count or 0)
        fetch_list = []
        seen_list = []
        fetch_count = 0
        queue = inputs[:]

        logger.debug("queue {}".format(queue))

        while queue:
            must_accept = False
            item = queue.pop(0)
            try:
                logger.debug("processing {}: {}".format(
                    item.__class__.__name__, item.source or item))
            except UnicodeEncodeError:
                logger.debug("processing {} (item decode problem)".format(
                    item.__class__.__name__))

            result_url = None
            if isinstance(item, UrlSet):
                urls = item.value
                urls = self._exclude_non_urls(urls)
                urls = self._normalise_urls(urls)

                if not urls:
                    continue

                seen_list += list(urls)

                fetch_urls = []
                result_urls = []

                for url in urls:
                    rv = self._accept_url(rule, name, url)
                    if rv:
                        results.append(rv)
                        result_urls.append(rv)
                    elif rv is not False:
                        fetch_urls.append(url)

                if len(result_urls) == 1 and len(fetch_list) <= 1:
                    results.append(result_urls[0])

                if result_urls:
                    logger.debug("--results from {}: {}".format(
                        item.value, result_urls))
                else:
                    logger.debug("--none of this set: {}".format(item.value))

                logger.debug("queuing {}".format(sorted(fetch_urls)))
                for url in self._group_urls(fetch_urls, name):
                    queue.append(Url(url))

                logger.info("{}: from {} added urls {}".format(
                    name, item.source, sorted(result_urls)))

            elif isinstance(item, Text):
                if fetch_count > max_fetch_count:
                    logger.info("Not processing text from {}".format(
                        item.source))
                    continue

                text = item.value
                if not text:
                    continue
                try:
                    urls = set(rule.link_extract(text, item.source))
                except Exception:
                    urls = []
                logger.debug("@@ ran {} on text size {} for {} urls !!".format(
                    rule.link_extract, len(text), len(urls)))
                logger.debug("extracted {}".format(sorted(urls)))
                queue.append(UrlSet(urls, item.source))

            elif isinstance(item, Email):
                email = item.value
                result_url = _find_named_repo(name, [email])
                must_accept = True

            elif isinstance(item, Name):
                item_name = item.value
                result_url = _find_named_repo(item_name)
                must_accept = True

            elif isinstance(item, Url):
                url = item.value

                if url in fetch_list:
                    logger.debug(
                        "queue loop skipping already fetched {}".format(url))
                    continue

                if url.startswith("https://github.com") or url.startswith(
                        "http://github.com"):  # see github rule below
                    if ".github.com" not in url:
                        logger.debug(
                            "queue loop skipping github {}".format(url))
                        continue

                if fetch_count > max_fetch_count:
                    logger.debug("queue loop skipping >{}: {}".format(
                        max_fetch_count, url))
                    continue

                user = identify(url)
                if user:
                    logger.info("{} detected as social for {}".format(
                        url, user))
                    continue

                rv = rule.reject_url(name, url.lower())
                logger.debug("reject rule {}: {}".format(url, rv))
                if rv in ["", None]:
                    continue
                if rv is True:
                    continue

                if ("github.com" in url and ".github.com" not in url
                    ):  # yara, pykalman, membrete, zvmcloudconnector
                    logger.debug(
                        "queue loop skipping github v2 {}".format(url))
                    continue

                if self.web_session is None:
                    self.web_session = get_file_cache_session("web")

                if url.startswith("git://"):  # TODO: create tidy phase
                    url = url[6:]

                if not url.startswith("http://") and not url.startswith(
                        "https://"):
                    if "/" not in url:
                        url = "http://" + url + "/"
                    elif "://" not in url:
                        url = "http://" + url
                try:
                    logger.info("r {}".format(url))
                    r = self.web_session.get(url,
                                             headers=self.headers,
                                             timeout=get_timeout(url))
                    logger.debug("r {}.url {} elapsed {}".format(
                        r.__class__.__name__, r.url, r.elapsed))
                    logger.debug("r {} headers: {}".format(r.url, r.headers))
                    r.raise_for_status()
                except Exception as e:
                    logger.warning("{}: {}".format(url, e))
                    continue

                urls = []
                if r.url != url:
                    urls.append(r.url)

                if r.headers.get("X-RTD-Project"):
                    rtd_url = get_rtd_repo(r.headers.get("X-RTD-Project"))
                    if rtd_url:
                        logger.warning("{}: rtd {}".format(url, rtd_url))
                        urls.append(rtd_url)

                if urls:
                    queue.append(UrlSet(set(urls)))

                if not r.text:
                    logger.warning("{}: empty page".format(url))
                else:
                    queue.append(Webpage(r, url))
                    fetch_list.append(url)
                    fetch_count += 1

            if not result_url and results and results != previous_results:
                result_url = get_best_match(rule.match, results)
                previous_results = results[:]

            if result_url:
                score = _compute_similarity(rule.match, result_url)
                if must_accept or score < self.max_distance:
                    if self._store_fetch_list:
                        _fetch_mapping[normalized_name] = fetch_list

                    return result_url

            if not queue and not results:
                ph_url = "https://pythonhosted.org/{}/".format(name)
                if ph_url not in seen_list:
                    queue.append(Url(ph_url))

        logger.debug("fetched {}".format(fetch_list))
        if self._store_fetch_list:
            _fetch_mapping[normalized_name] = fetch_list

        if results:
            return get_best_match(rule.match, results)