def test_vine(self): res = socialregexes.identify('https://vine.co/LoganPaul') self.assertTrue(not res is None and len(res) == 2) self.assertTrue(res[0] == 'vine') self.assertTrue(res[1] == 'LoganPaul') res = socialregexes.identify('https://vine.co/u/940474327508377600') self.assertTrue(not res is None and len(res) == 2) self.assertTrue(res[0] == 'vine') self.assertTrue(res[1] == '940474327508377600')
def test_googleplus(self): res = socialregexes.identify( 'https://plus.google.com/u/0/+guillemlefait') self.assertTrue(not res is None and len(res) == 2) self.assertTrue(res[0] == 'googleplus') self.assertTrue(res[1] == '+guillemlefait') res = socialregexes.identify( 'https://plus.google.com/u/0/116882192944905398376') self.assertTrue(not res is None and len(res) == 2) self.assertTrue(res[0] == 'googleplus') self.assertTrue(res[1] == '116882192944905398376')
def test_facebook(self): for link in ('https://www.facebook.com/guillem.lefait', 'https://fr-fr.facebook.com/guillem.lefait', 'https://www.facebook.com/guillem.lefait?ref=br_rs'): res = socialregexes.identify(link) self.assertTrue(not res is None and len(res) == 2) self.assertTrue(res[0] == 'facebook') self.assertTrue(res[1] == 'guillem.lefait') res = socialregexes.identify( 'https://www.facebook.com/profile.php?id=654000317') self.assertTrue(not res is None and len(res) == 2) self.assertTrue(res[0] == 'facebook') self.assertTrue(res[1] == '654000317')
def test_stackoverflow(self): for link in ('http://stackoverflow.com/users/3090365/glefait', 'http://stackoverflow.com/users/3090365/glefait?tab'): res = socialregexes.identify(link) self.assertTrue(not res is None and len(res) == 2) self.assertTrue(res[0] == 'stackoverflow') self.assertTrue(res[1] == '3090365')
def test_twitter_true_positive(self): for link in ('https://twitter.com/guillem_lefait', 'https://twitter.com/@guillem_lefait', 'https://twitter.com/#!/guillem_lefait'): res = socialregexes.identify(link) self.assertTrue(not res is None and len(res) == 2) self.assertTrue(res[0] == 'twitter') self.assertTrue(res[1] == 'guillem_lefait')
def test_pinterest(self): res = socialregexes.identify('https://fr.pinterest.com/diceverywhere/') self.assertTrue(not res is None and len(res) == 2) self.assertTrue(res[0] == 'pinterest') self.assertTrue(res[1] == 'diceverywhere')
def test_linkedin(self): res = socialregexes.identify('https://www.linkedin.com/in/glefait/') self.assertTrue(not res is None and len(res) == 2) self.assertTrue(res[0] == 'linkedin') self.assertTrue(res[1] == 'glefait')
def test_github_true_negative(self): res = socialregexes.identify('https://github.com/search') self.assertTrue(res is None)
def test_github_true_positive(self): res = socialregexes.identify('https://github.com/glefait') self.assertTrue(not res is None and len(res) == 2) self.assertTrue(res[0] == 'github') self.assertTrue(res[1] == 'glefait')
def test_junk(self): res = socialregexes.identify('http://blabla.com/xyz') self.assertTrue(res is None)
def test_email(self): res = socialregexes.identify( '*****@*****.**') self.assertTrue(not res is None and len(res) == 2) self.assertTrue(res[0] == 'email') self.assertTrue(res[1] == '*****@*****.**')
def test_twitter_true_negative(self): res = socialregexes.identify('https://twitter.com/tos') self.assertTrue(res is None)
def _get_vcs_links(self, rule, name, data, inputs): normalized_name = normalize(name) results = [] previous_results = None max_fetch_count = max(rule.fetch_count, self._max_fetch_count or 0) fetch_list = [] seen_list = [] fetch_count = 0 queue = inputs[:] logger.debug("queue {}".format(queue)) while queue: must_accept = False item = queue.pop(0) try: logger.debug("processing {}: {}".format( item.__class__.__name__, item.source or item)) except UnicodeEncodeError: logger.debug("processing {} (item decode problem)".format( item.__class__.__name__)) result_url = None if isinstance(item, UrlSet): urls = item.value urls = self._exclude_non_urls(urls) urls = self._normalise_urls(urls) if not urls: continue seen_list += list(urls) fetch_urls = [] result_urls = [] for url in urls: rv = self._accept_url(rule, name, url) if rv: results.append(rv) result_urls.append(rv) elif rv is not False: fetch_urls.append(url) if len(result_urls) == 1 and len(fetch_list) <= 1: results.append(result_urls[0]) if result_urls: logger.debug("--results from {}: {}".format( item.value, result_urls)) else: logger.debug("--none of this set: {}".format(item.value)) logger.debug("queuing {}".format(sorted(fetch_urls))) for url in self._group_urls(fetch_urls, name): queue.append(Url(url)) logger.info("{}: from {} added urls {}".format( name, item.source, sorted(result_urls))) elif isinstance(item, Text): if fetch_count > max_fetch_count: logger.info("Not processing text from {}".format( item.source)) continue text = item.value if not text: continue try: urls = set(rule.link_extract(text, item.source)) except Exception: urls = [] logger.debug("@@ ran {} on text size {} for {} urls !!".format( rule.link_extract, len(text), len(urls))) logger.debug("extracted {}".format(sorted(urls))) queue.append(UrlSet(urls, item.source)) elif isinstance(item, Email): email = item.value result_url = _find_named_repo(name, [email]) must_accept = True elif isinstance(item, Name): item_name = item.value result_url = _find_named_repo(item_name) must_accept = True elif isinstance(item, Url): url = item.value if url in fetch_list: logger.debug( "queue loop skipping already fetched {}".format(url)) continue if url.startswith("https://github.com") or url.startswith( "http://github.com"): # see github rule below if ".github.com" not in url: logger.debug( "queue loop skipping github {}".format(url)) continue if fetch_count > max_fetch_count: logger.debug("queue loop skipping >{}: {}".format( max_fetch_count, url)) continue user = identify(url) if user: logger.info("{} detected as social for {}".format( url, user)) continue rv = rule.reject_url(name, url.lower()) logger.debug("reject rule {}: {}".format(url, rv)) if rv in ["", None]: continue if rv is True: continue if ("github.com" in url and ".github.com" not in url ): # yara, pykalman, membrete, zvmcloudconnector logger.debug( "queue loop skipping github v2 {}".format(url)) continue if self.web_session is None: self.web_session = get_file_cache_session("web") if url.startswith("git://"): # TODO: create tidy phase url = url[6:] if not url.startswith("http://") and not url.startswith( "https://"): if "/" not in url: url = "http://" + url + "/" elif "://" not in url: url = "http://" + url try: logger.info("r {}".format(url)) r = self.web_session.get(url, headers=self.headers, timeout=get_timeout(url)) logger.debug("r {}.url {} elapsed {}".format( r.__class__.__name__, r.url, r.elapsed)) logger.debug("r {} headers: {}".format(r.url, r.headers)) r.raise_for_status() except Exception as e: logger.warning("{}: {}".format(url, e)) continue urls = [] if r.url != url: urls.append(r.url) if r.headers.get("X-RTD-Project"): rtd_url = get_rtd_repo(r.headers.get("X-RTD-Project")) if rtd_url: logger.warning("{}: rtd {}".format(url, rtd_url)) urls.append(rtd_url) if urls: queue.append(UrlSet(set(urls))) if not r.text: logger.warning("{}: empty page".format(url)) else: queue.append(Webpage(r, url)) fetch_list.append(url) fetch_count += 1 if not result_url and results and results != previous_results: result_url = get_best_match(rule.match, results) previous_results = results[:] if result_url: score = _compute_similarity(rule.match, result_url) if must_accept or score < self.max_distance: if self._store_fetch_list: _fetch_mapping[normalized_name] = fetch_list return result_url if not queue and not results: ph_url = "https://pythonhosted.org/{}/".format(name) if ph_url not in seen_list: queue.append(Url(ph_url)) logger.debug("fetched {}".format(fetch_list)) if self._store_fetch_list: _fetch_mapping[normalized_name] = fetch_list if results: return get_best_match(rule.match, results)