Python extract_plaintext_urls_from_text Exemples

Langage de programmation: Python

Espace de nommage/Pack: csxj.datasources.parser_tools.utils

Méthode/Fonction: extract_plaintext_urls_from_text

Exemples au hotexamples.com: 10

Python extract_plaintext_urls_from_text - 10 exemples trouvés. Ce sont les exemples réels les mieux notés de csxj.datasources.parser_tools.utils.extract_plaintext_urls_from_text extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Associées

LogsModel

create_equity_curve

to_datetime

make_scan_pose

copy

get_database_free_size

JINJA_ENVIRONMENT

get_rt_cutBitmap

MessagePasser

getQuotes

Related in langs

fixUrls (PHP)

Hubzero\Mail\Token (PHP)

TCP_Server (C#)

LevelWiseUserInfoModel (C#)

test_hashtable (C++)

start_benchmark (C++)

Fold (Go)

NewChunkProvider (Go)

IQuestionWidget (Java)

TopMost (Java)

Exemple #1

0

Afficher le fichier

Fichier : test_url_extraction.py Projet : sevas/csxj-crawler

def test_schemeless_subdomain_url(self): """ extract_plaintext_urls_from_text() can handle urls with no scheme and a subdomain (e.g. 'blog.foo.net') """ url = "blog.foo.net" extracted_urls = extract_plaintext_urls_from_text(url) eq_( [url], extracted_urls, msg=u"Could not extract schemeless url with subdomain (Expected '{0}', got'{1}')".format( [url], extracted_urls ), )

Exemple #2

0

Afficher le fichier

Fichier : test_url_extraction.py Projet : sevas/csxj-crawler

def test_schemeless_no_www_url(self): """ extract_plaintext_urls_from_text() can handle urls with no scheme, no 'www' prefix (e.g. 'foo.com') """ urls = ["foo.net", "Foo.net"] for url in urls: extracted_urls = extract_plaintext_urls_from_text(url) eq_( [url.lower()], extracted_urls, msg=u"Could not extract schemeless url without 'www' prefix (Expected '{0}', got'{1}')".format( [url], extracted_urls ), )

Exemple #3

0

Afficher le fichier

Fichier : test_url_extraction.py Projet : sevas/csxj-crawler

def test_text_with_urls(self): """ extract_plaintext_urls_from_text()""" urls = extract_plaintext_urls_from_text(self.text_with_urls) eq_( urls, [ "http://www.example.com", "http://en.wikipedia.org/wiki/PC_Tools_(Central_Point_Software)", "http://msdn.microsoft.com/en-us/library/aa752574(VS.85).aspx", "http://www.awesomeexample.com", "lastexample.com", ], )

Exemple #4

0

Afficher le fichier

Fichier : test_url_extraction.py Projet : sevas/csxj-crawler

def test_discard_enails(self): """extract_plaintext_urls_from_text() ignores email adresses""" urls = ["*****@*****.**", "@foo.com", "*****@*****.**"] for url in urls: extracted_urls = extract_plaintext_urls_from_text(url) eq_([], extracted_urls, msg=u"{0} was matched as a url: {1}".format(url, extracted_urls))

Exemple #5

0

Afficher le fichier

Fichier : test_url_extraction.py Projet : sevas/csxj-crawler

def test_tinylinks(self): """extract_plaintext_urls_from_text() correctly guesses that things like “bit.ly/foo” and “is.gd/foo/” """ url = "bit.ly/foo" extracted_urls = extract_plaintext_urls_from_text(url) eq_([url], extracted_urls)

Exemple #6

0

Afficher le fichier

Fichier : test_url_extraction.py Projet : sevas/csxj-crawler

def test_schemeless_url(self): """ extract_plaintext_urls_from_text() can handle urls with no scheme (e.g. 'www.foo.com') """ url = "www.foo.com" extracted_urls = extract_plaintext_urls_from_text(url) eq_([url], extracted_urls, msg=u"(Expected '{0}', got'{1}')".format([url], extracted_urls))

Exemple #7

0

Afficher le fichier

Fichier : test_url_extraction.py Projet : sevas/csxj-crawler

def test_no_url(self): """ extract_plaintext_urls_from_text() returns an empty list if the text contains no URL""" text = self.text.format("not a url") urls = extract_plaintext_urls_from_text(text) eq_(urls, [])

Exemple #8

0

Afficher le fichier

Fichier : test_url_extraction.py Projet : sevas/csxj-crawler

def test_multiple_urls(self): """ extract_plaintext_urls_from_text() can extract several URLs from a piece of text""" text = "this {0} has {1} many {2} links {3}" text_with_urls = text.format(self.simple_url, self.complex_url, self.complex_url, self.simple_url) urls = extract_plaintext_urls_from_text(text_with_urls) eq_(urls, [self.simple_url, self.complex_url, self.complex_url, self.simple_url])

Exemple #9

0

Afficher le fichier

Fichier : test_url_extraction.py Projet : sevas/csxj-crawler

def test_complex_url(self): """ extract_plaintext_urls_from_text() can extract a complex URL (parameters, port, spaces and semicolons) """ text_with_url = self.text.format(self.complex_url) urls = extract_plaintext_urls_from_text(text_with_url) eq_(urls, [self.complex_url])

Exemple #10

0

Afficher le fichier

Fichier : test_url_extraction.py Projet : sevas/csxj-crawler

def test_simple_url(self): """ extract_plaintext_urls_from_text() can extract a simple URL """ text_with_url = self.text.format(self.simple_url) urls = extract_plaintext_urls_from_text(text_with_url) eq_(urls, [self.simple_url])