def test_use_fallback_if_confidence_is_low(): nlp_without_fallback = spacy.blank("xx") nlp_without_fallback.add_pipe(LanguageDetector()) doc = nlp_without_fallback(poor_quality_text) assert doc._.language_score < 0.5 nlp_with_fallback = spacy.blank("xx") nlp_with_fallback.add_pipe(LanguageDetector(threshold=0.5)) doc = nlp_with_fallback(poor_quality_text) assert doc._.language == "xx" assert doc._.language_score < 0.5
def test_detect_doc_language(): nlp = spacy.blank("xx") nlp.add_pipe(LanguageDetector()) doc = nlp(en_text) assert doc._.language == "en" assert doc._.language_score >= 0.8
def test_use_custom_fallback(): nlp = spacy.blank("xx") nlp.add_pipe(LanguageDetector(threshold=0.99, default_language="fr")) doc = nlp(en_text) assert doc._.language == "fr" assert doc._.language_score >= 0.8
def test_use_fallback_value_if_language_not_supported(): nlp = spacy.blank("xx") nlp.add_pipe(LanguageDetector(supported_languages=["fr"])) doc = nlp(en_text) assert doc._.language == "xx" assert doc._.language_score >= 0.8
def test_use_custom_model(): nlp = spacy.blank("xx") nlp.add_pipe( LanguageDetector(model_path=os.path.realpath( os.path.join(__file__, "..", "..", "spacy_fastlang", "lid.176.ftz")))) doc = nlp(en_text) assert doc._.language == "en" assert doc._.language_score >= 0.8
def book_scraping( html_source ): # this takes the html content and returns a list with the useful info nlp = spacy.load('en_core_web_sm') nlp.add_pipe(LanguageDetector()) soup = BeautifulSoup( html_source, features='lxml') # instantiate a BeautifulSoup object for HTML parsing bookTitle = soup.find_all( 'h1', id='bookTitle')[0].contents[0].strip() # get the book title # if bookSeries is not present, then set it to the empty string try: bookSeries = soup.find_all( 'h2', id='bookSeries')[0].contents[1].contents[0].strip()[1:-1] except: bookSeries = '' # if bookAuthors is not present, then set it to the empty string try: bookAuthors = soup.find_all('span', itemprop='name')[0].contents[0].strip() except: bookAuthors = '' # the plot of the book is essential; if something goes wrong with the plot, raise an error try: Plot = soup.find_all( 'div', id='description' )[0].contents # get the main tag where the plot is found filter_plot = list( filter(lambda i: i != '\n', Plot) ) # filter the plot by removing tags that doesn’t contain the description if len(filter_plot) == 1: Plot = filter_plot[0].text else: # getting all the plot within the tag Plot = filter_plot[1].text doc = nlp(Plot) if doc._.language != 'en': raise Exception # if the plot is not in english, raise an error except: raise # pass the error to the caller function # if NumberofPages is not present, then set it to the empty string try: NumberofPages = soup.find_all( 'span', itemprop='numberOfPages')[0].contents[0].split()[0] except: NumberofPages = '' # if ratingValue is not present, then set it to the empty string try: ratingValue = soup.find_all( 'span', itemprop='ratingValue')[0].contents[0].strip() except: ratingValue = '' # if rating_reviews is not present, then set it to the empty string try: ratings_reviews = soup.find_all('a', href='#other_reviews') for i in ratings_reviews: if i.find_all('meta', itemprop='ratingCount'): ratingCount = i.contents[2].split()[0] if i.find_all('meta', itemprop='reviewCount'): reviewCount = i.contents[2].split()[0] except: ratings_reviews = '' # if Published is not present, then set it to the empty string try: pub = soup.find_all('div', class_='row')[1].contents[0].split()[1:4] Published = ' '.join(pub) # join the list of publishers except: Published = '' # if Character is not present, then set it to the empty string try: char = soup.find_all( 'a', href=re.compile('characters') ) # find the regular expression(re) 'characters' within the attribute href if len(char) == 0: Characters = '' # no characters in char else: Characters = ', '.join([i.contents[0] for i in char]) except: Characters = '' # something went wrong with char # if Setting is not present, then set it to the empty string try: sett = soup.find_all( 'a', href=re.compile('places') ) # find the regular expression(re) 'places' within the attribute href if len(sett) == 0: Setting = '' else: Setting = ', '.join([i.contents[0] for i in sett]) except: Setting = '' # something went wrong with Setting # get the URL to the page Url = soup.find_all('link', rel='canonical')[0].get('href') return [ bookTitle, bookSeries, bookAuthors, ratingValue, ratingCount, reviewCount, Plot, NumberofPages, Published, Characters, Setting, Url ]
import spacy from spacy_fastlang import LanguageDetector nlp = spacy.load('en_core_web_sm') nlp.add_pipe(LanguageDetector()) doc = nlp( 'Life is like a box of chocolates. You never know what you are gonna get.') assert doc._.language == 'en' assert doc._.language_score >= 0.8
def test_batch_predictions(): nlp = spacy.blank("xx") nlp.add_pipe(LanguageDetector()) for doc in nlp.pipe([en_text, en_text]): assert doc._.language == "en" assert doc._.language_score >= 0.8
# specifiy max len nlp.max_length = 50000 # expand on spaCy's stopwords # --+ my stopwrods my_stopwords = [ '\x1c', 'ft', 'wsj', 'time', 'sec', 'say', 'says', 'said', 'mr.', 'mister', 'mr', 'miss', 'ms', 'inc' ] # --+ expand on spacy's stopwords for stopword in my_stopwords: nlp.vocab[stopword].is_stop = True # filter-out non-en pages language_detector = LanguageDetector() nlp.add_pipe(language_detector) # containers docs_id_tokens = [] docs_ln = [] for _id, doc in zip(df_5_50._id, docs): doc = nlp(doc) ln = doc._.language docs_ln.append([_id, ln]) if doc._.language == 'en': tmp_tokens = [ token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.like_num and not token.like_url and not token.like_email