def langage_detection(self): """detect language""" self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) doc = self.nlp(self.phrase) return doc._.language
def create_spacy_nlp_object(parameters: Dict[str, Any]) -> Language: nlp = spacy.load(parameters['spacy_lang']) nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) # Modify tokenizer suffixes = list(nlp.Defaults.suffixes) # remove dot as suffix suffixes.append(r"\.") suffix_regex = spacy.util.compile_suffix_regex(suffixes) nlp.tokenizer.suffix_search = suffix_regex.search # modify tokenizer infix patterns infixes = ( LIST_ELLIPSES + LIST_ICONS + [ # EDIT: Removed hyphen \- : r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[0-9])[+\*^](?=[0-9-])", r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), ]) infix_re = spacy.util.compile_infix_regex(infixes) nlp.tokenizer.infix_finditer = infix_re.finditer return nlp
def lang_distribution(): print("loading dataset..") with open('bgg_download/data/boardgames-data/bgg-data-cleaned.json', 'r', encoding="utf-8") as f: data = json.load(f) print("dataset loaded") print("loading spacy en model...") nlp = spacy.load("en_core_web_lg") nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) print("model loaded") game_lang_dict = {} for item in data["items"]: for com in item["comments"]: comment = nlp(mt.pre_processing(com["value"].lower())) comment_lang, comment_lang_score = comment._.language.values() if comment_lang not in game_lang_dict.keys(): game_lang_dict[comment_lang] = 1 else: game_lang_dict[comment_lang] += 1 with open("bgg_result/games_lang_distr.json", 'w') as out: json.dump(game_lang_dict, out)
def check_language(check_text): # load language model lang = spacy.load("en") lang.add_pipe(LanguageDetector(), name="language_detector", last=True) doc_check = lang(check_text) if doc_check._.language["language"] == "en": return True
def spacy_language_detection(row): """ Function utilizes spaCy N.L.P. library, "langdetect" library, and "spacy-langdetect" library to determine the language of the Tweet. :param row: example in the dataset we are operating on. :return: the modified example with additional column specifying its language. """ global non_english_count_global nlp = spacy.load("en") nlp.add_pipe(LanguageDetector(), name="language_detector", last=True) document = nlp(row["tweet_full_text"]) # document level language detection. Think of it like average language of document! text_language = document._.language row["spaCy_language_detect"] = str(text_language["language"]) print("spaCy language designation:") print(str(text_language["language"])) if not str(text_language["language"]).startswith('en'): non_english_count_global += 1 log.warning(f"\t\t\tnon-English tweet (will be dropped): " f"\n\t\t\t\tid: {row['tweet_id']}" f"\n\t\t\t\ttweet: {row['text_derived']}" f"\n\t\t\t\tLanguage tags: {row['spaCy_language_detect']}") return row["spaCy_language_detect"]
def __init__(self, input_csv, col_names=None, min_review=100, remove_non_english=False): review_df = pd.read_csv(input_csv) # Drop Duplicate Rows # Unfortunately, the Indeed web scraper scrapes the same 'top' review # for every page, leading to duplicates, must drop identical rows # until this is fixed review_df = review_df.drop_duplicates() # We have standard column names, if your csv does not match these # standard names, you must supply a dictionary that translates if col_names is not None: if len(review_df.columns) < len(col_names): raise Exception('The number of column names supplied cannot' ' exceed the number of columns in dataframe') else: review_df.rename(columns=col_names, inplace=True) # Remove companies with less than 'min_reviews' threshold company_counts = review_df['Company'].value_counts() company_thres = company_counts[company_counts >= min_review].index temp_boolean = review_df['Company'].apply(lambda x: x in company_thres) self.reviews = review_df.loc[temp_boolean, :] # If user specifies, detect non-english reviews, and remove # Wayyyy to slow right now if remove_non_english: print('Removing non-english reviews') # Using spacy and 'langdetect' package from spacy import spacy from spacy_langdetect import LanguageDetector # Disable extraneous components of spacy pipeline to speed up nlp = spacy.load('en_core_web_md', disable=['ner', 'tagger', 'textcat']) # Add language detector nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) temp_boolean = review_df['Review'].apply(self.language_detect, nlp_pipe=nlp) # Remove non-english reviews self.reviews = self.reviews.loc[temp_boolean, :]
def detect_language(uuid): languages_detected = [] # * ---------- PATH --------- * # Path of the this file path_file = os.path.dirname(os.path.realpath(__file__)) # Path to WALK txt path_txt_dir_walk = f'{path_file}/{uuid}/txt/' # Detect language of all the txt files and put them in languages_detected array for r, d, f in os.walk(path_txt_dir_walk): for txt in f: if '.txt' in txt: # Read the txt file to get the text with codecs.open(path_txt_dir_walk + txt, 'r', 'utf-8') as file: txt_content = file.read() # Load Spacy nlp = spacy.load("en_core_web_sm") # Add language detection to it nlp.add_pipe(LanguageDetector(), name="language_detector", last=True) # Apply NLP on text txt_nlp = nlp(txt_content) # Target the language part of the nlp done txt_lang = txt_nlp._.language # Add the language detected to to languages_detected array languages_detected.append(txt_lang['language']) if 'fr' in languages_detected: is_languague_is_french = True else: is_languague_is_french = False return is_languague_is_french
def prune_data(df: pd.DataFrame) -> pd.DataFrame: """ Create simple features such as number of words, length of characters in order to prune dataset more """ # create number of words column df['num_words'] = df['DESCRIPTION'].apply(lambda x: len(x.split())) # from exploratpry data analysis, sentences form with 4 or more words, # drop entries with less than 3 words df = df[df['num_words'] > 3] # load in spacy & language detector nlp = spacy.load('en') nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) # simple function to return detected language def get_lang(desc): doc = nlp(desc) return doc._.language['language'] df['lang'] = df['DESCRIPTION'].apply(get_lang) # hindi and indonesian languages are most common, remove these for better nlp processing df = df[~df['lang'].isin(['hi', 'id'])] return df.reset_index(drop=True)
def language_detector(): """ Sort .txt files by document average language, by moving them into respective subfolders, implemented using SpaCy. Subfolders with the language as the name of the folders are created for every unseen language. Supports multiple languages. """ curr_dir = os.getcwd() nlp = spacy.load("en_core_web_sm") nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) for filename in os.listdir(curr_dir): if filename.endswith(".txt"): f = open(filename, "r", encoding="utf8") text = f.read() f.close() doc = nlp(text) lang = doc._.language['language'] if not os.path.exists(os.path.join(curr_dir, lang)): os.makedirs(os.path.join(curr_dir, lang)) else: print(f"Moving {filename} to {lang}") os.replace(os.path.join(curr_dir, filename), os.path.join(curr_dir, lang, filename)) else: continue
def preprocessing(): # Deutsches und Französisches Sprachmodell laden nlp_de = spacy.load("de_core_news_md") nlp_fr = spacy.load("fr_core_news_sm") # Language Detector laden nlp_de.add_pipe(LanguageDetector(), name="language_detector", last=True) # Den zu untersuchenden Text öffen with open("../data_in/rieger.txt", encoding="utf-8") as file: text = file.read() # dem Text wird ein deutsches Sprachmodell zugewiesen # --> wird gleich geändert, erst Spracherkennung, dann Sprachmodell de oder fr doc = nlp_de(text) # Bestimmung der Sprache auf Dokumentebene print(doc._.language) dict_fr = {} dict_de = {} dict_all = {} dict_sonstiges = {} for sent in doc.sents: dict_all[sent] = sent._.language print(dict_all) print(type(doc.sents)) print(dict_all) for i in dict_all: if "fr" in dict_all[i]['language']: dict_fr[i] = dict_all[i] if "de" in dict_all[i]['language']: dict_de[i] = dict_all[i] else: dict_sonstiges = dict_all[i] # print(dict_fr) # print(dict_de) # print(dict_sonstiges) ############################## text_fr = [] file = open("../data_out/spacy_lfr.txt", "w", encoding="utf-8") for j, k in dict_fr.items(): print("Text:", j) file.write(str(j)) for m in k: print(m + ":", k[m]) file2 = open("../data_out/spacy_lde.txt", "w", encoding="utf-8") for n, o in dict_de.items(): print("Text:", n) file2.write(str(n))
def test_custom_language_detector(): nlp = spacy.load("en_core_web_sm") nlp.add_pipe(LanguageDetector(language_detection_function=lambda spacy_object: "from custom function"), name="language_detector", last=True) text = "This is a test" doc = nlp(text) assert doc._.language == "from custom function" for i, sent in enumerate(doc.sents): assert sent._.language == "from custom function"
def test_tokens(): nlp = spacy.load("en_core_web_sm") nlp.add_pipe(LanguageDetector(), name="language_detector", last=True) text = "English Hello" doc = nlp(text) languages = [] for i, token in enumerate(doc): languages.append(token._.language["language"]) assert len(languages) == 2
def test_language_detector(): nlp = spacy.load("en_core_web_sm") nlp.add_pipe(LanguageDetector(), name="language_detector", last=True) text = "This is English text. Er lebt mit seinen Eltern und seiner Schwester in Berlin. Yo me divierto todos los días en el parque. Je m'appelle Angélica Summer, \ j'ai 12 ans et je suis canadienne." doc = nlp(text) doc._.language["language"] for i, sent in enumerate(doc.sents): sent._.language["language"]
def __init__(self, corpus): self.corpus = [{'id': _id, 'doc': doc} for _id, doc in corpus] self.nlp = spacy.load('en_core_web_sm') self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) nltk.download('wordnet') self.lemmatizer = WordNetLemmatizer() self.table = str.maketrans("", "", string.punctuation)
def predict(self, context, model_input): if self.nlp is None: self.nlp = spacy.load('en_core_web_sm') self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) return model_input[model_input.columns[0]].apply( lambda x: self.nlp(x)._.language)
def what_language(df): nlp = spacy.load('en') nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) df['language'] = df['article'].apply( lambda x: nlp(x)._.language['language']) #we can really only analyze English articles df = df[df['language'] == 'en']
def determine_language(corpus, spc_obj): """Determines the language of the first five lines of the corpus.""" spc_obj.add_pipe(LanguageDetector(), name='language_detector', last=True) doc = '' for line in corpus[:5]: doc += (line + ' ') lang = spc_obj(doc)._.language['language'] if lang != 'es': spc_obj = spacy.load(supported_languages[lang]) return (spc_obj, lang)
def detect_language(texte_string): # Load english in spacy nlp = spacy.load("en_core_web_sm") # Add the language detection nlp.add_pipe(LanguageDetector(), name="language_detector", last=True) # Apply NLP on the string string_with_nlp = nlp(texte_string) # Target the language NLP feature language_detected = string_with_nlp._.language return language_detected['language']
def __init__(self, model_type='', stopwords_file='' ): self._nlp=spacy.load(model_type) self._nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) #Representado como diccionario para tener un tiempo de acceso menor. self._stopwords=self._load_stopwords(file=stopwords_file)
def main(): ######## defining the parameters ############## supported_languages = ["English", "German", "Spanish", "Portuguese", "French", "Italian"] # this is required, otherwise we get weird languages for long and untidy documents default_language = "English" # making English the default, which is used when no language is detected useful_characters = string.printable + \ 'äöüÄÖÜéÉèÈáÁàÀóÓòÒúÚùÙíÍìÌñÑãÃõÕêÊâÂîÎôÔûÛ' # filtering the characters of the texts parsable_extensions = ['.csv', '.doc', '.docx', '.eml', '.epub', '.json', '.msg', '.odt', '.ogg', '.pdf', '.pptx', '.rtf', '.xlsx', '.xls'] """ '.gif', '.jpg', '.mp3', '.tiff', '.wav', '.ps', '.html' """ # the extensions which we try to parse to text doc_maxlength = 2000000 # default would be 1m which is the maximum length of a document in spacy minlength_of_text = 100 # if textlen is lower, we ignore this text POS_blacklist = ["PUNCT", "PART", "SYM", "SPACE", "DET", "CONJ", "CCONJ", "ADP", "INTJ", "X", ""] # we filter out these token-types parsers = [titlecaps, token_replacement, url_replacement] # the parsing functions used path = get_path(parsable_extensions) # Determining the directory from which to import documents ######## initiating the pipelines ############## multilanguage, nlp = decide_language_detection( path, supported_languages, default_language) # let the user determine if he wants to use the sentence-wise # language detection or the document-wise. The sentence-wise allows # to ignore parts of docs that contain text not of interest, such # as metadata in english for a german document nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) # add the language detector to the spacy nlp pipeline pdf_to_text(path, parsable_extensions) # safe all non-text-documents with parsable extensions to txt-file doc_list = documents_dataframe(path, minlength_of_text, doc_maxlength, nlp, multilanguage, default_language, supported_languages, parsers, useful_characters) # create a document list with detected language, filename, textname and text df_doclist = get_all_text_info( doc_list, supported_languages, POS_blacklist, doc_maxlength) # use the document list to retrieve various basic information from the text print(df_doclist.shape) df_doclist.to_pickle(path+"/df_doclist.pkl") # saving the data frame to path df_doclist = pd.read_pickle("./df_doclist.pkl") # and opening it
def check_language(input_text): """ Check the language of an input text :param input_text: :return: the name of the language """ NLP_dect.add_pipe(LanguageDetector(), name='language_detector', last=True) language = NLP_dect(input_text)._.language print(f"Text language: {language}") return language['language']
def languageDistribution(df): nlp = spacy.load("en") nlp.add_pipe(LanguageDetector(), name="language_detector", last=True) df['language'] = '' language = [] for index, row in df.iterrows(): text = row[str(commentTextColumn)] doc = nlp(text) language.append(str(doc._.language['language'])) df['language'] = language return df
def spacy_classifier(texts, lowercase=True, langs=['en', 'fr']): nlp = spacy.load(corpus_name) nlp.add_pipe(LanguageDetector(), name="language_detector", last=True) if lowercase: detected_langs = [langs[0] if nlp(str(text).lower())._.language['language'] == langs[0] else langs[1] for text in texts] else: detected_langs = [langs[0] if nlp(str(text))._.language['language'] == langs[0] else langs[1] for text in texts] return detected_langs
def titles_cleanup(img_entity_pickle, out_pickle=None): with open(img_entity_pickle, 'rb') as pf: imgs_web_entity = pickle.load(pf) title_map = imgs_web_entity['title_map'] snlp = spacy.load("en_core_web_lg") snlp.add_pipe(LanguageDetector(), name="language_detector", last=True) all_titles = [] all_title_idx = [] all_split_idx = [] for id, imgs_titles in title_map.items(): for n, img_titles in imgs_titles.items(): for t in img_titles: all_titles.append(t.lower()) all_title_idx.append(id) all_split_idx.append(n) assert len(all_titles) == len( all_title_idx), f"{len(all_titles)} != {len(all_title_idx)}" pipe = snlp.pipe(all_titles) clean_title_map = defaultdict(lambda: defaultdict(list)) all_clean_title = [] drop_by_lanu = 0 for i, doc in enumerate(pipe): if doc._.language['language'] != 'en': drop_by_lanu += 1 continue id = all_title_idx[i] senten = '' for token in doc: if token.pos_ != 'NUM' and token.pos_ != 'X': senten += token.text.lower() + ' ' for b in noun_chunk_blist: senten = re.sub(b, '', senten) for b in entity_black_list: senten = re.sub(b, '', senten) n_split = all_split_idx[i] # if len(clean_title_map[id]) < n_split + 1: # clean_title_map[id] += [[] for _ in range(n_split + 1 - len(clean_title_map[id]))] clean_title_map[id][n_split].append(senten) all_clean_title.append(senten) imgs_web_entity['clean_title_map'] = dict(clean_title_map) if out_pickle is None: out_pickle = img_entity_pickle with open(out_pickle, mode='wb') as pf: pickle.dump(imgs_web_entity, pf)
def __init__(self, lang: Languages): self.lang = lang self.nlp = self.load_spacy_model() # Add the language detector. It'll be turned off for normal tokenizing self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) # Add all the custom exceptions exceptions = tokenizer_exceptions.get(self.lang, {}) for term, exception in exceptions.items(): self.nlp.tokenizer.add_special_case(term, exception) # Add custom function self.fn = custom_functions.get(self.lang, fn)
def __post_init__(self): self.nlp = spacy.load(self.language_model) self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) # Add the abbreviation pipe to the spacy pipeline. Only need to run this once. abbreviation_pipe = AbbreviationDetector(self.nlp) self.nlp.add_pipe(abbreviation_pipe) # Our linker will look up named entities/concepts in the UMLS graph and normalize the data # for us. self.linker = UmlsEntityLinker(resolve_abbreviations=True) self.nlp.add_pipe(self.linker)
def what_language(row): """ Function utilizes spaCy N.L.P. library, "langdetect" library, and "spacy-langdetect" library to determine the language of the Tweet. :param row: example in the dataset we are operating on. :return: the modified example with additional column specifying its language. """ nlp = spacy.load("en") nlp.add_pipe(LanguageDetector(), name="language_detector", last=True) document = nlp(row["text_derived"]) # document level language detection. Think of it like average language of document! text_language = document._.language row["spaCy_language_detect"] = str(text_language["language"]) return row["spaCy_language_detect"]
def __init__(self, tokenizer_type: str = 'bert-base-uncased', do_lower_case: bool = True): self.nlp = spacy.load('en') self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) self.toke = BertTokenizer.from_pretrained(tokenizer_type, do_lower_case=do_lower_case) # For splitting by \n\n followed by # [... , (...) , {... , int... , ...: , or (R/r)epeat... self.header_seed = '(\n\n(\[.*|\(.*\)|\{.*|[0-9].*|.*[:|: ]\n|.*(R|r)epeat.*))' # For cleaning up any missed characters self.clean_seed = '\([^)].*\)|\[.*?\]|\(|\)|\[|\]|:'
def _index_warc(filename, index, counter): """ Index individual WARC file. :param filename: WARC file name :param index: Elasticsearch index :param counter: Spark counter """ try: nlp = spacy.load('en_core_web_sm') nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) helpers.bulk(util.get_es_client(), _generate_docs(index, filename, nlp, counter)) except Exception as e: logger.error(e)
def __init__(self, data_dir: str): '''Initializes a CORD-19 data preprocessing class Args: data_dir: Raw data directory ''' self.data_dir = data_dir # Initialize NLP model self.nlp = en_core_sci_lg.load(disable=["tagger", "ner"]) self.nlp.max_length = 2000000 self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) self.nlp_words_to_check = 100