def pos_tag(self,text): token_text = text text = ' '.join(text) if self.lang=='en': return nltk.pos_tag(token_text) elif self.lang in ['ro']: tags = self.tagger.predict([features(token_text, index) for index in range(len(token_text))]) return self.change_ud2penn([(token_text[i], tags[i]) for i in range(len(token_text))]) elif self.lang=='ar': pipeline = stanfordnlp.Pipeline(models_dir="models/stanford", lang='ar', use_gpu=False, processors='tokenize,pos') doc = pipeline(text) tags = [(word.text, word.upos) for sent in doc.sentences for word in sent.words] return tags elif self.lang=='es': # blob = Text(text, hint_language_code=self.lang) # return self.change_ud2penn(blob.pos_tags) pipeline = stanfordnlp.Pipeline(models_dir="models/stanford", lang='es', use_gpu=False, processors='tokenize,pos') doc = pipeline(text) tags = [(word.text, word.upos) for sent in doc.sentences for word in sent.words] return tags elif self.lang =='fr': return nltk.pos_tag(token_text)
def preprocess_text(samples, lowercase=False, lemmatize=False, remove_diacritics=False): if lemmatize: pipeline = stanfordnlp.Pipeline(processors='tokenize,pos,lemma', lang='sk') else: pipeline = stanfordnlp.Pipeline(processors='tokenize', lang='sk') for i, sample in enumerate(samples): doc = pipeline(sample.text) if lemmatize: tokens = [ word.lemma for sentence in doc.sentences for word in sentence.words ] else: tokens = [ word.text for sentence in doc.sentences for word in sentence.words ] if lowercase: tokens = [token.lower() for token in tokens] if remove_diacritics: tokens = [unidecode.unidecode(token) for token in tokens] samples[i].text = tokens return samples
def __init__(self, nlp=None, lang="tr"): if nlp: self.nlp = nlp else: try: self.nlp = stanfordnlp.Pipeline(lang=lang) except: stanfordnlp.download(lang) self.nlp = stanfordnlp.Pipeline(lang=lang)
def __init__(self, lang: str, annotators: Union[str, List[str]], use_corenlp=False, models_dir="", pretokenized=False, merge_one_sent=False, sep_syms=""): self.lang = lang self.corenlp_language_name = LANG_NAME_MAP[lang] self.use_corenlp = use_corenlp self.models_dir = models_dir if isinstance(annotators, str): self.annotators = annotators.split(",") else: self.annotators = annotators self.annotators_set = set(self.annotators) self.pretokenized = pretokenized self.merge_one_sent = merge_one_sent self.need_pos, self.need_lemma, self.need_dep = [ (z in self.annotators_set) for z in ['pos', 'lemma', 'depparse'] ] self.sep_syms_set = set(list( sep_syms)) # separate these symbols (add spaces) apart especially if len(self.sep_syms_set) > 0: assert not self.pretokenized, "No processing for sep_syms if pretokenized!!" # ----- # currently only support certain operations if use_corenlp: assert all( (z in {'tokenize', 'ssplit', 'pos', 'lemma', 'depparse'}) for z in self.annotators) assert "CORENLP_HOME" in os.environ, "CORENLP_HOME not found, please set this for CORENLP!!" zlog( f"Start tokenizer with corenlp, home={os.environ['CORENLP_HOME']} and self = {self.__dict__}" ) # self.corenlp = None # binded at running time self.nlp = None else: # no mwt since that may split words (which we ignore for now) # assert all((z in {'tokenize','mwt','pos','lemma','depparse'}) for z in self.annotators) assert all((z in {'tokenize', 'pos', 'lemma', 'depparse'}) for z in self.annotators) zlog(f"Start tokenizer with stanfordnlp, self = {self.__dict__}") # self.corenlp = None annotators_str = ",".join(self.annotators) if len(models_dir) > 0: self.nlp = stanfordnlp.Pipeline( processors=annotators_str, lang=self.lang, tokenize_pretokenized=pretokenized, models_dir=models_dir) else: # use default dir self.nlp = stanfordnlp.Pipeline( processors=annotators_str, lang=self.lang, tokenize_pretokenized=pretokenized)
def __init__(self, lang="en", models_dir=""): # mostly using stanfordnlp self.lang = lang if len(models_dir) > 0: self.parser = stanfordnlp.Pipeline( processors='tokenize,mwt,pos,lemma,depparse', lang=lang, tokenize_pretokenized=True, models_dir=models_dir) else: self.parser = stanfordnlp.Pipeline( processors='tokenize,mwt,pos,lemma,depparse', lang=lang, tokenize_pretokenized=True)
def tag_relations(text, terms, bags, nlp=None): """ Modified version of tag relations that handles the special case of making predictions on new data without known relation labels. """ # default to Stanford NLP pipeline wrapped in Spacy if nlp is None: snlp = stanfordnlp.Pipeline(lang="en") nlp = StanfordNLPLanguage(snlp) # preprocess with spacy if needed if type(terms[0]) != spacy.tokens.doc.Doc: terms = [nlp(term) for term in terms] if (type(text) != spacy.tokens.doc.Doc and type(text) != spacy.tokens.span.Span): text = nlp(text) results = tag_terms(text, terms, nlp) tokenized_text = results["tokenized_text"] tagged_text = results["tags"] found_terms_info = results["found_terms"] found_terms = list(found_terms_info.keys()) for i in range(len(found_terms) - 1): for j in range(i + 1, len(found_terms)): term_pair = (found_terms[i], found_terms[j]) bags = add_relation(term_pair, found_terms_info, tokenized_text, bags) term_pair_reverse = (found_terms[j], found_terms[i]) bags = add_relation(term_pair_reverse, found_terms_info, tokenized_text, bags) return bags
def tokenize(fpath, lang): content = open(fpath).read() paragraphs = re.split(r'\n+', content) res_sents = [] res_pars = [] res_pos = [] start_par = 0 for par in paragraphs: par = par.strip() if not par: continue doc = stanfordnlp.Document(par) nlp = stanfordnlp.Pipeline(lang=lang, processors="tokenize,mwt,pos") doc = nlp(doc) #print(doc.conll_file.conll_as_string()) #print(doc.conll_file.sents) sents = [[token[1] for token in sent if '-' not in token[0]] for sent in doc.conll_file.sents] pos = [[token[3] for token in sent if '-' not in token[0]] for sent in doc.conll_file.sents] res_sents.extend(sents) res_pos.extend(pos) length = sum((len(s) for s in sents)) res_pars.append([start_par, start_par + length - 1]) start_par = start_par + length return res_sents, res_pos, res_pars
def load_stanfordnlp_model(path, lang='en'): nlp = stanfordnlp.Pipeline(lang=lang, use_gpu=False, processors='tokenize,mwt,pos,lemma,depparse', pos_batch_size=1000, models_dir=path) return nlp
def __clean_text(self, df): config = { 'processors': 'tokenize,pos,lemma,depparse', # Comma-separated list of processors to use 'lang': 'ru', # Language code for the language to build the Pipeline in 'tokenize_model_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus_tokenizer.pt', 'pos_model_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus_tagger.pt', 'pos_pretrain_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus.pretrain.pt', 'lemma_model_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus_lemmatizer.pt', 'depparse_model_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus_parser.pt', 'depparse_pretrain_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus.pretrain.pt' } snlp = stanfordnlp.Pipeline(**config) nlp = StanfordNLPLanguage(snlp) text_list = df["Text"].values lower_text_list = [] for text in text_list: text_lower = text.lower() lower_text_list.append(text_lower) clean_text_list = [] for text in lower_text_list: text = nlp(text) token = [token.lemma_ for token in text if not (token.is_punct or token.is_stop)] clean_text_list.append(token) return clean_text_list
def truecase(text): # Tokenize the text by sentences sentences = sent_tokenize(text, language='english') # Capitalize each sentence sentences_capitalized = [s.capitalize() for s in sentences] # Join the sentences back capitalized_text = re.sub(" (?=[\.,'!?:;])", "", ' '.join(sentences_capitalized)) # Create a stanfordnlp pipeline by the following processors stf_nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos') # Process the text doc = stf_nlp(capitalized_text) # Capitalize the words if the word is the following parts of speech lst = [ w.text.capitalize() if w.upos in ["PROPN", "NNS"] else w.text for sent in doc.sentences for w in sent.words ] # Join the list of words pos_capitalized = ' '.join(lst) # Replace i, i'll, i'm, i've with the capitalized variants for pat, repl in replacements: pos_capitalized = re.sub(pat, repl, pos_capitalized) # Remove the spaces between the punctuation result = re.sub(r'\s+([?.!"\'])', r'\1', pos_capitalized) return result
def test_pretokenized(): nlp = stanfordnlp.Pipeline(**{'processors': 'tokenize', 'models_dir': '.', 'lang': 'en', 'tokenize_pretokenized': True}) doc = nlp(EN_DOC_PRETOKENIZED) assert EN_DOC_PRETOKENIZED_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) doc = nlp(EN_DOC_PRETOKENIZED_LIST) assert EN_DOC_PRETOKENIZED_LIST_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
def test_spacy_stanfordnlp(lang, models_dir): snlp = stanfordnlp.Pipeline(lang=lang, models_dir=models_dir) nlp = StanfordNLPLanguage(snlp) assert nlp.lang == "stanfordnlp_" + lang doc = nlp("Hello world! This is a test.") # fmt: off assert [t.text for t in doc ] == ["Hello", "world", "!", "This", "is", "a", "test", "."] assert [t.lemma_ for t in doc ] == ["hello", "world", "!", "this", "be", "a", "test", "."] assert [t.pos_ for t in doc] == [ "INTJ", "NOUN", "PUNCT", "DET", "VERB", "DET", "NOUN", "PUNCT" ] assert [t.tag_ for t in doc] == ["UH", "NN", ".", "DT", "VBZ", "DT", "NN", '.'] assert [t.dep_ for t in doc] == [ "root", "vocative", "punct", "nsubj", "cop", "det", "root", "punct" ] assert [t.is_sent_start for t in doc] == [True, None, None, True, None, None, None, None] assert any([t.is_stop for t in doc]) # fmt: on assert len(list(doc.sents)) == 2 assert doc.is_tagged assert doc.is_parsed assert doc.is_sentenced docs = list(nlp.pipe(["Hello world", "This is a test"])) assert docs[0].text == "Hello world" assert [t.pos_ for t in docs[0]] == ["INTJ", "NOUN"] assert docs[1].text == "This is a test" assert [t.pos_ for t in docs[1]] == ["DET", "VERB", "DET", "NOUN"]
def _init_nlp(model_or_lang, is_tokenized, disable_sbd, use_stanfordnlp): if model_or_lang is None: model_or_lang = 'en' if use_stanfordnlp else 'en_core_web_sm' nlp = None if use_stanfordnlp: from spacy_stanfordnlp import StanfordNLPLanguage import stanfordnlp snlp = stanfordnlp.Pipeline(lang=model_or_lang, tokenize_pretokenized=is_tokenized) nlp = StanfordNLPLanguage(snlp) else: # Init model: # Initialize model, with custom pipe # taking into account 'is_tokenized', 'disable_sbd', and 'include_headers' nlp = spacy.load(model_or_lang) if is_tokenized: nlp.tokenizer = nlp.tokenizer.tokens_from_list if disable_sbd: nlp.add_pipe(_prevent_sbd, name='prevent-sbd', before='parser') conllformatter = ConllFormatter(nlp) nlp.add_pipe(conllformatter, last=True) return nlp
def conll_parse(sentences, file_path): config = { 'processors': 'tokenize,lemma,pos,depparse', 'tokenize_pretokenized': True } stanford_parser = stanfordnlp.Pipeline(**config) n_steps = len(sentences) import time start_time = time.time() with open(str(file_path), 'w') as f: for i, sentence in enumerate(list(sentences)): doc_obj = stanford_parser(sentence) f.write(doc_obj.conll_file.conll_as_string()) if i % 100 == 0: completion = (i + 1) / n_steps comp_time = time.time() - start_time print( f'{completion*100:.1f}% complete. Elapsed time: {comp_time:.2f}s' ) print(f'Finished. Total computation time: {time.time() - start_time}')
def demo_nlp(): global start_state print("StanfordNLP") print("Please stand by while we initialize the StanfordNLP pipeline...") global lang_model_dir pipeline = stanfordnlp.Pipeline(models_dir=lang_model_dir, lang='en', use_gpu=False) print() print("Enter sentence(s) to parse, or just press Return to exit.") while True: inp = input("Input: ") if inp == "": return if inp == "PLOT": plot.plot.present(start_state) continue inp = preprocess(inp) doc = pipeline(inp) goal = Requirements(start_state) for i in range(0, len(doc.sentences)): cmd = patterns.handle_input(doc.sentences[i]) get_requirement(cmd, goal) take_actions(goal) if recent_objspec: print("recent_objspec:", recent_objspec)
def __init__(self): self.nlp = stanfordnlp.Pipeline(lang="id",use_gpu=False, silent=True) self.stemmer = StemmerFactory().create_stemmer() self.ner = get_entities # Set POS Tagger self.pos_tagger = nltk.tag.CRFTagger() self.pos_tagger.set_model_file('pretrained/pos_tagger/all_indo_man_tag_corpus_model.crf.tagger')
def __init__(self): self.tools = [] self._n = 0 self.pk_seg = pkuseg.pkuseg() self.thu_seg = thulac.thulac(seg_only=True) self.stanford_pipeline = stanfordnlp.Pipeline(processors='tokenize', lang='zh')
def pos_tagger(bundled_data, threshold): # combine list of sentences reviews = [] for review in bundled_data: reviews.extend(review) # parser parser = stanfordnlp.Pipeline(processors="tokenize,mwt,lemma,pos") # extract sentences from reviews sentences_list = [] for review in reviews: doc = parser(review) sentences_list.extend([sentence for sentence in doc.sentences]) sampled_sentences = random.sample(sentences_list, threshold) parsed_texts = [] for sentence in sampled_sentences: parsed_text = {} for i in range(len(sentence.words)): parsed_text[i] = (sentence.words[i].text, sentence.words[i].xpos) parsed_texts.append(parsed_text) with open("out/d_pos_tagged.json", "w") as json_file: json.dump(parsed_texts, json_file, indent=4, sort_keys=True)
def mrr_pairs(generated_captions_fn, occurrences_dir, heldout_pairs, split): # TODO with open(generated_captions_fn) as f: generated_captions = json.load(f) id2captions = {meta['image_id']: meta['captions'] for meta in generated_captions} config = {'use_gpu': False, 'tokenize_pretokenized': True} nlp_pipeline = stanfordnlp.Pipeline(lang='en', models_dir=STANFORDNLP_DIR, **config) mrr_scores = {} for pair in heldout_pairs: occurrences_fn = os.path.join(occurrences_dir, pair + ".json") occurrences_data = json.load(open(occurrences_fn, "r")) _, val_indices, test_indices = get_occurrences_splits([occurrences_fn]) if split == "val2014": eval_indices = test_indices else: eval_indices = val_indices nouns = set(occurrences_data[NOUNS]) if ADJECTIVES in occurrences_data: others = set(occurrences_data[ADJECTIVES]) concept_type = "adj-noun" elif VERBS in occurrences_data: others = set(occurrences_data[VERBS]) concept_type = "verb-noun" else: raise ValueError("No adjectives or verbs found in occurrences data!") mrr_score = calc_mrr(id2captions, eval_indices, nouns, others, concept_type, occurrences_data, nlp_pipeline) pair = os.path.basename(occurrences_fn).split(".")[0] mrr_scores[pair] = mrr_score return mrr_scores
def making_parsed_tree(sentiment_code, file_name): splited_sentence_first = [] parsed_sentence_first = [] pcn = StanfordCoreNLP('http://*****:*****@", '', text) text = re.sub(r'http\S+', '', text) return text for a in tqdm(range(len(df_amazon))): tweet_txt = about_symbol(text[a]) if label[a] == sentiment_code: if len(tweet_txt) > 3: tweet_txt = " ".join(tweet_txt.split()) tweet_txt = contractions.fix(tweet_txt) doc = nlp(tweet_txt) splited_sentence_second = [] parsed_sentence_second = [] for sentence in doc.sentences: temp = [] for token in sentence.tokens: temp.append(token.text) sum_text = " ".join(temp) sum_text = about_symbol(sum_text) output = pcn.annotate(sum_text, properties={ 'annotators': 'parse', 'outputFormat': 'json' }) parsed_sent = output['sentences'][0]['parse'] parsed_sent = " ".join(parsed_sent.split()) parsed_sent = parsed_sent.replace('(', '<') parsed_sent = parsed_sent.replace(')', '>') parsed_sentence_second.append(parsed_sent) splited_sentence_second.append(sum_text) # print(parsed_sent) splited_sentence_first.append(splited_sentence_second) parsed_sentence_first.append(parsed_sentence_second) sent_json['splited_sentence'] = [] sent_json['parsed_sentence'] = [] sent_json['original_sentence'] = [] sent_json['splited_sentence'].append(splited_sentence_first) sent_json['parsed_sentence'].append(parsed_sentence_first) sent_json['original_sentence'].append(tweet_txt) with open(file_name, 'w') as out_file: json.dump(sent_json, out_file, indent=4)
def tokenizer(gene_texts, MAX_NB_WORDS): word_index = {} docs = [] txt_count = 0 index = 1 if nlp is None: nlp = stanfordnlp.Pipeline(use_gpu=False) for text in gene_texts: txt_count += 1 if txt_count % 100 == 0: print('[tokenized txt]:', txt_count) doc = nlp(text) docs.append(doc) txt_matrix = NLP.get_text_matrix(doc) # doc matrix (array) # == process this text matrix for i in range(len(txt_matrix)): sent_arr = txt_matrix[i] for j in range(len(sent_arr)): token = sent_arr[j].lower() # add to word_index if len(word_index) < MAX_NB_WORDS: if token in word_index.keys(): continue else: word_index[token] = index index += 1 return word_index, docs
def get_org_name(url): response = requests.get(url) if (response.status_code == 200): soup = BeautifulSoup(response.text, "html.parser") doc_text = soup.find_all( ['p', 'tr']) # fixed code to extract text from tables as well. doc_text = ''.join([x.text for x in doc_text]) doc = clean_text(doc_text, lower=False) nlp = spacy.load('en_core_web_sm') stf_nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos') doc = doc[:250].split() doc = ' '.join([x for x in doc if x.lower().find('newswire') == -1]) orgs = set() spacy_doc = ' '.join(doc.split()).strip() nlp_doc = nlp(spacy_doc) for ent in nlp_doc.ents: if (ent.label_ == 'ORG'): orgs.add(ent.text) sf_doc = stf_nlp(doc) for sent in sf_doc.sentences: for word in sent.words: if (word.xpos == 'NNP' and word.upos == 'PROPN'): orgs.add(word.text) return ', '.join(list(orgs))
def __init__(self) -> None: snlp = stanfordnlp.Pipeline(lang='en') # stanfordnlp python pipeline self.nlp = StanfordNLPLanguage(snlp) # spacy wraper for snlp conllformatter = ConllFormatter(self.nlp) self.nlp.add_pipe(conllformatter, last=True) self.detokenizer = MosesDetokenizer() self.vanila_preprocessor = PreprocessorBase()
def lemmatize_texts(lemmatizer): entries = Entry.objects.filter(lemmatized='') if lemmatizer == 'stanford': texts = [(entry.text, entry.id) for entry in entries] snlp = stanfordnlp.Pipeline(lang='ru') nlp = StanfordNLPLanguage(snlp) for doc in tqdm.tqdm( nlp.pipe(texts, batch_size=100, as_tuples=True, disable=["tagger", "parser", "pos", "depparse"])): id = doc[1] lemmatized = ' '.join([token.lemma_ for token in doc[0]]) entry = Entry.objects.get(id=id) entry.lemmatized = lemmatized entry.save() if lemmatizer == 'mystem': m = Mystem() for entry in tqdm.tqdm(entries): lemmas = m.lemmatize(entry.text) lemmatized = ''.join(lemmas) entry = Entry.objects.get(id=entry.id) entry.lemmatized = lemmatized entry.save()
def parseToconllu(data): data = data[:5] nlp = stanfordnlp.Pipeline( models_dir='E:/anaconda/setup/Lib/site-packages/stanfordnlp' ) # This sets up a default neural pipeline in English doc_text = '' for data_ in data: for i in range(len(data_['questions'])): if not data_['questions'][i]['input_text'][-1] == '?': data_['questions'][i]['input_text'] += '?' if not data_['questions'][i]['input_text'][0].isupper(): data_['questions'][i]['input_text'] = list( data_['questions'][i]['input_text']) data_['questions'][i]['input_text'][0] = data_['questions'][i][ 'input_text'][0].upper() data_['questions'][i]['input_text'] = ''.join( data_['questions'][i]['input_text']) doc_text += data_['questions'][i]['input_text'] + ' ' if not data_['answers'][i]['input_text'][-1] == '.': data_['answers'][i]['input_text'] += '.' if not data_['answers'][i]['input_text'][0].isupper(): data_['answers'][i]['input_text'] = list( data_['questions'][i]['input_text']) data_['answers'][i]['input_text'][0] = data_['questions'][i][ 'input_text'][0].upper() data_['answers'][i]['input_text'] = ''.join( data_['questions'][i]['input_text']) doc_text += data_['answers'][i]['input_text'] + ' ' # doc_text += '\n' print('start parsing...') doc = nlp(doc_text) doc.write_conll_to_file('example.conllu')
def __init__(self): file = open("assign4_reviews.txt", "r") result = [] input_text = file.readlines() # stanfordnlp.download('en') nlp = stanfordnlp.Pipeline() for i in range(0, len(input_text) - 1): doc = nlp(input_text[i]) midresult = doc.conll_file.conll_as_string() midresult = midresult.split('\n') last_PROPN = '' last_Adj = '' for x in midresult: x = x.split('\t') if (len(x) > 4 and x[3] != 'PUNCT'): if (x[3] == 'PROPN' or x[3] == 'NOUN'): last_PROPN = last_PROPN + " " + x[1] splitted = last_PROPN.split(" ") if len(splitted) > 2: splitted = splitted[1:] last_PROPN = " ".join(splitted) if (x[3] == 'ADJ'): if (last_PROPN != ' '): result.append((last_PROPN, x[1])) last_PROPN = '' self.result = result return
def preprocess(models_dir, processors, extractive, cnn_dm_file, article_file, summary_file, output_train_files, output_test_files, split_ratio): """ This function goes through the entire preprocessing pipeline resulting in files ready to be used as the input of the graph neural network. :param models_dir: The path to the stanfordnlp directory. :param processors: List of parsers to use. Options: tokenize, mwt, pos, lemma, depparse. :param extractive: Whether to make extractive summary or not. If true,the input file should contain the best ids. :param cnn_dm_file: This file should contain the articles and the summaries in a jsonl format. :param article_file: This file will contain every article graph. :param summary_file: This file will contain every summary graph. :param output_train_files: The paths to save the training files. The first parameter is the training input, the second is the expected output. :param output_test_files: The paths to save the validation files. The first parameter is the validation input, the second is the expected output. :param split_ratio: The ratio of data used for training vs validation. :return: None """ from graph_transformations.preprocessor import main as stanford_preprocess if extractive: from graph_transformations.cnn_extractive_parser import main as cnn_process else: from graph_transformations.cnn_parser import main as cnn_process from graph_transformations.train_test_split import train_test_split import stanfordnlp if not os.path.exists(models_dir): stanfordnlp.download('en', resource_dir=models_dir) correct_processors = ["tokenize", "mwt", "pos", "lemma", "depparse"] incorrect = [i for i in processors if i not in correct_processors] if len(incorrect) != 0: raise ValueError( "The following processor values are incorrect: {}".format( incorrect)) if not os.path.exists(cnn_dm_file): raise FileNotFoundError( "The input file is not found. {} not found".format(cnn_dm_file)) pipeline = stanfordnlp.Pipeline(models_dir=models_dir, processors=processors) processed_file = "{}_processed.jsonl".format( os.path.splitext(cnn_dm_file)[0]) stanford_preprocess(pipeline, cnn_dm_file, processed_file) dependency_file = "dep.jsonl" word_file = "words.jsonl" pos_file = "pos.jsonl" cnn_process(processed_file, article_file, summary_file, dependency_file, word_file, pos_file, dependency_file[:-1], word_file[:-1], pos_file[:-1]) ratio = int(split_ratio) if split_ratio >= 1.0 else int(split_ratio * 100) train_test_split(article_file, output_train_files[0], output_test_files[0], ratio) train_test_split(summary_file, output_train_files[1], output_test_files[1], ratio)
def test_missing_requirements(): """ Try to build several pipelines with bad configs and check thrown exceptions against gold exceptions. :return: None """ # list of (bad configs, list of gold ProcessorRequirementsExceptions that should be thrown) pairs bad_config_lists = [ # missing tokenize ( # input config { 'processors': 'pos,depparse', 'models_dir': TEST_MODELS_DIR, 'lang': 'en' }, # 2 expected exceptions [{ 'processor_type': 'POSProcessor', 'processors_list': ['pos', 'depparse'], 'provided_reqs': set([]), 'requires': set(['tokenize']) }, { 'processor_type': 'DepparseProcessor', 'processors_list': ['pos', 'depparse'], 'provided_reqs': set([]), 'requires': set(['tokenize', 'pos']) }]), # no pos when lemma_pos set to True ( # input config { 'processors': 'tokenize,mwt,lemma', 'models_dir': TEST_MODELS_DIR, 'lang': 'en', 'lemma_pos': True }, # 1 expected exception [{ 'processor_type': 'LemmaProcessor', 'processors_list': ['tokenize', 'mwt', 'lemma'], 'provided_reqs': set(['tokenize', 'mwt']), 'requires': set(['tokenize', 'pos']) }]) ] # try to build each bad config, catch exceptions, check against gold pipeline_fails = 0 for bad_config, gold_exceptions in bad_config_lists: try: stanfordnlp.Pipeline(**bad_config) except PipelineRequirementsException as e: pipeline_fails += 1 assert isinstance(e, PipelineRequirementsException) assert len(e.processor_req_fails) == len(gold_exceptions) for processor_req_e, gold_exception in zip(e.processor_req_fails, gold_exceptions): # compare the thrown ProcessorRequirementsExceptions against gold check_exception_vals(processor_req_e, gold_exception) # check pipeline building failed twice assert pipeline_fails == 2
def __init__(self): self.translator = Translator() self.nlp_de = stanfordnlp.Pipeline(lang='de') self.nlp_en = stanfordnlp.Pipeline(lang='en') self.prep_akk = ['bis', 'durch', 'für', 'gegen', 'ohne', 'um'] self.prep_dat = [ 'aus', 'ausser', 'bei', 'nach', 'mit', 'seit', 'von', 'zu' ] self.prep_acc_dat = [ 'an', 'auf', 'hinter', 'in', 'neben', 'über', 'unter', 'von', 'zwischen' ] self.nominative = {'Masc': 'der', 'Fem': 'die', 'Neut': 'das'} self.accusative = {'Masc': 'den', 'Fem': 'die', 'Neut': 'das'} self.dative = {'Masc': 'dem', 'Fem': 'der', 'Neut': 'den'} self.genitive = {'Masc': 'des', 'Fem': 'der', 'Neut': 'des'}
def norm_seeds(lst, lang="da"): nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos,lemma', lang=lang) seeds = " ".join(lst) doc = nlp(seeds) seeds = [ word.lemma.lower() for sent in doc.sentences for word in sent.words ] return sorted(list(set(seeds)))