def predict(self, prompt: str) -> Dict[str, float]: vocab_size = len(self.unigram_counts) prompt_tokens = tokenize(prompt)[(-self.history - 1) :] prompt_tokens = [""] * max(0, self.n - 1 - len(prompt_tokens)) + prompt_tokens def mle(token: str) -> float: p_log = math.log(1.0) tokens = [*prompt_tokens, token] for ngram in ngrams(tokens, self.n): unigram_count = self.unigram_counts.get(ngram[0], 0) ngram_count = self.ngram_counts.get(ngram, 0) if self.smoothing is Smoothing.LAPLACE: p_log = p_log + math.log((ngram_count + 1) / (unigram_count + vocab_size)) elif self.smoothing is Smoothing.GOOD_TURING: if unigram_count == 0: p_log = p_log + math.log(1 / len(self.tokens)) else: p_log = p_log + math.log((ngram_count or (1 / len(self.tokens))) / unigram_count) else: if ngram_count == 0: return 0 p_log = p_log + math.log(ngram_count / unigram_count) return 2 ** p_log follower_odds = {k: mle(k) for k in self.unigram_counts.keys()} return collections.OrderedDict((sorted(follower_odds.items(), key=lambda item: item[1], reverse=True)))
def fit(self, text: str) -> None: tokens = tokenize(text) self.ngram_follower_counts = defaultdict(lambda: defaultdict(int)) for ngram in ngrams(tokens, self.n + 1): ngram, follower = ngram[: self.n], ngram[-1] self.ngram_follower_counts[ngram][follower] += 1
def tokenize_simple_test_2(): string = "this is a simple test" gold = ["this", "is", "a", "simple", "test"] tokens = nlp.tokenize(string) assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8")
def ptb_inputs_test11(self): string = "1. Buy a new Chevrolet (37%-owned in the U.S..) . 15%" gold = [ "1", ".", "Buy", "a", "new", "Chevrolet", "-LRB-", "37", "%", "-", "owned", "in", "the", "U.S.", ".", "-RRB-", ".", "15", "%", ] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def get_skipgram(tweets, out_folder, nIn, kIn): #tokenization and preprocess (if not yet done) must be done here. when analyzer receives #a callable, it will not perform tokenization, see documentation tweet_tokenized = [] for t in tweets: tweet_tokenized.append(nlp.tokenize(t)) skipper = functools.partial(skipgrams, n=nIn, k=kIn) vectorizer = TfidfVectorizer( analyzer=skipper, #stop_words=nlp.stopwords, # We do better when we keep stopwords use_idf=True, smooth_idf=False, norm=None, # Applies l2 norm smoothing decode_error='replace', max_features=10000, min_df=5, max_df=0.501) # for t in cleaned_tweets: # tweetTokens = word_tokenize(t) # skipgram_feature_matrix.append(list(skipper(tweetTokens))) # Fit the text into the vectorizer. logger.info("\tgenerating skip-gram vectors, n={}, k={}, {}".format( nIn, kIn, datetime.datetime.now())) tfidf = vectorizer.fit_transform(tweet_tokenized).toarray() logger.info("\t\t complete, dim={}, {}".format(tfidf.shape, datetime.datetime.now())) vocab = {v: i for i, v in enumerate(vectorizer.get_feature_names())} idf_vals = vectorizer.idf_ idf_dict = {i: idf_vals[i] for i in vocab.values() } # keys are indices; values are IDF scores pickle.dump(vocab, open(out_folder + "/" + SKIPGRAM_FEATURES_VOCAB + ".pk", "wb")) return tfidf, vocab
def ptb_inputs_test13(self): string = "Diamond (``Not even the chair'') lives near Udaipur " "(84km). {1. A potential Palmer trade:}" gold = [ "Diamond", "-LRB-", "``", "Not", "even", "the", "chair", "''", "-RRB-", "lives", "near", "Udaipur", "-LRB-", "84km", "-RRB-", ".", "-LCB-", "1", ".", "A", "potential", "Palmer", "trade", ":", "-RCB-", ] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test12(self): string = "I like you ;-) but do you care :(. I'm happy ^_^ but " "shy (x.x)!" gold = [ "I", "like", "you", ";--RRB-", "but", "do", "you", "care", ":-LRB-", ".", "I", "'m", "happy", "^_^", "but", "shy", "-LRB-x.x-RRB-", "!", ] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test17(self): string = "Kenneth liked Windows 3.1, Windows 3.x, and Mesa A.B " "as I remember things." gold = [ "Kenneth", "liked", "Windows", "3.1", ",", "Windows", "3.x", ",", "and", "Mesa", "A.B", "as", "I", "remember", "things", ".", ] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test15(self): string = "You can get a B.S. or a B. A. or a Ph.D (sometimes a " "Ph. D) from Stanford." gold = [ "You", "can", "get", "a", "B.S.", "or", "a", "B.", "A.", "or", "a", "Ph.D", "-LRB-", "sometimes", "a", "Ph.", "D", "-RRB-", "from", "Stanford", ".", ] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test22(self): string = "I like: \u2022wine, \u0095cheese, \u2023salami, & " "\u2043speck." gold = [ "I", "like", ":", "\u2022", "wine", ",", "\u2022", "cheese", ",", "\u2023", "salami", ",", "&", "\u2043", "speck", ".", ] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def tokenize_sgml_test_2_no_normalize(): nlp.get_global_PTB_config().strict_ptb3 = False sent2 = "Panasonic brand products are produced by Samsung Electronics " "Co. Ltd. Sanyo products aren't." gold = [ "Panasonic", "brand", "products", "are", "produced", "by", "Samsung", "Electronics", "Co.", "Ltd.", ".", "Sanyo", "products", "are", "n't", ".", ] tokens = nlp.tokenize(sent2) print(tokens) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def process_nlp_sentence(s_in): out = [] # Get tokenization tokens = nlp.tokenize(s_in) dependencies = dependency(s_in) # All tokens are TOKEN, tokens relate concepts for t in tokens: out.append("<\"{}\" --> TOKEN>.".format(t)) out.append("<(*,\"{0}\",{0}) --> RELATES>.".format(t)) # The tokens together are a sentence sent = "(*,\"{}\")".format("\", \"".join(tokens)) out.append("<{} --> SENTENCE>.".format(sent)) out.append("<{} --> (*,USER,SAYS)>. :|:".format(sent)) out.append("<{} <-> {}>.".format(sent, quote(raw))) # Process dependencies for D in dependencies: (t1, pos1), d, (t2, pos2) = D # Get all terms seperately i1 = "<(*,{},{}) --> INSIDE>".format(t1,sent) # get term for t1 in sentence i2 = "<(*,{},{}) --> INSIDE>".format(t2,sent) # get term for t2 in sentence p1 = "<{} --] {}>.".format(i1,pos1) # i1 has part of speech pos1 p2 = "<{} --] {}>.".format(i2,pos2) # i2 has part of speech pos2 d0 = "<(&&,{},{}) --> {}>.".format(i1,i2,d) # p1 and p2 have dependency d out += [p1,p2,d0] # output p1, p2, and d0 return out
def testTokenize_singleSentenceWithPunctuation(self): tokens = nlp.tokenize( "As far as I can see, this is a pipe made in 1965.") self.assertEquals(tokens, [ "As", "far", "as", "I", "can", "see", ",", "this", "is", "a", "pipe", "made", "in", "1965." ])
def ptb_inputs_test14(self): string = "No. I like No. 24 and no.47." gold = ["No", ".", "I", "like", "No.", "24", "and", "no.", "47", "."] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test1(self): string = "This is a sentence." gold = ["This", "is", "a", "sentence", "."] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def vocabulary(data): result = set() for record in data: text = str(record['text']) + ' ' + str(record['summary']) tokens = nlp.tokenize(text) for token in tokens: result.add(token.lower()) return result
def ptb_inputs_test4(self): string = "The Iron Age (ca. 1300 – ca. 300 BC)." gold = ["The", "Iron", "Age", "-LRB-", "ca.", "1300", "--", "ca.", "300", "BC", "-RRB-", "."] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test18(self): string = "I like programming in F# more than C#." gold = ["I", "like", "programming", "in", "F#", "more", "than", "C#", "."] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test16(self): string = "@Harry_Styles didn`t like Mu`ammar al-Qaddafi" gold = ["@Harry_Styles", "did", "n`t", "like", "Mu`ammar", "al-Qaddafi"] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test20(self): string = "I lived in O\u2019Malley and read OK! Magazine." gold = ["I", "lived", "in", "O'Malley", "and", "read", "OK!", "Magazine", "."] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test5(self): string = "Indo\u00ADnesian ship\u00ADping \u00AD" gold = ["Indonesian", "shipping", "-"] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def get_website_html_pages_tokens(website_names, filter_type): result = {} html_doc_id = 0 for website_name in website_names: with open(f'classifier/html_docs/{website_name}_html_docs_complete.json', 'r', encoding='utf-8') as f: html_docs_list = json.load(f) f.close() for html_doc in html_docs_list: if filter_type == 'true' and html_doc[0] and '<!DOCTYPE' in html_doc[0] and (html_doc[1] == True): result[html_doc_id] = [html_doc[2], html_doc[3], nlp.tokenize(html_doc[0])] elif filter_type == 'all' and html_doc[0] and '<!DOCTYPE' in html_doc[0]: result[html_doc_id] = [html_doc[2], html_doc[3], nlp.tokenize(html_doc[0])] html_doc_id = html_doc_id + 1 print(f'Get {website_name.upper()} tokens.') return result
def ptb_inputs_test6(self): string = "Gimme a phone, I'm gonna call." gold = ["Gim", "me", "a", "phone", ",", "I", "'m", "gon", "na", "call", "."] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test21(self): string = "I lived in O\u0092Malley and read OK! Magazine." # /* invalid unicode codepoint, but inherit from cp1252 */ gold = ["I", "lived", "in", "O'Malley", "and", "read", "OK!", "Magazine", "."] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def tokenize_sgml_test_1_no_normalize(): nlp.get_global_PTB_config().normalize_parentheses = False nlp.get_global_PTB_config().normalize_brackets = False sent1 = ( "Significant improvements in peak FEV1 were demonstrated " "with tiotropium/olodaterol 5/2 \u03BCg (p = 0.008), 5/5 \u03BCg " "(p = 0.012), and 5/10 \u03BCg (p < 0.0001) versus tiotropium " "monotherapy [51]." ) gold = [ "Significant", "improvements", "in", "peak", "FEV1", "were", "demonstrated", "with", "tiotropium/olodaterol", "5/2", "\u03BCg", "(", "p", "=", "0.008", ")", ",", "5/5", "\u03BCg", "(", "p", "=", "0.012", ")", ",", "and", "5/10", "\u03BCg", "(", "p", "<", "0.0001", ")", "versus", "tiotropium", "monotherapy", "[", "51", "]", ".", ] tokens = nlp.tokenize(sent1) # assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): print(token, gold_token.encode("utf-8")) assert bytes(token) == gold_token.encode("utf-8")
def tokenize_sgml_test_10_no_normalize(): sent10 = "<[email protected]> [email protected] " "<*****@*****.**>" gold = ["<[email protected]>", "*****@*****.**", "<*****@*****.**>"] tokens = nlp.tokenize(sent10) assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8")
def filter_english(data, min_english=MIN_ENGLISH): """ Remove songs that are mostly non-English """ rows = [] song_words = [] for i, row in data.iterrows(): text = row.song_darklyrics.strip() words = tokenize(text) english_words = tokenize(text, english_only=True) is_english = len(english_words) > min_english * len(words) if is_english: rows.append(i) song_words.append(' '.join(english_words)) print('Non-English songs removed:', len(data) - len(rows)) data = data.loc[rows] data['song_words'] = song_words return data
def ptb_inputs_test18(self): string = "I like programming in F# more than C#." gold = [ "I", "like", "programming", "in", "F#", "more", "than", "C#", "." ] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def fit(self, text: str) -> None: self.tokens = tokenize(text) self.unigram_counts = dict() for unigram in self.tokens: self.unigram_counts[unigram] = self.unigram_counts.get(unigram, 0) + 1 self.ngram_counts = dict() for ngram in ngrams(self.tokens, self.n): self.ngram_counts[ngram] = self.ngram_counts.get(ngram, 0) + 1
def ptb_inputs_test6(self): string = "Gimme a phone, I'm gonna call." gold = [ "Gim", "me", "a", "phone", ",", "I", "'m", "gon", "na", "call", "." ] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test8(self): string = "I said at 4:45pm." gold = ["I", "said", "at", "4:45", "pm", "."] tokens = nlp.tokenize(string) print("SYSTEM:", tokens) print("GOLD :", gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test23(self): string = "I don't give a f**k about your sh*tty life." gold = ["I", "do", "n't", "give", "a", "f", "**", "k", "about", "your", "sh", "*", "tty", "life", "."] tokens = nlp.tokenize(string) print("SYSTEM:", tokens) print("GOLD :", gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test16(self): string = "@Harry_Styles didn`t like Mu`ammar al-Qaddafi" gold = [ "@Harry_Styles", "did", "n`t", "like", "Mu`ammar", "al-Qaddafi" ] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test4(self): string = "The Iron Age (ca. 1300 – ca. 300 BC)." gold = [ "The", "Iron", "Age", "-LRB-", "ca.", "1300", "--", "ca.", "300", "BC", "-RRB-", "." ] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def tokenize_sgml_test_4_no_normalize(): nlp.get_global_PTB_config().normalize_spaces = False sent4 = '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Strict//EN" ' '"http://www.w3.org/TR/html4/strict.dtd">' gold = ['<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Strict//EN" ' '"http://www.w3.org/TR/html4/strict.dtd">'] # spaces go to \u00A0 tokens = nlp.tokenize(sent4) assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8")
def ptb_inputs_test20(self): string = "I lived in O\u2019Malley and read OK! Magazine." gold = [ "I", "lived", "in", "O'Malley", "and", "read", "OK!", "Magazine", "." ] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test10(self): string = "You `paid' US$170,000?!\nYou should've paid only$16.75." gold = [ "You", "`", "paid", "'", "US$", "170,000", "?!", "You", "should", "'ve", "paid", "only", "$", "16.75", "." ] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test11(self): string = "1. Buy a new Chevrolet (37%-owned in the U.S..) . 15%" gold = [ "1", ".", "Buy", "a", "new", "Chevrolet", "-LRB-", "37", "%", "-", "owned", "in", "the", "U.S.", ".", "-RRB-", ".", "15", "%" ] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def get_website_html_pages_tokens(website_name): with open(f'classifier/html_docs/{website_name}_html_docs.json', 'r', encoding='utf-8') as f: html_docs_list = json.load(f) f.close() for html_doc in html_docs_list: if html_doc[0] and '<!DOCTYPE' in html_doc[0]: X_TRAIN.append(nlp.tokenize(html_doc[0])) Y_TRAIN.append(html_doc[1])
def tokenize(): data = request.get_data() if not data: return json_error('empty request') try: text = data.decode('utf-8') except UnicodeDecodeError as err: return json_error(str(err)) return jsonify(tokens=nlp.tokenize(text))
def ptb_inputs_test24(self): string = "First sentence.... Second sentence." # The tests that come with corenlp state that ". . . . Second" should # be "...", ".", "Second". However, the actual CoreNLP tokenizer # and our tokenizer produce "...", "Second" so I am going with that. gold = ["First", "sentence", "...", "Second", "sentence", "."] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def ptb_inputs_test22(self): string = "I like: \u2022wine, \u0095cheese, \u2023salami, & " \ "\u2043speck." gold = [ "I", "like", ":", "\u2022", "wine", ",", "\u2022", "cheese", ",", "\u2023", "salami", ",", "&", "\u2043", "speck", "." ] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def tokenize_sgml_test_8_no_normalize(): nlp.get_global_PTB_config().escape_forward_slash_asterisk = False sent8 = "<a href=\"http:\\\\it's\\here\"> <quote orig_author='some " "\"dude'/> <not sgmltag" gold = ['<a href="http:\\\\it\'s\\here">', "<quote orig_author='some \"dude'/>", "<", "not", "sgmltag"] tokens = nlp.tokenize(sent8) print(tokens) print(gold) assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8")
def ptb_inputs_test21(self): string = "I lived in O\u0092Malley and read OK! Magazine." # /* invalid unicode codepoint, but inherit from cp1252 */ gold = [ "I", "lived", "in", "O'Malley", "and", "read", "OK!", "Magazine", "." ] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def predict(self, prompt: str) -> Dict[str, float]: prompt_ngram = tuple(tokenize(prompt))[-self.n :] prompt_ngram = ("",) * max(0, self.n - len(prompt_ngram)) + prompt_ngram follower_odds: Dict[str, int] = defaultdict(int) for neighbor in self.ngram_follower_counts.keys(): neighbor_distance = ngram_distance(prompt_ngram, neighbor, self.metrics) for follower, follower_count in self.ngram_follower_counts[neighbor].items(): follower_odds[follower] += follower_count * (self.n - neighbor_distance) return collections.OrderedDict((sorted(follower_odds.items(), key=lambda item: item[1], reverse=True)))
def ptb_inputs_test17(self): string = "Kenneth liked Windows 3.1, Windows 3.x, and Mesa A.B " \ "as I remember things." gold = [ "Kenneth", "liked", "Windows", "3.1", ",", "Windows", "3.x", ",", "and", "Mesa", "A.B", "as", "I", "remember", "things", "." ] tokens = nlp.tokenize(string) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def tokenize_sgml_test_10_no_normalize(): sent10 = "<[email protected]> [email protected] " \ "<*****@*****.**>" gold = [ "<[email protected]>", "*****@*****.**", "<*****@*****.**>" ] tokens = nlp.tokenize(sent10) assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8")
def tokenize_sgml_test_5_no_normalize(): sent5 = "Hi! <foo bar=\"baz xy = foo !$*) 422\" > <?PITarget " \ "PIContent?> <?PITarget PIContent> Hi!" gold = [ "Hi", "!", "<foo bar=\"baz xy = foo !$*) 422\" >", "<?PITarget PIContent?>", "<?PITarget PIContent>", "Hi", "!" ] tokens = nlp.tokenize(sent5) print(tokens) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def tokenize_sgml_test_5_no_normalize(): sent5 = 'Hi! <foo bar="baz xy = foo !$*) 422" > <?PITarget ' "PIContent?> <?PITarget PIContent> Hi!" gold = [ "Hi", "!", '<foo bar="baz xy = foo !$*) 422" >', "<?PITarget PIContent?>", "<?PITarget PIContent>", "Hi", "!", ] tokens = nlp.tokenize(sent5) print(tokens) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def tokenize_sgml_test_6_no_space_normalize(): sent6 = ( '<?xml version="1.0" encoding="UTF-8" ?>\n<?xml-stylesheet ' 'type="text/xsl" href="style.xsl"?>\n<book ' 'xml:id="simple_book" ' 'xmlns="http://docbook.org/ns/docbook" version="5.0">\n' ) gold = [ '<?xml version="1.0" encoding="UTF-8" ?>', '<?xml-stylesheet type="text/xsl" href="style.xsl"?>', '<book xml:id="simple_book" ' 'xmlns="http://docbook.org/ns/docbook" version="5.0">', ] tokens = nlp.tokenize(sent6) assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8")
def tokenize_sgml_test_3_no_normalize(): nlp.get_global_PTB_config().normalize_parentheses = False sent3 = ( "Oesophageal acid exposure (% time <pH 4) was similar in " "patients with or without complications (19.2% v 19.3% p>0.05)." ) gold = [ "Oesophageal", "acid", "exposure", "(", "%", "time", "<", "pH", "4", ")", "was", "similar", "in", "patients", "with", "or", "without", "complications", "(", "19.2", "%", "v", "19.3", "%", "p", ">", "0.05", ")", ".", ] tokens = nlp.tokenize(sent3) assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8")
def tokenize_sgml_test_7_no_normalize(): sent7 = ( '<chapter xml:id="chapter_1"><?php echo $a; ?>\n<!-- This ' 'is an SGML/XML comment "Hi!" -->\n<p> </p> <p-fix / >' ) gold = [ '<chapter xml:id="chapter_1">', "<?php echo $a; ?>", '<!-- This is an SGML/XML comment "Hi!" -->', "<p>", "</p>", "<p-fix / >", ] tokens = nlp.tokenize(sent7) assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8")
def ptb_inputs_test7(self): string = '"John & Mary\'s dog," Jane thought (to herself).\n"' "What a #$%!\na- ``I like AT&T''.\"" gold = [ "``", "John", "&", "Mary", "'s", "dog", ",", "''", "Jane", "thought", "-LRB-", "to", "herself", "-RRB-", ".", "``", "What", "a", "#", "$", "%", "!", "a", "-", "``", "I", "like", "AT&T", "''", ".", "''", ] tokens = nlp.tokenize(string) print("SYSTEM:", tokens) print("GOLD :", gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def tokenize_sgml_test_11_no_normalize(): sent11 = "<DOC> <DOCID> nyt960102.0516 </DOCID><STORYID cat=w " "pri=u> A0264 </STORYID> <SLUG fv=ttj-z> " # this is a MUC7 document gold = [ "<DOC>", "<DOCID>", "nyt960102", ".0516", "</DOCID>", "<STORYID cat=w pri=u>", "A0264", "</STORYID>", "<SLUG fv=ttj-z>", ] tokens = nlp.tokenize(sent11) assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8")