def timexTagAndTokenizeText(self, altText=None): """In this method, two steps are required, so if altText is specified, all steps are done inside the if statement, so incorrect dict entries aren't stored""" if altText is not None: raw = altText altOutputStep1 = self.timexTagText(raw) altOutputStep2 = self.wordTokenizeText(altOutputStep1) time_tagged_and_tokenizedText = MWETokenizer( mwes=[('<', '/TIMEX2', '>'), ('<', 'TIMEX2', '>')], separator='').tokenize(altOutputStep2) return time_tagged_and_tokenizedText else: """Tag all temporal expressions with timex2 tags.""" """Don't need to open file here, because it's opened in timexTagText()""" tagged = self.timexTagText() """Word-tokenize all text above""" word_tagged = self.wordTokenizeText(tagged) '''consolidate all broken apart Timex2 tags into single "words"''' if self.textList.get('timexTagAndTokenizeText') is None: self.textList['timexTagAndTokenizeText'] = [ MWETokenizer(mwes=[('<', '/TIMEX2', '>'), ('<', 'TIMEX2', '>')], separator='').tokenize(x) for x in word_tagged ] print self.textList.get('timexTagAndTokenizeText') return self.textList.get('timexTagAndTokenizeText')
def tokenize(multi_word_queries, text): """Returns a list of words that make up the text. Params: {text: String} Returns: List """ lower_case = text.lower() tokenizer = RegexpTokenizer( 'not\s+very\s+[a-z]+|not\s+[a-z]+|no\s+[a-z]+|[a-z]+') result = tokenizer.tokenize(lower_case) multi_tokenizer = MWETokenizer([('working', 'out'), ('coffee', 'shops'), ('average', 'prices'), ('union', 'square'), ('real', 'estate'), ('ice', 'cream'), ('whole', 'foods'), ('co', 'op'), ('wall', 'street'), ('world', 'trade'), ('high', 'school'), ('dim', 'sum'), ('empire', 'state'), ('high', 'rise'), ('walk', 'ups')]) if len(multi_word_queries) > 0: for tok in multi_word_queries: if (len(tok.split('_')) > 1): multi_tokenizer.add_mwe(tuple(tok.split('_'))) #add neighborhood names for n in neighborhood_name_phrases: multi_tokenizer.add_mwe(tuple(n.split('_'))) result2 = multi_tokenizer.tokenize(result) return result2
def __init__(self, locations): self.tokenizer = MWETokenizer() self.time_tagger = TimeTagger() for a in locations: self.tokenizer.add_mwe(a.split()) # Rules defined self.specials = { "ACTIVITY": activities.union(["driving", "flight"]), "REGION": regions, "KEYWORD": [word for word in all_keywords if ' ' in word], "LOCATION": locations, "QUANTITY": ["at least", "more than", "less than", "at most", "not more than", "a number of"], "IN": ["in front of", "called"], "NN": [phrase.replace("_", " ") for phrase in list(phrases.keys())], "SPACE": ["living room", "fastfood restaurant", "restaurant kitchen", "restaurant", "dining hall", "food court", "butchers shop", "restaurant patio", "coffee shop", "room", "hotel room", "kitchen", "office", "airport", "salon"], "POSITION": ["side", "foreground", "background", "right", "left", "image"], "TOBE": ["am", "is", "are", "be", "is being", "am being", "are being", "being"], "WAS": ["was", "were", "had been", "have been"], "TIMEPREP": ["prior to", "then", "earlier than", "later than", "sooner than"], "POSITION_PREP": ["near", "distance to"], } for tag in self.specials: for keyword in self.specials[tag]: if ' ' in keyword: self.tokenizer.add_mwe(keyword.split())
def __init__(self, ents=None, tag2ent=None, collocations=special_collocations, appos=collocations.appos): self.__tokenizer = TweetTokenizer(reduce_len=True) self.__collocations = collocations self.__tknzr = MWETokenizer(self.__collocations) self.__lemm = WordNetLemmatizer() self.__nlp = spacy.load("en_core_web_sm") if ents is None: self.__ents = {} if tag2ent is not None: raise ValueError( "ent2tag and ents should be None or not None both") self.__tag2ent = {} else: if tag2ent is None: raise ValueError( "ent2tag and ents should be None or not None both") self.__ents = ents self.__tag2ent = tag2ent self.__appos = appos for a in appos: self.__appos[a] = '_'.join(self.__appos[a].split()) self.__punctuation = punctuation + "“”‘’‚" self.__stop_symbols = '←↓→↑'
def segment(text, userdict_filepath="userdict2.txt", stopwords_filepath='stopwords.txt'): import nltk stopwords = [ line.strip().lower() for line in open( stopwords_filepath, 'r', encoding='utf-8').readlines() ] # final_list = [] temp_list = [] with open(userdict_filepath, 'r', encoding='utf-8') as f: for line in f: temp_list.append(line.strip(' ').strip('\n')) f.close() temp = [] for line in temp_list: for li in line.lower().split(' '): if len(li) != 0: temp.append(li.strip('\t')) final_list.append(tuple(temp)) temp.clear() userdict_list = final_list tokenizer = MWETokenizer(userdict_list, separator=' ') seg_list = tokenizer.tokenize( nltk.word_tokenize(remove_symbols(text).lower())) seg_list_without_stopwords = [] for word in seg_list: if word not in stopwords: if word != '\t': seg_list_without_stopwords.append(word) return seg_list_without_stopwords
def get_context(self, query_str, text, k=10): if query_str in text: tokenizer = MWETokenizer() query_str_tokens = tuple(query_str.split()) query_str_dashed = "_".join(query_str_tokens) tokenizer.add_mwe(query_str_tokens) text_token = tokenizer.tokenize(text.split()) try: t_start = text_token.index(query_str_dashed) except: return None, None, None t_end = t_start + 1 start_index = max(t_start - k, 0) end_index = min(t_end + k, len(text_token)) text_token_query = text_token[start_index:t_start] + text_token[ t_end + 1:end_index] context = " ".join(text_token_query) context_mention = text_token[start_index:t_start] + [ query_str ] + text_token[t_end + 1:end_index] context_mention = " ".join(context_mention) return context, text_token_query, context_mention else: logging.info('error, query not in text') return None, None, None
def TokenizeDocs(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile): tokenizeddocs = [] combineddocuments = [] fo = FileOperations() # tokenizer = RegexpTokenizer(r'\w+') if fo.exists(filename): # Load the file combineddocuments = fo.LoadFile(filename) pass else: tokenizer = MWETokenizer(glossarylist) regtokenizer = RegexpTokenizer(r'\w+') for doc in tqdm(docs): sentences = sent_tokenize(doc) tmp = [] for sentence in sentences: tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower())) token_lowercase = [x.lower() for x in tokens] tmp.append(token_lowercase) tokenizeddocs.append(tmp) for doc in tqdm(tokenizeddocs): tokdoc = [] [tokdoc.extend(sent) for sent in doc] combineddocuments.append(tokdoc) # Save the file fo.SaveFile(filename, combineddocuments, mode='wb') del fo return combineddocuments
def tokenization(docs): documents = {} for doc in docs: document_plain = docs[doc] document_plain = document_plain.replace("/", "").replace("-", " ") #re.sub(r'\([^)]*\)', '', document_plain) re.sub(r'\([0-9]*\)', '', document_plain) relevant_words = [] mwetokenizer = MWETokenizer() document_ner = spacy_nlp(document_plain) for element in document_ner.ents: # don't consider numbers if element.label_ not in "CARDINAL": relevant_words.append(element) #for each relevant word, if whitespace is present, create a single token with all the words for word in relevant_words: token = str(word).split() if len(token) > 1: move_data = [] for element in token: move_data.append(element) tup = tuple(move_data) mwetokenizer.add_mwe(tup) document_tokenized = word_tokenize(document_plain) document_retokenized = mwetokenizer.tokenize(document_tokenized) documents[doc] = document_retokenized return documents
def __init__(self, filename): """initializes a LyricsCleaner object""" self._filename = filename self._tokenizer = MWETokenizer() for word in SIGNAL_WORDS: self._tokenizer.add_mwe(('[', word, ']')) self._stemmer = LancasterStemmer()
def trim_bio(text): # keywords to return keywords = [] # load from file after custom edit df_keyword = pd.read_csv(local_data + "data/keywords/df.csv") ## convert df to list important_words = df_keyword["Unnamed: 0"].tolist() ## format important words so that they can be registered to tokenizer important_words = [x.split() for x in important_words] # initialize tokenizer tokenizer = MWETokenizer() for iw in important_words: tokenizer.add_mwe([x for x in iw]) # add important words #tokenizer.add_mwe(iw) # add important words # tokenize bio tokens = tokenizer.tokenize([word.lower() for word in text.split()]) # find important words from tokens, append it to keyword for iw in important_words: iw_joined = "_".join(iw) if (iw_joined in tokens): keywords.append(iw_joined) return keywords
def init_base_order_tokenizer(): p = nltk.PorterStemmer() food_tokenizer = MWETokenizer() food_items = {} prices_items = {} image_items = {} cal_items = {} with open('sheet1.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: food_item = row['Menu Item'].replace(' ', '_').lower() price = float(row['Price']) image = row['Image'] cal = float(row['Calories']) image_items[food_item] = image food_items[food_item] = 0 prices_items[food_item] = price cal_items[food_item] = cal items_stem = [ p.stem(i) for i in row['Menu Item'].lower().split(' ') ] if len(items_stem) > 1: food_tokenizer.add_mwe(tuple(items_stem)) with open('mwe.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: items_stem = [ p.stem(i) for i in row['Menu Item'].lower().split(' ') ] if len(items_stem) > 1: food_tokenizer.add_mwe(tuple(items_stem)) return food_tokenizer, food_items, prices_items, cal_items, image_items
def initialize_known_phrase_tokenization(phrases): from nltk.tokenize import MWETokenizer tokenizer = MWETokenizer() for phrase in phrases: if (phrase): phrase_as_list = phrase.replace("_", " ").split() tokenizer.add_mwe(phrase_as_list) return tokenizer
def multiword_tokenizer(token_list, bigram_list): """ Tokenize a list of unigram tokens into bigram tokens, given a list of bigrams. Bigrams are separated with "__" """ mwetokenizer = MWETokenizer(bigram_list, separator="__") return mwetokenizer.tokenize(token_list)
def _init_mwe_tokenizer(self): def multi_word_expressions(): for entity in self.vocab: if entity.find(self._PHRASE_DELIMITER) != -1: yield entity.split(self._PHRASE_DELIMITER) it = multi_word_expressions() self._mwe_tokenizer = MWETokenizer(it)
def fit(self, X, **fit_params): """ Procedure to iteratively contract bigrams (up to max_collocation_iterations times) that score higher on the collocation_function than the min_collocation_score (and satisfy other criteria set out by the optional parameters). """ self.tokenization_ = X n_tokens = sum([len(x) for x in X]) for i in range(self.max_iterations): bigramer = BigramCollocationFinder.from_documents( self.tokenization_) if not self.ignored_tokens == None: ignore_fn = lambda w: w in self.ignored_tokens bigramer.apply_word_filter(ignore_fn) if not self.excluded_token_regex == None: exclude_fn = (lambda w: re.fullmatch(self.excluded_token_regex, w) is not None) bigramer.apply_word_filter(exclude_fn) if not self.min_token_occurrences == None: minocc_fn = lambda w: bigramer.word_fd[ w] < self.min_token_occurrences bigramer.apply_word_filter(minocc_fn) if not self.max_token_occurrences == None: maxocc_fn = lambda w: bigramer.word_fd[ w] > self.max_token_occurrences bigramer.apply_word_filter(maxocc_fn) if not self.min_token_frequency == None: minfreq_fn = (lambda w: bigramer.word_fd[w] < self. min_token_frequency * n_tokens) bigramer.apply_word_filter(minfreq_fn) if not self.max_token_frequency == None: maxfreq_fn = (lambda w: bigramer.word_fd[w] > self. max_token_frequency * n_tokens) bigramer.apply_word_filter(maxfreq_fn) if not self.min_ngram_occurrences == None: bigramer.apply_freq_filter(self.min_ngram_occurrences) new_grams = list( bigramer.above_score(self.score_function, self.min_score)) if len(new_grams) == 0: break self.mtes_.append(new_grams) contracter = MWETokenizer(new_grams) self.tokenization_ = tuple([ tuple(contracter.tokenize(doc)) for doc in self.tokenization_ ]) return self
def tokenize_and_remove_punct(text): text = text.translate(str.maketrans('', '', string.punctuation)) mtokenizer = MWETokenizer() mwe = mtokenizer.tokenize(text.split()) words = [] for t in mwe: if t.isalpha(): words.append(t) return words
def phrase_eval(params): list_phrases, unigram_set, target_token, idf, agg_score, pid = params idf_list = [*idf] idf_set = set(idf_list) tokenizer = MWETokenizer(separator=' ') for e in unigram_set: tokenizer.add_mwe(nltk.word_tokenize(e)) phrases_score = {} for phrase in tqdm(list_phrases, desc='phrase-eval-{}'.format(pid), mininterval=10): score = 0 tokens = nltk.word_tokenize(phrase) if not set(tokens).issubset(idf_set): continue nonstop_tokens = [token for token in tokens if token not in stop] if len(nonstop_tokens) / len(tokens) <= 0.5: continue raw_tokenized = tokenizer.tokenize(tokens) tokenized_set = set(raw_tokenized) keywords = tokenized_set.intersection(unigram_set) for token in keywords: score += agg_score[token] score /= (1 + np.log(len(nonstop_tokens))) vocab = set(target_token).union(set(tokens)) vocab = list(vocab.intersection(idf_set)) target_vec = [0] * len(vocab) phrase_vec = [0] * len(vocab) target_token_freq = dict(Counter(target_token)) target_token_subset = list(set(vocab).intersection(set(target_token))) for token in target_token_subset: index = vocab.index(token) target_vec[index] = target_token_freq[token] / len( target_token) * idf[token] phrase_token_freq = dict(Counter(tokens)) for token in tokens: index = vocab.index(token) phrase_vec[index] = phrase_token_freq[token] / len( tokens) * idf[token] tfidf_sim = 1 - spatial.distance.cosine(target_vec, phrase_vec) phrases_score.update({phrase: {'score': score, 'eval': tfidf_sim}}) rearrange = {} for k, v in phrases_score.items(): rearrange.update({k: v['score']}) top_10 = nlargest(10, rearrange, key=rearrange.get) return {key: phrases_score[key] for key in top_10}
def LoadTokenizer(): global tokenizer tokenizer = MWETokenizer(separator=' ') for spword in WordDict: if ' ' in spword: tupleword = tuple(spword.split(' ')) tokenizer.add_mwe(tupleword) if ':' in spword: tupleword = tuple(re.split(r"(:)", spword)) tokenizer.add_mwe(tupleword)
def form_mwe_tokenizer(self): mwes = [] for _, entry in self.text_entries.items(): term = entry["lemma"] splitted = term.split() if len(term.split()) > 1: mwes.append(tuple(splitted)) return MWETokenizer(mwes=mwes, separator=" ")
def timexTagAndTokenizeText(self, altText=None): """Tags temporal expressions with nltk timex2, and tokenizes the resultant text. Args: altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored. Returns: tokenized text (nested list, by sentence): ex. [['This', 'is', 'a', 'sentence', '.'],['And', 'maybe', 'another']] """ """In this method, two steps are required, so if altText is specified, all steps are done inside the if statement, so incorrect dict entries aren't stored""" if altText is not None: raw = altText altOutputStep1 = self.timexTagText(raw) altOutputStep2 = self.wordTokenizeText(altOutputStep1) time_tagged_and_tokenizedText = MWETokenizer( mwes=[('<', '/TIMEX2', '>'), ('<', 'TIMEX2', '>')], separator='').tokenize(altOutputStep2) return time_tagged_and_tokenizedText else: """Tag all temporal expressions with timex2 tags.""" """Don't need to open file here, because it's opened in timexTagText()""" tagged = self.timexTagText() """Word-tokenize all text above""" word_tagged = self.wordTokenizeText(tagged) '''consolidate all broken apart Timex2 tags into single "words"''' if Preprocessor.textList.get('timexTagAndTokenizeText') is None: nestedListOutput = [ MWETokenizer(mwes=[('<', '/TIMEX2', '>'), ('<', 'TIMEX2', '>')], separator='').tokenize(x) for x in word_tagged ] #We need to remove and change this line if we don't want flattened (one dimensional list). Read below comment. Preprocessor.textList['timexTagAndTokenizeText'] = [ item for sublist in nestedListOutput for item in sublist ] """Currently, the output is a flattened list, we need to decide if we want to keep the sentence structure (making the output a list of lists. This throws off the AEExtractor and the SuspectExtractor, which need to then be fixed.""" return Preprocessor.textList.get('timexTagAndTokenizeText')
def multiword_tokenize(text, mwe): # Tokenizer分离缩略词,(“Don't” =>'Do', "n't") 表句子切分的“,” "." 单独成词 protected_tuples = [word_tokenize(word) for word in mwe] protected_tuples_underscore = ['_'.join(word) for word in protected_tuples] tokenizer = MWETokenizer(protected_tuples) # Tokenize the text. tokenized_text = tokenizer.tokenize(word_tokenize(text)) # Replace the underscored protected words with the original MWE for i, token in enumerate(tokenized_text): if token in protected_tuples_underscore: tokenized_text[i] = mwe[protected_tuples_underscore.index(token)] return tokenized_text
def TokenizeDocsNew(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile): tokenizeddocs = [] combineddocuments = [] fo = FileOperations() # tokenizer = RegexpTokenizer(r'\w+') if fo.exists(filename): # Load the file combineddocuments = fo.LoadFile(filename) pass else: tokenizer = MWETokenizer(glossarylist) regtokenizer = RegexpTokenizer(r'\w+') lmtzr = WordNetLemmatizer() stemmer = SnowballStemmer("english", ignore_stopwords=True) stop_words = stopwords.words('english') for doc in tqdm(docs): sentences = sent_tokenize(doc) tmp = [] for sentence in sentences: # For each sentence in the sentences # Tokenize the sentence based on Regex and then using MWETokenizer tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower())) # Lower the case of all the tokens token_lowercase = [x.lower() for x in tokens] # Lemmatize the sentence. Find the POS tags and then lemmatize tokens_lowecase_tagged = nltk.pos_tag(token_lowercase) lammetized_sentence = [lmtzr.lemmatize(wrd, pos=get_wordnet_pos(tag)) for wrd, tag in tokens_lowecase_tagged] # Stem the sentence stemmed_sentence = [stemmer.stem(wrd) for wrd in lammetized_sentence] # Remove the stop words processed_sentence = [word for word in stemmed_sentence if word not in stop_words] tmp.append(processed_sentence) tokenizeddocs.append(tmp) for doc in tqdm(tokenizeddocs): tokdoc = [] [tokdoc.extend(sent) for sent in doc] combineddocuments.append(tokdoc) # Save the file fo.SaveFile(filename, combineddocuments, mode='wb') del fo return combineddocuments
def tokenizer_sent(dataset): tokenizer = MWETokenizer() aspect_tokenized = [] sentence_tokenized = [] for i in range(0, len(dataset.index)): aspect_split = tuple(dataset['aspect_term'][i].lower().split()) res = tokenizer.add_mwe(aspect_split) aspect_tokenized.append(res) for j in range(0, len(dataset.index)): tok = nltk.pos_tag( tokenizer.tokenize(dataset['text'][i].lower().split())) sentence_tokenized.append(tok)
def sentence_filter(self, sentence): # 对句子进行初步的分词和清洗 if self.language == 'chinese': import jieba.posseg as psg return psg.cut(sentence) # 使用jieba的分词接口直接完成分词和清洗 elif self.language == 'english': from nltk.tokenize import MWETokenizer # 使用MWE分词器 tokenizer = MWETokenizer(self.userdict) # 添加自定义词组,以下划线'_'为词组连接 nlp = spacy.load('en_core_web_sm') # 生成spacy分词器 # for word in self.userdict: # spacy添加自定义词语,貌似无效 # lex = nlp.vocab[word] # 清洗标点符号 quote_double_pattern = re.compile('“|”') quote_single_pattern = re.compile('‘|’') punc_pattern = re.compile( "\"|\xa0|\t|\n|\:|\;| — | - |–-|\!|\@|\#|\$|\%|\^|\*|\_|\?|?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|\/|,") sentence = re.sub(quote_double_pattern, '"', sentence) sentence = re.sub(quote_single_pattern, "'", sentence) # 考虑's和s'的情况,不能直接删掉 sentence = re.sub(punc_pattern, ' ', sentence) # 使用nltk和spacy得到分词结果,使用pke则得到完整句子 # return nlp(' '.join(sentence.split())) # spacy return nlp(' '.join(tokenizer.tokenize(sentence.lower().split()))) # nltk + spacy: 先用nltk添加词组,再用spacy分词 # return sentence # pke elif self.language == 'japanese': mecab = MeCab.Tagger('') # 使用mecab的分词器直接得到结果,暂时不能添加自定义词典,有些专有名词识别不出来(如: 比特/币) # 清洗标点符号 punc_pattern = re.compile( "\xa0|\t|\n|\:|\;| — | - |\!|\@|\#|\$|\%|\^|\&|\*|\_|\?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|/|・|「|」|•|※") sentence = re.sub(punc_pattern, ' ', sentence) sentence = [ ( chunk.split('\t')[0], chunk.split('\t')[1].split(',')[0] ) for chunk in mecab.parse(sentence).splitlines()[:-1] ] # 根据词条结构获取词根和词型 return sentence
def parseWordsFromEntry(entry, vocab_cap=10000): '''Tokenizes an entry into a list of words.''' '''Calculates their indeces relative to their frequencies.''' unknown = "UNKNOWN_WORD" tokenizer = MWETokenizer() words = entry.split() #words = tokenizer.tokenize(entry.split()) frequencies = findWordFrequencyDists(words) vocab = frequencies.most_common(vocab_cap - 1) index_to_word = [x[0] for x in vocab] index_to_word.append(unknown) word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)]) return word_to_index
def text_process_group(mess): """ 1. Lower case the input 2. Remove punctuation expect '-' 3. Apply custom tokenizer 4. Return column of clean text words""" mess.lower() regex = r"[^\P{P}-]+" new_mess = re.sub(regex, " ", mess, 0) tokenizer = MWETokenizer(all_list, separator=' ') token = tokenizer.tokenize(new_mess.lower().split()) sw = [x for x in token if x not in stopwords.words('english')] return sw
def sentence_filter(self, sentence): tokenizer = MWETokenizer(self.userdict) # 添加自定义词组,以下划线'_'为词组连接 nlp = spacy.load('en_core_web_sm') # 生成spacy分词器 quote_double_pattern = re.compile('“|”') quote_single_pattern = re.compile('‘|’') punc_pattern = re.compile( "\"|\xa0|\t|\n|\:|\;| — | - |–-|\!|\@|\#|\$|\%|\^|\*|\_|\?|?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|\/|," ) sentence = re.sub(quote_double_pattern, '"', sentence) sentence = re.sub(quote_single_pattern, "'", sentence) # 考虑's和s'的情况,不能直接删掉 sentence = re.sub(punc_pattern, ' ', sentence) return nlp(' '.join(tokenizer.tokenize( sentence.lower().split()))) # nltk + spacy: 先用nltk添加词组,再用spacy分词
def multi_word_tokenizer(relevant_words, text): mwetokenizer = MWETokenizer() #add tuples of words into multiword tokenizer for word in relevant_words: token = str(word).split() move_data=[] for element in token: move_data.append(element) tup = tuple(move_data) mwetokenizer.add_mwe(tup) #execute multitokenization return mwetokenizer.tokenize(text)
def tokenize_sentence(self, string, max_sentence_len, with_labels=False): merger = MWETokenizer([('<', 'unk', '>')], separator = '') sentence = word_tokenize(string.strip()) # tokenize sentence sentence = merger.tokenize(sentence) # merge <unk> if with_labels: sentence = sentence[1:] sentence = [token.lower() for token in sentence] sentence = sentence[:max_sentence_len - 2] # cut sentence at max_sentence_length sentence = ['<sos>'] + sentence + ['<eos>'] # add start and end-of-sentence tags # pad the rest of the sentence padded_sentence = sentence.copy() padded_sentence.extend(['<pad>']*(max_sentence_len - len(sentence))) return sentence, padded_sentence
def create_tokenizer(): with open('mwe-prep-ru-final.txt', 'r') as f: lines = f.read().split('\n') mwe_list = [tuple(line.split(' ')) for line in lines if 'lemma' not in line and line != ''] with open('mwes-prep-en.html', 'r') as f: lines = f.read().split('\n') mwe_list_en = [] for line in lines: if '</b>:' in line: mwe_list_en.append(tuple(line.split('</b>: ')[1].split(' <td align=right>')[0].split(' '))) mwe_list.extend(mwe_list_en) return MWETokenizer(mwe_list)