def preproc(xtrain, xtest): xtrain_1 = [] xtest_1 = [] stemmer = PorterStemmer() for X in xtrain: Y = str(X).replace('\n', '') X = WhitespaceTokenizer().tokenize(str(Y)) X = re.sub(r'[^\w]', ' ', str(X)) X = word_tokenize(str(X)) stems = [stemmer.stem(token) for token in X] xtrain_1.append(str(stems)) for X in xtest: Y = str(X).replace('\n', '') X = WhitespaceTokenizer().tokenize(str(Y)) X = re.sub(r'[^\w]', ' ', str(X)) X = word_tokenize(str(X)) stems = [stemmer.stem(token) for token in X] xtest_1.append(str(stems)) vectorizer = TfidfVectorizer(stop_words="english", max_df=0.2) Xtrain = vectorizer.fit_transform(xtrain_1) Xtest = vectorizer.transform(xtest_1) return (Xtrain, Xtest)
def __init__(self, vocab_size=30522, pct_bpe=1, word_tokenizer=None, silent=True, ngram_min=2, ngram_max=2, required_tokens=None, strict=False, lowercase=True, EOW=DEFAULT_EOW, SOW=DEFAULT_SOW, UNK=DEFAULT_UNK, PAD=DEFAULT_PAD, MASK=DEFAULT_MASK, CLS=DEFAULT_CLS, SEP=DEFAULT_SEP): if vocab_size < 1: raise ValueError('vocab size must be greater than 0.') self.EOW = EOW self.SOW = SOW self.eow_len = len(EOW) self.sow_len = len(SOW) self.UNK = UNK self.PAD = PAD self.cls_token = CLS self.sep_token = SEP self.mask_token = MASK self.required_tokens = list(set(required_tokens or []).union({self.UNK, self.PAD, self.mask_token,self.cls_token,self.sep_token})) self.vocab_size = vocab_size self.pct_bpe = pct_bpe self.word_vocab_size = max([int(vocab_size * (1 - pct_bpe)), len(self.required_tokens or [])]) self.bpe_vocab_size = vocab_size - self.word_vocab_size self.word_tokenizer = word_tokenizer if word_tokenizer is not None else WhitespaceTokenizer().tokenize #ucto_tokenize#WhitespaceTokenizer().tokenize#ucto_tokenize#WhitespaceTokenizer().tokenize#ucto_tokenize#WhitespaceTokenizer().tokenize#ucto_tokenize #ucto_tokenize # WhitespaceTokenizer().tokenize #wordpunct_tokenize self.word_tokenizer_fitting = WhitespaceTokenizer().tokenize #ucto_tokenize#WhitespaceTokenizer().tokenize#ucto_tokenize#ucto_tokenize#WhitespaceTokenizer().tokenize #WhitespaceTokenizer().tokenize #wordpunct_tokenize self.custom_tokenizer = word_tokenizer is not None self.word_vocab = {} # type: Dict[str, int] self.bpe_vocab = {} # type: Dict[str, int] self.inverse_word_vocab = {} # type: Dict[int, str] self.inverse_bpe_vocab = {} # type: Dict[int, str] self._progress_bar = iter if silent else tqdm self.ngram_min = ngram_min self.ngram_max = ngram_max self.strict = strict self.lowercase = lowercase
def resultwords(self, user_view, gm_view): # Gather words and positions from the text words_index = WhitespaceTokenizer().span_tokenize(self.text) words_text = WhitespaceTokenizer().tokenize(self.text) words = zip(words_index, words_text) # Add counters for concensus count and personal annotation # ((start, stop), 'Word string itself', Intensity, GM Ann ID, User Ann ID, Did user annotate) words = [w + (0, None, None, False,) for w in words] # Gather other annotations from GM and users for this section gm_anns = Annotation.objects.filter(view=gm_view).values_list('pk', 'start', 'text') # Build the running counter of times a word was annotated for gm_pk, start, text in gm_anns: length = len(text) for idx, word in enumerate(words): word_start = word[0][0] counter = word[2] if word_start >= start and word_start <= start + length: counter += 1 words[idx] = (word[0], word[1], counter, gm_pk, word[3], word[4]) user_anns = Annotation.objects.filter(view=user_view).values_list('pk', 'start', 'text') # Build the running counter of times a word was annotated for user_pk, start, text in user_anns: length = len(text) for idx, word in enumerate(words): word_start = word[0][0] if word_start >= start and word_start <= start + length: words[idx] = (word[0], word[1], word[2], word[3], user_pk, True) return words
def My_tokenizer( self, sentence, charoff=False ): # I removed an erlier version of this tokenizer that contains a set of rules that could capture name entities. Now it's just an nltk tokenizer numbers = [str(i) for i in range(1000)] tokens = [] char_o = [] tokenized = word_tokenize(sentence) for i in tokenized: if i[-1] in string.punctuation: tokens.append(i[:-1]) else: tokens.append(i) c_tokens = [ token for token in tokens if not token in string.punctuation if not token in numbers ] l_tokens = " ".join([t + " " for t in c_tokens]) b = WhitespaceTokenizer().tokenize(l_tokens) if charoff: if len(c_tokens) > 1: m = list(WhitespaceTokenizer().span_tokenize(l_tokens)) char_o = list(zip(b, m)) return char_o else: return b
def getWhitespaceTokens(file_path): file = open(file_path, "r") raw_text = file.read() ## Testing the replacement of all "=" signs by spaces before tokenizing. text = raw_text.translate(str.maketrans("=", ' ')) ## Tokenize the sentences sentences = sent_tokenize(text) ## Get spans of the sentences sent_spans = align_tokens(sentences, text) ## create empty arrays for white space tokens and sentence delimiters tokenized_text = [] text_spans = [] ## Loop through each sentence and get the tokens and token spans for s in range(0, len(sentences)): # get the tokens and token spans within the sentence toks = WhitespaceTokenizer().tokenize(sentences[s]) span_generator = WhitespaceTokenizer().span_tokenize(sentences[s]) rel_spans = [span for span in span_generator] # convert the relative spans into absolute spans abs_spans = [] for start, end in rel_spans: abs_spans = abs_spans + [ (sent_spans[s][0] + start, sent_spans[s][0] + end) ] tokenized_text = tokenized_text + toks text_spans = text_spans + abs_spans ## Now we have the token list and the spans. We should be able to continue finding sentnence boundaries as before tags = nltk.pos_tag(tokenized_text) sent_boundaries = [0] * len(tokenized_text) ## figure out which tokens are at the end of a sentence tok_counter = 0 for s in range(0, len(sentences)): sent = sentences[s] if "\n" in sent: sent_newline = sent.split("\n") for sn in sent_newline: sent_split = WhitespaceTokenizer().tokenize(sn) nw_idx = len(sent_split) + tok_counter - 1 sent_boundaries[nw_idx] = 1 tok_counter = tok_counter + len(sent_split) else: sent_split = WhitespaceTokenizer().tokenize(sent) nw_idx = len(sent_split) + tok_counter - 1 sent_boundaries[nw_idx] = 1 tok_counter = tok_counter + len(sent_split) return raw_text, text, tokenized_text, text_spans, tags, sent_boundaries
def getWhitespaceTokens(file_path): file = open(file_path, "r") text = file.read() text = text.replace("\n", "\n\n") span_generator = WhitespaceTokenizer().span_tokenize(text) spans = [span for span in span_generator] tokenized_text = WhitespaceTokenizer().tokenize(text) tags = nltk.pos_tag(tokenized_text) return text, tokenized_text, spans, tags
def getWhitespaceTokens(file_path): file = open(file_path, "r") text = file.read() ## Testing the replacement of all "=" signs by spaces before tokenizing. text = text.translate(str.maketrans("=", ' ')) span_generator = WhitespaceTokenizer().span_tokenize(text) spans = [span for span in span_generator] tokenized_text = WhitespaceTokenizer().tokenize(text) tags = nltk.pos_tag(tokenized_text) #print(tokenized_text) sent_tokenize_list = sent_tokenize(text) sent_boundaries = [0] * len(tokenized_text) ## figure out which tokens are at the end of a sentence tok_counter = 0 #print("\nLength of tokenized_text: " + str(len(tokenized_text)) + "\n") #print("Starting value of tok_counter: " + str(tok_counter)) #print("Number of tokenized sentences: " + str(len(sent_tokenize_list))) for s in range(0, len(sent_tokenize_list)): sent = sent_tokenize_list[s] #print("Sentence #" + str(s) + "::::" + sent) if "\n" in sent: #print("Found Newline in Sentence #" + str(s)) sent_newline = sent.split("\n") #print("Sentence #" + str(s) + " has " + str(len(sent_newline)) + " new lines.") for sn in sent_newline: sent_split = WhitespaceTokenizer().tokenize(sn) #print("Newline string :::: " + sn) #print("Length of newline string: " + str(len(sent_split))) nw_idx = len(sent_split) + tok_counter - 1 #print("Absolute index of last token in newline string: " + str(len(sent_split)) + "+" + str(tok_counter) + "-1 = " + str(nw_idx)) sent_boundaries[nw_idx] = 1 #print("New sent_boundaries: " + str(sent_boundaries)) tok_counter = tok_counter + len(sent_split) #print("Incremented tok_counter by " + str(len(sent_split)) + " to equal " + str(tok_counter)) else: sent_split = WhitespaceTokenizer().tokenize(sent) #print("No new lines. tok_counter: " + str(tok_counter)) #print("Length of sentence: " + str(len(sent_split))) #print("Tokenized sentence #" + str(s) + ":::: " + str(sent_split)) nw_idx = len(sent_split) + tok_counter - 1 #print("New idx: " + str(nw_idx)) sent_boundaries[nw_idx] = 1 #print("New sent_boundaries: " + str(sent_boundaries)) tok_counter = tok_counter + len(sent_split) #print("Incremented tok_counter by " + str(len(sent_split)) + " to equal " + str(tok_counter)) return text, tokenized_text, spans, tags, sent_boundaries
def initialize_berita(judul, isi): return { 'token_judul': WhitespaceTokenizer().tokenize(stem(preprocess(judul))), 'isi': isi, 'list_isi': sent_tokenize(isi), 'token_isi': [ WhitespaceTokenizer().tokenize(stem(preprocess(kalimat))) for kalimat in sent_tokenize(isi) ] }
def fun_1_1_5(): import nltk from nltk.tokenize import RegexpTokenizer from nltk.tokenize import regexp_tokenize tokenizer = RegexpTokenizer("[\w]+") print "RegexpTokenizer:", tokenizer.tokenize( "Don't hesitate to ask questions") print "regexp_tokenizer:", regexp_tokenize( "Don't hesitate to ask questions", pattern="\w+|\$[\d\.]+|\S+") # 通过空格来执行切分 tokenizer = RegexpTokenizer('\s+', gaps=True) print "RegexpTokenizer:", tokenizer.tokenize( "Don't hesitate to ask questions") # 筛选以大写字母开头的单词 sent = " She secured 90.56 % in class X \n. She is a meritorious student" capt = RegexpTokenizer('[A-Z]\w+') print "RegexpTokenizer:", capt.tokenize(sent) # RegexpTokenizer 的一个子类是如何使用预定义正则表达式的 from nltk.tokenize import BlanklineTokenizer print "BlanklineTokenizer:", BlanklineTokenizer().tokenize(sent) # 字符串的切分可以通过空格、间隔、换行等来完成 from nltk.tokenize import WhitespaceTokenizer print "WhitespaceTokenizer:", WhitespaceTokenizer().tokenize(sent) # WordPunctTokenizer 使用正则表达式\w+|[^\w\s]+来执行文本的切分,并将其 # 切分为字母与非字母字符 from nltk.tokenize import WordPunctTokenizer print "WordPunctTokenizer:", WordPunctTokenizer().tokenize(sent) # 使用 split()方法进行切分 print "split():", sent.split() print "split(' '):", sent.split(' ') print "split('\n'):", sent.split('\n') # 类似于 sent.split('\n')方法,LineTokenizer 通过将文本切分为行来执行切分 from nltk.tokenize import LineTokenizer print "LineTokenizer:", LineTokenizer().tokenize(sent) print "LineTokenizer:", LineTokenizer(blanklines='keep').tokenize(sent) print "LineTokenizer:", LineTokenizer(blanklines='discard').tokenize(sent) # SpaceTokenizer 与 sent.split('')方法的工作原理类似 from nltk.tokenize import SpaceTokenizer print "SpaceTokenizer:", SpaceTokenizer().tokenize(sent) # nltk.tokenize.util 模块通过返回元组形式的序列来执行切分,该序列为标识符 # 在语句中的位置和偏移量 print "标识符序列:", list(WhitespaceTokenizer().span_tokenize(sent)) # 给定一个标识符的序列,则可以返回其跨度序列 from nltk.tokenize.util import spans_to_relative print "位置和偏移:", list( spans_to_relative(WhitespaceTokenizer().span_tokenize(sent))) # 通过在每一个分隔符的连接处进行分割,nltk.tokenize.util.string_span_tokenize(sent,separator)将返回 sent 中标识符的偏移量: from nltk.tokenize.util import string_span_tokenize print "标识符序列:", list(string_span_tokenize(sent, " "))
def get_file_sentence_length_stats(): fen = codecs.open('file.en', 'rb', encoding='utf-8') fde = codecs.open('file.de', 'rb', encoding='utf-8') enlen = [] for s in fen: enlen.append(len(WhitespaceTokenizer().tokenize(s))) delen = [] for s in fde: delen.append(len(WhitespaceTokenizer().tokenize(s))) print 'English mean: ', np.mean(enlen) print 'English std: ', np.std(enlen) print 'German mean: ', np.mean(delen) print 'German std: ', np.std(delen) return True
def get_phrase_length_stats(): p = pickle.load(open('counts_phrase_lr_dl.p','rb')) de_len = [] en_len = [] for de_p,en_p in p.keys(): de_len.append(len(WhitespaceTokenizer().tokenize(de_p))) en_len.append(len(WhitespaceTokenizer().tokenize(en_p))) print 'English phrase mean: ', np.mean(en_len) print 'English phrase std: ', np.std(en_len) print 'German phrase mean: ', np.mean(de_len) print 'German phrase std: ', np.std(de_len) return True
def read_session(lines): """ it takes a path to a transcription file and returns a dictionary that maps conversation id to a list of words. :param lines: <class '_io.TextIOWrapper'> remember: *v: non-Dutch words, *n: new non-existing words, *s: street words, *a: incomplete words, *u: distorted words, *x: unclear word, xxx: unclear utterances, vvv: non-Dutch sentences, ggg: sounds made by the speaker """ lines_to_words = lines.read() lines_to_words = re.sub('[0-9]*\.[0-9]*\t', '', lines_to_words) # to remove timestamps lines_to_words = re.sub( '[A-Za-z]*\*[anuxANUX]{1}', '', lines_to_words) # to remove words with *n, *a, *u, and *x lines_to_words = re.sub('[A-Za-z]*\*[etV]{1}', '', lines_to_words) # unknown notation lines_to_words = re.sub('[A-Za-z]*\*op', '', lines_to_words) # a mistake? lines_to_words = lines_to_words.replace('start\tend\ttext\n', '').replace('.', '').replace('-', ' ')\ .replace('?', '').replace('\n', ' ').replace('xxx', '').replace('ggg', '').replace('vvv', '')\ .replace('*v', '').replace('*s', '') lines_to_words = re.sub('[A-Za-z]*\*', '', lines_to_words) # for words with missing notation # s = lines_to_words.translate({ord(c): None for c in string.punctuation if c != '*'}) tk = WhitespaceTokenizer() words = tk.tokenize(lines_to_words) return words
def clean_googlengram(line): """Removes speechtags from line specific to the googlengram module Param: line (unicode) Returns: line (unicode) """ return_line = line.split("\t")[ 0] # Get the ngram, remove year, counter, etc clean = [] words = WhitespaceTokenizer().tokenize(return_line) for word in words: # in >1-grams transitions to specific tags are written as: # The_ADJ _NOUN_ (meaning from The there is a transition to a noun # We remove those if word[0] != '_' and word[-1] != '_': # Split the token and the tag based on the '_' token, tag = str2tuple(word, '_') # Punct will be added using rules. if len(token) > 1: if tag != 'PUNCT' or tag != '.' or tag != '': clean.append(token) elif token not in punctuation: clean.append(token) return_line = ' '.join(clean) if return_line != line: return True, return_line else: return False, line
def main(tweet): #tweet = input("enter tweet here: ") tk = WhitespaceTokenizer() words = tk.tokenize(tweet) words_with_pos = pos_tag(words) queries = formQueries(words_with_pos) return scrapeWebForEachQuery(queries)
def lemmatize_and_tag(text): '''Tokenize the text and lemmatize the word by its tag. Add possible high frequeny character name to a global list nnp_to_remove ''' global nnp_to_remove lemmatizer = WordNetLemmatizer() words = [] for w, p in pos_tag(WhitespaceTokenizer().tokenize(text)): w = w.strip('1234567890"' + string.punctuation).lower() if w.lower() in stop_word or not w: continue if p in pos_to_wornet_dict.keys(): if p == 'NNP' and 'V' in pos_tag( [w.lower()] )[0][1]: #Verbs at the beggining of a sentence is classified as NNP sometimes. Discard it. continue words.append( (lemmatizer.lemmatize(w, pos_to_wornet_dict[p]).lower(), p)) # Find words with highest frequency. If they are NNP, add them to a list of words to remove, as those words are most likely character's name bow = Counter(words) nnp_to_remove.update({ x[0][0] for x in sorted(bow.items(), key=lambda x: x[1], reverse=True)[:8] if x[0][1] == 'NNP' }) return ' '.join([ x for x in list(zip(*words))[0] if (x not in nnp_to_remove and len(x) > 1) ])
def __init__(self, name, config): """ Initializes the component. :param name: Component name (read from configuration file). :type name: str :param config: Dictionary of parameters (read from the configuration ``.yaml`` file). :type config: :py:class:`ptp.configuration.ConfigInterface` """ # Call constructors of parent classes. Component.__init__(self, name, SentenceTokenizer, config) # Read the actual configuration. self.mode_detokenize = config['detokenize'] # Tokenizer. self.tokenizer = WhitespaceTokenizer() # Set key mappings. self.key_inputs = self.stream_keys["inputs"] self.key_outputs = self.stream_keys["outputs"] if self.mode_detokenize: # list of strings -> sentence. self.processor = self.detokenize_sample else: # sentence -> list of strings. self.processor = self.tokenize_sample
def __init__(self, filename: str, concat_glove: bool, glove_vectors: Vectors, elmo_model: ELMoModel, lowercase_sentences: bool = False, tokenize_sentences: bool = True, only_news: bool = False): assert os.path.splitext(filename)[ 1] == '.csv', 'Metaphor dataset file should be of type CSV' self.concat_glove = concat_glove self.glove_vectors = glove_vectors self.tokenizer = WhitespaceTokenizer() self.lowercase_sentences = lowercase_sentences self.tokenize_sentences = tokenize_sentences self.only_news = only_news self.elmo_model = elmo_model self._sentences, self._labels = self._parse_csv_file(filename) self.pos_weight = 1 / ( sum([sum([label for label in labels]) for labels in self._labels]) / sum([sum([1 for label in labels]) for labels in self._labels])) self.elmo_filename = self._assert_elmo_vectors_file( filename, self._sentences) self._data_size = len(self._sentences)
def N_gram_Tokenizer(pathToFiles, n): pathToFiles = pathToFiles stemmer = PorterStemmer() n = n #n in n-gram ngrams = {} #check if the file has gold standard if not continue to next file for filename in os.listdir(pathToFiles): if filename not in filename_overlap: continue with open(os.path.join(pathToFiles, filename)) as currentFile: ngrams[filename] = {} tokens_in_window = [] for line in currentFile: #Tokenize on whitespace tokens = WhitespaceTokenizer().tokenize(line) for token in tokens: token = token.split("_") token = token[0].lower() token = stemmer.stem(token) tokens_in_window.append(token) if len(tokens_in_window) > n: tokens_in_window = tokens_in_window[1:] newNGram = '' if len(tokens_in_window) == n: for currentToken in tokens_in_window: newNGram = '{} {}'.format(newNGram, currentToken) newNGram = newNGram.strip() if newNGram: ngrams[filename][newNGram] = 0 return ngrams
def extract_tokens(row, lemmatize=True, use_tag=True): tokenizer = WhitespaceTokenizer() if lemmatize: # reduce words to lemmas pattern = '[().*+,?!\'\";:]*' token_list = list() if use_tag: # use POS tags to obtain more accurate lemmas pos_tags = PerceptronTagger().tag(tokenizer.tokenize(row['text'])) lemmatizer_input = map( lambda x: (x[0], nltk_to_wordnet.get(x[1][0])), pos_tags) lemmatizer = WordNetLemmatizer() for word, tag in lemmatizer_input: if word != 'urlLink' and 'http:' not in word: word = word.lower() if tag is None: tok = lemmatizer.lemmatize(word) tok = re.sub(pattern, '', tok) if not tok.isdigit(): token_list.append(tok) else: tok = lemmatizer.lemmatize(word, tag) tok = re.sub(pattern, '', tok) if not tok.isdigit(): token_list.append(tok) else: # do not use a tagger if not specified and speed up computation lemmatizer_input = tokenizer.tokenize(row['text']) lemmatizer = WordNetLemmatizer() for word in lemmatizer_input: if word != 'urlLink' and 'http:' not in word: tok = lemmatizer.lemmatize(word.lower()) tok = re.sub(pattern, '', tok) if not tok.isdigit(): token_list.append(tok) else: # simply tokenize based on whitespaces token_list = tokenizer.tokenize(row['text']) return token_list
def tweet_clean(tweet): # Remove HTML special entities (e.g. &) tweet_c1 = re.sub(r'\&\w*;', '', tweet) # Remove hyperlinks tweet_c2 = re.sub(r'https?:\/\/.*\/\w*', '', tweet_c1) # Remove punctuation tweet_c3 = re.sub(r'[' + punctuation2.replace('@', '') + ']+', ' ', tweet_c2) # Conversion to lowercase tweet_c4 = tweet_c3.lower() # Remove emoticons tweet_c5 = emoji_pattern.sub(r'', tweet_c4) # Tokenize with WhitespaceTokenizer to handle hashtag tokens = WhitespaceTokenizer().tokenize(tweet_c5) # Remove stopwords stop_words = set(nltk.corpus.stopwords.words('italian')) filt_words = [w for w in tokens if not w in stop_words] # stemming words (...) stemmer = SnowballStemmer("italian") stem_words = [stemmer.stem(w) for w in filt_words] return stem_words
def tokenize(documents): tokenizer = WhitespaceTokenizer() def tokenize_doc(document): return tokenizer.tokenize(document) """ Ingests content, converts to lowercase, removes special characters except for _, ?, and %, replaces dashes and hypens. Returns full or unique list of cleaned words in content. :param content: String of text to tokenize. :param unique: Boolean indicating whether to make the output list of words unique or not. :return: list of cleaned and tokenized input content. """ if documents is None: return None documents = list(map(tokenize_doc, documents)) # Return an occurrence matrix instead of a frequency matrix # if unique is True: # # set() removes duplicates and returns a dict, convert back into list # words = list(set(words)) return documents
def tokenize(self, text): """ tokenize text into a list of Token objects :param text: text to be tokenized (might contains several sentences) :type text: str :return: List of Token objects :rtype: list(Token) """ tokens = [] if self.tokenizer_type == "SpaceTokenizer": operator = RegexpTokenizer('\w+|\$[\d\.]+|\S+') for counter, span in enumerate(operator.span_tokenize(text)): new_token = Token(counter, text[span[0]:span[1]], span[0], span[1]) tokens.append(new_token) elif self.tokenizer_type == "NLTKWhiteSpaceTokenizer": operator = WhitespaceTokenizer() for counter, span in enumerate(operator.span_tokenize(text)): new_token = Token(counter, text[span[0]:span[1]], span[0], span[1]) tokens.append(new_token) elif self.tokenizer_type == "PTBTokenizer": ptb_tokens = word_tokenize(text) counter = 0 for token, span in self._penn_treebank_tokens_with_spans(text, ptb_tokens): new_token = Token(counter, token, span[0], span[1]) counter += 1 tokens.append(new_token) return tokens
def tokenize(text): """using nltk to case normalize, lemmatize, and tokenize text. This function is used in the machine learning pipeline to, vectorize and then apply TF-IDF to the text. Args: text (str): A disaster message. Returns: processed_tokens (list): list of cleaned tokens in the message. """ # get tokens from text tokens= WhitespaceTokenizer().tokenize(text) lemmatizer= WordNetLemmatizer() # clean tokens processed_tokens=[] for token in tokens: token=lemmatizer.lemmatize(token).lower().strip('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~') token=re.sub(r'\[[^.,;:]]*\]','', token) # add token to compiled list if not empty if token !='': processed_tokens.append(token) return processed_tokens
def generate_n_gram(corpus): # In this example I'm using a corpus from NLTK - Gutenburg Project # Sara Bryant - Stories to Tell to path = "./corpora/" + corpus reader = PlaintextCorpusReader(path, '.*\.txt', WhitespaceTokenizer()) sentences = reader.sents() # Process text and collect reverse N-grams sentence by sentence # Do not do this word by word or you'll have incoherent N-grams that span sentences def processText(sentence): tokens = [] for word in sentence: valid = True for c in word: if c in string.punctuation and c != "'": valid = False if valid: tokens.append(word.lower()) return tokens ngrams = [] for sentence in sentences: tokens = processText(sentence) ngrams += reverseNgrams(tokens, 3) # print string.punctuation model = setupModel(ngrams) with open(corpus + '.json', 'w') as outfile: json.dump(model, outfile)
def MRR(pathToFiles): pathToFiles = pathToFiles stemmer = PorterStemmer() ngrams = {} for filename in os.listdir(pathToFiles): with open(os.path.join(pathToFiles, filename), encoding="utf8", errors="ignore") as currentFile: lines = [line.strip('\n') for line in currentFile] ngrams[filename] = {} tokens_in_window = [] for line in lines: #tokenize on whitespace tokens = WhitespaceTokenizer().tokenize(line) tokens_in_window = [] for token in tokens: token = token.split("_") token = token[0].lower() #stemming using Porter Stemmer token = stemmer.stem(token) if not token: continue tokens_in_window.append(token) newNGram = '' for currentToken in tokens_in_window: newNGram = '{} {}'.format(newNGram, currentToken) newNGram = newNGram.strip() if newNGram: ngrams[filename][newNGram] = 0 return ngrams
def preprocess_txt(doc): #lowercase doc = doc.lower() #remove "{html}" strings doc = re.sub('\{html\}', '', doc) #remove html tags doc = BeautifulSoup(doc, 'html.parser').get_text() #remove all paths/urls/--keys pattern = re.compile(r'[/\-+\\+]') doc_split = [token for token in WhitespaceTokenizer().tokenize(doc) if not pattern.findall(token)] doc = " ".join(doc_split) #tokenize and remove stop words and punctuation symbols and spaces using spaCy #use lemmas doc_spacy = sp(doc) doc_tokenized_spacy = [token.lemma_ for token in doc_spacy if not token.is_stop and not token.is_punct and not token.is_space] #preprocessing additionaly with nltk give much better results doc_nltk = " ".join(doc_tokenized_spacy) #tokenize and remove stop words and punctuation symbols using nltk #remove numerics doc_tokenized_spacy_nltk = [token for token in nltk.word_tokenize(doc_nltk) if token.isalpha()] return doc_tokenized_spacy_nltk
class VNTokenizer(nn.Module): def __init__(self, config): super(VNTokenizer, self).__init__() self.max_emb_words = config.get('max_emb_words') self.embedding_dim = config.get('embedding_dim', EMBEDDING_DIM) self.char_embedding_dim = config.get('char_embedding_dim', CHAR_EMBEDDING_DIM) self.hidden_dim = config.get('hidden_dim', 1200) self.num_layers = config.get('num_layers', 3) self.dropout_prob = config.get('dropout_prob', .2) self.is_cuda = is_cuda if is_cuda is not None else torch.cuda.is_available() self.word_encoder = to_gpu(BRNNWordEncoder(self.char_embedding_dim, rnn_type='LSTM')) self.dropout = nn.Dropout(self.dropout_prob)) # 0: reserved index by Keras tokenizer # num_words + 1: index for oov token self.embedding = nn.Embedding(self.max_emb_words + 2, self.embedding_dim) self.lstm = nn.LSTM(self.embedding_dim + self.char_embedding_dim, self.hidden_dim // 2, num_layers=self.num_layers, bidirectional=True) # Maps the output of the LSTM into tag space. self.hidden2tag = nn.Linear(self.hidden_dim, 1) # Set tokenizer self.tokenizer = tokenizer self.tokenize_fn = WhitespaceTokenizer().tokenize
def nltk_tokenizer(tweets): tokenizers = {"TreebankWordTokenizer": {"tokens": [TreebankWordTokenizer().tokenize(tweet) for tweet in tweets]}, "WordPunctTokenizer": {"tokens": [WordPunctTokenizer().tokenize(tweet) for tweet in tweets]}, "WhitespaceTokenizer": {"tokens": [WhitespaceTokenizer().tokenize(tweet) for tweet in tweets]},} tokenizers = analyzer(tokenizers, tweets) return tokenizers
def __init__( self, root, fileids, sep='/', word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer('\n', gaps=True), alignedsent_block_reader=read_alignedsent_block, encoding='latin1', ): """ Construct a new Aligned Corpus reader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._alignedsent_block_reader = alignedsent_block_reader
def clean_text(text): """ Removes punctuation, capitalizations, numbers, stop words, and stems words""" ps = PorterStemmer() stop_words = set(stopwords.words('english')) text = text.lower() text = contractions.expandContractions(text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r"\'scuse", " excuse ", text) text = re.sub('\W', ' ', text) # remove punctuation text = re.sub('\s+', ' ', text) text = re.sub('\d+', ' ', text) # remove numbers text = re.sub( r'(.)\1\1+', r'\1\1', text) # letters repeated 3 or more times in a row are repeated twice text = re.sub(r'(ha)\1\1+', r'haha', text) text = re.sub(r'(lo)\1\1+', r'lol', text) text = text.strip(' ') # stem words tokenizer = WhitespaceTokenizer() tokenized_comment = tokenizer.tokenize(text) filtered_sentence = [w for w in tokenized_comment if not w in stop_words] stemmed_comment = [ps.stem(word) for word in filtered_sentence] text = " ".join(stemmed_comment) return text