def __index_content(url_id, db, soup): title = soup.title.text if title is not None: if isinstance(title, basestring): title.encode('utf8') else: unicode(title).encode('utf8') content = soup.find("div", {"id": "mw-content-text"}).text if isinstance(content, basestring): content.encode('utf8') else: unicode(content).encode('utf8') # content = soup.text custom_tokenizer = PunktSentenceTokenizer() tokenized_sentences = custom_tokenizer.tokenize(unicode(content)) page = dict() page["title"] = title hints_list = list() try: for sentence in tokenized_sentences: words = nltk.word_tokenize(sentence) tagged = nltk.pos_tag(words) grammar = r"""NP: {<DT|PP\$>?<JJ>*<NN>} {<NNP>+}""" chunk_parser = nltk.RegexpParser(grammar) chunked = chunk_parser.parse(tagged) for chunk in chunked.subtrees(): if chunk.label() == "NP": line = list() for each in chunk.leaves(): if len(each[0]) > 2: line.append(each[0]) if len(line) > 0: final_value = (" ".join(line)).lower() hints_list.append(final_value) page["hints"] = hints_list except Exception as e: print(str(e)) db.known_urls.update_one({"_id": url_id}, {"$set": {"content": page}}) page_content_size = len(page["hints"]) print(colored("\t\tUpdated With Indexed Content", "yellow")) # current_dir = os.getcwd() # files_dir = current_dir + "/Originals/" # file_name = url_id # file_path = files_dir + str(file_name) # created_file = open(file_path, "w") # created_file.write(content.encode("utf-8")) # created_file.close() # print("\t\tOriginal Content Is Saved") return page_content_size return
def tokenize_english_document(input_text): """ This is a crude tokenizer for input conversations in English. :param input_text: :return: """ end_list = [] block_tokenizer = BlanklineTokenizer() sentence_tokenizer = PunktSentenceTokenizer() word_tokenizer = WhitespaceTokenizer() # using the 38 characters in one line rule from ITV subtitle guidelines characters_per_line = 38 lines_per_subtitle = 2 blocks = block_tokenizer.tokenize(input_text) for block in blocks: # We have one speaker sentences = sentence_tokenizer.tokenize(block) # We have the sentences for sentence in sentences: words = word_tokenizer.tokenize(sentence) reverse_words = words[::-1] lines = [] current_line = '' line_full = False while reverse_words: word = reverse_words.pop() longer_line = ' '.join([current_line, word]).strip() if len(longer_line) > characters_per_line and len( current_line): # The longer line is overreaching boundaries reverse_words.append(word) line_full = True elif len(word) >= characters_per_line: # Very long words current_line = longer_line line_full = True else: current_line = longer_line if line_full: lines.append(current_line) current_line = '' line_full = False if len(lines) >= lines_per_subtitle: end_list.append(lines) lines = [] if current_line: lines.append(current_line) if lines: end_list.append(lines) return end_list
def pre_segment(doc): """Set sentence boundaries with nltk instead of spacy.""" if len(str(doc.text).split()) > 3: tokenizer = PunktSentenceTokenizer(doc.text) sentences = tokenizer.tokenize(doc.text) for nltk_sentence in sentences: words = re.findall(r"[\w]+|[^\s\w]", nltk_sentence) for i in range(len(doc) - len(words) + 1): token_list = [str(token) for token in doc[i:i + len(words)]] if token_list == words: doc[i].is_sent_start = True for token in doc[i + 1:i + len(words)]: token.is_sent_start = False return doc
def sentence_tokenizer(text): """ Tokenizes sentences. :param text: :return: list of sentences (a sentence is a string) """ punkt_param = PunktParameters() punkt_param.abbrev_types = { 'zzgl', 'prof', 'ca', 'vj', 't', 'mio', 'sro', 'lv', 'io', 'ihv', 'bzw', 'usw', 'inkl', 'zt', 'vh', 'dr', 'entspr', 'dem', 'fort', 'co', 'kg', 'zb', 'bspw', 'ua', 'rd', 'abs', 'etc', 'tsd', 'z.b', 'evtl', '1', '2', '3', '4', '5', '6', '7', '8', '9', '19', '20', '21' } sentence_splitter = PunktSentenceTokenizer(punkt_param) return sentence_splitter.tokenize(text)
def _extract_text_from_elements(elements: Element, punkt: bool, keep_xml: bool) -> List[str]: examples = [] if keep_xml: for e in elements: xml_str = tostring(e).decode('utf-8') # tostring returns bytes length = len(innertext(e)) if length > config.min_char_length: examples.append(xml_str) else: for e in elements: text = innertext(e) if punkt: sentences = PunktSentenceTokenizer().tokenize(text=text) filtered_sentences = [s for s in sentences if self._filter(s)] examples += filtered_sentences else: if _filter(text): examples.append(text) return examples
def sentence_split(input_text): input_text = "<root>" + input_text + "</root>" soup = BeautifulSoup(input_text, "xml") paragraphs = [] for doc in soup.find('root').findAll('DOC'): if doc['type'] == 'story': headlines = doc('HEADLINE') for h in headlines: paragraphs.append(h.contents[0]) p_blocks = doc.find('TEXT').findAll('P') for p in p_blocks: paragraphs.append(p.contents[0]) elif doc['type'] == 'multi': paragraphs.append(doc.find('TEXT').contents[0]) sentences = [] punkt = PunktSentenceTokenizer() for parag in paragraphs: for sent in punkt.sentences_from_text(parag, realign_boundaries=True): sentences.append(replace.sub(' ', sent).strip()) return sentences
def sent_tokenize(text): model_path = join(dirname(__file__), 'sent_tokenize_model_v1.0.pkl') with open(model_path, 'rb') as fs: punkt_param = pickle.load(fs) punkt_param.sent_starters = {} abbrev_types = [ 'g.m.t', 'e.g', 'dr', 'dr', 'vs', "000", 'mr', 'mrs', 'prof', 'inc', 'tp', 'ts', 'ths', 'th', 'vs', 'tp', 'k.l', 'a.w.a.k.e', 't', 'a.i', '</i', 'g.w', 'ass', 'u.n.c.l.e', 't.e.s.t', 'ths', 'd.c', 've…', 'ts', 'f.t', 'b.b', 'z.e', 's.g', 'm.p', 'g.u.y', 'l.c', 'g.i', 'j.f', 'r.r', 'v.i', 'm.h', 'a.s', 'bs', 'c.k', 'aug', 't.d.q', 'b…', 'ph', 'j.k', 'e.l', 'o.t', 's.a' ] abbrev_types.extend(string.ascii_uppercase) for abbrev_type in abbrev_types: punkt_param.abbrev_types.add(abbrev_type) for abbrev_type in string.ascii_lowercase: punkt_param.abbrev_types.add(abbrev_type) tokenizer = PunktSentenceTokenizer(punkt_param) sentences = tokenizer.sentences_from_text(text) return sentences
def _load_model(): global sentence_tokenizer if sentence_tokenizer is not None: return model_path = join(dirname(__file__), 'st_kiss-strunk-2006_2019_01_13.pkl') with open(model_path, 'rb') as fs: punkt_param = pickle.load(fs) punkt_param.sent_starters = {} abbrev_types = [ 'g.m.t', 'e.g', 'dr', 'dr', 'vs', "000", 'mr', 'mrs', 'prof', 'inc', 'tp', 'ts', 'ths', 'th', 'vs', 'tp', 'k.l', 'a.w.a.k.e', 't', 'a.i', '</i', 'g.w', 'ass', 'u.n.c.l.e', 't.e.s.t', 'ths', 'd.c', 've…', 'ts', 'f.t', 'b.b', 'z.e', 's.g', 'm.p', 'g.u.y', 'l.c', 'g.i', 'j.f', 'r.r', 'v.i', 'm.h', 'a.s', 'bs', 'c.k', 'aug', 't.d.q', 'b…', 'ph', 'j.k', 'e.l', 'o.t', 's.a' ] abbrev_types.extend(string.ascii_uppercase) for abbrev_type in abbrev_types: punkt_param.abbrev_types.add(abbrev_type) for abbrev_type in string.ascii_lowercase: punkt_param.abbrev_types.add(abbrev_type) sentence_tokenizer = PunktSentenceTokenizer(punkt_param)
for line in lines: line_dict = dict(line) if "EKYWD" in line_dict and "EABST" in line_dict: keywords = re.split("\t", line_dict["EKYWD"]) all_keywords.update(set(keywords)) sys.stderr.write("Tokenize keywords\n") keywords = [] for keyword in all_keywords: keywords.append(word_tokenize(keyword)) sys.stderr.write("All keywords: " + str(len(all_keywords)) + "\n") p = PunktSentenceTokenizer() for i in range(len(lines)): if i % 10 == 0: sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n") line = dict(lines[i]) if not ("EKYWD" in line and "EABST" in line): continue abstract = line["EABST"] abstract = p.tokenize(abstract) abstract = [word_tokenize(sentence) for sentence in abstract] for sentence in abstract: j = 0 while j < len(sentence): found = False for k in range(len(keywords)):
def tokenize_to_sentences(self, paragraph): tokenizer = PunktSentenceTokenizer() sentences = tokenizer.tokenize(paragraph) return sentences
def get_sentence_tokenizer(): # https://textminingonline.com/dive-into-nltk-part-ii-sentence-tokenize-and-word-tokenize return PunktSentenceTokenizer()
from nltk.corpus import stopwords import nltk example = "Hello Mr. Holmes. How are you doing? The weather is nice Holmes and Python is amazing. I hope you like it too!" sen_list = sent_tokenize(example) sen = sen_list[2] print(sen) stop_words = set(stopwords.words('english')) '''words = word_tokenize(sen) filtered_words = [] for w in words: if w not in stop_words: tokenizing filtered_words.append(w) print(filtered_words) ''' tokenize = PunktSentenceTokenizer(sen) tokenized = tokenize.tokenize(sen) # Speech tagging print(tokenized) for i in tokenized: words = word_tokenize(i) tagged = nltk.pos_tag(words) # Chunking ''' using regex here . means select all characters ? means atleast 1 repetation.. for further info see tutorial on pythonprogrammong.net ''' chunkgram = r"""Chunk: {<RB.?>*<VB.?>*<NNP><NN>?} """ # RB,VB,NNp etc are tags like VB=verb... what we are doing here is selecting certain type of words in chunk chunkparser = nltk.RegexpParser(chunkgram) chunked = chunkparser.parse(tagged) print(chunked)
from nltk import PunktSentenceTokenizer, WordPunctTokenizer from collections import Counter vocab_size = 1000 sentTokenier = PunktSentenceTokenizer() wordTokenizer = WordPunctTokenizer() filename = 'data/formatted_movie_lines.txt' string = open(filename, mode='r', encoding='utf8').read() string = string.replace("'t", "") string = string.replace("'s", "") words = wordTokenizer.tokenize(string) sentences = set(sentTokenier.tokenize(string)) vocab = Counter(words).most_common(vocab_size) dict = Counter(vocab) sentences = [wordTokenizer.tokenize(sentence) for sentence in sentences] new_sentences = [] with open("lines.txt", mode='w', encoding='utf8') as file: for sentence in sentences: write = True for word in sentence: if word in dict.keys(): write = False break if write: file.writelines(" ".join(sentence) + "\n") new_sentences.append(sentence)
def __init__(self): super().__init__() self.sent_splitter = PunktSentenceTokenizer()
#Representing the words with their Parts of Speech import nltk from nltk.corpus import state_union ''' PunktSentenceTokenizer is unsupervised ml sentence tokenizer It comes with pretraining and we can also further train it ''' from nltk import PunktSentenceTokenizer train = state_union.raw("2005-GWBush.txt") text = state_union.raw("2006-GWBush.txt") SentenceTokenizer = PunktSentenceTokenizer(train) tokenized = SentenceTokenizer.tokenize(text) def process(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) print(tagged) except Exception as e: print(str(e)) process()
def __init__(self, stop_words, vector_len=1000): self.vocab = [] self.stop_words = stop_words self.vector_len = vector_len self.tokenizer = PunktSentenceTokenizer()
class McChineseTokenizer(object): """Chinese language tokenizer that uses jieba.""" # Path to jieba dictionary(ies) __dict_path = os.path.join(mc_root_path(), 'lib/MediaWords/Languages/resources/zh/') __jieba_dict_path = os.path.join(__dict_path, 'dict.txt.big') __jieba_userdict_path = os.path.join(__dict_path, 'userdict.txt') # jieba instance __jieba = None # Text -> sentence tokenizer for Chinese text __chinese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Chinese text discard_empty=True, ) # Text -> sentence tokenizer for non-Chinese (e.g. English) text __non_chinese_sentence_tokenizer = PunktSentenceTokenizer() def __init__(self): """Initialize jieba tokenizer.""" self.__jieba = JiebaTokenizer() if not os.path.isdir(self.__dict_path): raise McChineseTokenizerException(""" jieba dictionary directory was not found: %s Maybe you forgot to initialize Git submodules? """ % self.__dict_path) if not os.path.isfile(self.__jieba_dict_path): raise McChineseTokenizerException(""" Default dictionary not found in jieba dictionary directory: %s Maybe you forgot to run jieba installation script? """ % self.__dict_path) if not os.path.isfile(self.__jieba_userdict_path): raise McChineseTokenizerException(""" User dictionary not found in jieba dictionary directory: %s Maybe you forgot to run jieba installation script? """ % self.__dict_path) try: # loading dictionary is part of the init process self.__jieba.set_dictionary(os.path.join(self.__jieba_dict_path)) self.__jieba.load_userdict(os.path.join( self.__jieba_userdict_path)) except Exception as ex: raise McChineseTokenizerException( "Unable to initialize jieba: %s" % str(ex)) def tokenize_text_to_sentences(self, text: str) -> list: """Tokenize Chinese text into sentences.""" text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text to tokenize into sentences is None.") return [] text = text.strip() if len(text) == 0: return [] # First split Chinese text chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text) sentences = [] for sentence in chinese_sentences: # Split paragraphs separated by two line breaks denoting a list paragraphs = re.split("\n\s*?\n", sentence) for paragraph in paragraphs: # Split lists separated by "* " list_items = re.split("\n\s*?(?=\* )", paragraph) for list_item in list_items: # Split non-Chinese text non_chinese_sentences = self.__non_chinese_sentence_tokenizer.tokenize( list_item) sentences += non_chinese_sentences # Trim whitespace sentences = [sentence.strip() for sentence in sentences] return sentences def tokenize_sentence_to_words(self, sentence: str) -> list: """Tokenize Chinese sentence into words. Removes punctuation.""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence to tokenize into words is None.") return [] sentence = sentence.strip() if len(sentence) == 0: return [] parsed_text = self.__jieba.lcut(sentence, cut_all=False) parsed_tokens = [x for x in parsed_text if x.strip()] words = [] for parsed_token in parsed_tokens: if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None: words.append(parsed_token) else: pass return words
# -*- coding: utf-8 -*- """ Created on Wed Mar 11 12:49:39 2020 @author: alex.a.murray """ import nltk from nltk.corpus import state_union from nltk import PunktSentenceTokenizer train_text = state_union.raw("2005-GWBush.text") sample_text = state_union.raw("2006-GWBush.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) print(tagged) except Exception as e: print(str(e)) process_content
sentences = sent_tokenize(example_text) for w in words: print(w) print() for s in sentences: print(s) print() # Using PunktSentenceTokenizer and training it train_text = state_union.raw("2005-GWBush.txt") custom_sentence_tokenizer_trained = PunktSentenceTokenizer(train_text) sentences = custom_sentence_tokenizer_trained.tokenize(example_text) for s in sentences: print(s) print() # Using PunktSentenceTokenizer with no training (it comes pretrained) custom_sentence_tokenizer_untrained = PunktSentenceTokenizer() sentences = custom_sentence_tokenizer_untrained.tokenize(example_text) for s in sentences: print(s)
def __init__(self): self.parser = CoreNLPDependencyParser(url=self.corenlp_server()) self.sentence_tokenizer = PunktSentenceTokenizer()
class McJapaneseTokenizer(object): """Japanese language tokenizer that uses MeCab.""" # Paths where mecab-ipadic-neologd might be located __MECAB_DICTIONARY_PATHS = [ # Ubuntu / Debian '/var/lib/mecab/dic/ipadic-neologd', # CentOS / Fedora '/usr/lib64/mecab/dic/ipadic-neologd/', # OS X '/usr/local/opt/mecab-ipadic-neologd/lib/mecab/dic/ipadic-neologd/', ] # MeCab instance __mecab = None # Text -> sentence tokenizer for Japanese text __japanese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Japanese text discard_empty=True, ) # Text -> sentence tokenizer for non-Japanese (e.g. English) text __non_japanese_sentence_tokenizer = PunktSentenceTokenizer() __MECAB_TOKEN_POS_SEPARATOR = random_string( length=16) # for whatever reason tab doesn't work __MECAB_EOS_MARK = 'EOS' def __init__(self): """Initialize MeCab tokenizer.""" mecab_dictionary_path = McJapaneseTokenizer._mecab_ipadic_neologd_path( ) try: self.__mecab = MeCab.Tagger( '--dicdir=%(dictionary_path)s ' '--node-format=%%m%(token_pos_separator)s%%h\\n ' '--eos-format=%(eos_mark)s\\n' % { 'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR, 'eos_mark': self.__MECAB_EOS_MARK, 'dictionary_path': mecab_dictionary_path, }) except Exception as ex: raise McJapaneseTokenizerException( "Unable to initialize MeCab: %s" % str(ex)) @staticmethod def _mecab_ipadic_neologd_path( ) -> str: # (protected and not private because used by the unit test) """Return path to mecab-ipadic-neologd dictionary installed on system.""" mecab_dictionary_path = None candidate_paths = McJapaneseTokenizer.__MECAB_DICTIONARY_PATHS for candidate_path in candidate_paths: if os.path.isdir(candidate_path): if os.path.isfile(os.path.join(candidate_path, 'sys.dic')): mecab_dictionary_path = candidate_path break if mecab_dictionary_path is None: raise McJapaneseTokenizerException( "mecab-ipadic-neologd was not found in paths: %s" % str(candidate_paths)) return mecab_dictionary_path def tokenize_text_to_sentences(self, text: str) -> list: """Tokenize Japanese text into sentences.""" text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text to tokenize into sentences is None.") return [] text = text.strip() if len(text) == 0: return [] # First split Japanese text japanese_sentences = self.__japanese_sentence_tokenizer.tokenize(text) sentences = [] for sentence in japanese_sentences: # Split paragraphs separated by two line breaks denoting a list paragraphs = re.split("\n\s*?\n", sentence) for paragraph in paragraphs: # Split lists separated by "* " list_items = re.split("\n\s*?(?=\* )", paragraph) for list_item in list_items: # Split non-Japanese text non_japanese_sentences = self.__non_japanese_sentence_tokenizer.tokenize( list_item) sentences += non_japanese_sentences # Trim whitespace sentences = [sentence.strip() for sentence in sentences] return sentences @staticmethod def _mecab_allowed_pos_ids() -> Dict[int, str]: """Return allowed MeCab part-of-speech IDs and their definitions from pos-id.def. Definitions don't do much in the language module itself, they're used by unit tests to verify that pos-id.def didn't change in some unexpected way and we're not missing out on newly defined POSes. """ return { 36: '名詞,サ変接続,*,*', # noun-verbal 38: '名詞,一般,*,*', # noun 40: '名詞,形容動詞語幹,*,*', # adjectival nouns or quasi-adjectives 41: '名詞,固有名詞,一般,*', # proper nouns 42: '名詞,固有名詞,人名,一般', # proper noun, names of people 43: '名詞,固有名詞,人名,姓', # proper noun, first name 44: '名詞,固有名詞,人名,名', # proper noun, last name 45: '名詞,固有名詞,組織,*', # proper noun, organization 46: '名詞,固有名詞,地域,一般', # proper noun in general 47: '名詞,固有名詞,地域,国', # proper noun, country name } def tokenize_sentence_to_words(self, sentence: str) -> list: """Tokenize Japanese sentence into words. Removes punctuation and words that don't belong to part-of-speech whitelist.""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence to tokenize into words is None.") return [] sentence = sentence.strip() if len(sentence) == 0: return [] parsed_text = self.__mecab.parse(sentence).strip() parsed_tokens = parsed_text.split("\n") allowed_pos_ids = self._mecab_allowed_pos_ids() words = [] for parsed_token_line in parsed_tokens: if self.__MECAB_TOKEN_POS_SEPARATOR in parsed_token_line: primary_form_and_pos_number = parsed_token_line.split( self.__MECAB_TOKEN_POS_SEPARATOR) primary_form = primary_form_and_pos_number[0] pos_number = primary_form_and_pos_number[1] if pos_number.isdigit(): pos_number = int(pos_number) if pos_number in allowed_pos_ids: words.append(primary_form) else: # Ignore all the "EOS" stuff pass return words
PATTERN = re.compile( r'''(?x) # set flag to allow verbose regexps: ignores spaces and newlines (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | '(?:s|nt)\b # 's, 'nt # | \w+(?:-\w+)* # words with optional internal hyphens | [a-zA-Z0-9]+ # words without internal hyphens e.g. Type1_gene(Attention: \w contains _) | \.\.\. # ellipsis | [.,;"?:] # these are separate tokens | [][()_`\|\n-]+ # these tokens are grouped; includes ] and [ and - | [^a-zA-Z0-9_\s] # find these character in the AGAC training data. ''') # default tokenizer word_tokenizer = RegexpTokenizer(PATTERN) sent_tokenizer = PunktSentenceTokenizer() class Text(object): """ Abstract text. e.g a paragrah, abstract and an essay. """ def __init__(self, text, s_tokenizer=sent_tokenizer, w_tokenizer=word_tokenizer): """ :param text: str :param s_tokenizer: sentence_tokenzier :param w_tokenizer: word_tokenizer """
def _load_model(): global sentence_tokenizer if sentence_tokenizer is not None: return model_path = join(dirname(__file__), "st_kiss-strunk-2006_2019_01_13.pkl") with open(model_path, "rb") as fs: punkt_param = pickle.load(fs) punkt_param.sent_starters = {} abbrev_types = [ "g.m.t", "e.g", "dr", "dr", "vs", "000", "mr", "mrs", "prof", "inc", "tp", "tp." "ts", "ths", "th", "vs", "tp", "k.l", "a.w.a.k.e", "t", "a.i", "</i", "g.w", "ass", "u.n.c.l.e", "t.e.s.t", "ths", "d.c", "ve…", "ts", "f.t", "b.b", "z.e", "s.g", "m.p", "g.u.y", "l.c", "g.i", "j.f", "r.r", "v.i", "m.h", "a.s", "bs", "c.k", "aug", "t.d.q", "b…", "ph", "j.k", "e.l", "o.t", "s.a", ] abbrev_types.extend(string.ascii_uppercase) for abbrev_type in abbrev_types: punkt_param.abbrev_types.add(abbrev_type) for abbrev_type in string.ascii_lowercase: punkt_param.abbrev_types.add(abbrev_type) sentence_tokenizer = PunktSentenceTokenizer(punkt_param)