def __index_content(url_id, db, soup): title = soup.title.text if title is not None: if isinstance(title, basestring): title.encode('utf8') else: unicode(title).encode('utf8') content = soup.find("div", {"id": "mw-content-text"}).text if isinstance(content, basestring): content.encode('utf8') else: unicode(content).encode('utf8') # content = soup.text custom_tokenizer = PunktSentenceTokenizer() tokenized_sentences = custom_tokenizer.tokenize(unicode(content)) page = dict() page["title"] = title hints_list = list() try: for sentence in tokenized_sentences: words = nltk.word_tokenize(sentence) tagged = nltk.pos_tag(words) grammar = r"""NP: {<DT|PP\$>?<JJ>*<NN>} {<NNP>+}""" chunk_parser = nltk.RegexpParser(grammar) chunked = chunk_parser.parse(tagged) for chunk in chunked.subtrees(): if chunk.label() == "NP": line = list() for each in chunk.leaves(): if len(each[0]) > 2: line.append(each[0]) if len(line) > 0: final_value = (" ".join(line)).lower() hints_list.append(final_value) page["hints"] = hints_list except Exception as e: print(str(e)) db.known_urls.update_one({"_id": url_id}, {"$set": {"content": page}}) page_content_size = len(page["hints"]) print(colored("\t\tUpdated With Indexed Content", "yellow")) # current_dir = os.getcwd() # files_dir = current_dir + "/Originals/" # file_name = url_id # file_path = files_dir + str(file_name) # created_file = open(file_path, "w") # created_file.write(content.encode("utf-8")) # created_file.close() # print("\t\tOriginal Content Is Saved") return page_content_size return
def tokenize_english_document(input_text): """ This is a crude tokenizer for input conversations in English. :param input_text: :return: """ end_list = [] block_tokenizer = BlanklineTokenizer() sentence_tokenizer = PunktSentenceTokenizer() word_tokenizer = WhitespaceTokenizer() # using the 38 characters in one line rule from ITV subtitle guidelines characters_per_line = 38 lines_per_subtitle = 2 blocks = block_tokenizer.tokenize(input_text) for block in blocks: # We have one speaker sentences = sentence_tokenizer.tokenize(block) # We have the sentences for sentence in sentences: words = word_tokenizer.tokenize(sentence) reverse_words = words[::-1] lines = [] current_line = '' line_full = False while reverse_words: word = reverse_words.pop() longer_line = ' '.join([current_line, word]).strip() if len(longer_line) > characters_per_line and len( current_line): # The longer line is overreaching boundaries reverse_words.append(word) line_full = True elif len(word) >= characters_per_line: # Very long words current_line = longer_line line_full = True else: current_line = longer_line if line_full: lines.append(current_line) current_line = '' line_full = False if len(lines) >= lines_per_subtitle: end_list.append(lines) lines = [] if current_line: lines.append(current_line) if lines: end_list.append(lines) return end_list
def pre_segment(doc): """Set sentence boundaries with nltk instead of spacy.""" if len(str(doc.text).split()) > 3: tokenizer = PunktSentenceTokenizer(doc.text) sentences = tokenizer.tokenize(doc.text) for nltk_sentence in sentences: words = re.findall(r"[\w]+|[^\s\w]", nltk_sentence) for i in range(len(doc) - len(words) + 1): token_list = [str(token) for token in doc[i:i + len(words)]] if token_list == words: doc[i].is_sent_start = True for token in doc[i + 1:i + len(words)]: token.is_sent_start = False return doc
def get_nltk_sents(txt: str, tokenizer: nltk.PunktSentenceTokenizer, extra_abbreviations: Set[str] = None) -> List[str]: if extra_abbreviations is not None: tokenizer._params.abbrev_types.update(extra_abbreviations) return tokenizer.tokenize(txt)
def sentence_tokenizer(text): """ Tokenizes sentences. :param text: :return: list of sentences (a sentence is a string) """ punkt_param = PunktParameters() punkt_param.abbrev_types = { 'zzgl', 'prof', 'ca', 'vj', 't', 'mio', 'sro', 'lv', 'io', 'ihv', 'bzw', 'usw', 'inkl', 'zt', 'vh', 'dr', 'entspr', 'dem', 'fort', 'co', 'kg', 'zb', 'bspw', 'ua', 'rd', 'abs', 'etc', 'tsd', 'z.b', 'evtl', '1', '2', '3', '4', '5', '6', '7', '8', '9', '19', '20', '21' } sentence_splitter = PunktSentenceTokenizer(punkt_param) return sentence_splitter.tokenize(text)
class LanguageModel: """ N-gram model """ def __init__(self, n_gram=2, missed_value=0.99): """ :param n_gram: length of n-gram :param missed_value: default value for all unseen n-gram """ self.n = n_gram self.n_grams = {} self.context = {} self.sentence_tokenizer = SentenceTokenizer() self.tokenizer = Tokenizer() self.missed_value = missed_value def build_model(self, text): sentenses = self.sentence_tokenizer.tokenize(text) words = [ list( filter( lambda s: s.isalpha(), self.tokenizer.tokenize(sentence.strip()) ) ) for sentence in sentenses ] for sentence in words: if len(sentence) < self.n: key = " ".join(sentence) self.context.update({key: self.context.get(key, 0) + 1}) else: for i in range(len(sentence) - self.n + 1): context_key = " ".join(sentence[i:i + self.n - 1]) n_gram_key = " ".join(sentence[i:i + self.n]) self.context.update({context_key: self.context.get(context_key, 0) + 1}) self.n_grams.update({n_gram_key: self.n_grams.get(n_gram_key, 0) + 1}) def calculate_proba(self, sentence): words = list( filter( lambda s: s.isalpha(), self.tokenizer.tokenize(sentence.strip()) ) ) result = 1 for i in range(min(self.n - 2, len(words) - 1), len(words)): if i < self.n - 1: size = sum([val for key, val in self.context.items() if len(key.split(" ")) == i+1]) result *= self.context.get(" ".join(words[:i+1]), self.missed_value if i == self.n - 2 else 0) / size elif i > self.n - 2: context_key = " ".join(words[i-self.n+1:i]) n_gram_key = " ".join(words[i-self.n+1:i+1]) context_val = self.context.get(context_key, self.missed_value) n_gram_val = self.n_grams.get(n_gram_key, self.missed_value) p = n_gram_val / context_val result *= p return result
def handle(self, *app_labels, **options): print app_labels print options for article in BwogArticle.objects.all(): sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(article.body) for sentence_index in range(len(sentences)): sentence = sentences[sentence_index] sentence_words = nltk.word_tokenize(sentence) tagged = nltk.pos_tag(sentence_words) for tup_index in range(len(tagged)): tup = tagged[tup_index] article_word = tup[0] article_tag = tup[1] p = ParsedItem(content_object=article, word=article_word, tag=article_tag, sentence_sequence=sentence_index, word_sequence=tup_index) p.save() print p
class NLTKSentenceSegmenter(PackProcessor): r"""A wrapper of NLTK sentence tokenizer. """ def __init__(self): super().__init__() self.sent_splitter = PunktSentenceTokenizer() def _process(self, input_pack: DataPack): for begin, end in self.sent_splitter.span_tokenize(input_pack.text): Sentence(input_pack, begin, end)
def name_ent_recog(post): train_text = state_union.raw("2005-GWBush.txt") sample_text = post custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) namedEnt = [] try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) namedEnt.append(nltk.ne_chunk(tagged)) # chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP.?>*<NN>?}""" # # chunkGram = r"""Chunk: {<.*>+} # # }<VB.?|IN|DT>+{""" # chunkParser = nltk.RegexpParser(chunkGram) # chunked = chunkParser.parse(tagged) # print(chunked) # #print(tagged) except Exception as e: print(str(e)) return namedEnt
def __init__(self, n_gram=2, missed_value=0.99): """ :param n_gram: length of n-gram :param missed_value: default value for all unseen n-gram """ self.n = n_gram self.n_grams = {} self.context = {} self.sentence_tokenizer = SentenceTokenizer() self.tokenizer = Tokenizer() self.missed_value = missed_value
def sentence_split(input_text): input_text = "<root>" + input_text + "</root>" soup = BeautifulSoup(input_text, "xml") paragraphs = [] for doc in soup.find('root').findAll('DOC'): if doc['type'] == 'story': headlines = doc('HEADLINE') for h in headlines: paragraphs.append(h.contents[0]) p_blocks = doc.find('TEXT').findAll('P') for p in p_blocks: paragraphs.append(p.contents[0]) elif doc['type'] == 'multi': paragraphs.append(doc.find('TEXT').contents[0]) sentences = [] punkt = PunktSentenceTokenizer() for parag in paragraphs: for sent in punkt.sentences_from_text(parag, realign_boundaries=True): sentences.append(replace.sub(' ', sent).strip()) return sentences
def sent_tokenize(text): model_path = join(dirname(__file__), 'sent_tokenize_model_v1.0.pkl') with open(model_path, 'rb') as fs: punkt_param = pickle.load(fs) punkt_param.sent_starters = {} abbrev_types = [ 'g.m.t', 'e.g', 'dr', 'dr', 'vs', "000", 'mr', 'mrs', 'prof', 'inc', 'tp', 'ts', 'ths', 'th', 'vs', 'tp', 'k.l', 'a.w.a.k.e', 't', 'a.i', '</i', 'g.w', 'ass', 'u.n.c.l.e', 't.e.s.t', 'ths', 'd.c', 've…', 'ts', 'f.t', 'b.b', 'z.e', 's.g', 'm.p', 'g.u.y', 'l.c', 'g.i', 'j.f', 'r.r', 'v.i', 'm.h', 'a.s', 'bs', 'c.k', 'aug', 't.d.q', 'b…', 'ph', 'j.k', 'e.l', 'o.t', 's.a' ] abbrev_types.extend(string.ascii_uppercase) for abbrev_type in abbrev_types: punkt_param.abbrev_types.add(abbrev_type) for abbrev_type in string.ascii_lowercase: punkt_param.abbrev_types.add(abbrev_type) tokenizer = PunktSentenceTokenizer(punkt_param) sentences = tokenizer.sentences_from_text(text) return sentences
class SentenceToVec(BaseEstimator, TransformerMixin): def __init__(self, stop_words, vector_len=1000): self.vocab = [] self.stop_words = stop_words self.vector_len = vector_len self.tokenizer = PunktSentenceTokenizer() def format_word(self, word): if word.isdigit(): return "0" elif word in self.stop_words: return "" else: return word.strip() def tokenize(self, sentence): res_tokens = [] tokens_temp = self.tokenizer.tokenize(sentence) for tokens in tokens_temp: tokens = nltk.word_tokenize(tokens) tokens = [self.format_word(t) for t in tokens] res_tokens += [t for t in tokens if t] return res_tokens def fit(self, X, y=None): self.vocab = [] word_freq = Counter() for i in range(X.shape[0]): for w in self.tokenize(X[i]): if w not in self.stop_words: word_freq[w] += 1 for term, freq in word_freq.most_common(): if len(self.vocab) < self.vector_len: self.vocab.append(term) return self def _vectorize(self, words): freq = dict(Counter(words)) vector = [] for v in self.vocab: vector.append(freq[v] if v in words else 0) return np.array(vector) def transform(self, X, copy=True): _X = np.zeros((X.shape[0], len(self.vocab))) for i in range(X.shape[0]): _X[i] = self._vectorize(self.tokenize(X[i])) return _X
class CoreNLP: def __init__(self): self.parser = CoreNLPDependencyParser(url=self.corenlp_server()) self.sentence_tokenizer = PunktSentenceTokenizer() @staticmethod def corenlp_server(): return getenv('CORENLP_SERVER') def dep_parse(self, text: str, conll_version=10) -> str: """Get a CoreNLP depparse,lemma""" def get_conll(t): deps, = self.parser.raw_parse(t) return deps.to_conll(conll_version) # xrenner requires conll10 sentences = self.sentence_tokenizer.sentences_from_text(text) return '\n'.join(map(get_conll, sentences))
def _extract_text_from_elements(elements: Element, punkt: bool, keep_xml: bool) -> List[str]: examples = [] if keep_xml: for e in elements: xml_str = tostring(e).decode('utf-8') # tostring returns bytes length = len(innertext(e)) if length > config.min_char_length: examples.append(xml_str) else: for e in elements: text = innertext(e) if punkt: sentences = PunktSentenceTokenizer().tokenize(text=text) filtered_sentences = [s for s in sentences if self._filter(s)] examples += filtered_sentences else: if _filter(text): examples.append(text) return examples
def _load_model(): global sentence_tokenizer if sentence_tokenizer is not None: return model_path = join(dirname(__file__), 'st_kiss-strunk-2006_2019_01_13.pkl') with open(model_path, 'rb') as fs: punkt_param = pickle.load(fs) punkt_param.sent_starters = {} abbrev_types = [ 'g.m.t', 'e.g', 'dr', 'dr', 'vs', "000", 'mr', 'mrs', 'prof', 'inc', 'tp', 'ts', 'ths', 'th', 'vs', 'tp', 'k.l', 'a.w.a.k.e', 't', 'a.i', '</i', 'g.w', 'ass', 'u.n.c.l.e', 't.e.s.t', 'ths', 'd.c', 've…', 'ts', 'f.t', 'b.b', 'z.e', 's.g', 'm.p', 'g.u.y', 'l.c', 'g.i', 'j.f', 'r.r', 'v.i', 'm.h', 'a.s', 'bs', 'c.k', 'aug', 't.d.q', 'b…', 'ph', 'j.k', 'e.l', 'o.t', 's.a' ] abbrev_types.extend(string.ascii_uppercase) for abbrev_type in abbrev_types: punkt_param.abbrev_types.add(abbrev_type) for abbrev_type in string.ascii_lowercase: punkt_param.abbrev_types.add(abbrev_type) sentence_tokenizer = PunktSentenceTokenizer(punkt_param)
class NLTKSentenceSegmenter(PackProcessor): r"""A wrapper of NLTK sentence tokenizer.""" def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) nltk.download("punkt") def __init__(self): super().__init__() self.sent_splitter = PunktSentenceTokenizer() def _process(self, input_pack: DataPack): for begin, end in self.sent_splitter.span_tokenize(input_pack.text): Sentence(input_pack, begin, end) def record(self, record_meta: Dict[str, Set[str]]): r"""Method to add output type record of `NLTKSentenceSegmenter`, which is `ft.onto.base_ontology.Sentence` to :attr:`forte.data.data_pack.Meta.record`. Args: record_meta: the field in the datapack for type record that need to fill in for consistency checking. """ record_meta["ft.onto.base_ontology.Sentence"] = set()
from string import translate, maketrans, punctuation from nltk import PunktSentenceTokenizer, PorterStemmer from textblob import TextBlob from bs4 import BeautifulSoup as BS import multiprocessing from textwrap import dedent from itertools import izip_longest from itertools import chain, combinations_with_replacement import urllib2 #for punctuation pp = punctuation del (punctuation) T = maketrans(pp, ' ' * len(pp)) tknr = PunktSentenceTokenizer() #download training data from this dude's githubrepo url = "https://raw.githubusercontent.com/rhasan/nltk-try/532e51035b509c10b08bef4666307a37ca5409ec/ngram/simple_wikipedia_plaintext.txt" req = urllib2.Request(url) raw = urllib2.urlopen(req).read().split('\n') raw = list(chain(*[x.split('.').strip().lower() for x in raw if x != ''])) raw = [removeNonAscii(x) for x in raw] raw = [x for x in raw if len(x) > 1] with open('train_sentences.txt', 'wU') as f: for line in raw: f.write(line + '\n') del (raw)
# -*- coding: utf-8 -*- """ Created on Wed Mar 11 12:49:39 2020 @author: alex.a.murray """ import nltk from nltk.corpus import state_union from nltk import PunktSentenceTokenizer train_text = state_union.raw("2005-GWBush.text") sample_text = state_union.raw("2006-GWBush.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) print(tagged) except Exception as e: print(str(e)) process_content
def tokenize_to_sentences(self, paragraph): tokenizer = PunktSentenceTokenizer() sentences = tokenizer.tokenize(paragraph) return sentences
import sys import re from nltk import word_tokenize, pos_tag, PunktSentenceTokenizer, tag if len(sys.argv) < 2: raw = sys.stdin.read() else: f = open(sys.argv[1]) raw = f.read() lines = lib.get_dat_sgml(raw) sys.stderr.write(str(len(lines)) + " entries\n") p = PunktSentenceTokenizer() for i in range(len(lines)): if i % 100 == 0: sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n") line = dict(lines[i]) if not ("EKYWD" in line and "EABST" in line): continue abstract = line["EABST"] abstract = p.tokenize(abstract) abstract = [word_tokenize(sentence) for sentence in abstract] keywords = re.split("\t", line["EKYWD"]) keywords = [word_tokenize(keyword) for keyword in keywords] for sentence in abstract: pos_sentence = pos_tag(sentence) pos_sentence = [(word, tag.simplify.simplify_wsj_tag(t)) for word, t in pos_sentence]
for line in lines: line_dict = dict(line) if "EKYWD" in line_dict and "EABST" in line_dict: keywords = re.split("\t", line_dict["EKYWD"]) all_keywords.update(set(keywords)) sys.stderr.write("Tokenize keywords\n") keywords = [] for keyword in all_keywords: keywords.append(word_tokenize(keyword)) sys.stderr.write("All keywords: " + str(len(all_keywords)) + "\n") p = PunktSentenceTokenizer() for i in range(len(lines)): if i % 10 == 0: sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n") line = dict(lines[i]) if not ("EKYWD" in line and "EABST" in line): continue abstract = line["EABST"] abstract = p.tokenize(abstract) abstract = [word_tokenize(sentence) for sentence in abstract] for sentence in abstract: j = 0 while j < len(sentence): found = False for k in range(len(keywords)):
#Representing the words with their Parts of Speech import nltk from nltk.corpus import state_union ''' PunktSentenceTokenizer is unsupervised ml sentence tokenizer It comes with pretraining and we can also further train it ''' from nltk import PunktSentenceTokenizer train = state_union.raw("2005-GWBush.txt") text = state_union.raw("2006-GWBush.txt") SentenceTokenizer = PunktSentenceTokenizer(train) tokenized = SentenceTokenizer.tokenize(text) def process(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) print(tagged) except Exception as e: print(str(e)) process()
class McChineseTokenizer(object): """Chinese language tokenizer that uses jieba.""" # Path to jieba dictionary(ies) __dict_path = os.path.join(mc_root_path(), 'lib/MediaWords/Languages/resources/zh/') __jieba_dict_path = os.path.join(__dict_path, 'dict.txt.big') __jieba_userdict_path = os.path.join(__dict_path, 'userdict.txt') # jieba instance __jieba = None # Text -> sentence tokenizer for Chinese text __chinese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Chinese text discard_empty=True, ) # Text -> sentence tokenizer for non-Chinese (e.g. English) text __non_chinese_sentence_tokenizer = PunktSentenceTokenizer() def __init__(self): """Initialize jieba tokenizer.""" self.__jieba = JiebaTokenizer() if not os.path.isdir(self.__dict_path): raise McChineseTokenizerException(""" jieba dictionary directory was not found: %s Maybe you forgot to initialize Git submodules? """ % self.__dict_path) if not os.path.isfile(self.__jieba_dict_path): raise McChineseTokenizerException(""" Default dictionary not found in jieba dictionary directory: %s Maybe you forgot to run jieba installation script? """ % self.__dict_path) if not os.path.isfile(self.__jieba_userdict_path): raise McChineseTokenizerException(""" User dictionary not found in jieba dictionary directory: %s Maybe you forgot to run jieba installation script? """ % self.__dict_path) try: # loading dictionary is part of the init process self.__jieba.set_dictionary(os.path.join(self.__jieba_dict_path)) self.__jieba.load_userdict(os.path.join( self.__jieba_userdict_path)) except Exception as ex: raise McChineseTokenizerException( "Unable to initialize jieba: %s" % str(ex)) def tokenize_text_to_sentences(self, text: str) -> list: """Tokenize Chinese text into sentences.""" text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text to tokenize into sentences is None.") return [] text = text.strip() if len(text) == 0: return [] # First split Chinese text chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text) sentences = [] for sentence in chinese_sentences: # Split paragraphs separated by two line breaks denoting a list paragraphs = re.split("\n\s*?\n", sentence) for paragraph in paragraphs: # Split lists separated by "* " list_items = re.split("\n\s*?(?=\* )", paragraph) for list_item in list_items: # Split non-Chinese text non_chinese_sentences = self.__non_chinese_sentence_tokenizer.tokenize( list_item) sentences += non_chinese_sentences # Trim whitespace sentences = [sentence.strip() for sentence in sentences] return sentences def tokenize_sentence_to_words(self, sentence: str) -> list: """Tokenize Chinese sentence into words. Removes punctuation.""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence to tokenize into words is None.") return [] sentence = sentence.strip() if len(sentence) == 0: return [] parsed_text = self.__jieba.lcut(sentence, cut_all=False) parsed_tokens = [x for x in parsed_text if x.strip()] words = [] for parsed_token in parsed_tokens: if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None: words.append(parsed_token) else: pass return words
def get_sentence_tokenizer(): # https://textminingonline.com/dive-into-nltk-part-ii-sentence-tokenize-and-word-tokenize return PunktSentenceTokenizer()
def __init__(self): super().__init__() self.sent_splitter = PunktSentenceTokenizer()
def __init__(self): self.parser = CoreNLPDependencyParser(url=self.corenlp_server()) self.sentence_tokenizer = PunktSentenceTokenizer()
from nltk.corpus import stopwords import nltk example = "Hello Mr. Holmes. How are you doing? The weather is nice Holmes and Python is amazing. I hope you like it too!" sen_list = sent_tokenize(example) sen = sen_list[2] print(sen) stop_words = set(stopwords.words('english')) '''words = word_tokenize(sen) filtered_words = [] for w in words: if w not in stop_words: tokenizing filtered_words.append(w) print(filtered_words) ''' tokenize = PunktSentenceTokenizer(sen) tokenized = tokenize.tokenize(sen) # Speech tagging print(tokenized) for i in tokenized: words = word_tokenize(i) tagged = nltk.pos_tag(words) # Chunking ''' using regex here . means select all characters ? means atleast 1 repetation.. for further info see tutorial on pythonprogrammong.net ''' chunkgram = r"""Chunk: {<RB.?>*<VB.?>*<NNP><NN>?} """ # RB,VB,NNp etc are tags like VB=verb... what we are doing here is selecting certain type of words in chunk chunkparser = nltk.RegexpParser(chunkgram) chunked = chunkparser.parse(tagged) print(chunked)
sentences = sent_tokenize(example_text) for w in words: print(w) print() for s in sentences: print(s) print() # Using PunktSentenceTokenizer and training it train_text = state_union.raw("2005-GWBush.txt") custom_sentence_tokenizer_trained = PunktSentenceTokenizer(train_text) sentences = custom_sentence_tokenizer_trained.tokenize(example_text) for s in sentences: print(s) print() # Using PunktSentenceTokenizer with no training (it comes pretrained) custom_sentence_tokenizer_untrained = PunktSentenceTokenizer() sentences = custom_sentence_tokenizer_untrained.tokenize(example_text) for s in sentences: print(s)
import nltk from nltk import PunktSentenceTokenizer from nltk.corpus import state_union train_text = state_union.raw('2005-GWBush.txt') sample_text = state_union.raw('2006-GWBush.txt') custom_sent_tokenizer = PunktSentenceTokenizer.train(train_text=train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) named_entity = nltk.ne_chunk(tagged, binary=True) named_entity.draw() except Exception as e: print(str(e))
from nltk import PunktSentenceTokenizer, WordPunctTokenizer from collections import Counter vocab_size = 1000 sentTokenier = PunktSentenceTokenizer() wordTokenizer = WordPunctTokenizer() filename = 'data/formatted_movie_lines.txt' string = open(filename, mode='r', encoding='utf8').read() string = string.replace("'t", "") string = string.replace("'s", "") words = wordTokenizer.tokenize(string) sentences = set(sentTokenier.tokenize(string)) vocab = Counter(words).most_common(vocab_size) dict = Counter(vocab) sentences = [wordTokenizer.tokenize(sentence) for sentence in sentences] new_sentences = [] with open("lines.txt", mode='w', encoding='utf8') as file: for sentence in sentences: write = True for word in sentence: if word in dict.keys(): write = False break if write: file.writelines(" ".join(sentence) + "\n") new_sentences.append(sentence)
PATTERN = re.compile( r'''(?x) # set flag to allow verbose regexps: ignores spaces and newlines (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | '(?:s|nt)\b # 's, 'nt # | \w+(?:-\w+)* # words with optional internal hyphens | [a-zA-Z0-9]+ # words without internal hyphens e.g. Type1_gene(Attention: \w contains _) | \.\.\. # ellipsis | [.,;"?:] # these are separate tokens | [][()_`\|\n-]+ # these tokens are grouped; includes ] and [ and - | [^a-zA-Z0-9_\s] # find these character in the AGAC training data. ''') # default tokenizer word_tokenizer = RegexpTokenizer(PATTERN) sent_tokenizer = PunktSentenceTokenizer() class Text(object): """ Abstract text. e.g a paragrah, abstract and an essay. """ def __init__(self, text, s_tokenizer=sent_tokenizer, w_tokenizer=word_tokenizer): """ :param text: str :param s_tokenizer: sentence_tokenzier :param w_tokenizer: word_tokenizer """
class McJapaneseTokenizer(object): """Japanese language tokenizer that uses MeCab.""" # Paths where mecab-ipadic-neologd might be located __MECAB_DICTIONARY_PATHS = [ # Ubuntu / Debian '/var/lib/mecab/dic/ipadic-neologd', # CentOS / Fedora '/usr/lib64/mecab/dic/ipadic-neologd/', # OS X '/usr/local/opt/mecab-ipadic-neologd/lib/mecab/dic/ipadic-neologd/', ] # MeCab instance __mecab = None # Text -> sentence tokenizer for Japanese text __japanese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Japanese text discard_empty=True, ) # Text -> sentence tokenizer for non-Japanese (e.g. English) text __non_japanese_sentence_tokenizer = PunktSentenceTokenizer() __MECAB_TOKEN_POS_SEPARATOR = random_string( length=16) # for whatever reason tab doesn't work __MECAB_EOS_MARK = 'EOS' def __init__(self): """Initialize MeCab tokenizer.""" mecab_dictionary_path = McJapaneseTokenizer._mecab_ipadic_neologd_path( ) try: self.__mecab = MeCab.Tagger( '--dicdir=%(dictionary_path)s ' '--node-format=%%m%(token_pos_separator)s%%h\\n ' '--eos-format=%(eos_mark)s\\n' % { 'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR, 'eos_mark': self.__MECAB_EOS_MARK, 'dictionary_path': mecab_dictionary_path, }) except Exception as ex: raise McJapaneseTokenizerException( "Unable to initialize MeCab: %s" % str(ex)) @staticmethod def _mecab_ipadic_neologd_path( ) -> str: # (protected and not private because used by the unit test) """Return path to mecab-ipadic-neologd dictionary installed on system.""" mecab_dictionary_path = None candidate_paths = McJapaneseTokenizer.__MECAB_DICTIONARY_PATHS for candidate_path in candidate_paths: if os.path.isdir(candidate_path): if os.path.isfile(os.path.join(candidate_path, 'sys.dic')): mecab_dictionary_path = candidate_path break if mecab_dictionary_path is None: raise McJapaneseTokenizerException( "mecab-ipadic-neologd was not found in paths: %s" % str(candidate_paths)) return mecab_dictionary_path def tokenize_text_to_sentences(self, text: str) -> list: """Tokenize Japanese text into sentences.""" text = decode_object_from_bytes_if_needed(text) if text is None: log.warning("Text to tokenize into sentences is None.") return [] text = text.strip() if len(text) == 0: return [] # First split Japanese text japanese_sentences = self.__japanese_sentence_tokenizer.tokenize(text) sentences = [] for sentence in japanese_sentences: # Split paragraphs separated by two line breaks denoting a list paragraphs = re.split("\n\s*?\n", sentence) for paragraph in paragraphs: # Split lists separated by "* " list_items = re.split("\n\s*?(?=\* )", paragraph) for list_item in list_items: # Split non-Japanese text non_japanese_sentences = self.__non_japanese_sentence_tokenizer.tokenize( list_item) sentences += non_japanese_sentences # Trim whitespace sentences = [sentence.strip() for sentence in sentences] return sentences @staticmethod def _mecab_allowed_pos_ids() -> Dict[int, str]: """Return allowed MeCab part-of-speech IDs and their definitions from pos-id.def. Definitions don't do much in the language module itself, they're used by unit tests to verify that pos-id.def didn't change in some unexpected way and we're not missing out on newly defined POSes. """ return { 36: '名詞,サ変接続,*,*', # noun-verbal 38: '名詞,一般,*,*', # noun 40: '名詞,形容動詞語幹,*,*', # adjectival nouns or quasi-adjectives 41: '名詞,固有名詞,一般,*', # proper nouns 42: '名詞,固有名詞,人名,一般', # proper noun, names of people 43: '名詞,固有名詞,人名,姓', # proper noun, first name 44: '名詞,固有名詞,人名,名', # proper noun, last name 45: '名詞,固有名詞,組織,*', # proper noun, organization 46: '名詞,固有名詞,地域,一般', # proper noun in general 47: '名詞,固有名詞,地域,国', # proper noun, country name } def tokenize_sentence_to_words(self, sentence: str) -> list: """Tokenize Japanese sentence into words. Removes punctuation and words that don't belong to part-of-speech whitelist.""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence to tokenize into words is None.") return [] sentence = sentence.strip() if len(sentence) == 0: return [] parsed_text = self.__mecab.parse(sentence).strip() parsed_tokens = parsed_text.split("\n") allowed_pos_ids = self._mecab_allowed_pos_ids() words = [] for parsed_token_line in parsed_tokens: if self.__MECAB_TOKEN_POS_SEPARATOR in parsed_token_line: primary_form_and_pos_number = parsed_token_line.split( self.__MECAB_TOKEN_POS_SEPARATOR) primary_form = primary_form_and_pos_number[0] pos_number = primary_form_and_pos_number[1] if pos_number.isdigit(): pos_number = int(pos_number) if pos_number in allowed_pos_ids: words.append(primary_form) else: # Ignore all the "EOS" stuff pass return words