def train_tokenizer(): trainer = punkt.PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True print 'Training the tokenizer on SemEval' for semeval_file in semeval_files: print 'File', semeval_file try: with open(semeval_file, 'r') as f: st = [] for line in f: st += [line.strip()] text = read_visit_sem(st) trainer.train(text, finalize=False) except IOError: pass trainer2 = copy.deepcopy(trainer) trainer2.finalize_training() tokenizer = punkt.PunktSentenceTokenizer(trainer2.get_params()) out = open("tokenizer.pk", "wb") pickle.dump(tokenizer, out, -1) out.close() tokenizer = None trainer2 = None print 'Wrote tokenizer.' print 'Training the tokenizer on MIMIC' for notes_file in subset(notes_files, 15): # 15 random MIMIC files print 'File', notes_file try: with open(notes_file, 'r') as f: ct = 0 st = [] for line in f: ct += 1 if ct % 50000 == 0: print ct if line.strip() == '</VISIT>': text = read_visit(st) trainer.train(text, finalize=False) st = [] elif line.strip() != '<VISIT>': st += [line.strip()] except IOError: continue trainer2 = copy.deepcopy(trainer) trainer2.finalize_training() tokenizer = punkt.PunktSentenceTokenizer(trainer2.get_params()) out = open("tokenizer.pk", "wb") pickle.dump(tokenizer, out, -1) out.close() print 'Wrote tokenizer.'
def punkt_debug_decisions(self, input_text, n_sents, n_splits, lang_vars=None): tokenizer = punkt.PunktSentenceTokenizer() if lang_vars != None: tokenizer._lang_vars = lang_vars assert len(tokenizer.tokenize(input_text)) == n_sents assert len(list(tokenizer.debug_decisions(input_text))) == n_splits
def getSentiment(text, language="en"): ct = pkt.PunktSentenceTokenizer(lang_vars=CustomLanguageVars()) sentences = ct.tokenize(text) # sentences = [sentence.strip() for sentence in sentences if sentence != ""] documents = [] for i, sentence in enumerate(sentences): documents.append({ "language": language, "id": str(i), "text": sentence }) r = requests.post(API_URL, json={"documents": documents}, headers={ "Ocp-Apim-Subscription-Key": API_KEY, "content-type": "application/json" }) if r.status_code != 200: print("Something went wrong: %s" % r.text) return [(r.text, 0)] results = json.loads(r.text) scores = [None] * len(sentences) for tup in results["documents"]: scores[int(tup["id"])] = tup["score"] return zip(sentences, scores)
def clean_and_tag(row, text_col, csv_writer): """ Clean given text and write each sentence to CSV """ # set up sentence splitter with custom parameters punkt_params = punkt.PunktParameters() # sentences are not split ending on the given parameters, using {} creates a set literal punkt_params.abbrev_types = { 'inc', 'inc ', '.tm', 'tm', 'no', 'i.v', 'drs', 'u.s' } # the tokenizer has to be unpickled so better do it once here than every time it is used sentence_splitter = punkt.PunktSentenceTokenizer(punkt_params) # clean up html tags plaintext = nltk.clean_html(row[text_col]) # TODO coreference resolution to find more relevant sentences sentences = sentence_splitter.tokenize(plaintext) # maybe unecessary defensiveness... if len(sentences) > 0: for s in sentences: # remove punctuation, still want to add original sentence to CSV though #no_punct = re.findall(r'[\w\$\xc2()-]+', s) #no_punct = ' '.join(no_punct) tokens = nltk.word_tokenize(s) tags = nltk.pos_tag(tokens) # TODO parse tree info, something to do with stemming? # write row to file for each sentence row.append(tags) csv_writer.writerow(row)
def __init__(self): """Constructor. """ super(EnglishPunktTokenizer, self).__init__() self._sentence_tokenizer = punkt.PunktSentenceTokenizer() self._word_tokenizer = punkt.PunktWordTokenizer()
def test_punkt_tokenize_no_custom_lang_vars(self): obj = punkt.PunktSentenceTokenizer() # We expect these sentences to not be split properly, as the Bengali full stop '।' is not included in the default language vars sentences = u"উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।" expected = ["উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"] assert obj.tokenize(sentences) == expected
def getSentencesWithWord(allNodes, noun): trainer = punkt.PunktSentenceTokenizer() trainer.train("data/real_estate.txt" ) #Sentence fragmenter trained on real_estate (arbitrarily) sentences = [] for node in allNodes: name = node.get("name") tokens = trainer.tokenize(node.get("text")) for token in tokens: if noun in token: sentences.append(token) return sentences
def test_punkt_tokenize_custom_lang_vars(self): # Create LangVars including a full stop end character as used in Bengali class BengaliLanguageVars(punkt.PunktLanguageVars): sent_end_chars = ('.', '?', '!', '\u0964') obj = punkt.PunktSentenceTokenizer(lang_vars = BengaliLanguageVars()) # We now expect these sentences to be split up into the individual sentences sentences = u"উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।" expected = ["উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন।", "অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন।", "এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"] assert obj.tokenize(sentences) == expected
def set_up_tokenizer(): """ Set up sentence splitter with custom parameters and return to caller """ punkt_params = punkt.PunktParameters() # sentences are not split ending on the given parameters, using {} creates a set literal punkt_params.abbrev_types = {'inc', '.tm', 'tm', 'no', 'i.v', 'dr', 'drs', 'u.s', 'u.k', 'ltd', 'vs', 'vol', 'corp', 'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'sept', 'oct', 'nov', 'dec', 'pm', 'p.m', 'am', 'a.m', 'mr', 'mrs', 'ms', 'i.e', 'e.g', # above is from reuters, below for eu-adr specifically 'spp'} return punkt.PunktSentenceTokenizer(punkt_params)
def set_up_tokenizer(): """ Set up sentence splitter with custom parameters and return to caller """ punkt_params = punkt.PunktParameters() # sentences are not split ending on the given parameters, using {} creates a set literal punkt_params.abbrev_types = { 'inc', '.tm', 'tm', 'no', 'i.v', 'dr', 'drs', 'u.s', 'u.k', 'ltd', 'vs', 'vol', 'corp', 'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'sept', 'oct', 'nov', 'dec', 'pm', 'p.m', 'am', 'a.m', 'mr', 'mrs', 'ms', 'i.e' } # the tokenizer has to be unpickled so better do it once here than every time it is used return punkt.PunktSentenceTokenizer(punkt_params)
def read_all(self): """A wrapper to read all abstracts and annotations """ self.train_txt = self.read_abstracts(self._train_text_path) self.dev_txt = self.read_abstracts(self._dev_text_path) self.eval_txt = self.read_abstracts(self._eval_text_path) print(f"Finished reading abstracts.\n# of sentences read: Train: {len(self.train_txt)}, Dev: {len(self.dev_txt)}, Eval: {len(self.eval_txt)}") self.train_anno = self.read_annotations(self._train_label_path) self.dev_anno = self.read_annotations(self._dev_label_path) self.eval_anno = self.read_annotations(self._eval_label_path) print(f"Finished reading annotations") self.punkt_tokenizer = punkt.PunktSentenceTokenizer(self.all_texts)
def __init__(self, text): ''' Initialise the NLTK PunktSentenceTokenizer with our custom language variable for sentence splitting. Initialise a RegEx pattern for use in the NLTK RegexpTokenizer. :param text: string of raw continuous text ''' self.text = text self.custom_tknzr = pkt.PunktSentenceTokenizer( lang_vars=CustomLanguageVars.CustomLanguageVars()) self.pattern = r'''(?x) \b[a-zA-Z0-9._%+-]+@\s*?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b # Email addresses |\s+ # any consecutive whitespace characters |\b(?:[a-zA-Z]\.)+\b # abbreviations, e.g. U.S.A. |\d+(?:\.\d+)?%? # numbers, incl. percentages |\w+(?:[-']\w+)* # words with optional internal hyphen/apostrophe |[.,;:!?"'()\[\]] # specific punctuation characters |\S+ # any consecutive non-whitespace characters ''' self.regex_tknzr = ret.RegexpTokenizer(self.pattern)
def punkt(self, text): """ Sentence Segmentation using the Punkt Tokenizer Parameters ---------- text : str A string (a bunch of sentences) Returns ------- list A list of strings where each strin is a single sentence """ sent_splitter = punkt.PunktSentenceTokenizer() segmented_text = sent_splitter.tokenize(text) # Fill in code here return segmented_text
def getContext(allNodes, links): """ Gets the sentence before and the sentence after each citation """ trainer = punkt.PunktSentenceTokenizer() trainer.train("data/real_estate.txt" ) #Sentence fragmenter trained on real_estate (arbitrarily) for node, srcIndex in zip(allNodes, range(0, len(allNodes))): name = node.get("name") tokens = trainer.tokenize(node.get("text")) for link in links: if link.get("source") == srcIndex: target = allNodes[link.get("target")].get("name") for sentence, sentIndex in zip(tokens, range(0, len(tokens))): if target in sentence: prevSent = '' i = 1 while len( prevSent.split(' ') ) < 10: #If the previous sentence was too short, add more try: prevSent = tokens[sentIndex - i] + prevSent except: break i += 1 print 'PREV SENTENCE: ' + prevSent print 'CURRENT SENTENCE: ' + sentence nextSent = '' while len(nextSent.split( ' ')) < 10: #Same with the next sentence try: nextSent += tokens[sentIndex + i] except: break i += 1 print 'NEXT SENTENCE: ' + nextSent
def __init__(self, name="", features=None): """ Initializes a feature set. """ # Load various libraries / dictionaries if they haven't been if FeatureSet.pronouns is None: FeatureSet.pronouns = loadDictionary(PRONOUN_FILENAME) if FeatureSet.words is None: FeatureSet.words = loadDictionary(DICT_FILENAME) if FeatureSet.stop_words is None: FeatureSet.stop_words = loadDictionary(STOP_FILENAME) if FeatureSet.st is None: # FeatureSet.st = punkt.PunktSentenceTokenizer(gutenberg.raw(gutenberg.files())) FeatureSet.st = punkt.PunktSentenceTokenizer() if FeatureSet.wt is None: FeatureSetwt = punkt.PunktWordTokenizer() # predefined set of features? if features is None: self.features = {} else: self.features = features # article name self.name = name
#!/usr/bin/env python3 # punktgen.py import sys, pickle from nltk.tokenize import punkt if len(sys.argv) < 3: print("Usage: %s infile outfile" % sys.argv[0]) sys.exit() tk = punkt.PunktSentenceTokenizer() inf = open(sys.argv[1]) outf = open(sys.argv[2], 'wb') tk.train(inf.read()) pickle.dump(tk, outf) outf.close() inf.close() print(sys.argv[2] + " saved.")
#! /usr/bin/env python # Author: Kapil Thadani ([email protected]) from __future__ import division, with_statement from nltk.tokenize import punkt punkt_splitter = punkt.PunktSentenceTokenizer() # Suffixes observed to frequently cause incorrect splits # (derived from observations in parsing WikiNews). bad_suffixes = [ 'Mr.', 'Ms.', 'Mrs.', 'Dr.', 'Lt.', 'Sgt.', 'Maj.', # 'Col.', # 'Gen.', # 'Adm.', 'Sen.', 'Rep.', 'U.S.', 'U.N.', 'U.K.', 'E.U.', 'Jan.', 'Feb.', 'Mar.', 'Apr.',
def build(): #this custom tokenizer doesn't handle abbrev as well, need to add it: my_punkt_param = PunktParameters() my_punkt_param.abbrev_types = set([ 'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'd.c', 'a.d', 'b.c', 'r.s.v.p', 'p.s', 'a.s.a.p', 'e.t.a', 'd.i.y', 'r.i.p', 'e.g' ]) my_sent_tokenizer = pkt.PunktSentenceTokenizer(lang_vars=MyLanguageVars(), train_text=my_punkt_param) reader = MyCorpusReader("flask_query_api/api/query/files/", ".*\.txt", para_block_reader=my_read_blankline_block, sent_tokenizer=my_sent_tokenizer) #the "\n" (or, "%0A") passed from url #Try to add line into dictionary lineno = 1 occurrences = [] keys = [] values = [] for para in reader.paras(): if not para: lineno += 1 else: #keeping track of the column position of the last sentence in the last paragraph. col_pos = 0 for sent in para: count_line = 0 sent_no_linebreak = re.sub(r'\n|\r', ' ', sent) lines = sent.split('\n') for line in lines: if count_line != 0: lineno += 1 col_pos = len(line) for length in range(1, len(line) + 1): for start_pos in range(0, len(line) - length + 1): key = line[start_pos:start_pos + length] value = (lineno, start_pos + 1, start_pos + length + 1, bytearray(sent_no_linebreak.strip(), 'utf-8')) keys.append(key) values.append( value) #(line,start,end,in_sentence) else: for length in range(1, len(line) + 1): for start_pos in range(0, len(line) - length + 1): key = line[start_pos:start_pos + length] value = (lineno, start_pos + 1 + col_pos, start_pos + length + 1 + col_pos, bytearray(sent_no_linebreak.strip(), 'utf-8')) keys.append(key) values.append( value) #(line,start,end,in_sentence) col_pos += len(line) count_line += 1 lineno += 2 value_format = ">LLL512s" data = zip(keys, values) dictionary = marisa_trie.RecordTrie(value_format, data) #dictionary = dawg.RecordDAWG(value_format, data) return dictionary
nltk.download('punkt') class CustomLanguageVars(pkt.PunktLanguageVars): _period_context_fmt = r""" \S* # some word material %(SentEndChars)s # a potential sentence ending \s* # <-- THIS is what I changed (?=(?P<after_tok> %(NonWord)s # either other punctuation | (?P<next_tok>\S+) # <-- Normally you would have \s+ here ))""" sentence_tokenizer = pkt.PunktSentenceTokenizer(lang_vars=CustomLanguageVars()) @app.route('/analyze', methods=['POST']) def analyze(): review_text = request.form.get('review_text') if review_text and len(review_text) > 10000: return jsonify({ "error": "Only reviews of less than 10K characters are supported." }) elif review_text: review_lines = [ process_line(l) for l in sentence_tokenizer.tokenize(review_text) ] # use first model for short reviews
sd = statistics.stdev(count_list) max_word_list = [] for key, value in freq_tup: if value >= (mean_count + 1.5 * sd): max_word_list.append(key) print(max_word_list) return max_word_list print(mean_count) print(sd) import nltk.tokenize.punkt as punk punk_cl = punk.PunktSentenceTokenizer() sent_break = punk_cl.sentences_from_text(clean_text_read) for sentence in sent_break: ind = sent_break.index(sentence) clean = clean_word(sentence) del sent_break[ind] sent_break.insert(ind, clean) word_read = open('combined_biology_wordlist.txt', 'r') bio_word = word_read.readlines() clean_bio_word = [] for i in bio_word: new = i.strip() clean_bio_word.append(new)
def __init__(self): """Initialize the bad prefixes and suffixes surrounding splits. """ self.punkt_splitter = punkt.PunktSentenceTokenizer() self.whitespace = set(WHITESPACE)
def clean_and_tag(): """ Create new CSV containing all relevant sentences """ # set filepath to input basepath = os.path.dirname(__file__) file_in = 'data/reuters/press_releases/PR_drug_company_500.csv' file_in = os.path.abspath(os.path.join(basepath, '..', '..', file_in)) file_out = os.path.abspath( os.path.join(basepath, '..', 'reuters/sentences_POS.csv')) # set up sentence splitter with custom parameters punkt_params = punkt.PunktParameters() # sentences are not split ending on the given parameters, using {} creates a set literal punkt_params.abbrev_types = { 'inc', 'inc ', '.tm', 'tm', 'no', 'i.v', 'drs', 'u.s' } # the tokenizer has to be unpickled so better do it once here than every time it is used sentence_splitter = punkt.PunktSentenceTokenizer(punkt_params) with open(file_in, 'rb') as csv_in: with open(file_out, 'wb') as csv_out: # TO DO use dictionary reader to avoid using magic numbers for columns csv_reader = csv.reader(csv_in, delimiter=',') csv_writer = csv.writer(csv_out, delimiter=',') # write column headers on first row row = csv_reader.next() row.append('POS TAGS') csv_writer.writerow(row) for row in csv_reader: # use stdout to avoid spaces and newlines sys.stdout.write('.') # need to flush the buffer to display immediately sys.stdout.flush() # clean up html tags plaintext = nltk.clean_html(row[1]) drug = row[3] company = row[5] src = row[0] # only consider texts containing both the drug and company if drug in plaintext and company in plaintext: sentences = sentence_splitter.tokenize(plaintext) # filter for only sentences mentioning drug, company or both # TO DO coreference resolution to find more relevant sentences sentences = [ s for s in sentences if drug in s or company in s ] if len(sentences) > 0: for s in sentences: # remove punctuation, still want to add original sentence to CSV though no_punct = re.findall(r'[\w\$\xc2()-]+', s) no_punct = ' '.join(no_punct) tokens = nltk.word_tokenize(no_punct) tags = nltk.pos_tag(tokens) # TO DO parse tree info, something to do with stemming? # write row to file for each sentence row.append(tags) csv_writer.writerow( [src, s, row[2], drug, row[4], company, tags])
def text_search_sequential(search_term): #this custom tokenizer doesn't handle abbrev as well, need to add it: my_punkt_param = PunktParameters() my_punkt_param.abbrev_types = set([ 'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'd.c', 'a.d', 'b.c', 'r.s.v.p', 'p.s', 'a.s.a.p', 'e.t.a', 'd.i.y', 'r.i.p', 'e.g' ]) my_sent_tokenizer = pkt.PunktSentenceTokenizer(lang_vars=MyLanguageVars(), train_text=my_punkt_param) reader = MyCorpusReader("flask_query_api/api/query/files/", ".*\.txt", para_block_reader=my_read_blankline_block, sent_tokenizer=my_sent_tokenizer) #the "\n" (or, "%0A") passed from url search_two_lines = re.search(r'\n', search_term) lineno = 1 occurrences = [] for para in reader.paras(): if not para: lineno += 1 else: #keeping track of the column position of the last sentence in the last paragraph. col_pos = 0 for sent in para: count_line = 0 sent_no_linebreak = re.sub(r'\n|\r', ' ', sent) results_sent_list = [] for m in re.finditer(re.escape(search_term), sent): results_sent_list.append({ 'start': m.start() + 1, 'end': m.end() + 1, 'in_sentence': sent_no_linebreak.strip() }) result = None if results_sent_list: result = results_sent_list.pop(0) sent_pos = 0 lines = sent.split('\n') for line in lines: if count_line != 0: lineno += 1 col_pos = len(line) # use while because may have multiple matches per line while result and (sent_pos + len(line) + count_line > result['start']): result['line'] = lineno result['start'] -= (sent_pos + count_line) result['end'] -= (sent_pos + count_line) if search_two_lines: result['end'] -= (len(line) + 1) occurrences.append(result) if results_sent_list: result = results_sent_list.pop(0) else: result = None sent_pos += len(line) else: while result and (len(line) > result['start'] ): #search found on current line result['line'] = lineno result['start'] += col_pos result['end'] += col_pos if search_two_lines: result['end'] -= (len(line) + 1) occurrences.append(result) if results_sent_list: result = results_sent_list.pop(0) else: result = None col_pos += len(line) sent_pos += len(line) count_line += 1 lineno += 2 response = { "query_text": search_term, "number_of_occurrences": len(occurrences), "occurences": occurrences } return response
def __init__(self, dataset: List[str]): self.text = "\n".join(dataset) self.tokenizer = punkt.PunktSentenceTokenizer(train_text=self.text)