def GetData(path, to_lower=False, remove_proper_nouns=False): files = os.listdir(path) corpus = "" for file in files: with open(os.path.join(path, file), "r") as file: file = file.readlines() for line in file: corpus += line corpus = re.sub("\n\n+", ". ", corpus) corpus = re.sub("\n", " ", corpus) tokenizer = nltk.PunktSentenceTokenizer() corpus = tokenizer.tokenize(corpus) data = [] count_dict = {} for sentence in corpus: sentence = nltk.word_tokenize(sentence) if(remove_proper_nouns): sentence = RemoveProperNouns(sentence) sentence = RemovePunctuations(sentence, to_lower=to_lower) if(len(sentence) > 0): for word in sentence: if(word in count_dict): count_dict[word] += 1 else: count_dict[word] = 1 data.append(sentence) return data, count_dict
def _tokenize_by_sentence(text, return_spans): tokenizer = nltk.PunktSentenceTokenizer() if return_spans: return tokenizer.span_tokenize(text) else: return tokenizer.tokenize(text)
def file_read(self, input_text): doc = (file.read(file(input_text))).decode('utf-8', 'replace') #Sentence tokenizing doc = ' '.join(doc.strip().split('\n')) sentence_tokenizer = nltk.PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(doc) return sentences
def file_read(self, input_text): with open(input_text, encoding='utf-8', errors='replace') as docfile: doc = docfile.read() #Sentence tokenizing doc = ' '.join(doc.strip().split('\n')) sentence_tokenizer = nltk.PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(doc) return sentences
def process_all_reviews(reviews): me.connect("data-mining") for review in reviews: custom_sent_tokenizer = nltk.PunktSentenceTokenizer(review.review) sentences = custom_sent_tokenizer.tokenize(review.review) sent_list = review_description_sentence_list(sentences) item = MongoGansevoortReview(corresponding_id=review.id, paragraph=sent_list, description=review.review, date=review.date) item.save()
def nltk_punkt_sentence_tokenizer(input_dict): """ A sentence tokenizer which uses an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences; and then uses that model to find sentence boundaries. This approach has been shown to work well for many European languages. :param input_dict (default): {} :returns tokenizer: A python dictionary containing the Tokenizer object and its arguments. """ return {'tokenizer': {'object': nltk.PunktSentenceTokenizer()}}
def get_important_sent(html_content): punkt_param = PunktParameters() punkt_param.abbrev_types = set([ 'dr', 'vs', 'mr', 'mrs', 'miss', 'prof', 'inc', 'no', 'cap', 'nos', 'vol', 'para', 'exh' ]) tokenizer = nltk.PunktSentenceTokenizer(punkt_param) soup = BeautifulSoup(html_content, 'html.parser') content = soup.get_text() paras = get_paras(content) sents = [] for para in paras: para_content = content[para[0]:para[1] + 1] for sent in tokenizer.span_tokenize(para_content): sents.append(para_content[sent[0]:sent[1] + 1]) sents = np.array(sents) BertTokenizer = bert.bert_tokenization.FullTokenizer(VOCAB_FILE, do_lower_case=True) input_ids, input_mask, segment_ids = convert_all_sentences( clean_data(sents), BertTokenizer) model = tf.keras.models.load_model("bert_model") input_X = { "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids } sents = sents[(model.predict(input_X, batch_size=1) > 0.4).reshape(-1, )] for sent in sents: segs = filter(lambda seg: seg != "", sent.split("\n")) for seg in segs: seg = seg.replace("\xa0", " ") while seg: cur = len(seg) while True: if not cur: return html_content cur_str = seg[:cur] res = html_content.find(cur_str) if res == -1: cur -= 1 else: html_content = add_important_class( html_content, res, res + len(cur_str)) seg = seg[cur:] break return html_content
def __init__(self, gid, gdata): assert self.TYPE self.gid = gid self.gdata = gdata self.tags = [] self.media = [] self.comments = [] self.content = None self.title = None self.published = date_parse(self.gdata['published']) self.updated = date_parse(self.gdata['updated']) # Convert content to HTML so we can: # * Determine if the page has content # * Create a better title H2T = html2text.HTML2Text() H2T.ignore_links = True H2T.ignore_images = True H2T.ignore_emphasis = True H2T.body_width = 0 txtcontent = H2T.handle(self.gdata['object']['content']) lines = [x for x in txtcontent.split('\n') if x.strip()] if not lines: self.has_content = False self.title = None else: # Take the first sentence as the title tokenizer = nltk.PunktSentenceTokenizer() sentences = tokenizer.tokenize(lines[0]) self.title = sentences[0].strip() # If we just have a link, guess we don't have a title if self.title.startswith('http://') \ or self.title.startswith('https://'): self.title = None self.has_content = bool(sentences[1:]) or bool(lines[1:]) # FIXME: Should we strip the title from the content? self.content = self.gdata['object']['content']
def trainSentenceTokenizer(): """ Method trains custom sentence tokenizer using punk. At the moment it preforms worse then plain english one (most likely due to not that much data) """ collection = database["crawled-data"] text = "" for record in collection.find({ABSTRACT_DOCUMENT: {"$ne": None}}): text += record[ABSTRACT_DOCUMENT] + " " trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True trainer.train(text) model = nltk.PunktSentenceTokenizer(trainer.get_params()) with open("latvianPunkt2.pickle", mode='wb') as fout: pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
def tokenizeDocument(self, doc, tokenize='sentence', returnDoc=False): if tokenize == 'sentence': spt = nltk.PunktSentenceTokenizer() if isinstance(doc, (pd.DataFrame, pd.Series)): tokens = doc.apply( lambda row: spt.tokenize(' '.join(row))).values elif isinstance(doc, list): tokens = list() for each in doc: tokens.append(spt.tokenize(each)) else: tokens = spt.tokenize(doc) elif tokenize == 'word': wpt = nltk.WordPunctTokenizer() if isinstance(doc, (pd.DataFrame, pd.Series)): tokens = doc.apply( lambda row: wpt.tokenize(' '.join(row))).values elif isinstance(doc, list): tokens = list() for each in doc: tokens.append(wpt.tokenize(each)) else: tokens = wpt.tokenize(doc) preProcObj = TextPreprocessor() #final_tokens = preProcObj.preprocess_text(tokens,[preProcObj.removeStopWords,preProcObj.removeNumbers,preProcObj.removeEmptyString],strFlag=False) #final_tokens = preProcObj.preprocess_text(tokens,[preProcObj.lowercase,preProcObj.lemmatize,preProcObj.removePunctuation,preProcObj.removeEmptyString,preProcObj.removehypen],strFlag=False) final_tokens = preProcObj.preprocess_text(tokens, [preProcObj.lowercase], strFlag=False) if returnDoc: # re-create document from filtered tokens doc = ' '.join(final_tokens) return final_tokens, doc else: return final_tokens
def make_text_clear(text_in, synonymizer, morph): sentences = nltk.PunktSentenceTokenizer().tokenize(text_in.lower()) text_items_in = [] for sentence in sentences: for word in nltk.WordPunctTokenizer().tokenize(sentence): text_items_in.append(word) text_out = u'' punct = set(string.punctuation) for item in text_items_in: if item not in punct: word_info = morph.parse(item)[0] norm = word_info.normal_form synonyms = synonymizer.synonymize(norm) print "-----------------" print("Word: %s" % norm) print("Synonyms:") for syn, freq in synonyms: print("%s : %s" % (syn, freq)) print "-----------------" if len(synonyms) is not 0: best_synonym = synonyms[0][0] if synonymizer.need_replace(norm, best_synonym): grammemes = [ word_info.tag.POS, word_info.tag.aspect, word_info.tag.case, word_info.tag.mood, word_info.tag.number, word_info.tag.person, word_info.tag.tense, word_info.tag.voice ] tag = set(gram for gram in grammemes if gram is not None) syn_info = morph.parse(best_synonym)[0].inflect(tag) if syn_info is not None: text_out += syn_info.word text_out += ' ' continue text_out += item text_out += ' ' return text_out
def split_text(text): tokenizer = nltk.PunktSentenceTokenizer() return tokenizer.tokenize(text)
import nltk #loading speech of George W Bush trainText = nltk.corpus.state_union.raw("2005-GWBush.txt") sampleText = nltk.corpus.state_union.raw("2006-GWBush.txt") #Punkt Sentence Tokeniser punktSentenceTokeniser = nltk.PunktSentenceTokenizer(trainText) sentTokens = punktSentenceTokeniser.tokenize(sampleText) def processContent(): try: for tokens in sentTokens: wordTokens = nltk.word_tokenize(tokens) tagged = nltk.pos_tag(wordTokens) # nameEnt = nltk.ne_chunk(tagged , binary=True)#adding binary= True will not classify Name Entity as location , money or something else.it just list as Name Entity nameEnt = nltk.ne_chunk(tagged) nameEnt.draw() except Exception as e: print(str(e)) processContent()
from reviewsentencescore import maximalsentencenumber, sentencenumber, location, nw, phraseindicator, \ reviewsentencescore def sentenceimportance(ch, cr, rca): return (ch + cr + rca) / 3 dbname = "data-mining" connect(dbname) ReviewSentences.drop_collection() Sentence.drop_collection() for index, rev in enumerate(GansevoortReview.objects): custom_sent_tokenizer = nltk.PunktSentenceTokenizer(rev.description) sentences = custom_sent_tokenizer.tokenize(rev.description) sent_list = review_description_sentence_list(sentences) maximal = maximalsentencenumber(sent_list) CH = ReviewHelpfulness.objects.get(reviewId=rev.id).value CR = ReviewRecency.objects.get(reviewId=rev.id).value try: JOIN = UserReviews.objects.get(name=rev.name).id RCA = ReviewAuthorRepresentativeness.objects.get(authorId=JOIN).value except DoesNotExist: RCA = None except MultipleObjectsReturned: RCA = None print(f"two items were returned")
def cut_text(full_text): list_of_sentences = nltk.PunktSentenceTokenizer().tokenize(full_text) sen_num = float(len(list_of_sentences)) for s in list_of_sentences: cut_sentence(s) sum = 0.0 for word in all_words: sum += len(word) #сумма букв в предложении all_words_in_text.append(word) lemmatise(word) sym_count = 0 word5 = 0 word6 = 0 word7 = 0 word8 = 0 word9 = 0 word10 = 0 word11 = 0 word12 = 0 word13 = 0 for w in all_words_in_text: sym_count += len(w) if len(w) > 4: word5 += 1 if len(w) > 5: word6 += 1 if len(w) > 6: word7 += 1 if len(w) > 7: word8 += 1 if len(w) > 8: word9 += 1 if len(w) > 9: word10 += 1 if len(w) > 10: word11 += 1 if len(w) > 11: word12 += 1 if len(w) > 12: word13 += 1 final_list.append(sym_count / sen_num) final_list.append(sym_count / float(len(all_words_in_text))) final_list.append(word5 / float(len(all_words_in_text))) final_list.append(word6 / float(len(all_words_in_text))) final_list.append(word7 / float(len(all_words_in_text))) final_list.append(word8 / float(len(all_words_in_text))) final_list.append(word9 / float(len(all_words_in_text))) final_list.append(word10 / float(len(all_words_in_text))) final_list.append(word11 / float(len(all_words_in_text))) final_list.append(word12 / float(len(all_words_in_text))) final_list.append(word13 / float(len(all_words_in_text))) del all_words_in_text[:] count = 0 for vow_num in sent_words_in_vowels: count += vow_num final_list.append(count / sen_num) del sent_words_in_vowels[:] summ = 0 for number in sum_sen_length: summ += number w = 0 vow3 = 0 vow4 = 0 vow5 = 0 vow6 = 0 for vow in all_words_in_vowels: w += vow if vow > 2: vow3 += 1 if vow > 3: vow4 += 1 if vow > 4: vow5 += 1 if vow > 5: vow6 += 1 words_in_sent = float(len(all_words)) final_list.append(w / sum) final_list.append(vow3 / words_in_sent) final_list.append(vow4 / words_in_sent) final_list.append(vow5 / words_in_sent) final_list.append(vow6 / words_in_sent) av_sen_length = summ / sen_num final_list.append(av_sen_length) av_word = sum / len(all_words) final_list.append(av_word) final_list.append(sum) final_list.append(len(all_words)) finding(poses) # [av vowels in sent, av vowels in text, % of word5, % of word6, % of word7, % of word8, % of word9, % of word10, % of word11, % of word12, % of word13, % of vow3, % of vow4, % of vow5, % of vow6, av len of words in sent, av len of words in symbols, av # words in sent, av word len in words, text in symbols, text_in words, pos, pos, ..., pos] # тут типа все параметры кроме проверки на лексический минимум print final_list del final_list[:], sum_sen_length[:], all_words[:], poses[:], all_words_in_vowels[:]
def cut_sentence(self, full_text): list_of_sentences = nltk.PunktSentenceTokenizer().tokenize(full_text) for s in list_of_sentences: self.cut_words(s)
from gensim import models import nltk import pandas as pd wvec = models.Word2Vec import os MODEL_NAME = 'w2vmodel' if os.path.exists(MODEL_NAME): model = wvec.load(MODEL_NAME) else: # os.chdir('TrippyMain/AI') df_all = pd.read_pickle('../Warehouse/UdpRevFin.pkl') all_text = '. '.join(list(df_all['Text'])) pk = nltk.PunktSentenceTokenizer() sentences = [nltk.word_tokenize(i) for i in pk.tokenize(all_text)] model = wv(sentences, min_count=5, size=500, workers=3) model.save(MODEL_NAME) def sim(a, b): return model.wv.similarity(a, b)
paras, last_start = [], 0 for sign in all_sign: paras.append((last_start, sign.span()[0] - 1)) last_start = sign.span()[1] paras.append((last_start, len(content) - 1)) return paras RAW_DATA_FILE = 'raw/drug_labeled_20200408.json' OUTPUT_DATA_FILE = 'data/drug_features_classification.csv' punkt_param = PunktParameters() punkt_param.abbrev_types = set([ 'dr', 'vs', 'mr', 'mrs', 'miss', 'prof', 'inc', 'no', 'cap', 'nos', 'vol', 'para', 'exh' ]) tokenizer = nltk.PunktSentenceTokenizer(punkt_param) with open(RAW_DATA_FILE, 'r', encoding='UTF-8') as input_f: lines = input_f.readlines() with open(OUTPUT_DATA_FILE, 'w', encoding='UTF-8') as output_f: csv_writer = csv.writer(output_f) csv_writer.writerow(['neutral_citation', 'sentence_id', 'sentence'] + labels) with tqdm(total=len(lines), unit_scale=True) as pbar: for document_count, line in enumerate(lines): data = json.loads(line) neutral_citation, annotations = extract_annotations( data["annotation"]) if not neutral_citation: neutral_citation = document_count paras = get_paragraphs(data["content"]) count = 0
def extract_case_refs(self, referenced_by: Case, content: str, key: int = 0): """ BVerwG, Urteil vom 20. Februar 2013, - 10 C 23.12 - BVerwG, Urteil vom 27. April 2010 - 10 C 5.09 - BVerfG, Beschluss vom 10.07.1989, - 2 BvR 502, 1000, 961/86 - BVerwG, Urteil vom 20.02.2013, - 10 C 23.12 - OVG Nordrhein-Westfalen, Urteil vom 21.2.2017, - 14 A 2316/16.A - OVG Nordrhein-Westfalen, Urteil vom 29.10.2012 – 2 A 723/11 - OVG NRW, Urteil vom 14.08.2013 – 1 A 1481/10, Rn. 81 – OVG Saarland, Urteil vom 2.2.2017, - 2 A 515/16 - OVG Rheinland-Pfalz, Urteil vom 16.12.2016, -1A 10922/16 - Bayrischer VGH, Urteil vom 12.12.16, - 21 B 16.30364 OVG Nordrhein-Westfalen, Urteil vom 21.2.2017, - 14 A 2316/16.A - Bayrischer VGH, Urteil vom 12.12.2016, - 21 B 16.30372 - OVG Saarland, Urteil vom 2.2.2017, - 2 A 515/16 - OVG Rheinland-Pfalz, Urteil vom 16.12.2016, -1A 10922/16 - VG Minden, Urteil vom 22.12.2016, - 1 K 5137/16.A - VG Gießen, Urteil vom 23.11.2016, - 2 K 969/16.GI.A VG Düsseldorf, Urteil vom 24.1.2017, - 17 K 9400/16.A VG Köln, Beschluss vom 25.03.2013 – 23 L 287/12 - OVG Schleswig, Beschluss vom 20.07.2006 – 1 MB 13/06 - Schleswig-Holsteinisches Verwaltungsgericht, Urteil vom 05.082014 – 11 A 7/14, Rn. 37 – Entscheidung des Bundesverwaltungsgerichts vom 24.01.2012 (2 C 24/10) EuGH Urteil vom 25.07.2002 – C-459/99 - TODO all court codes + case types - look for (Entscheidung|Bechluss|Urteil) - +/- 50 chars - find VG|OVG|Verwaltungsgericht|BVerwG|... - find location - find file number - ... - or (...) TODO Sentence tokenzier - remove all "special endings" \s([0-9]+|[a-zA-Z]|sog|Abs)\. - remove all dates :param key: :param content: :return: """ refs = [] original = content text = content # print('Before = %s' % text) # Clean up text; replacing all chars that can lead to wrong sentences text = self.clean_text_for_tokenizer(text) # TODO from nltk.tokenize.punkt import PunktParameters punkt_param = PunktParameters() abbreviation = ['1', 'e', 'i'] punkt_param.abbrev_types = set(abbreviation) # tokenizer = PunktSentenceTokenizer(punkt_param) offset = 0 marker_offset = 0 for start, end in nltk.PunktSentenceTokenizer().span_tokenize(text): length = end - start sentence = text[start:end] original_sentence = original[start:end] matches = list(re.finditer(r'\((.*?)\)', original_sentence)) logger.debug('Sentence (matches: %i): %s' % (len(matches), sentence)) logger.debug('Sentence (orignal): %s' % (original_sentence)) for m in matches: # pass # print('offset = %i, len = %i' % (offset, len(sentence))) # # print('MANGLED: ' + sentence) logger.debug('Full sentence // UNMANGLED: ' + original_sentence) # focus_all = original[start+m.start(1):start+m.end(1)].split(',') focus_all = original_sentence[m.start(1):m.end(1)].split(',') # print(m.group(1)) logger.debug('In parenthesis = %s' % focus_all) # Split for focus in focus_all: # Search for file number fns_matches = list( re.finditer(self.get_file_number_regex(), focus)) if len(fns_matches) == 1: fn = fns_matches[0].group(0) pos = fns_matches[0].start(0) logger.debug('File number found: %s' % fn) # Find court court_name = None court_pos = 999999 court_matches = list( re.finditer(self.get_court_name_regex(), original_sentence)) if len(court_matches) == 1: # Yeah everything is fine court_name = court_matches[0].group(0) elif len(court_matches) > 0: # Multiple results, choose the one that is closest to file number for cm in court_matches: if court_name is None or abs( pos - cm.start()) < court_pos: court_name = cm.group(0) court_pos = abs(pos - cm.start()) else: # no court found, guess by search query # probably the court of the current case? test for "die kammer" pass # Find date # TODO logger.debug('Filename = %s' % fn) logger.debug('Courtname = %s' % court_name) ref_start = start + m.start(1) + pos ref_end = ref_start + len(fn) if court_name is None: # raise ) # TODO Probably same court as current case (use case validation) logger.error( AmbiguousReferenceError( 'No court name found - FN: %s' % fn)) # logger.debug('Sentence: %s' % (fn, original_sentence))) continue ref_ids = [{ 'type': 'case', 'ecli': 'ecli://de/' + slugify(court_name) + '/' + slugify(fn.replace('/', '-')) }] # TODO maintain order for case+law refs ref = CaseReferenceMarker(referenced_by=referenced_by, text=focus, start=ref_start, end=ref_end, line=0) # TODO line number ref.set_uuid() ref.set_references(ref_ids) refs.append(ref) content, marker_offset = ref.replace_content( content, marker_offset, key + len(refs)) pass elif len(fns_matches) > 1: logger.warning('More file numbers found: %s' % fns_matches) pass else: logger.debug('No file number found') return content, refs